]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
5332703ba33bc47a04836275d35bcab7daf5de52
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/ceph_crypto.h"
29 #include "common/errno.h"
30 #include "common/scrub_types.h"
31 #include "common/perf_counters.h"
32
33 #include "messages/MOSDOp.h"
34 #include "messages/MOSDBackoff.h"
35 #include "messages/MOSDPGTrim.h"
36 #include "messages/MOSDPGScan.h"
37 #include "messages/MOSDRepScrub.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MOSDPGUpdateLogMissing.h"
41 #include "messages/MOSDPGUpdateLogMissingReply.h"
42 #include "messages/MCommandReply.h"
43 #include "messages/MOSDScrubReserve.h"
44 #include "common/EventTrace.h"
45
46 #include "common/config.h"
47 #include "include/compat.h"
48 #include "mon/MonClient.h"
49 #include "osdc/Objecter.h"
50 #include "json_spirit/json_spirit_value.h"
51 #include "json_spirit/json_spirit_reader.h"
52 #include "include/ceph_assert.h" // json_spirit clobbers it
53 #include "include/rados/rados_types.hpp"
54
55 #ifdef WITH_LTTNG
56 #include "tracing/osd.h"
57 #else
58 #define tracepoint(...)
59 #endif
60
61 #define dout_context cct
62 #define dout_subsys ceph_subsys_osd
63 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
64 #undef dout_prefix
65 #define dout_prefix _prefix(_dout, this)
66 using TOPNSPC::common::cmd_getval;
67
68 template <typename T>
69 static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return pg->gen_prefix(*_dout);
71 }
72
73
74 #include <sstream>
75 #include <utility>
76
77 #include <errno.h>
78
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81 using namespace ceph::osd::scheduler;
82
83 /**
84 * The CopyCallback class defines an interface for completions to the
85 * copy_start code. Users of the copy infrastructure must implement
86 * one and give an instance of the class to start_copy.
87 *
88 * The implementer is responsible for making sure that the CopyCallback
89 * can associate itself with the correct copy operation.
90 */
91 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
92 protected:
93 CopyCallback() {}
94 /**
95 * results.get<0>() is the return code: 0 for success; -ECANCELED if
96 * the operation was cancelled by the local OSD; -errno for other issues.
97 * results.get<1>() is a pointer to a CopyResults object, which you are
98 * responsible for deleting.
99 */
100 void finish(CopyCallbackResults results_) override = 0;
101
102 public:
103 /// Provide the final size of the copied object to the CopyCallback
104 ~CopyCallback() override {}
105 };
106
107 template <typename T>
108 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
109 PrimaryLogPGRef pg;
110 unique_ptr<GenContext<T>> c;
111 epoch_t e;
112 public:
113 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
114 : pg(pg), c(c), e(e) {}
115 void finish(T t) override {
116 std::scoped_lock locker{*pg};
117 if (pg->pg_has_reset_since(e))
118 c.reset();
119 else
120 c.release()->complete(t);
121 }
122 bool sync_finish(T t) {
123 // we assume here all blessed/wrapped Contexts can complete synchronously.
124 c.release()->complete(t);
125 return true;
126 }
127 };
128
129 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
130 GenContext<ThreadPool::TPHandle&> *c) {
131 return new BlessedGenContext<ThreadPool::TPHandle&>(
132 this, c, get_osdmap_epoch());
133 }
134
135 template <typename T>
136 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
137 PrimaryLogPGRef pg;
138 unique_ptr<GenContext<T>> c;
139 epoch_t e;
140 public:
141 UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
142 : pg(pg), c(c), e(e) {}
143 void finish(T t) override {
144 if (pg->pg_has_reset_since(e))
145 c.reset();
146 else
147 c.release()->complete(t);
148 }
149 bool sync_finish(T t) {
150 // we assume here all blessed/wrapped Contexts can complete synchronously.
151 c.release()->complete(t);
152 return true;
153 }
154 };
155
156 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
157 GenContext<ThreadPool::TPHandle&> *c) {
158 return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
159 this, c, get_osdmap_epoch());
160 }
161
162 class PrimaryLogPG::BlessedContext : public Context {
163 PrimaryLogPGRef pg;
164 unique_ptr<Context> c;
165 epoch_t e;
166 public:
167 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
168 : pg(pg), c(c), e(e) {}
169 void finish(int r) override {
170 std::scoped_lock locker{*pg};
171 if (pg->pg_has_reset_since(e))
172 c.reset();
173 else
174 c.release()->complete(r);
175 }
176 bool sync_finish(int r) {
177 // we assume here all blessed/wrapped Contexts can complete synchronously.
178 c.release()->complete(r);
179 return true;
180 }
181 };
182
183 Context *PrimaryLogPG::bless_context(Context *c) {
184 return new BlessedContext(this, c, get_osdmap_epoch());
185 }
186
187 class PrimaryLogPG::C_PG_ObjectContext : public Context {
188 PrimaryLogPGRef pg;
189 ObjectContext *obc;
190 public:
191 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
192 pg(p), obc(o) {}
193 void finish(int r) override {
194 pg->object_context_destructor_callback(obc);
195 }
196 };
197
198 struct OnReadComplete : public Context {
199 PrimaryLogPG *pg;
200 PrimaryLogPG::OpContext *opcontext;
201 OnReadComplete(
202 PrimaryLogPG *pg,
203 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
204 void finish(int r) override {
205 opcontext->finish_read(pg);
206 }
207 ~OnReadComplete() override {}
208 };
209
210 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
211 PrimaryLogPGRef pg;
212 ObjectContextRef obc;
213 public:
214 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
215 pg(p), obc(o) {}
216 bool sync_finish(int r) override {
217 pg->_applied_recovered_object(obc);
218 return true;
219 }
220 void finish(int r) override {
221 std::scoped_lock locker{*pg};
222 pg->_applied_recovered_object(obc);
223 }
224 };
225
226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238 };
239
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 bool sync_finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 return true;
248 }
249 void finish(int r) override {
250 std::scoped_lock locker{*pg};
251 pg->_applied_recovered_object_replica();
252 }
253 };
254
255 // OpContext
256 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
257 {
258 inflightreads = 1;
259 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
260 pair<bufferlist*, Context*> > > in;
261 in.swap(pending_async_reads);
262 pg->pgbackend->objects_read_async(
263 obc->obs.oi.soid,
264 in,
265 new OnReadComplete(pg, this), pg->get_pool().fast_read);
266 }
267 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
268 {
269 ceph_assert(inflightreads > 0);
270 --inflightreads;
271 if (async_reads_complete()) {
272 ceph_assert(pg->in_progress_async_reads.size());
273 ceph_assert(pg->in_progress_async_reads.front().second == this);
274 pg->in_progress_async_reads.pop_front();
275
276 // Restart the op context now that all reads have been
277 // completed. Read failures will be handled by the op finisher
278 pg->execute_ctx(this);
279 }
280 }
281
282 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
283 public:
284 PrimaryLogPG::CopyResults *results = nullptr;
285 PrimaryLogPG::OpContext *ctx;
286 OSDOp &osd_op;
287 uint32_t truncate_seq;
288 uint64_t truncate_size;
289 bool have_truncate = false;
290
291 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
292 : ctx(ctx), osd_op(osd_op) {
293 }
294 ~CopyFromCallback() override {}
295
296 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
297 results = results_.get<1>();
298 int r = results_.get<0>();
299
300 // Only use truncate_{seq,size} from the original object if the client
301 // did not sent us these parameters
302 if (!have_truncate) {
303 truncate_seq = results->truncate_seq;
304 truncate_size = results->truncate_size;
305 }
306
307 // for finish_copyfrom
308 ctx->user_at_version = results->user_version;
309
310 if (r >= 0) {
311 ctx->pg->execute_ctx(ctx);
312 } else {
313 if (r != -ECANCELED) { // on cancel just toss it out; client resends
314 if (ctx->op)
315 ctx->pg->osd->reply_op_error(ctx->op, r);
316 } else if (results->should_requeue) {
317 if (ctx->op)
318 ctx->pg->requeue_op(ctx->op);
319 }
320 ctx->pg->close_op_ctx(ctx);
321 }
322 }
323
324 bool is_temp_obj_used() {
325 return results->started_temp_obj;
326 }
327 uint64_t get_data_size() {
328 return results->object_size;
329 }
330 void set_truncate(uint32_t seq, uint64_t size) {
331 truncate_seq = seq;
332 truncate_size = size;
333 have_truncate = true;
334 }
335 };
336
337 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
338 CopyFromCallback *copy_from_callback;
339
340 explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
341 : copy_from_callback(copy_from_callback) {
342 }
343
344 int execute() override {
345 // instance will be destructed after this method completes
346 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
347 return 0;
348 }
349 };
350
351 // ======================
352 // PGBackend::Listener
353
354 void PrimaryLogPG::on_local_recover(
355 const hobject_t &hoid,
356 const ObjectRecoveryInfo &_recovery_info,
357 ObjectContextRef obc,
358 bool is_delete,
359 ObjectStore::Transaction *t
360 )
361 {
362 dout(10) << __func__ << ": " << hoid << dendl;
363
364 ObjectRecoveryInfo recovery_info(_recovery_info);
365 clear_object_snap_mapping(t, hoid);
366 if (!is_delete && recovery_info.soid.is_snap()) {
367 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
368 set<snapid_t> snaps;
369 dout(20) << " snapset " << recovery_info.ss << dendl;
370 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
371 if (p != recovery_info.ss.clone_snaps.end()) {
372 snaps.insert(p->second.begin(), p->second.end());
373 dout(20) << " snaps " << snaps << dendl;
374 snap_mapper.add_oid(
375 recovery_info.soid,
376 snaps,
377 &_t);
378 } else {
379 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
380 }
381 }
382 if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
383 recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
384 ceph_assert(is_primary());
385 const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
386 if (latest->op == pg_log_entry_t::LOST_REVERT &&
387 latest->reverting_to == recovery_info.version) {
388 dout(10) << " got old revert version " << recovery_info.version
389 << " for " << *latest << dendl;
390 recovery_info.version = latest->version;
391 // update the attr to the revert event version
392 recovery_info.oi.prior_version = recovery_info.oi.version;
393 recovery_info.oi.version = latest->version;
394 bufferlist bl;
395 encode(recovery_info.oi, bl,
396 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
397 ceph_assert(!pool.info.is_erasure());
398 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
399 if (obc)
400 obc->attr_cache[OI_ATTR] = bl;
401 }
402 }
403
404 // keep track of active pushes for scrub
405 ++active_pushes;
406
407 recovery_state.recover_got(
408 recovery_info.soid,
409 recovery_info.version,
410 is_delete,
411 *t);
412
413 if (is_primary()) {
414 if (!is_delete) {
415 obc->obs.exists = true;
416
417 bool got = obc->get_recovery_read();
418 ceph_assert(got);
419
420 ceph_assert(recovering.count(obc->obs.oi.soid));
421 recovering[obc->obs.oi.soid] = obc;
422 obc->obs.oi = recovery_info.oi; // may have been updated above
423 }
424
425 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
426
427 publish_stats_to_osd();
428 release_backoffs(hoid);
429 if (!is_unreadable_object(hoid)) {
430 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
431 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
432 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
433 requeue_ops(unreadable_object_entry->second);
434 waiting_for_unreadable_object.erase(unreadable_object_entry);
435 }
436 }
437 } else {
438 t->register_on_applied(
439 new C_OSD_AppliedRecoveredObjectReplica(this));
440
441 }
442
443 t->register_on_commit(
444 new C_OSD_CommittedPushedObject(
445 this,
446 get_osdmap_epoch(),
447 info.last_complete));
448 }
449
450 void PrimaryLogPG::on_global_recover(
451 const hobject_t &soid,
452 const object_stat_sum_t &stat_diff,
453 bool is_delete)
454 {
455 recovery_state.object_recovered(soid, stat_diff);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
459 ceph_assert(i != recovering.end());
460
461 if (i->second && i->second->rwstate.recovery_read_marker) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 ceph_assert(i->second);
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488 }
489
490 void PrimaryLogPG::schedule_recovery_work(
491 GenContext<ThreadPool::TPHandle&> *c)
492 {
493 osd->queue_recovery_context(this, c);
494 }
495
496 void PrimaryLogPG::replica_clear_repop_obc(
497 const vector<pg_log_entry_t> &logv,
498 ObjectStore::Transaction &t)
499 {
500 for (auto &&e: logv) {
501 /* Have to blast all clones, they share a snapset */
502 object_contexts.clear_range(
503 e.soid.get_object_boundary(), e.soid.get_head());
504 ceph_assert(
505 snapset_contexts.find(e.soid.get_head()) ==
506 snapset_contexts.end());
507 }
508 }
509
510 bool PrimaryLogPG::should_send_op(
511 pg_shard_t peer,
512 const hobject_t &hoid) {
513 if (peer == get_primary())
514 return true;
515 ceph_assert(recovery_state.has_peer_info(peer));
516 bool should_send =
517 hoid.pool != (int64_t)info.pgid.pool() ||
518 hoid <= last_backfill_started ||
519 hoid <= recovery_state.get_peer_info(peer).last_backfill;
520 if (!should_send) {
521 ceph_assert(is_backfill_target(peer));
522 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
523 << ", object " << hoid
524 << " beyond std::max(last_backfill_started "
525 << ", peer_info[peer].last_backfill "
526 << recovery_state.get_peer_info(peer).last_backfill
527 << ")" << dendl;
528 return should_send;
529 }
530 if (is_async_recovery_target(peer) &&
531 recovery_state.get_peer_missing(peer).is_missing(hoid)) {
532 should_send = false;
533 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
534 << ", object " << hoid
535 << " which is pending recovery in async_recovery_targets" << dendl;
536 }
537 return should_send;
538 }
539
540
541 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
542 int peer, epoch_t from_epoch)
543 {
544 return osd->get_con_osd_cluster(peer, from_epoch);
545 }
546
547 PerfCounters *PrimaryLogPG::get_logger()
548 {
549 return osd->logger;
550 }
551
552
553 // ====================
554 // missing objects
555
556 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
557 {
558 return recovery_state.get_pg_log().get_missing().get_items().count(soid);
559 }
560
561 void PrimaryLogPG::maybe_kick_recovery(
562 const hobject_t &soid)
563 {
564 eversion_t v;
565 bool work_started = false;
566 if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
567 return;
568
569 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
570 if (p != recovering.end()) {
571 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
572 } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
573 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
574 } else {
575 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
576 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
577 if (is_missing_object(soid)) {
578 recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
579 } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
580 prep_object_replica_deletes(soid, v, h, &work_started);
581 } else {
582 prep_object_replica_pushes(soid, v, h, &work_started);
583 }
584 pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
585 }
586 }
587
588 void PrimaryLogPG::wait_for_unreadable_object(
589 const hobject_t& soid, OpRequestRef op)
590 {
591 ceph_assert(is_unreadable_object(soid));
592 maybe_kick_recovery(soid);
593 waiting_for_unreadable_object[soid].push_back(op);
594 op->mark_delayed("waiting for missing object");
595 }
596
597 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
598 {
599 /* The conditions below may clear (on_local_recover, before we queue
600 * the transaction) before we actually requeue the degraded waiters
601 * in on_global_recover after the transaction completes.
602 */
603 if (waiting_for_degraded_object.count(soid))
604 return true;
605 if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
606 return true;
607 ceph_assert(!get_acting_recovery_backfill().empty());
608 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
609 i != get_acting_recovery_backfill().end();
610 ++i) {
611 if (*i == get_primary()) continue;
612 pg_shard_t peer = *i;
613 auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
614 // If an object is missing on an async_recovery_target, return false.
615 // This will not block the op and the object is async recovered later.
616 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
617 peer_missing_entry->second.get_items().count(soid)) {
618 if (is_async_recovery_target(peer))
619 continue;
620 else
621 return true;
622 }
623 // Object is degraded if after last_backfill AND
624 // we are backfilling it
625 if (is_backfill_target(peer) &&
626 recovery_state.get_peer_info(peer).last_backfill <= soid &&
627 last_backfill_started >= soid &&
628 backfills_in_flight.count(soid))
629 return true;
630 }
631 return false;
632 }
633
634 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
635 {
636 for (auto &i: get_async_recovery_targets()) {
637 auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
638 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
639 peer_missing_entry->second.get_items().count(soid)) {
640 dout(30) << __func__ << " " << soid << dendl;
641 return true;
642 }
643 }
644 return false;
645 }
646
647 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
648 {
649 ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
650
651 maybe_kick_recovery(soid);
652 waiting_for_degraded_object[soid].push_back(op);
653 op->mark_delayed("waiting for degraded object");
654 }
655
656 void PrimaryLogPG::block_write_on_full_cache(
657 const hobject_t& _oid, OpRequestRef op)
658 {
659 const hobject_t oid = _oid.get_head();
660 dout(20) << __func__ << ": blocking object " << oid
661 << " on full cache" << dendl;
662 objects_blocked_on_cache_full.insert(oid);
663 waiting_for_cache_not_full.push_back(op);
664 op->mark_delayed("waiting for cache not full");
665 }
666
667 void PrimaryLogPG::block_for_clean(
668 const hobject_t& oid, OpRequestRef op)
669 {
670 dout(20) << __func__ << ": blocking object " << oid
671 << " on primary repair" << dendl;
672 waiting_for_clean_to_primary_repair.push_back(op);
673 op->mark_delayed("waiting for clean to repair");
674 }
675
676 void PrimaryLogPG::block_write_on_snap_rollback(
677 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
678 {
679 dout(20) << __func__ << ": blocking object " << oid.get_head()
680 << " on snap promotion " << obc->obs.oi.soid << dendl;
681 // otherwise, we'd have blocked in do_op
682 ceph_assert(oid.is_head());
683 ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
684 objects_blocked_on_snap_promotion[oid] = obc;
685 wait_for_blocked_object(obc->obs.oi.soid, op);
686 }
687
688 void PrimaryLogPG::block_write_on_degraded_snap(
689 const hobject_t& snap, OpRequestRef op)
690 {
691 dout(20) << __func__ << ": blocking object " << snap.get_head()
692 << " on degraded snap " << snap << dendl;
693 // otherwise, we'd have blocked in do_op
694 ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
695 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
696 wait_for_degraded_object(snap, op);
697 }
698
699 bool PrimaryLogPG::maybe_await_blocked_head(
700 const hobject_t &hoid,
701 OpRequestRef op)
702 {
703 ObjectContextRef obc;
704 obc = object_contexts.lookup(hoid.get_head());
705 if (obc) {
706 if (obc->is_blocked()) {
707 wait_for_blocked_object(obc->obs.oi.soid, op);
708 return true;
709 } else {
710 return false;
711 }
712 }
713 return false;
714 }
715
716 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
717 {
718 dout(10) << __func__ << " " << soid << " " << op << dendl;
719 waiting_for_blocked_object[soid].push_back(op);
720 op->mark_delayed("waiting for blocked object");
721 }
722
723 void PrimaryLogPG::maybe_force_recovery()
724 {
725 // no force if not in degraded/recovery/backfill states
726 if (!is_degraded() &&
727 !state_test(PG_STATE_RECOVERING |
728 PG_STATE_RECOVERY_WAIT |
729 PG_STATE_BACKFILLING |
730 PG_STATE_BACKFILL_WAIT |
731 PG_STATE_BACKFILL_TOOFULL))
732 return;
733
734 if (recovery_state.get_pg_log().get_log().approx_size() <
735 cct->_conf->osd_max_pg_log_entries *
736 cct->_conf->osd_force_recovery_pg_log_entries_factor)
737 return;
738
739 // find the oldest missing object
740 version_t min_version = recovery_state.get_pg_log().get_log().head.version;
741 hobject_t soid;
742 if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
743 min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
744 soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
745 }
746 ceph_assert(!get_acting_recovery_backfill().empty());
747 for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
748 it != get_acting_recovery_backfill().end();
749 ++it) {
750 if (*it == get_primary()) continue;
751 pg_shard_t peer = *it;
752 auto it_missing = recovery_state.get_peer_missing().find(peer);
753 if (it_missing != recovery_state.get_peer_missing().end() &&
754 !it_missing->second.get_rmissing().empty()) {
755 const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
756 dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
757 << " oid " << min_obj->second << dendl;
758 if (min_version > min_obj->first) {
759 min_version = min_obj->first;
760 soid = min_obj->second;
761 }
762 }
763 }
764
765 // recover it
766 if (soid != hobject_t())
767 maybe_kick_recovery(soid);
768 }
769
770 bool PrimaryLogPG::check_laggy(OpRequestRef& op)
771 {
772 if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
773 SERVER_OCTOPUS)) {
774 dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
775 return true;
776 }
777 if (state_test(PG_STATE_WAIT)) {
778 dout(10) << __func__ << " PG is WAIT state" << dendl;
779 } else if (!state_test(PG_STATE_LAGGY)) {
780 auto mnow = osd->get_mnow();
781 auto ru = recovery_state.get_readable_until();
782 if (mnow <= ru) {
783 // not laggy
784 return true;
785 }
786 dout(10) << __func__
787 << " mnow " << mnow
788 << " > readable_until " << ru << dendl;
789
790 if (!is_primary()) {
791 osd->reply_op_error(op, -EAGAIN);
792 return false;
793 }
794
795 // go to laggy state
796 state_set(PG_STATE_LAGGY);
797 publish_stats_to_osd();
798 }
799 dout(10) << __func__ << " not readable" << dendl;
800 waiting_for_readable.push_back(op);
801 op->mark_delayed("waiting for readable");
802 return false;
803 }
804
805 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
806 {
807 if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
808 SERVER_OCTOPUS)) {
809 return true;
810 }
811 if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
812 return true; // not laggy
813 }
814 dout(10) << __func__ << " not readable" << dendl;
815 waiting_for_readable.push_front(op);
816 op->mark_delayed("waiting for readable");
817 return false;
818 }
819
820 void PrimaryLogPG::recheck_readable()
821 {
822 if (!is_wait() && !is_laggy()) {
823 dout(20) << __func__ << " wasn't wait or laggy" << dendl;
824 return;
825 }
826 auto mnow = osd->get_mnow();
827 bool pub = false;
828 if (is_wait()) {
829 auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
830 if (mnow < prior_readable_until_ub) {
831 dout(10) << __func__ << " still wait (mnow " << mnow
832 << " < prior_readable_until_ub " << prior_readable_until_ub
833 << ")" << dendl;
834 } else {
835 dout(10) << __func__ << " no longer wait (mnow " << mnow
836 << " >= prior_readable_until_ub " << prior_readable_until_ub
837 << ")" << dendl;
838 state_clear(PG_STATE_WAIT);
839 recovery_state.clear_prior_readable_until_ub();
840 pub = true;
841 }
842 }
843 if (is_laggy()) {
844 auto ru = recovery_state.get_readable_until();
845 if (ru == ceph::signedspan::zero()) {
846 dout(10) << __func__ << " still laggy (mnow " << mnow
847 << ", readable_until zero)" << dendl;
848 } else if (mnow >= ru) {
849 dout(10) << __func__ << " still laggy (mnow " << mnow
850 << " >= readable_until " << ru << ")" << dendl;
851 } else {
852 dout(10) << __func__ << " no longer laggy (mnow " << mnow
853 << " < readable_until " << ru << ")" << dendl;
854 state_clear(PG_STATE_LAGGY);
855 pub = true;
856 }
857 }
858 if (pub) {
859 publish_stats_to_osd();
860 }
861 if (!is_laggy() && !is_wait()) {
862 requeue_ops(waiting_for_readable);
863 }
864 }
865
866 bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
867 {
868 bufferlist bl;
869
870 // If filter has expressed an interest in an xattr, load it.
871 if (!filter.get_xattr().empty()) {
872 int ret = pgbackend->objects_get_attr(
873 sobj,
874 filter.get_xattr(),
875 &bl);
876 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
877 if (ret < 0) {
878 if (ret != -ENODATA || filter.reject_empty_xattr()) {
879 return false;
880 }
881 }
882 }
883
884 return filter.filter(sobj, bl);
885 }
886
887 std::pair<int, std::unique_ptr<const PGLSFilter>>
888 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
889 {
890 string type;
891 // storing non-const PGLSFilter for the sake of ::init()
892 std::unique_ptr<PGLSFilter> filter;
893
894 try {
895 decode(type, iter);
896 }
897 catch (buffer::error& e) {
898 return { -EINVAL, nullptr };
899 }
900
901 if (type.compare("plain") == 0) {
902 filter = std::make_unique<PGLSPlainFilter>();
903 } else {
904 std::size_t dot = type.find(".");
905 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
906 return { -EINVAL, nullptr };
907 }
908
909 const std::string class_name = type.substr(0, dot);
910 const std::string filter_name = type.substr(dot + 1);
911 ClassHandler::ClassData *cls = NULL;
912 int r = ClassHandler::get_instance().open_class(class_name, &cls);
913 if (r != 0) {
914 derr << "Error opening class '" << class_name << "': "
915 << cpp_strerror(r) << dendl;
916 if (r != -EPERM) // propogate permission error
917 r = -EINVAL;
918 return { r, nullptr };
919 } else {
920 ceph_assert(cls);
921 }
922
923 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
924 if (class_filter == NULL) {
925 derr << "Error finding filter '" << filter_name << "' in class "
926 << class_name << dendl;
927 return { -EINVAL, nullptr };
928 }
929 filter.reset(class_filter->fn());
930 if (!filter) {
931 // Object classes are obliged to return us something, but let's
932 // give an error rather than asserting out.
933 derr << "Buggy class " << class_name << " failed to construct "
934 "filter " << filter_name << dendl;
935 return { -EINVAL, nullptr };
936 }
937 }
938
939 ceph_assert(filter);
940 int r = filter->init(iter);
941 if (r < 0) {
942 derr << "Error initializing filter " << type << ": "
943 << cpp_strerror(r) << dendl;
944 return { -EINVAL, nullptr };
945 } else {
946 // Successfully constructed and initialized, return it.
947 return std::make_pair(0, std::move(filter));
948 }
949 }
950
951
952 // ==========================================================
953
954 void PrimaryLogPG::do_command(
955 const string_view& orig_prefix,
956 const cmdmap_t& cmdmap,
957 const bufferlist& idata,
958 std::function<void(int,const std::string&,bufferlist&)> on_finish)
959 {
960 string format;
961 cmd_getval(cmdmap, "format", format);
962 std::unique_ptr<Formatter> f(Formatter::create(
963 format, "json-pretty", "json-pretty"));
964 int ret = 0;
965 stringstream ss; // stderr error message stream
966 bufferlist outbl; // if empty at end, we'll dump formatter as output
967
968 // get final prefix:
969 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
970 // - ceph tell <pgid> foo -> prefix=foo
971 string prefix(orig_prefix);
972 string command;
973 cmd_getval(cmdmap, "cmd", command);
974 if (command.size()) {
975 prefix = command;
976 }
977
978 if (prefix == "query") {
979 f->open_object_section("pg");
980 f->dump_stream("snap_trimq") << snap_trimq;
981 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
982 recovery_state.dump_peering_state(f.get());
983 f->close_section();
984
985 f->open_array_section("recovery_state");
986 handle_query_state(f.get());
987 f->close_section();
988
989 f->open_object_section("agent_state");
990 if (agent_state)
991 agent_state->dump(f.get());
992 f->close_section();
993
994 f->close_section();
995 }
996
997 else if (prefix == "mark_unfound_lost") {
998 string mulcmd;
999 cmd_getval(cmdmap, "mulcmd", mulcmd);
1000 int mode = -1;
1001 if (mulcmd == "revert") {
1002 if (pool.info.is_erasure()) {
1003 ss << "mode must be 'delete' for ec pool";
1004 ret = -EINVAL;
1005 goto out;
1006 }
1007 mode = pg_log_entry_t::LOST_REVERT;
1008 } else if (mulcmd == "delete") {
1009 mode = pg_log_entry_t::LOST_DELETE;
1010 } else {
1011 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1012 ret = -EINVAL;
1013 goto out;
1014 }
1015 ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1016 mode == pg_log_entry_t::LOST_DELETE);
1017
1018 if (!is_primary()) {
1019 ss << "not primary";
1020 ret = -EROFS;
1021 goto out;
1022 }
1023
1024 uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
1025 if (!unfound) {
1026 ss << "pg has no unfound objects";
1027 goto out; // make command idempotent
1028 }
1029
1030 if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
1031 ss << "pg has " << unfound
1032 << " unfound objects but we haven't probed all sources, not marking lost";
1033 ret = -EINVAL;
1034 goto out;
1035 }
1036
1037 mark_all_unfound_lost(mode, on_finish);
1038 return;
1039 }
1040
1041 else if (prefix == "list_unfound") {
1042 hobject_t offset;
1043 string offset_json;
1044 bool show_offset = false;
1045 if (cmd_getval(cmdmap, "offset", offset_json)) {
1046 json_spirit::Value v;
1047 try {
1048 if (!json_spirit::read(offset_json, v))
1049 throw std::runtime_error("bad json");
1050 offset.decode(v);
1051 } catch (std::runtime_error& e) {
1052 ss << "error parsing offset: " << e.what();
1053 ret = -EINVAL;
1054 goto out;
1055 }
1056 show_offset = true;
1057 }
1058 f->open_object_section("missing");
1059 if (show_offset) {
1060 f->open_object_section("offset");
1061 offset.dump(f.get());
1062 f->close_section();
1063 }
1064 auto &needs_recovery_map = recovery_state.get_missing_loc()
1065 .get_needs_recovery();
1066 f->dump_int("num_missing", needs_recovery_map.size());
1067 f->dump_int("num_unfound", get_num_unfound());
1068 map<hobject_t, pg_missing_item>::const_iterator p =
1069 needs_recovery_map.upper_bound(offset);
1070 {
1071 f->open_array_section("objects");
1072 int32_t num = 0;
1073 for (; p != needs_recovery_map.end() &&
1074 num < cct->_conf->osd_command_max_records;
1075 ++p) {
1076 if (recovery_state.get_missing_loc().is_unfound(p->first)) {
1077 f->open_object_section("object");
1078 {
1079 f->open_object_section("oid");
1080 p->first.dump(f.get());
1081 f->close_section();
1082 }
1083 p->second.dump(f.get()); // have, need keys
1084 {
1085 f->open_array_section("locations");
1086 for (auto &&r : recovery_state.get_missing_loc().get_locations(
1087 p->first)) {
1088 f->dump_stream("shard") << r;
1089 }
1090 f->close_section();
1091 }
1092 f->close_section();
1093 num++;
1094 }
1095 }
1096 f->close_section();
1097 }
1098 f->dump_bool("more", p != needs_recovery_map.end());
1099 f->close_section();
1100 }
1101
1102 else if (prefix == "scrub" ||
1103 prefix == "deep_scrub") {
1104 bool deep = (prefix == "deep_scrub");
1105 int64_t time;
1106 cmd_getval(cmdmap, "time", time, (int64_t)0);
1107
1108 if (is_primary()) {
1109 const pg_pool_t *p = &pool.info;
1110 double pool_scrub_max_interval = 0;
1111 double scrub_max_interval;
1112 if (deep) {
1113 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1114 scrub_max_interval = pool_scrub_max_interval > 0 ?
1115 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1116 } else {
1117 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1118 scrub_max_interval = pool_scrub_max_interval > 0 ?
1119 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1120 }
1121 // Instead of marking must_scrub force a schedule scrub
1122 utime_t stamp = ceph_clock_now();
1123 if (time == 0)
1124 stamp -= scrub_max_interval;
1125 else
1126 stamp -= (float)time;
1127 stamp -= 100.0; // push back last scrub more for good measure
1128 if (deep) {
1129 set_last_deep_scrub_stamp(stamp);
1130 } else {
1131 set_last_scrub_stamp(stamp);
1132 }
1133 f->open_object_section("result");
1134 f->dump_bool("deep", deep);
1135 f->dump_stream("stamp") << stamp;
1136 f->close_section();
1137 } else {
1138 ss << "Not primary";
1139 ret = -EPERM;
1140 }
1141 outbl.append(ss.str());
1142 }
1143
1144 else {
1145 ret = -ENOSYS;
1146 ss << "prefix '" << prefix << "' not implemented";
1147 }
1148
1149 out:
1150 if (ret >= 0 && outbl.length() == 0) {
1151 f->flush(outbl);
1152 }
1153 on_finish(ret, ss.str(), outbl);
1154 }
1155
1156
1157 // ==========================================================
1158
1159 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1160 {
1161 const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
1162 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1163 dout(10) << "do_pg_op " << *m << dendl;
1164
1165 op->mark_started();
1166
1167 int result = 0;
1168 string cname, mname;
1169
1170 snapid_t snapid = m->get_snapid();
1171
1172 vector<OSDOp> ops = m->ops;
1173
1174 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1175 std::unique_ptr<const PGLSFilter> filter;
1176 OSDOp& osd_op = *p;
1177 auto bp = p->indata.cbegin();
1178 switch (p->op.op) {
1179 case CEPH_OSD_OP_PGNLS_FILTER:
1180 try {
1181 decode(cname, bp);
1182 decode(mname, bp);
1183 }
1184 catch (const buffer::error& e) {
1185 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1186 result = -EINVAL;
1187 break;
1188 }
1189 std::tie(result, filter) = get_pgls_filter(bp);
1190 if (result < 0)
1191 break;
1192
1193 ceph_assert(filter);
1194
1195 // fall through
1196
1197 case CEPH_OSD_OP_PGNLS:
1198 if (snapid != CEPH_NOSNAP) {
1199 result = -EINVAL;
1200 break;
1201 }
1202 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1203 dout(10) << " pgnls pg=" << m->get_pg()
1204 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1205 << " != " << info.pgid << dendl;
1206 result = 0; // hmm?
1207 } else {
1208 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1209 p->op.pgls.count);
1210
1211 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1212 << dendl;
1213 // read into a buffer
1214 vector<hobject_t> sentries;
1215 pg_nls_response_t response;
1216 try {
1217 decode(response.handle, bp);
1218 }
1219 catch (const buffer::error& e) {
1220 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1221 result = -EINVAL;
1222 break;
1223 }
1224
1225 hobject_t next;
1226 hobject_t lower_bound = response.handle;
1227 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1228 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1229 dout(10) << " pgnls lower_bound " << lower_bound
1230 << " pg_end " << pg_end << dendl;
1231 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1232 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1233 // this should only happen with a buggy client.
1234 dout(10) << "outside of PG bounds " << pg_start << " .. "
1235 << pg_end << dendl;
1236 result = -EINVAL;
1237 break;
1238 }
1239
1240 hobject_t current = lower_bound;
1241 int r = pgbackend->objects_list_partial(
1242 current,
1243 list_size,
1244 list_size,
1245 &sentries,
1246 &next);
1247 if (r != 0) {
1248 result = -EINVAL;
1249 break;
1250 }
1251
1252 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1253 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1254 vector<hobject_t>::iterator ls_iter = sentries.begin();
1255 hobject_t _max = hobject_t::get_max();
1256 while (1) {
1257 const hobject_t &mcand =
1258 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1259 _max :
1260 missing_iter->first;
1261 const hobject_t &lcand =
1262 ls_iter == sentries.end() ?
1263 _max :
1264 *ls_iter;
1265
1266 hobject_t candidate;
1267 if (mcand == lcand) {
1268 candidate = mcand;
1269 if (!mcand.is_max()) {
1270 ++ls_iter;
1271 ++missing_iter;
1272 }
1273 } else if (mcand < lcand) {
1274 candidate = mcand;
1275 ceph_assert(!mcand.is_max());
1276 ++missing_iter;
1277 } else {
1278 candidate = lcand;
1279 ceph_assert(!lcand.is_max());
1280 ++ls_iter;
1281 }
1282
1283 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1284 << " vs lower bound 0x" << lower_bound.get_hash()
1285 << std::dec << dendl;
1286
1287 if (candidate >= next) {
1288 break;
1289 }
1290
1291 if (response.entries.size() == list_size) {
1292 next = candidate;
1293 break;
1294 }
1295
1296 if (candidate.snap != CEPH_NOSNAP)
1297 continue;
1298
1299 // skip internal namespace
1300 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1301 continue;
1302
1303 if (recovery_state.get_missing_loc().is_deleted(candidate))
1304 continue;
1305
1306 // skip wrong namespace
1307 if (m->get_hobj().nspace != librados::all_nspaces &&
1308 candidate.get_namespace() != m->get_hobj().nspace)
1309 continue;
1310
1311 if (filter && !pgls_filter(*filter, candidate))
1312 continue;
1313
1314 dout(20) << "pgnls item 0x" << std::hex
1315 << candidate.get_hash()
1316 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1317 << std::dec << " "
1318 << candidate.oid.name << dendl;
1319
1320 librados::ListObjectImpl item;
1321 item.nspace = candidate.get_namespace();
1322 item.oid = candidate.oid.name;
1323 item.locator = candidate.get_key();
1324 response.entries.push_back(item);
1325 }
1326
1327 if (next.is_max() &&
1328 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1329 ls_iter == sentries.end()) {
1330 result = 1;
1331
1332 // Set response.handle to the start of the next PG according
1333 // to the object sort order.
1334 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1335 } else {
1336 response.handle = next;
1337 }
1338 dout(10) << "pgnls handle=" << response.handle << dendl;
1339 encode(response, osd_op.outdata);
1340 dout(10) << " pgnls result=" << result << " outdata.length()="
1341 << osd_op.outdata.length() << dendl;
1342 }
1343 break;
1344
1345 case CEPH_OSD_OP_PGLS_FILTER:
1346 try {
1347 decode(cname, bp);
1348 decode(mname, bp);
1349 }
1350 catch (const buffer::error& e) {
1351 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1352 result = -EINVAL;
1353 break;
1354 }
1355 std::tie(result, filter) = get_pgls_filter(bp);
1356 if (result < 0)
1357 break;
1358
1359 ceph_assert(filter);
1360
1361 // fall through
1362
1363 case CEPH_OSD_OP_PGLS:
1364 if (snapid != CEPH_NOSNAP) {
1365 result = -EINVAL;
1366 break;
1367 }
1368 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1369 dout(10) << " pgls pg=" << m->get_pg()
1370 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1371 << " != " << info.pgid << dendl;
1372 result = 0; // hmm?
1373 } else {
1374 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1375 p->op.pgls.count);
1376
1377 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1378 // read into a buffer
1379 vector<hobject_t> sentries;
1380 pg_ls_response_t response;
1381 try {
1382 decode(response.handle, bp);
1383 }
1384 catch (const buffer::error& e) {
1385 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1386 result = -EINVAL;
1387 break;
1388 }
1389
1390 hobject_t next;
1391 hobject_t current = response.handle;
1392 int r = pgbackend->objects_list_partial(
1393 current,
1394 list_size,
1395 list_size,
1396 &sentries,
1397 &next);
1398 if (r != 0) {
1399 result = -EINVAL;
1400 break;
1401 }
1402
1403 ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
1404
1405 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1406 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1407 vector<hobject_t>::iterator ls_iter = sentries.begin();
1408 hobject_t _max = hobject_t::get_max();
1409 while (1) {
1410 const hobject_t &mcand =
1411 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1412 _max :
1413 missing_iter->first;
1414 const hobject_t &lcand =
1415 ls_iter == sentries.end() ?
1416 _max :
1417 *ls_iter;
1418
1419 hobject_t candidate;
1420 if (mcand == lcand) {
1421 candidate = mcand;
1422 if (!mcand.is_max()) {
1423 ++ls_iter;
1424 ++missing_iter;
1425 }
1426 } else if (mcand < lcand) {
1427 candidate = mcand;
1428 ceph_assert(!mcand.is_max());
1429 ++missing_iter;
1430 } else {
1431 candidate = lcand;
1432 ceph_assert(!lcand.is_max());
1433 ++ls_iter;
1434 }
1435
1436 if (candidate >= next) {
1437 break;
1438 }
1439
1440 if (response.entries.size() == list_size) {
1441 next = candidate;
1442 break;
1443 }
1444
1445 if (candidate.snap != CEPH_NOSNAP)
1446 continue;
1447
1448 // skip wrong namespace
1449 if (candidate.get_namespace() != m->get_hobj().nspace)
1450 continue;
1451
1452 if (recovery_state.get_missing_loc().is_deleted(candidate))
1453 continue;
1454
1455 if (filter && !pgls_filter(*filter, candidate))
1456 continue;
1457
1458 response.entries.push_back(make_pair(candidate.oid,
1459 candidate.get_key()));
1460 }
1461 if (next.is_max() &&
1462 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1463 ls_iter == sentries.end()) {
1464 result = 1;
1465 }
1466 response.handle = next;
1467 encode(response, osd_op.outdata);
1468 dout(10) << " pgls result=" << result << " outdata.length()="
1469 << osd_op.outdata.length() << dendl;
1470 }
1471 break;
1472
1473 case CEPH_OSD_OP_PG_HITSET_LS:
1474 {
1475 list< pair<utime_t,utime_t> > ls;
1476 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1477 p != info.hit_set.history.end();
1478 ++p)
1479 ls.push_back(make_pair(p->begin, p->end));
1480 if (hit_set)
1481 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1482 encode(ls, osd_op.outdata);
1483 }
1484 break;
1485
1486 case CEPH_OSD_OP_PG_HITSET_GET:
1487 {
1488 utime_t stamp(osd_op.op.hit_set_get.stamp);
1489 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1490 // read the current in-memory HitSet, not the version we've
1491 // checkpointed.
1492 if (!hit_set) {
1493 result= -ENOENT;
1494 break;
1495 }
1496 encode(*hit_set, osd_op.outdata);
1497 result = osd_op.outdata.length();
1498 } else {
1499 // read an archived HitSet.
1500 hobject_t oid;
1501 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1502 p != info.hit_set.history.end();
1503 ++p) {
1504 if (stamp >= p->begin && stamp <= p->end) {
1505 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1506 break;
1507 }
1508 }
1509 if (oid == hobject_t()) {
1510 result = -ENOENT;
1511 break;
1512 }
1513 if (!pool.info.is_replicated()) {
1514 // FIXME: EC not supported yet
1515 result = -EOPNOTSUPP;
1516 break;
1517 }
1518 if (is_unreadable_object(oid)) {
1519 wait_for_unreadable_object(oid, op);
1520 return;
1521 }
1522 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1523 }
1524 }
1525 break;
1526
1527 case CEPH_OSD_OP_SCRUBLS:
1528 result = do_scrub_ls(m, &osd_op);
1529 break;
1530
1531 default:
1532 result = -EINVAL;
1533 break;
1534 }
1535
1536 if (result < 0)
1537 break;
1538 }
1539
1540 // reply
1541 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1542 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1543 false);
1544 reply->claim_op_out_data(ops);
1545 reply->set_result(result);
1546 reply->set_reply_versions(info.last_update, info.last_user_version);
1547 osd->send_message_osd_client(reply, m->get_connection());
1548 }
1549
1550 int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
1551 {
1552 if (m->get_pg() != info.pgid.pgid) {
1553 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1554 return -EINVAL; // hmm?
1555 }
1556 auto bp = osd_op->indata.cbegin();
1557 scrub_ls_arg_t arg;
1558 try {
1559 arg.decode(bp);
1560 } catch (buffer::error&) {
1561 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1562 return -EINVAL;
1563 }
1564 int r = 0;
1565 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1566 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1567 r = -EAGAIN;
1568 } else if (!scrubber.store) {
1569 r = -ENOENT;
1570 } else if (arg.get_snapsets) {
1571 result.vals = scrubber.store->get_snap_errors(osd->store,
1572 get_pgid().pool(),
1573 arg.start_after,
1574 arg.max_return);
1575 } else {
1576 result.vals = scrubber.store->get_object_errors(osd->store,
1577 get_pgid().pool(),
1578 arg.start_after,
1579 arg.max_return);
1580 }
1581 encode(result, osd_op->outdata);
1582 return r;
1583 }
1584
1585 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1586 const PGPool &_pool,
1587 const map<string,string>& ec_profile, spg_t p) :
1588 PG(o, curmap, _pool, p),
1589 pgbackend(
1590 PGBackend::build_pg_backend(
1591 _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1592 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1593 new_backfill(false),
1594 temp_seq(0),
1595 snap_trimmer_machine(this)
1596 {
1597 recovery_state.set_backend_predicates(
1598 pgbackend->get_is_readable_predicate(),
1599 pgbackend->get_is_recoverable_predicate());
1600 snap_trimmer_machine.initiate();
1601 }
1602
1603 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1604 {
1605 src_oloc = oloc;
1606 if (oloc.key.empty())
1607 src_oloc.key = oid.name;
1608 }
1609
1610 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1611 {
1612 auto m = op->get_req<MOSDBackoff>();
1613 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1614 if (!session)
1615 return; // drop it.
1616 hobject_t begin = info.pgid.pgid.get_hobj_start();
1617 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1618 if (begin < m->begin) {
1619 begin = m->begin;
1620 }
1621 if (end > m->end) {
1622 end = m->end;
1623 }
1624 dout(10) << __func__ << " backoff ack id " << m->id
1625 << " [" << begin << "," << end << ")" << dendl;
1626 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1627 }
1628
1629 void PrimaryLogPG::do_request(
1630 OpRequestRef& op,
1631 ThreadPool::TPHandle &handle)
1632 {
1633 if (op->osd_trace) {
1634 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1635 op->pg_trace.event("do request");
1636 }
1637 // make sure we have a new enough map
1638 auto p = waiting_for_map.find(op->get_source());
1639 if (p != waiting_for_map.end()) {
1640 // preserve ordering
1641 dout(20) << __func__ << " waiting_for_map "
1642 << p->first << " not empty, queueing" << dendl;
1643 p->second.push_back(op);
1644 op->mark_delayed("waiting_for_map not empty");
1645 return;
1646 }
1647 if (!have_same_or_newer_map(op->min_epoch)) {
1648 dout(20) << __func__ << " min " << op->min_epoch
1649 << ", queue on waiting_for_map " << op->get_source() << dendl;
1650 waiting_for_map[op->get_source()].push_back(op);
1651 op->mark_delayed("op must wait for map");
1652 osd->request_osdmap_update(op->min_epoch);
1653 return;
1654 }
1655
1656 if (can_discard_request(op)) {
1657 return;
1658 }
1659
1660 // pg-wide backoffs
1661 const Message *m = op->get_req();
1662 int msg_type = m->get_type();
1663 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1664 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1665 if (!session)
1666 return; // drop it.
1667
1668 if (msg_type == CEPH_MSG_OSD_OP) {
1669 if (session->check_backoff(cct, info.pgid,
1670 info.pgid.pgid.get_hobj_start(), m)) {
1671 return;
1672 }
1673
1674 bool backoff =
1675 is_down() ||
1676 is_incomplete() ||
1677 (!is_active() && is_peered());
1678 if (g_conf()->osd_backoff_on_peering && !backoff) {
1679 if (is_peering()) {
1680 backoff = true;
1681 }
1682 }
1683 if (backoff) {
1684 add_pg_backoff(session);
1685 return;
1686 }
1687 }
1688 // pg backoff acks at pg-level
1689 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1690 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1691 if (ba->begin != ba->end) {
1692 handle_backoff(op);
1693 return;
1694 }
1695 }
1696 }
1697
1698 if (!is_peered()) {
1699 // Delay unless PGBackend says it's ok
1700 if (pgbackend->can_handle_while_inactive(op)) {
1701 bool handled = pgbackend->handle_message(op);
1702 ceph_assert(handled);
1703 return;
1704 } else {
1705 waiting_for_peered.push_back(op);
1706 op->mark_delayed("waiting for peered");
1707 return;
1708 }
1709 }
1710
1711 if (recovery_state.needs_flush()) {
1712 dout(20) << "waiting for flush on " << op << dendl;
1713 waiting_for_flush.push_back(op);
1714 op->mark_delayed("waiting for flush");
1715 return;
1716 }
1717
1718 ceph_assert(is_peered() && !recovery_state.needs_flush());
1719 if (pgbackend->handle_message(op))
1720 return;
1721
1722 switch (msg_type) {
1723 case CEPH_MSG_OSD_OP:
1724 case CEPH_MSG_OSD_BACKOFF:
1725 if (!is_active()) {
1726 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1727 waiting_for_active.push_back(op);
1728 op->mark_delayed("waiting for active");
1729 return;
1730 }
1731 switch (msg_type) {
1732 case CEPH_MSG_OSD_OP:
1733 // verify client features
1734 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1735 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1736 osd->reply_op_error(op, -EOPNOTSUPP);
1737 return;
1738 }
1739 do_op(op);
1740 break;
1741 case CEPH_MSG_OSD_BACKOFF:
1742 // object-level backoff acks handled in osdop context
1743 handle_backoff(op);
1744 break;
1745 }
1746 break;
1747
1748 case MSG_OSD_PG_SCAN:
1749 do_scan(op, handle);
1750 break;
1751
1752 case MSG_OSD_PG_BACKFILL:
1753 do_backfill(op);
1754 break;
1755
1756 case MSG_OSD_PG_BACKFILL_REMOVE:
1757 do_backfill_remove(op);
1758 break;
1759
1760 case MSG_OSD_SCRUB_RESERVE:
1761 {
1762 auto m = op->get_req<MOSDScrubReserve>();
1763 switch (m->type) {
1764 case MOSDScrubReserve::REQUEST:
1765 handle_scrub_reserve_request(op);
1766 break;
1767 case MOSDScrubReserve::GRANT:
1768 handle_scrub_reserve_grant(op, m->from);
1769 break;
1770 case MOSDScrubReserve::REJECT:
1771 handle_scrub_reserve_reject(op, m->from);
1772 break;
1773 case MOSDScrubReserve::RELEASE:
1774 handle_scrub_reserve_release(op);
1775 break;
1776 }
1777 }
1778 break;
1779
1780 case MSG_OSD_REP_SCRUB:
1781 replica_scrub(op, handle);
1782 break;
1783
1784 case MSG_OSD_REP_SCRUBMAP:
1785 do_replica_scrub_map(op);
1786 break;
1787
1788 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1789 do_update_log_missing(op);
1790 break;
1791
1792 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1793 do_update_log_missing_reply(op);
1794 break;
1795
1796 default:
1797 ceph_abort_msg("bad message type in do_request");
1798 }
1799 }
1800
1801 hobject_t PrimaryLogPG::earliest_backfill() const
1802 {
1803 hobject_t e = hobject_t::get_max();
1804 for (const pg_shard_t& bt : get_backfill_targets()) {
1805 const pg_info_t &pi = recovery_state.get_peer_info(bt);
1806 e = std::min(pi.last_backfill, e);
1807 }
1808 return e;
1809 }
1810
1811 /** do_op - do an op
1812 * pg lock will be held (if multithreaded)
1813 * osd_lock NOT held.
1814 */
1815 void PrimaryLogPG::do_op(OpRequestRef& op)
1816 {
1817 FUNCTRACE(cct);
1818 // NOTE: take a non-const pointer here; we must be careful not to
1819 // change anything that will break other reads on m (operator<<).
1820 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1821 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1822 if (m->finish_decode()) {
1823 op->reset_desc(); // for TrackedOp
1824 m->clear_payload();
1825 }
1826
1827 dout(20) << __func__ << ": op " << *m << dendl;
1828
1829 const hobject_t head = m->get_hobj().get_head();
1830
1831 if (!info.pgid.pgid.contains(
1832 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1833 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1834 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1835 << std::hex << head.get_hash() << std::dec << dendl;
1836 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1837 << " op " << *m;
1838 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1839 return;
1840 }
1841
1842 bool can_backoff =
1843 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1844 ceph::ref_t<Session> session;
1845 if (can_backoff) {
1846 session = static_cast<Session*>(m->get_connection()->get_priv().get());
1847 if (!session.get()) {
1848 dout(10) << __func__ << " no session" << dendl;
1849 return;
1850 }
1851
1852 if (session->check_backoff(cct, info.pgid, head, m)) {
1853 return;
1854 }
1855 }
1856
1857 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1858 // not implemented.
1859 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1860 osd->reply_op_error(op, -EINVAL);
1861 return;
1862 }
1863
1864 {
1865 int r = op->maybe_init_op_info(*get_osdmap());
1866 if (r) {
1867 osd->reply_op_error(op, r);
1868 return;
1869 }
1870 }
1871
1872 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1873 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1874 op->may_read() &&
1875 !(op->may_write() || op->may_cache())) {
1876 // balanced reads; any replica will do
1877 if (!(is_primary() || is_nonprimary())) {
1878 osd->handle_misdirected_op(this, op);
1879 return;
1880 }
1881 } else {
1882 // normal case; must be primary
1883 if (!is_primary()) {
1884 osd->handle_misdirected_op(this, op);
1885 return;
1886 }
1887 }
1888
1889 if (!check_laggy(op)) {
1890 return;
1891 }
1892
1893 if (!op_has_sufficient_caps(op)) {
1894 osd->reply_op_error(op, -EPERM);
1895 return;
1896 }
1897
1898 if (op->includes_pg_op()) {
1899 return do_pg_op(op);
1900 }
1901
1902 // object name too long?
1903 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1904 dout(4) << "do_op name is longer than "
1905 << cct->_conf->osd_max_object_name_len
1906 << " bytes" << dendl;
1907 osd->reply_op_error(op, -ENAMETOOLONG);
1908 return;
1909 }
1910 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1911 dout(4) << "do_op locator is longer than "
1912 << cct->_conf->osd_max_object_name_len
1913 << " bytes" << dendl;
1914 osd->reply_op_error(op, -ENAMETOOLONG);
1915 return;
1916 }
1917 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1918 dout(4) << "do_op namespace is longer than "
1919 << cct->_conf->osd_max_object_namespace_len
1920 << " bytes" << dendl;
1921 osd->reply_op_error(op, -ENAMETOOLONG);
1922 return;
1923 }
1924 if (m->get_hobj().oid.name.empty()) {
1925 dout(4) << "do_op empty oid name is not allowed" << dendl;
1926 osd->reply_op_error(op, -EINVAL);
1927 return;
1928 }
1929
1930 if (int r = osd->store->validate_hobject_key(head)) {
1931 dout(4) << "do_op object " << head << " invalid for backing store: "
1932 << r << dendl;
1933 osd->reply_op_error(op, r);
1934 return;
1935 }
1936
1937 // blacklisted?
1938 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1939 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1940 osd->reply_op_error(op, -EBLACKLISTED);
1941 return;
1942 }
1943
1944 // order this op as a write?
1945 bool write_ordered = op->rwordered();
1946
1947 // discard due to cluster full transition? (we discard any op that
1948 // originates before the cluster or pool is marked full; the client
1949 // will resend after the full flag is removed or if they expect the
1950 // op to succeed despite being full). The except is FULL_FORCE and
1951 // FULL_TRY ops, which there is no reason to discard because they
1952 // bypass all full checks anyway. If this op isn't write or
1953 // read-ordered, we skip.
1954 // FIXME: we exclude mds writes for now.
1955 if (write_ordered && !(m->get_source().is_mds() ||
1956 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1957 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1958 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1959 dout(10) << __func__ << " discarding op sent before full " << m << " "
1960 << *m << dendl;
1961 return;
1962 }
1963 // mds should have stopped writing before this point.
1964 // We can't allow OSD to become non-startable even if mds
1965 // could be writing as part of file removals.
1966 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
1967 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
1968 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
1969 return;
1970 }
1971 int64_t poolid = get_pgid().pool();
1972 if (op->may_write()) {
1973
1974 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1975 if (!pi) {
1976 return;
1977 }
1978
1979 // invalid?
1980 if (m->get_snapid() != CEPH_NOSNAP) {
1981 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1982 osd->reply_op_error(op, -EINVAL);
1983 return;
1984 }
1985
1986 // too big?
1987 if (cct->_conf->osd_max_write_size &&
1988 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1989 // journal can't hold commit!
1990 derr << "do_op msg data len " << m->get_data_len()
1991 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1992 << " on " << *m << dendl;
1993 osd->reply_op_error(op, -OSD_WRITETOOBIG);
1994 return;
1995 }
1996 }
1997
1998 dout(10) << "do_op " << *m
1999 << (op->may_write() ? " may_write" : "")
2000 << (op->may_read() ? " may_read" : "")
2001 << (op->may_cache() ? " may_cache" : "")
2002 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2003 << " flags " << ceph_osd_flag_string(m->get_flags())
2004 << dendl;
2005
2006 // missing object?
2007 if (is_unreadable_object(head)) {
2008 if (!is_primary()) {
2009 osd->reply_op_error(op, -EAGAIN);
2010 return;
2011 }
2012 if (can_backoff &&
2013 (g_conf()->osd_backoff_on_degraded ||
2014 (g_conf()->osd_backoff_on_unfound &&
2015 recovery_state.get_missing_loc().is_unfound(head)))) {
2016 add_backoff(session, head, head);
2017 maybe_kick_recovery(head);
2018 } else {
2019 wait_for_unreadable_object(head, op);
2020 }
2021 return;
2022 }
2023
2024 if (write_ordered) {
2025 // degraded object?
2026 if (is_degraded_or_backfilling_object(head)) {
2027 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2028 add_backoff(session, head, head);
2029 maybe_kick_recovery(head);
2030 } else {
2031 wait_for_degraded_object(head, op);
2032 }
2033 return;
2034 }
2035
2036 if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
2037 dout(20) << __func__ << ": waiting for scrub" << dendl;
2038 waiting_for_scrub.push_back(op);
2039 op->mark_delayed("waiting for scrub");
2040 return;
2041 }
2042 if (!check_laggy_requeue(op)) {
2043 return;
2044 }
2045
2046 // blocked on snap?
2047 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2048 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2049 hobject_t to_wait_on(head);
2050 to_wait_on.snap = blocked_iter->second;
2051 wait_for_degraded_object(to_wait_on, op);
2052 return;
2053 }
2054 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2055 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2056 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2057 return;
2058 }
2059 if (objects_blocked_on_cache_full.count(head)) {
2060 block_write_on_full_cache(head, op);
2061 return;
2062 }
2063 }
2064
2065 // dup/resent?
2066 if (op->may_write() || op->may_cache()) {
2067 // warning: we will get back *a* request for this reqid, but not
2068 // necessarily the most recent. this happens with flush and
2069 // promote ops, but we can't possible have both in our log where
2070 // the original request is still not stable on disk, so for our
2071 // purposes here it doesn't matter which one we get.
2072 eversion_t version;
2073 version_t user_version;
2074 int return_code = 0;
2075 vector<pg_log_op_return_item_t> op_returns;
2076 bool got = check_in_progress_op(
2077 m->get_reqid(), &version, &user_version, &return_code, &op_returns);
2078 if (got) {
2079 dout(3) << __func__ << " dup " << m->get_reqid()
2080 << " version " << version << dendl;
2081 if (already_complete(version)) {
2082 osd->reply_op_error(op, return_code, version, user_version, op_returns);
2083 } else {
2084 dout(10) << " waiting for " << version << " to commit" << dendl;
2085 // always queue ondisk waiters, so that we can requeue if needed
2086 waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2087 op_returns);
2088 op->mark_delayed("waiting for ondisk");
2089 }
2090 return;
2091 }
2092 }
2093
2094 ObjectContextRef obc;
2095 bool can_create = op->may_write();
2096 hobject_t missing_oid;
2097
2098 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2099 const hobject_t& oid =
2100 m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
2101
2102 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2103 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2104 OSDOp& osd_op = *p;
2105
2106 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2107 if (m->get_snapid() != CEPH_SNAPDIR) {
2108 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2109 osd->reply_op_error(op, -EINVAL);
2110 return;
2111 }
2112 } else {
2113 if (m->get_snapid() == CEPH_SNAPDIR) {
2114 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2115 osd->reply_op_error(op, -EINVAL);
2116 return;
2117 }
2118 }
2119 }
2120
2121 // io blocked on obc?
2122 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2123 maybe_await_blocked_head(oid, op)) {
2124 return;
2125 }
2126
2127 if (!is_primary()) {
2128 if (!recovery_state.can_serve_replica_read(oid)) {
2129 dout(20) << __func__ << ": oid " << oid
2130 << " unstable write on replica, bouncing to primary."
2131 << *m << dendl;
2132 osd->reply_op_error(op, -EAGAIN);
2133 return;
2134 } else {
2135 dout(20) << __func__ << ": serving replica read on oid" << oid
2136 << dendl;
2137 }
2138 }
2139
2140 int r = find_object_context(
2141 oid, &obc, can_create,
2142 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2143 &missing_oid);
2144
2145 // LIST_SNAPS needs the ssc too
2146 if (obc &&
2147 m->get_snapid() == CEPH_SNAPDIR &&
2148 !obc->ssc) {
2149 obc->ssc = get_snapset_context(oid, true);
2150 }
2151
2152 if (r == -EAGAIN) {
2153 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2154 // we have to wait for the object.
2155 if (is_primary()) {
2156 // missing the specific snap we need; requeue and wait.
2157 ceph_assert(!op->may_write()); // only happens on a read/cache
2158 wait_for_unreadable_object(missing_oid, op);
2159 return;
2160 }
2161 } else if (r == 0) {
2162 if (is_unreadable_object(obc->obs.oi.soid)) {
2163 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2164 << " is unreadable, waiting" << dendl;
2165 wait_for_unreadable_object(obc->obs.oi.soid, op);
2166 return;
2167 }
2168
2169 // degraded object? (the check above was for head; this could be a clone)
2170 if (write_ordered &&
2171 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2172 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2173 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2174 << " is degraded, waiting" << dendl;
2175 wait_for_degraded_object(obc->obs.oi.soid, op);
2176 return;
2177 }
2178 }
2179
2180 bool in_hit_set = false;
2181 if (hit_set) {
2182 if (obc.get()) {
2183 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2184 in_hit_set = true;
2185 } else {
2186 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2187 in_hit_set = true;
2188 }
2189 if (!op->hitset_inserted) {
2190 hit_set->insert(oid);
2191 op->hitset_inserted = true;
2192 if (hit_set->is_full() ||
2193 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2194 hit_set_persist();
2195 }
2196 }
2197 }
2198
2199 if (agent_state) {
2200 if (agent_choose_mode(false, op))
2201 return;
2202 }
2203
2204 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2205 if (maybe_handle_manifest(op,
2206 write_ordered,
2207 obc))
2208 return;
2209 }
2210
2211 if (maybe_handle_cache(op,
2212 write_ordered,
2213 obc,
2214 r,
2215 missing_oid,
2216 false,
2217 in_hit_set))
2218 return;
2219
2220 if (r && (r != -ENOENT || !obc)) {
2221 // copy the reqids for copy get on ENOENT
2222 if (r == -ENOENT &&
2223 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2224 fill_in_copy_get_noent(op, oid, m->ops[0]);
2225 return;
2226 }
2227 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2228 if (op->may_write() &&
2229 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2230 record_write_error(op, oid, nullptr, r);
2231 } else {
2232 osd->reply_op_error(op, r);
2233 }
2234 return;
2235 }
2236
2237 // make sure locator is consistent
2238 object_locator_t oloc(obc->obs.oi.soid);
2239 if (m->get_object_locator() != oloc) {
2240 dout(10) << " provided locator " << m->get_object_locator()
2241 << " != object's " << obc->obs.oi.soid << dendl;
2242 osd->clog->warn() << "bad locator " << m->get_object_locator()
2243 << " on object " << oloc
2244 << " op " << *m;
2245 }
2246
2247 // io blocked on obc?
2248 if (obc->is_blocked() &&
2249 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2250 wait_for_blocked_object(obc->obs.oi.soid, op);
2251 return;
2252 }
2253
2254 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2255
2256 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2257
2258 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2259 dout(20) << __func__ << ": skipping rw locks" << dendl;
2260 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2261 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2262
2263 // verify there is in fact a flush in progress
2264 // FIXME: we could make this a stronger test.
2265 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2266 if (p == flush_ops.end()) {
2267 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2268 reply_ctx(ctx, -EINVAL);
2269 return;
2270 }
2271 } else if (!get_rw_locks(write_ordered, ctx)) {
2272 dout(20) << __func__ << " waiting for rw locks " << dendl;
2273 op->mark_delayed("waiting for rw locks");
2274 close_op_ctx(ctx);
2275 return;
2276 }
2277 dout(20) << __func__ << " obc " << *obc << dendl;
2278
2279 if (r) {
2280 dout(20) << __func__ << " returned an error: " << r << dendl;
2281 if (op->may_write() &&
2282 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2283 record_write_error(op, oid, nullptr, r,
2284 ctx->op->allows_returnvec() ? ctx : nullptr);
2285 } else {
2286 osd->reply_op_error(op, r);
2287 }
2288 close_op_ctx(ctx);
2289 return;
2290 }
2291
2292 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2293 ctx->ignore_cache = true;
2294 }
2295
2296 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2297 // This object is lost. Reading from it returns an error.
2298 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2299 << " is lost" << dendl;
2300 reply_ctx(ctx, -ENFILE);
2301 return;
2302 }
2303 if (!op->may_write() &&
2304 !op->may_cache() &&
2305 (!obc->obs.exists ||
2306 ((m->get_snapid() != CEPH_SNAPDIR) &&
2307 obc->obs.oi.is_whiteout()))) {
2308 // copy the reqids for copy get on ENOENT
2309 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2310 fill_in_copy_get_noent(op, oid, m->ops[0]);
2311 close_op_ctx(ctx);
2312 return;
2313 }
2314 reply_ctx(ctx, -ENOENT);
2315 return;
2316 }
2317
2318 op->mark_started();
2319
2320 execute_ctx(ctx);
2321 utime_t prepare_latency = ceph_clock_now();
2322 prepare_latency -= op->get_dequeued_time();
2323 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2324 if (op->may_read() && op->may_write()) {
2325 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2326 } else if (op->may_read()) {
2327 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2328 } else if (op->may_write() || op->may_cache()) {
2329 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2330 }
2331
2332 // force recovery of the oldest missing object if too many logs
2333 maybe_force_recovery();
2334 }
2335
2336 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2337 OpRequestRef op,
2338 bool write_ordered,
2339 ObjectContextRef obc)
2340 {
2341 ceph_assert(obc);
2342 if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2343 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2344 return cache_result_t::NOOP;
2345 }
2346
2347 // if it is write-ordered and blocked, stop now
2348 if (obc->is_blocked() && write_ordered) {
2349 // we're already doing something with this object
2350 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2351 return cache_result_t::NOOP;
2352 }
2353
2354 vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
2355 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2356 OSDOp& osd_op = *p;
2357 ceph_osd_op& op = osd_op.op;
2358 if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2359 op.op == CEPH_OSD_OP_SET_CHUNK ||
2360 op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2361 op.op == CEPH_OSD_OP_TIER_FLUSH) {
2362 return cache_result_t::NOOP;
2363 } else if (op.op == CEPH_OSD_OP_TIER_PROMOTE) {
2364 bool is_dirty = false;
2365 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2366 if (p.second.is_dirty()) {
2367 is_dirty = true;
2368 }
2369 }
2370 if (is_dirty) {
2371 start_flush(OpRequestRef(), obc, true, NULL, std::nullopt);
2372 }
2373 return cache_result_t::NOOP;
2374 }
2375 }
2376
2377 switch (obc->obs.oi.manifest.type) {
2378 case object_manifest_t::TYPE_REDIRECT:
2379 if (op->may_write() || write_ordered) {
2380 do_proxy_write(op, obc);
2381 } else {
2382 // promoted object
2383 if (obc->obs.oi.size != 0) {
2384 return cache_result_t::NOOP;
2385 }
2386 do_proxy_read(op, obc);
2387 }
2388 return cache_result_t::HANDLED_PROXY;
2389 case object_manifest_t::TYPE_CHUNKED:
2390 {
2391 if (can_proxy_chunked_read(op, obc)) {
2392 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2393 if (p != flush_ops.end()) {
2394 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2395 return cache_result_t::HANDLED_PROXY;
2396 }
2397 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2398 return cache_result_t::HANDLED_PROXY;
2399 }
2400
2401 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2402 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2403 hobject_t head = m->get_hobj();
2404
2405 if (is_degraded_or_backfilling_object(head)) {
2406 dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2407 wait_for_degraded_object(head, op);
2408 return cache_result_t::BLOCKED_RECOVERY;
2409 }
2410
2411 if (write_blocked_by_scrub(head)) {
2412 dout(20) << __func__ << ": waiting for scrub" << dendl;
2413 waiting_for_scrub.push_back(op);
2414 op->mark_delayed("waiting for scrub");
2415 return cache_result_t::BLOCKED_RECOVERY;
2416 }
2417 if (!check_laggy_requeue(op)) {
2418 return cache_result_t::BLOCKED_RECOVERY;
2419 }
2420
2421 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2422 if (p.second.is_missing()) {
2423 auto m = op->get_req<MOSDOp>();
2424 const object_locator_t oloc = m->get_object_locator();
2425 promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2426 return cache_result_t::BLOCKED_PROMOTE;
2427 }
2428 }
2429
2430 bool all_dirty = true;
2431 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2432 if (!p.second.is_dirty()) {
2433 all_dirty = false;
2434 }
2435 }
2436 if (all_dirty) {
2437 start_flush(OpRequestRef(), obc, true, NULL, std::nullopt);
2438 }
2439 return cache_result_t::NOOP;
2440 }
2441 default:
2442 ceph_abort_msg("unrecognized manifest type");
2443 }
2444
2445 return cache_result_t::NOOP;
2446 }
2447
2448 struct C_ManifestFlush : public Context {
2449 PrimaryLogPGRef pg;
2450 hobject_t oid;
2451 epoch_t lpr;
2452 ceph_tid_t tid;
2453 utime_t start;
2454 uint64_t offset;
2455 uint64_t last_offset;
2456 C_ManifestFlush(PrimaryLogPG *p, hobject_t o, epoch_t e)
2457 : pg(p), oid(o), lpr(e),
2458 tid(0), start(ceph_clock_now())
2459 {}
2460 void finish(int r) override {
2461 if (r == -ECANCELED)
2462 return;
2463 std::scoped_lock locker{*pg};
2464 pg->handle_manifest_flush(oid, tid, r, offset, last_offset, lpr);
2465 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
2466 }
2467 };
2468
2469 void PrimaryLogPG::handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2470 uint64_t offset, uint64_t last_offset,
2471 epoch_t lpr)
2472 {
2473 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2474 if (p == flush_ops.end()) {
2475 dout(10) << __func__ << " no flush_op found" << dendl;
2476 return;
2477 }
2478 if (p->second->rval < 0) {
2479 return;
2480 }
2481 p->second->io_results[offset] = r;
2482 for (auto &ior: p->second->io_results) {
2483 if (ior.second < 0) {
2484 finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2485 p->second->rval = r;
2486 return;
2487 }
2488 }
2489 if (p->second->chunks == p->second->io_results.size()) {
2490 if (lpr == get_last_peering_reset()) {
2491 ceph_assert(p->second->obc);
2492 finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2493 }
2494 }
2495 }
2496
2497 int PrimaryLogPG::start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
2498 std::optional<std::function<void()>> &&on_flush)
2499 {
2500 auto p = obc->obs.oi.manifest.chunk_map.begin();
2501 FlushOpRef manifest_fop(std::make_shared<FlushOp>());
2502 manifest_fop->op = op;
2503 manifest_fop->obc = obc;
2504 manifest_fop->flushed_version = obc->obs.oi.user_version;
2505 manifest_fop->blocking = blocking;
2506 manifest_fop->on_flush = std::move(on_flush);
2507 int r = do_manifest_flush(op, obc, manifest_fop, p->first, blocking);
2508 if (r < 0) {
2509 return r;
2510 }
2511
2512 flush_ops[obc->obs.oi.soid] = manifest_fop;
2513 return -EINPROGRESS;
2514 }
2515
2516 int PrimaryLogPG::do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
2517 uint64_t start_offset, bool block)
2518 {
2519 struct object_manifest_t &manifest = obc->obs.oi.manifest;
2520 hobject_t soid = obc->obs.oi.soid;
2521 ceph_tid_t tid;
2522 SnapContext snapc;
2523 uint64_t max_copy_size = 0, last_offset = 0;
2524
2525 map<uint64_t, chunk_info_t>::iterator iter = manifest.chunk_map.find(start_offset);
2526 ceph_assert(iter != manifest.chunk_map.end());
2527 for (;iter != manifest.chunk_map.end(); ++iter) {
2528 if (iter->second.is_dirty()) {
2529 last_offset = iter->first;
2530 max_copy_size += iter->second.length;
2531 }
2532 if (get_copy_chunk_size() < max_copy_size) {
2533 break;
2534 }
2535 }
2536
2537 iter = manifest.chunk_map.find(start_offset);
2538 for (;iter != manifest.chunk_map.end(); ++iter) {
2539 if (!iter->second.is_dirty()) {
2540 continue;
2541 }
2542 uint64_t tgt_length = iter->second.length;
2543 uint64_t tgt_offset= iter->second.offset;
2544 hobject_t tgt_soid = iter->second.oid;
2545 object_locator_t oloc(tgt_soid);
2546 ObjectOperation obj_op;
2547 bufferlist chunk_data;
2548 int r = pgbackend->objects_read_sync(
2549 soid, iter->first, tgt_length, 0, &chunk_data);
2550 if (r < 0) {
2551 dout(0) << __func__ << " read fail " << " offset: " << tgt_offset
2552 << " len: " << tgt_length << " r: " << r << dendl;
2553 return r;
2554 }
2555 if (!chunk_data.length()) {
2556 return -ENODATA;
2557 }
2558
2559 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
2560 CEPH_OSD_FLAG_RWORDERED;
2561 tgt_length = chunk_data.length();
2562 if (pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
2563 iter->second.has_reference() &&
2564 fp_algo != pg_pool_t::TYPE_FINGERPRINT_NONE) {
2565 object_t fp_oid = [fp_algo, &chunk_data]() -> string {
2566 switch (fp_algo) {
2567 case pg_pool_t::TYPE_FINGERPRINT_SHA1:
2568 return crypto::digest<crypto::SHA1>(chunk_data).to_str();
2569 case pg_pool_t::TYPE_FINGERPRINT_SHA256:
2570 return crypto::digest<crypto::SHA256>(chunk_data).to_str();
2571 case pg_pool_t::TYPE_FINGERPRINT_SHA512:
2572 return crypto::digest<crypto::SHA512>(chunk_data).to_str();
2573 default:
2574 assert(0 == "unrecognized fingerprint type");
2575 return {};
2576 }
2577 }();
2578 bufferlist in;
2579 if (fp_oid != tgt_soid.oid) {
2580 // decrement old chunk's reference count
2581 ObjectOperation dec_op;
2582 cls_chunk_refcount_put_op put_call;
2583 put_call.source = soid;
2584 ::encode(put_call, in);
2585 dec_op.call("cas", "chunk_put", in);
2586 // we don't care dec_op's completion. scrub for dedup will fix this.
2587 tid = osd->objecter->mutate(
2588 tgt_soid.oid, oloc, dec_op, snapc,
2589 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2590 flags, NULL);
2591 in.clear();
2592 }
2593 tgt_soid.oid = fp_oid;
2594 iter->second.oid = tgt_soid;
2595 // add data op
2596 ceph_osd_op osd_op;
2597 osd_op.extent.offset = 0;
2598 osd_op.extent.length = chunk_data.length();
2599 encode(osd_op, in);
2600 encode(soid, in);
2601 in.append(chunk_data);
2602 obj_op.call("cas", "cas_write_or_get", in);
2603 } else {
2604 obj_op.add_data(CEPH_OSD_OP_WRITE, tgt_offset, tgt_length, chunk_data);
2605 }
2606
2607 C_ManifestFlush *fin = new C_ManifestFlush(this, soid, get_last_peering_reset());
2608 fin->offset = iter->first;
2609 fin->last_offset = last_offset;
2610 manifest_fop->chunks++;
2611
2612 tid = osd->objecter->mutate(
2613 tgt_soid.oid, oloc, obj_op, snapc,
2614 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2615 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())));
2616 fin->tid = tid;
2617 manifest_fop->io_tids[iter->first] = tid;
2618
2619 dout(20) << __func__ << " offset: " << tgt_offset << " len: " << tgt_length
2620 << " oid: " << tgt_soid.oid << " ori oid: " << soid.oid.name
2621 << " tid: " << tid << dendl;
2622 if (last_offset < iter->first) {
2623 break;
2624 }
2625 }
2626
2627 return 0;
2628 }
2629
2630 void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2631 ObjectContextRef obc, uint64_t last_offset)
2632 {
2633 dout(10) << __func__ << " " << oid << " tid " << tid
2634 << " " << cpp_strerror(r) << " last_offset: " << last_offset << dendl;
2635 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2636 if (p == flush_ops.end()) {
2637 dout(10) << __func__ << " no flush_op found" << dendl;
2638 return;
2639 }
2640 map<uint64_t, chunk_info_t>::iterator iter =
2641 obc->obs.oi.manifest.chunk_map.find(last_offset);
2642 ceph_assert(iter != obc->obs.oi.manifest.chunk_map.end());
2643 for (;iter != obc->obs.oi.manifest.chunk_map.end(); ++iter) {
2644 if (iter->second.is_dirty() && last_offset < iter->first) {
2645 do_manifest_flush(p->second->op, obc, p->second, iter->first, p->second->blocking);
2646 return;
2647 }
2648 }
2649 finish_flush(oid, tid, r);
2650 }
2651
2652 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2653 MOSDOpReply *orig_reply, int r,
2654 OpContext *ctx_for_op_returns)
2655 {
2656 dout(20) << __func__ << " r=" << r << dendl;
2657 ceph_assert(op->may_write());
2658 const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
2659 mempool::osd_pglog::list<pg_log_entry_t> entries;
2660 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2661 get_next_version(), eversion_t(), 0,
2662 reqid, utime_t(), r));
2663 if (ctx_for_op_returns) {
2664 entries.back().set_op_returns(*ctx_for_op_returns->ops);
2665 dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2666 }
2667
2668 struct OnComplete {
2669 PrimaryLogPG *pg;
2670 OpRequestRef op;
2671 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2672 int r;
2673 OnComplete(
2674 PrimaryLogPG *pg,
2675 OpRequestRef op,
2676 MOSDOpReply *orig_reply,
2677 int r)
2678 : pg(pg), op(op),
2679 orig_reply(orig_reply, false /* take over ref */), r(r)
2680 {}
2681 void operator()() {
2682 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2683 auto m = op->get_req<MOSDOp>();
2684 MOSDOpReply *reply = orig_reply.detach();
2685 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2686 pg->osd->send_message_osd_client(reply, m->get_connection());
2687 }
2688 };
2689
2690 ObcLockManager lock_manager;
2691 submit_log_entries(
2692 entries,
2693 std::move(lock_manager),
2694 std::optional<std::function<void(void)> >(
2695 OnComplete(this, op, orig_reply, r)),
2696 op,
2697 r);
2698 }
2699
2700 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2701 OpRequestRef op,
2702 bool write_ordered,
2703 ObjectContextRef obc,
2704 int r, hobject_t missing_oid,
2705 bool must_promote,
2706 bool in_hit_set,
2707 ObjectContextRef *promote_obc)
2708 {
2709 // return quickly if caching is not enabled
2710 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2711 return cache_result_t::NOOP;
2712
2713 if (op &&
2714 op->get_req() &&
2715 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2716 (op->get_req<MOSDOp>()->get_flags() &
2717 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2718 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2719 return cache_result_t::NOOP;
2720 }
2721
2722 must_promote = must_promote || op->need_promote();
2723
2724 if (obc)
2725 dout(25) << __func__ << " " << obc->obs.oi << " "
2726 << (obc->obs.exists ? "exists" : "DNE")
2727 << " missing_oid " << missing_oid
2728 << " must_promote " << (int)must_promote
2729 << " in_hit_set " << (int)in_hit_set
2730 << dendl;
2731 else
2732 dout(25) << __func__ << " (no obc)"
2733 << " missing_oid " << missing_oid
2734 << " must_promote " << (int)must_promote
2735 << " in_hit_set " << (int)in_hit_set
2736 << dendl;
2737
2738 // if it is write-ordered and blocked, stop now
2739 if (obc.get() && obc->is_blocked() && write_ordered) {
2740 // we're already doing something with this object
2741 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2742 return cache_result_t::NOOP;
2743 }
2744
2745 if (r == -ENOENT && missing_oid == hobject_t()) {
2746 // we know this object is logically absent (e.g., an undefined clone)
2747 return cache_result_t::NOOP;
2748 }
2749
2750 if (obc.get() && obc->obs.exists) {
2751 osd->logger->inc(l_osd_op_cache_hit);
2752 return cache_result_t::NOOP;
2753 }
2754 if (!is_primary()) {
2755 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2756 osd->reply_op_error(op, -EAGAIN);
2757 return cache_result_t::REPLIED_WITH_EAGAIN;
2758 }
2759
2760 if (missing_oid == hobject_t() && obc.get()) {
2761 missing_oid = obc->obs.oi.soid;
2762 }
2763
2764 auto m = op->get_req<MOSDOp>();
2765 const object_locator_t oloc = m->get_object_locator();
2766
2767 if (op->need_skip_handle_cache()) {
2768 return cache_result_t::NOOP;
2769 }
2770
2771 OpRequestRef promote_op;
2772
2773 switch (pool.info.cache_mode) {
2774 case pg_pool_t::CACHEMODE_WRITEBACK:
2775 if (agent_state &&
2776 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2777 if (!op->may_write() && !op->may_cache() &&
2778 !write_ordered && !must_promote) {
2779 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2780 do_proxy_read(op);
2781 return cache_result_t::HANDLED_PROXY;
2782 }
2783 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2784 block_write_on_full_cache(missing_oid, op);
2785 return cache_result_t::BLOCKED_FULL;
2786 }
2787
2788 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2789 promote_object(obc, missing_oid, oloc, op, promote_obc);
2790 return cache_result_t::BLOCKED_PROMOTE;
2791 }
2792
2793 if (op->may_write() || op->may_cache()) {
2794 do_proxy_write(op);
2795
2796 // Promote too?
2797 if (!op->need_skip_promote() &&
2798 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2799 pool.info.min_write_recency_for_promote,
2800 OpRequestRef(),
2801 promote_obc)) {
2802 return cache_result_t::BLOCKED_PROMOTE;
2803 }
2804 return cache_result_t::HANDLED_PROXY;
2805 } else {
2806 do_proxy_read(op);
2807
2808 // Avoid duplicate promotion
2809 if (obc.get() && obc->is_blocked()) {
2810 if (promote_obc)
2811 *promote_obc = obc;
2812 return cache_result_t::BLOCKED_PROMOTE;
2813 }
2814
2815 // Promote too?
2816 if (!op->need_skip_promote()) {
2817 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2818 pool.info.min_read_recency_for_promote,
2819 promote_op, promote_obc);
2820 }
2821
2822 return cache_result_t::HANDLED_PROXY;
2823 }
2824 ceph_abort_msg("unreachable");
2825 return cache_result_t::NOOP;
2826
2827 case pg_pool_t::CACHEMODE_READONLY:
2828 // TODO: clean this case up
2829 if (!obc.get() && r == -ENOENT) {
2830 // we don't have the object and op's a read
2831 promote_object(obc, missing_oid, oloc, op, promote_obc);
2832 return cache_result_t::BLOCKED_PROMOTE;
2833 }
2834 if (!r) { // it must be a write
2835 do_cache_redirect(op);
2836 return cache_result_t::HANDLED_REDIRECT;
2837 }
2838 // crap, there was a failure of some kind
2839 return cache_result_t::NOOP;
2840
2841 case pg_pool_t::CACHEMODE_FORWARD:
2842 // this mode is deprecated; proxy instead
2843 case pg_pool_t::CACHEMODE_PROXY:
2844 if (!must_promote) {
2845 if (op->may_write() || op->may_cache() || write_ordered) {
2846 do_proxy_write(op);
2847 return cache_result_t::HANDLED_PROXY;
2848 } else {
2849 do_proxy_read(op);
2850 return cache_result_t::HANDLED_PROXY;
2851 }
2852 }
2853 // ugh, we're forced to promote.
2854 if (agent_state &&
2855 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2856 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2857 block_write_on_full_cache(missing_oid, op);
2858 return cache_result_t::BLOCKED_FULL;
2859 }
2860 promote_object(obc, missing_oid, oloc, op, promote_obc);
2861 return cache_result_t::BLOCKED_PROMOTE;
2862
2863 case pg_pool_t::CACHEMODE_READFORWARD:
2864 // this mode is deprecated; proxy instead
2865 case pg_pool_t::CACHEMODE_READPROXY:
2866 // Do writeback to the cache tier for writes
2867 if (op->may_write() || write_ordered || must_promote) {
2868 if (agent_state &&
2869 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2870 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2871 block_write_on_full_cache(missing_oid, op);
2872 return cache_result_t::BLOCKED_FULL;
2873 }
2874 promote_object(obc, missing_oid, oloc, op, promote_obc);
2875 return cache_result_t::BLOCKED_PROMOTE;
2876 }
2877
2878 // If it is a read, we can read, we need to proxy it
2879 do_proxy_read(op);
2880 return cache_result_t::HANDLED_PROXY;
2881
2882 default:
2883 ceph_abort_msg("unrecognized cache_mode");
2884 }
2885 return cache_result_t::NOOP;
2886 }
2887
2888 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2889 const hobject_t& missing_oid,
2890 const object_locator_t& oloc,
2891 bool in_hit_set,
2892 uint32_t recency,
2893 OpRequestRef promote_op,
2894 ObjectContextRef *promote_obc)
2895 {
2896 dout(20) << __func__ << " missing_oid " << missing_oid
2897 << " in_hit_set " << in_hit_set << dendl;
2898
2899 switch (recency) {
2900 case 0:
2901 break;
2902 case 1:
2903 // Check if in the current hit set
2904 if (in_hit_set) {
2905 break;
2906 } else {
2907 // not promoting
2908 return false;
2909 }
2910 break;
2911 default:
2912 {
2913 unsigned count = (int)in_hit_set;
2914 if (count) {
2915 // Check if in other hit sets
2916 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2917 for (map<time_t,HitSetRef>::reverse_iterator itor =
2918 agent_state->hit_set_map.rbegin();
2919 itor != agent_state->hit_set_map.rend();
2920 ++itor) {
2921 if (!itor->second->contains(oid)) {
2922 break;
2923 }
2924 ++count;
2925 if (count >= recency) {
2926 break;
2927 }
2928 }
2929 }
2930 if (count >= recency) {
2931 break;
2932 }
2933 return false; // not promoting
2934 }
2935 break;
2936 }
2937
2938 if (osd->promote_throttle()) {
2939 dout(10) << __func__ << " promote throttled" << dendl;
2940 return false;
2941 }
2942 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2943 return true;
2944 }
2945
2946 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2947 {
2948 auto m = op->get_req<MOSDOp>();
2949 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2950 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2951 flags, false);
2952 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2953 reply->set_redirect(redir);
2954 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2955 << op << dendl;
2956 m->get_connection()->send_message(reply);
2957 return;
2958 }
2959
2960 struct C_ProxyRead : public Context {
2961 PrimaryLogPGRef pg;
2962 hobject_t oid;
2963 epoch_t last_peering_reset;
2964 ceph_tid_t tid;
2965 PrimaryLogPG::ProxyReadOpRef prdop;
2966 utime_t start;
2967 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2968 const PrimaryLogPG::ProxyReadOpRef& prd)
2969 : pg(p), oid(o), last_peering_reset(lpr),
2970 tid(0), prdop(prd), start(ceph_clock_now())
2971 {}
2972 void finish(int r) override {
2973 if (prdop->canceled)
2974 return;
2975 std::scoped_lock locker{*pg};
2976 if (prdop->canceled) {
2977 return;
2978 }
2979 if (last_peering_reset == pg->get_last_peering_reset()) {
2980 pg->finish_proxy_read(oid, tid, r);
2981 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2982 }
2983 }
2984 };
2985
2986 struct C_ProxyChunkRead : public Context {
2987 PrimaryLogPGRef pg;
2988 hobject_t oid;
2989 epoch_t last_peering_reset;
2990 ceph_tid_t tid;
2991 PrimaryLogPG::ProxyReadOpRef prdop;
2992 utime_t start;
2993 ObjectOperation *obj_op;
2994 int op_index = 0;
2995 uint64_t req_offset = 0;
2996 ObjectContextRef obc;
2997 uint64_t req_total_len = 0;
2998 C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2999 const PrimaryLogPG::ProxyReadOpRef& prd)
3000 : pg(p), oid(o), last_peering_reset(lpr),
3001 tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
3002 {}
3003 void finish(int r) override {
3004 if (prdop->canceled)
3005 return;
3006 std::scoped_lock locker{*pg};
3007 if (prdop->canceled) {
3008 return;
3009 }
3010 if (last_peering_reset == pg->get_last_peering_reset()) {
3011 if (r >= 0) {
3012 if (!prdop->ops[op_index].outdata.length()) {
3013 ceph_assert(req_total_len);
3014 bufferlist list;
3015 bufferptr bptr(req_total_len);
3016 list.push_back(std::move(bptr));
3017 prdop->ops[op_index].outdata.append(list);
3018 }
3019 ceph_assert(obj_op);
3020 uint64_t copy_offset;
3021 if (req_offset >= prdop->ops[op_index].op.extent.offset) {
3022 copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
3023 } else {
3024 copy_offset = 0;
3025 }
3026 prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
3027 obj_op->ops[0].outdata.length(),
3028 obj_op->ops[0].outdata.c_str());
3029 }
3030
3031 pg->finish_proxy_read(oid, tid, r);
3032 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3033 if (obj_op) {
3034 delete obj_op;
3035 }
3036 }
3037 }
3038 };
3039
3040 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3041 {
3042 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3043 // stash the result in the request's OSDOp vector
3044 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3045 object_locator_t oloc;
3046 hobject_t soid;
3047 /* extensible tier */
3048 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3049 switch (obc->obs.oi.manifest.type) {
3050 case object_manifest_t::TYPE_REDIRECT:
3051 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3052 soid = obc->obs.oi.manifest.redirect_target;
3053 break;
3054 default:
3055 ceph_abort_msg("unrecognized manifest type");
3056 }
3057 } else {
3058 /* proxy */
3059 soid = m->get_hobj();
3060 oloc = object_locator_t(m->get_object_locator());
3061 oloc.pool = pool.info.tier_of;
3062 }
3063 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3064
3065 // pass through some original flags that make sense.
3066 // - leave out redirection and balancing flags since we are
3067 // already proxying through the primary
3068 // - leave off read/write/exec flags that are derived from the op
3069 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3070 CEPH_OSD_FLAG_ORDERSNAP |
3071 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3072 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3073
3074 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3075
3076 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3077
3078 ObjectOperation obj_op;
3079 obj_op.dup(prdop->ops);
3080
3081 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3082 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3083 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3084 ceph_osd_op op = obj_op.ops[i].op;
3085 switch (op.op) {
3086 case CEPH_OSD_OP_READ:
3087 case CEPH_OSD_OP_SYNC_READ:
3088 case CEPH_OSD_OP_SPARSE_READ:
3089 case CEPH_OSD_OP_CHECKSUM:
3090 case CEPH_OSD_OP_CMPEXT:
3091 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3092 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3093 }
3094 }
3095 }
3096
3097 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3098 prdop);
3099 ceph_tid_t tid = osd->objecter->read(
3100 soid.oid, oloc, obj_op,
3101 m->get_snapid(), NULL,
3102 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3103 &prdop->user_version,
3104 &prdop->data_offset,
3105 m->get_features());
3106 fin->tid = tid;
3107 prdop->objecter_tid = tid;
3108 proxyread_ops[tid] = prdop;
3109 in_progress_proxy_ops[soid].push_back(op);
3110 }
3111
3112 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3113 {
3114 dout(10) << __func__ << " " << oid << " tid " << tid
3115 << " " << cpp_strerror(r) << dendl;
3116
3117 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3118 if (p == proxyread_ops.end()) {
3119 dout(10) << __func__ << " no proxyread_op found" << dendl;
3120 return;
3121 }
3122 ProxyReadOpRef prdop = p->second;
3123 if (tid != prdop->objecter_tid) {
3124 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3125 << " tid " << prdop->objecter_tid << dendl;
3126 return;
3127 }
3128 if (oid != prdop->soid) {
3129 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3130 << " soid " << prdop->soid << dendl;
3131 return;
3132 }
3133 proxyread_ops.erase(tid);
3134
3135 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3136 if (q == in_progress_proxy_ops.end()) {
3137 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3138 return;
3139 }
3140 ceph_assert(q->second.size());
3141 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3142 q->second.end(),
3143 prdop->op);
3144 ceph_assert(it != q->second.end());
3145 OpRequestRef op = *it;
3146 q->second.erase(it);
3147 if (q->second.size() == 0) {
3148 in_progress_proxy_ops.erase(oid);
3149 } else if (std::find(q->second.begin(),
3150 q->second.end(),
3151 prdop->op) != q->second.end()) {
3152 /* multiple read case */
3153 dout(20) << __func__ << " " << oid << " is not completed " << dendl;
3154 return;
3155 }
3156
3157 osd->logger->inc(l_osd_tier_proxy_read);
3158
3159 auto m = op->get_req<MOSDOp>();
3160 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3161 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3162 ctx->user_at_version = prdop->user_version;
3163 ctx->data_off = prdop->data_offset;
3164 ctx->ignore_log_op_stats = true;
3165 complete_read_ctx(r, ctx);
3166 }
3167
3168 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3169 {
3170 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3171 if (p == in_progress_proxy_ops.end())
3172 return;
3173
3174 list<OpRequestRef>& ls = p->second;
3175 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3176 requeue_ops(ls);
3177 in_progress_proxy_ops.erase(p);
3178 }
3179
3180 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3181 vector<ceph_tid_t> *tids)
3182 {
3183 dout(10) << __func__ << " " << prdop->soid << dendl;
3184 prdop->canceled = true;
3185
3186 // cancel objecter op, if we can
3187 if (prdop->objecter_tid) {
3188 tids->push_back(prdop->objecter_tid);
3189 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3190 prdop->ops[i].outdata.clear();
3191 }
3192 proxyread_ops.erase(prdop->objecter_tid);
3193 prdop->objecter_tid = 0;
3194 }
3195 }
3196
3197 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3198 {
3199 dout(10) << __func__ << dendl;
3200
3201 // cancel proxy reads
3202 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3203 while (p != proxyread_ops.end()) {
3204 cancel_proxy_read((p++)->second, tids);
3205 }
3206
3207 // cancel proxy writes
3208 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3209 while (q != proxywrite_ops.end()) {
3210 cancel_proxy_write((q++)->second, tids);
3211 }
3212
3213 if (requeue) {
3214 map<hobject_t, list<OpRequestRef>>::iterator p =
3215 in_progress_proxy_ops.begin();
3216 while (p != in_progress_proxy_ops.end()) {
3217 list<OpRequestRef>& ls = p->second;
3218 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3219 << " requests" << dendl;
3220 requeue_ops(ls);
3221 in_progress_proxy_ops.erase(p++);
3222 }
3223 } else {
3224 in_progress_proxy_ops.clear();
3225 }
3226 }
3227
3228 struct C_ProxyWrite_Commit : public Context {
3229 PrimaryLogPGRef pg;
3230 hobject_t oid;
3231 epoch_t last_peering_reset;
3232 ceph_tid_t tid;
3233 PrimaryLogPG::ProxyWriteOpRef pwop;
3234 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3235 const PrimaryLogPG::ProxyWriteOpRef& pw)
3236 : pg(p), oid(o), last_peering_reset(lpr),
3237 tid(0), pwop(pw)
3238 {}
3239 void finish(int r) override {
3240 if (pwop->canceled)
3241 return;
3242 std::scoped_lock locker{*pg};
3243 if (pwop->canceled) {
3244 return;
3245 }
3246 if (last_peering_reset == pg->get_last_peering_reset()) {
3247 pg->finish_proxy_write(oid, tid, r);
3248 }
3249 }
3250 };
3251
3252 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3253 {
3254 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3255 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3256 object_locator_t oloc;
3257 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3258 hobject_t soid;
3259 /* extensible tier */
3260 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3261 switch (obc->obs.oi.manifest.type) {
3262 case object_manifest_t::TYPE_REDIRECT:
3263 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3264 soid = obc->obs.oi.manifest.redirect_target;
3265 break;
3266 default:
3267 ceph_abort_msg("unrecognized manifest type");
3268 }
3269 } else {
3270 /* proxy */
3271 soid = m->get_hobj();
3272 oloc = object_locator_t(m->get_object_locator());
3273 oloc.pool = pool.info.tier_of;
3274 }
3275
3276 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3277 if (!(op->may_write() || op->may_cache())) {
3278 flags |= CEPH_OSD_FLAG_RWORDERED;
3279 }
3280 if (op->allows_returnvec()) {
3281 flags |= CEPH_OSD_FLAG_RETURNVEC;
3282 }
3283
3284 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3285
3286 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3287 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3288 pwop->mtime = m->get_mtime();
3289
3290 ObjectOperation obj_op;
3291 obj_op.dup(pwop->ops);
3292
3293 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3294 this, soid, get_last_peering_reset(), pwop);
3295 ceph_tid_t tid = osd->objecter->mutate(
3296 soid.oid, oloc, obj_op, snapc,
3297 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3298 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3299 &pwop->user_version, pwop->reqid);
3300 fin->tid = tid;
3301 pwop->objecter_tid = tid;
3302 proxywrite_ops[tid] = pwop;
3303 in_progress_proxy_ops[soid].push_back(op);
3304 }
3305
3306 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3307 ObjectContextRef obc, bool write_ordered)
3308 {
3309 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3310 OSDOp *osd_op = NULL;
3311 for (unsigned int i = 0; i < m->ops.size(); i++) {
3312 osd_op = &m->ops[i];
3313 uint64_t cursor = osd_op->op.extent.offset;
3314 uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3315 uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3316 object_manifest_t *manifest = &obc->obs.oi.manifest;
3317 map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3318
3319 while (cursor < op_length) {
3320 chunk_index = 0;
3321 chunk_length = 0;
3322 /* find the right chunk position for cursor */
3323 for (auto &p : manifest->chunk_map) {
3324 if (p.first <= cursor && p.first + p.second.length > cursor) {
3325 chunk_length = p.second.length;
3326 chunk_index = p.first;
3327 break;
3328 }
3329 }
3330 /* no index */
3331 if (!chunk_index && !chunk_length) {
3332 if (cursor == osd_op->op.extent.offset) {
3333 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3334 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3335 ctx->data_off = osd_op->op.extent.offset;
3336 ctx->ignore_log_op_stats = true;
3337 complete_read_ctx(0, ctx);
3338 }
3339 break;
3340 }
3341 uint64_t next_length = chunk_length;
3342 /* the size to read -> | op length | */
3343 /* | a chunk | */
3344 if (cursor + next_length > op_length) {
3345 next_length = op_length - cursor;
3346 }
3347 /* the size to read -> | op length | */
3348 /* | a chunk | */
3349 if (cursor + next_length > chunk_index + chunk_length) {
3350 next_length = chunk_index + chunk_length - cursor;
3351 }
3352
3353 chunk_read[cursor] = {{chunk_index, next_length}};
3354 cursor += next_length;
3355 }
3356
3357 req_len = cursor - osd_op->op.extent.offset;
3358 for (auto &p : chunk_read) {
3359 auto chunks = p.second.begin();
3360 dout(20) << __func__ << " chunk_index: " << chunks->first
3361 << " next_length: " << chunks->second << " cursor: "
3362 << p.first << dendl;
3363 do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3364 }
3365 }
3366 }
3367
3368 struct RefCountCallback : public Context {
3369 public:
3370 PrimaryLogPG::OpContext *ctx;
3371 OSDOp& osd_op;
3372 bool requeue = false;
3373
3374 RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3375 : ctx(ctx), osd_op(osd_op) {}
3376 void finish(int r) override {
3377 // NB: caller must already have pg->lock held
3378 ctx->obc->stop_block();
3379 ctx->pg->kick_object_context_blocked(ctx->obc);
3380 if (r >= 0) {
3381 osd_op.rval = 0;
3382 ctx->pg->execute_ctx(ctx);
3383 } else {
3384 // on cancel simply toss op out,
3385 // or requeue as requested
3386 if (r != -ECANCELED) {
3387 if (ctx->op)
3388 ctx->pg->osd->reply_op_error(ctx->op, r);
3389 } else if (requeue) {
3390 if (ctx->op)
3391 ctx->pg->requeue_op(ctx->op);
3392 }
3393 ctx->pg->close_op_ctx(ctx);
3394 }
3395 }
3396 void set_requeue(bool rq) {
3397 requeue = rq;
3398 }
3399 };
3400
3401 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3402 OSDOp& osd_op;
3403
3404 explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3405 }
3406
3407 int execute() override {
3408 return osd_op.rval;
3409 }
3410 };
3411
3412 struct C_SetManifestRefCountDone : public Context {
3413 RefCountCallback* cb;
3414 hobject_t soid;
3415 C_SetManifestRefCountDone(
3416 RefCountCallback* cb, hobject_t soid) : cb(cb), soid(soid) {}
3417 void finish(int r) override {
3418 if (r == -ECANCELED)
3419 return;
3420 auto pg = cb->ctx->pg;
3421 std::scoped_lock locker{*pg};
3422 auto it = pg->manifest_ops.find(soid);
3423 if (it == pg->manifest_ops.end()) {
3424 // raced with cancel_manifest_ops
3425 return;
3426 }
3427 pg->manifest_ops.erase(it);
3428 cb->complete(r);
3429 }
3430 };
3431
3432 void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3433 {
3434 dout(10) << __func__ << dendl;
3435 auto p = manifest_ops.begin();
3436 while (p != manifest_ops.end()) {
3437 auto mop = p->second;
3438 // cancel objecter op, if we can
3439 if (mop->objecter_tid) {
3440 tids->push_back(mop->objecter_tid);
3441 mop->objecter_tid = 0;
3442 }
3443 mop->cb->set_requeue(requeue);
3444 mop->cb->complete(-ECANCELED);
3445 manifest_ops.erase(p++);
3446 }
3447 }
3448
3449 void PrimaryLogPG::refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
3450 SnapContext snapc, bool get, RefCountCallback *cb, uint64_t offset)
3451 {
3452 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3453 CEPH_OSD_FLAG_RWORDERED;
3454
3455 dout(10) << __func__ << " Start refcount for " << soid << dendl;
3456
3457 ObjectOperation obj_op;
3458 bufferlist in;
3459 if (get) {
3460 cls_chunk_refcount_get_op call;
3461 call.source = obc->obs.oi.soid;
3462 ::encode(call, in);
3463 obj_op.call("cas", "chunk_get", in);
3464 } else {
3465 cls_chunk_refcount_put_op call;
3466 call.source = obc->obs.oi.soid;
3467 ::encode(call, in);
3468 obj_op.call("cas", "chunk_put", in);
3469 }
3470
3471 Context *c = nullptr;
3472 if (cb) {
3473 C_SetManifestRefCountDone *fin =
3474 new C_SetManifestRefCountDone(cb, obc->obs.oi.soid);
3475 c = new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard()));
3476 }
3477
3478 auto tid = osd->objecter->mutate(
3479 soid.oid, oloc, obj_op, snapc,
3480 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
3481 flags, c);
3482 if (cb) {
3483 manifest_ops[obc->obs.oi.soid] = std::make_shared<ManifestOp>(cb, tid);
3484 obc->start_block();
3485 }
3486 }
3487
3488 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3489 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3490 uint64_t req_total_len, bool write_ordered)
3491 {
3492 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3493 object_manifest_t *manifest = &obc->obs.oi.manifest;
3494 if (!manifest->chunk_map.count(chunk_index)) {
3495 return;
3496 }
3497 uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3498 hobject_t soid = manifest->chunk_map[chunk_index].oid;
3499 hobject_t ori_soid = m->get_hobj();
3500 object_locator_t oloc(soid);
3501 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3502 if (write_ordered) {
3503 flags |= CEPH_OSD_FLAG_RWORDERED;
3504 }
3505
3506 if (!chunk_length || soid == hobject_t()) {
3507 return;
3508 }
3509
3510 /* same as do_proxy_read() */
3511 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3512 CEPH_OSD_FLAG_ORDERSNAP |
3513 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3514 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3515
3516 dout(10) << __func__ << " Start do chunk proxy read for " << *m
3517 << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3518 << " req_length: " << req_length << dendl;
3519
3520 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3521
3522 ObjectOperation *pobj_op = new ObjectOperation;
3523 OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3524
3525 if (chunk_index <= req_offset) {
3526 osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3527 } else {
3528 ceph_abort_msg("chunk_index > req_offset");
3529 }
3530 osd_op.op.extent.length = req_length;
3531
3532 ObjectOperation obj_op;
3533 obj_op.dup(pobj_op->ops);
3534
3535 C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3536 prdop);
3537 fin->obj_op = pobj_op;
3538 fin->op_index = op_index;
3539 fin->req_offset = req_offset;
3540 fin->obc = obc;
3541 fin->req_total_len = req_total_len;
3542
3543 ceph_tid_t tid = osd->objecter->read(
3544 soid.oid, oloc, obj_op,
3545 m->get_snapid(), NULL,
3546 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3547 &prdop->user_version,
3548 &prdop->data_offset,
3549 m->get_features());
3550 fin->tid = tid;
3551 prdop->objecter_tid = tid;
3552 proxyread_ops[tid] = prdop;
3553 in_progress_proxy_ops[ori_soid].push_back(op);
3554 }
3555
3556 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3557 {
3558 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3559 OSDOp *osd_op = NULL;
3560 bool ret = true;
3561 for (unsigned int i = 0; i < m->ops.size(); i++) {
3562 osd_op = &m->ops[i];
3563 ceph_osd_op op = osd_op->op;
3564 switch (op.op) {
3565 case CEPH_OSD_OP_READ:
3566 case CEPH_OSD_OP_SYNC_READ: {
3567 uint64_t cursor = osd_op->op.extent.offset;
3568 uint64_t remain = osd_op->op.extent.length;
3569
3570 /* requested chunks exist in chunk_map ? */
3571 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3572 if (p.first <= cursor && p.first + p.second.length > cursor) {
3573 if (!p.second.is_missing()) {
3574 return false;
3575 }
3576 if (p.second.length >= remain) {
3577 remain = 0;
3578 break;
3579 } else {
3580 remain = remain - p.second.length;
3581 }
3582 cursor += p.second.length;
3583 }
3584 }
3585
3586 if (remain) {
3587 dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3588 return false;
3589 }
3590 continue;
3591 }
3592 default:
3593 return false;
3594 }
3595 }
3596 return ret;
3597 }
3598
3599 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3600 {
3601 dout(10) << __func__ << " " << oid << " tid " << tid
3602 << " " << cpp_strerror(r) << dendl;
3603
3604 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3605 if (p == proxywrite_ops.end()) {
3606 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3607 return;
3608 }
3609 ProxyWriteOpRef pwop = p->second;
3610 ceph_assert(tid == pwop->objecter_tid);
3611 ceph_assert(oid == pwop->soid);
3612
3613 proxywrite_ops.erase(tid);
3614
3615 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3616 if (q == in_progress_proxy_ops.end()) {
3617 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3618 delete pwop->ctx;
3619 pwop->ctx = NULL;
3620 return;
3621 }
3622 list<OpRequestRef>& in_progress_op = q->second;
3623 ceph_assert(in_progress_op.size());
3624 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3625 in_progress_op.end(),
3626 pwop->op);
3627 ceph_assert(it != in_progress_op.end());
3628 in_progress_op.erase(it);
3629 if (in_progress_op.size() == 0) {
3630 in_progress_proxy_ops.erase(oid);
3631 } else if (std::find(in_progress_op.begin(),
3632 in_progress_op.end(),
3633 pwop->op) != in_progress_op.end()) {
3634 if (pwop->ctx)
3635 delete pwop->ctx;
3636 pwop->ctx = NULL;
3637 dout(20) << __func__ << " " << oid << " tid " << tid
3638 << " in_progress_op size: "
3639 << in_progress_op.size() << dendl;
3640 return;
3641 }
3642
3643 osd->logger->inc(l_osd_tier_proxy_write);
3644
3645 auto m = pwop->op->get_req<MOSDOp>();
3646 ceph_assert(m != NULL);
3647
3648 if (!pwop->sent_reply) {
3649 // send commit.
3650 assert(pwop->ctx->reply == nullptr);
3651 MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3652 true /* we claim it below */);
3653 reply->set_reply_versions(eversion_t(), pwop->user_version);
3654 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3655 reply->claim_op_out_data(pwop->ops);
3656 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3657 osd->send_message_osd_client(reply, m->get_connection());
3658 pwop->sent_reply = true;
3659 pwop->ctx->op->mark_commit_sent();
3660 }
3661
3662 delete pwop->ctx;
3663 pwop->ctx = NULL;
3664 }
3665
3666 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3667 vector<ceph_tid_t> *tids)
3668 {
3669 dout(10) << __func__ << " " << pwop->soid << dendl;
3670 pwop->canceled = true;
3671
3672 // cancel objecter op, if we can
3673 if (pwop->objecter_tid) {
3674 tids->push_back(pwop->objecter_tid);
3675 delete pwop->ctx;
3676 pwop->ctx = NULL;
3677 proxywrite_ops.erase(pwop->objecter_tid);
3678 pwop->objecter_tid = 0;
3679 }
3680 }
3681
3682 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3683 ObjectContextRef obc;
3684 PrimaryLogPG *pg;
3685 utime_t start;
3686 public:
3687 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3688 : obc(obc_),
3689 pg(pg_),
3690 start(ceph_clock_now()) {}
3691
3692 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3693 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3694 int r = results.get<0>();
3695 pg->finish_promote(r, results_data, obc);
3696 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3697 }
3698 };
3699
3700 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3701 ObjectContextRef obc;
3702 PrimaryLogPG *pg;
3703 utime_t start;
3704 PrimaryLogPG::OpContext *ctx;
3705 PrimaryLogPG::CopyCallbackResults promote_results;
3706 public:
3707 PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3708 : obc(obc_),
3709 pg(pg_),
3710 start(ceph_clock_now()), ctx(ctx) {}
3711
3712 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3713 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3714 int r = results.get<0>();
3715 if (ctx) {
3716 promote_results = results;
3717 pg->execute_ctx(ctx);
3718 } else {
3719 pg->finish_promote_manifest(r, results_data, obc);
3720 }
3721 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3722 }
3723 friend struct PromoteFinisher;
3724 };
3725
3726 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3727 PromoteManifestCallback *promote_callback;
3728
3729 explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3730 : promote_callback(promote_callback) {
3731 }
3732
3733 int execute() override {
3734 if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3735 promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3736 promote_callback->promote_results.get<1>(),
3737 promote_callback->obc);
3738 } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3739 promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3740 promote_callback->promote_results.get<1>(),
3741 promote_callback->obc);
3742 } else {
3743 ceph_abort_msg("unrecognized manifest type");
3744 }
3745 return 0;
3746 }
3747 };
3748
3749 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3750 const hobject_t& missing_oid,
3751 const object_locator_t& oloc,
3752 OpRequestRef op,
3753 ObjectContextRef *promote_obc)
3754 {
3755 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3756 ceph_assert(hoid != hobject_t());
3757 if (write_blocked_by_scrub(hoid)) {
3758 dout(10) << __func__ << " " << hoid
3759 << " blocked by scrub" << dendl;
3760 if (op) {
3761 waiting_for_scrub.push_back(op);
3762 op->mark_delayed("waiting for scrub");
3763 dout(10) << __func__ << " " << hoid
3764 << " placing op in waiting_for_scrub" << dendl;
3765 } else {
3766 dout(10) << __func__ << " " << hoid
3767 << " no op, dropping on the floor" << dendl;
3768 }
3769 return;
3770 }
3771 if (op && !check_laggy_requeue(op)) {
3772 return;
3773 }
3774 if (!obc) { // we need to create an ObjectContext
3775 ceph_assert(missing_oid != hobject_t());
3776 obc = get_object_context(missing_oid, true);
3777 }
3778 if (promote_obc)
3779 *promote_obc = obc;
3780
3781 /*
3782 * Before promote complete, if there are proxy-reads for the object,
3783 * for this case we don't use DONTNEED.
3784 */
3785 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3786 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3787 if (q == in_progress_proxy_ops.end()) {
3788 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3789 }
3790
3791 CopyCallback *cb;
3792 object_locator_t my_oloc;
3793 hobject_t src_hoid;
3794 if (!obc->obs.oi.has_manifest()) {
3795 my_oloc = oloc;
3796 my_oloc.pool = pool.info.tier_of;
3797 src_hoid = obc->obs.oi.soid;
3798 cb = new PromoteCallback(obc, this);
3799 } else {
3800 if (obc->obs.oi.manifest.is_chunked()) {
3801 src_hoid = obc->obs.oi.soid;
3802 cb = new PromoteManifestCallback(obc, this);
3803 } else if (obc->obs.oi.manifest.is_redirect()) {
3804 object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3805 my_oloc = src_oloc;
3806 src_hoid = obc->obs.oi.manifest.redirect_target;
3807 cb = new PromoteCallback(obc, this);
3808 } else {
3809 ceph_abort_msg("unrecognized manifest type");
3810 }
3811 }
3812
3813 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3814 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3815 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3816 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3817 start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
3818 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3819 src_fadvise_flags, 0);
3820
3821 ceph_assert(obc->is_blocked());
3822
3823 if (op)
3824 wait_for_blocked_object(obc->obs.oi.soid, op);
3825
3826 recovery_state.update_stats(
3827 [](auto &history, auto &stats) {
3828 stats.stats.sum.num_promote++;
3829 return false;
3830 });
3831 }
3832
3833 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3834 {
3835 FUNCTRACE(cct);
3836 dout(10) << __func__ << " " << ctx << dendl;
3837 ctx->reset_obs(ctx->obc);
3838 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3839 OpRequestRef op = ctx->op;
3840 auto m = op->get_req<MOSDOp>();
3841 ObjectContextRef obc = ctx->obc;
3842 const hobject_t& soid = obc->obs.oi.soid;
3843
3844 // this method must be idempotent since we may call it several times
3845 // before we finally apply the resulting transaction.
3846 ctx->op_t.reset(new PGTransaction);
3847
3848 if (op->may_write() || op->may_cache()) {
3849 // snap
3850 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3851 pool.info.is_pool_snaps_mode()) {
3852 // use pool's snapc
3853 ctx->snapc = pool.snapc;
3854 } else {
3855 // client specified snapc
3856 ctx->snapc.seq = m->get_snap_seq();
3857 ctx->snapc.snaps = m->get_snaps();
3858 filter_snapc(ctx->snapc.snaps);
3859 }
3860 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3861 ctx->snapc.seq < obc->ssc->snapset.seq) {
3862 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3863 << " < snapset seq " << obc->ssc->snapset.seq
3864 << " on " << obc->obs.oi.soid << dendl;
3865 reply_ctx(ctx, -EOLDSNAPC);
3866 return;
3867 }
3868
3869 // version
3870 ctx->at_version = get_next_version();
3871 ctx->mtime = m->get_mtime();
3872
3873 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3874 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3875 << " snapc " << ctx->snapc
3876 << " snapset " << obc->ssc->snapset
3877 << dendl;
3878 } else {
3879 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3880 << " ov " << obc->obs.oi.version
3881 << dendl;
3882 }
3883
3884 if (!ctx->user_at_version)
3885 ctx->user_at_version = obc->obs.oi.user_version;
3886 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3887
3888 {
3889 #ifdef WITH_LTTNG
3890 osd_reqid_t reqid = ctx->op->get_reqid();
3891 #endif
3892 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3893 reqid.name._num, reqid.tid, reqid.inc);
3894 }
3895
3896 int result = prepare_transaction(ctx);
3897
3898 {
3899 #ifdef WITH_LTTNG
3900 osd_reqid_t reqid = ctx->op->get_reqid();
3901 #endif
3902 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3903 reqid.name._num, reqid.tid, reqid.inc);
3904 }
3905
3906 bool pending_async_reads = !ctx->pending_async_reads.empty();
3907 if (result == -EINPROGRESS || pending_async_reads) {
3908 // come back later.
3909 if (pending_async_reads) {
3910 ceph_assert(pool.info.is_erasure());
3911 in_progress_async_reads.push_back(make_pair(op, ctx));
3912 ctx->start_async_reads(this);
3913 }
3914 return;
3915 }
3916
3917 if (result == -EAGAIN) {
3918 // clean up after the ctx
3919 close_op_ctx(ctx);
3920 return;
3921 }
3922
3923 bool ignore_out_data = false;
3924 if (!ctx->op_t->empty() &&
3925 op->may_write() &&
3926 result >= 0) {
3927 // successful update
3928 if (ctx->op->allows_returnvec()) {
3929 // enforce reasonable bound on the return buffer sizes
3930 for (auto& i : *ctx->ops) {
3931 if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
3932 dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
3933 result = -EOVERFLOW; // overall result is overflow
3934 i.rval = -EOVERFLOW;
3935 i.outdata.clear();
3936 }
3937 }
3938 } else {
3939 // legacy behavior -- zero result and return data etc.
3940 ignore_out_data = true;
3941 result = 0;
3942 }
3943 }
3944
3945 // prepare the reply
3946 ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
3947 ignore_out_data);
3948 dout(20) << __func__ << " alloc reply " << ctx->reply
3949 << " result " << result << dendl;
3950
3951 // read or error?
3952 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3953 // finish side-effects
3954 if (result >= 0)
3955 do_osd_op_effects(ctx, m->get_connection());
3956
3957 complete_read_ctx(result, ctx);
3958 return;
3959 }
3960
3961 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3962
3963 ceph_assert(op->may_write() || op->may_cache());
3964
3965 // trim log?
3966 recovery_state.update_trim_to();
3967
3968 // verify that we are doing this in order?
3969 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3970 !pool.info.is_tier() && !pool.info.has_tiers()) {
3971 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3972 ceph_tid_t t = m->get_tid();
3973 client_t n = m->get_source().num();
3974 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3975 if (p == cm.end()) {
3976 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3977 cm[n] = t;
3978 } else {
3979 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3980 if (p->second > t) {
3981 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3982 ceph_abort_msg("out of order op");
3983 }
3984 p->second = t;
3985 }
3986 }
3987
3988 if (ctx->update_log_only) {
3989 if (result >= 0)
3990 do_osd_op_effects(ctx, m->get_connection());
3991
3992 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3993 // save just what we need from ctx
3994 MOSDOpReply *reply = ctx->reply;
3995 ctx->reply = nullptr;
3996 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3997
3998 if (result == -ENOENT) {
3999 reply->set_enoent_reply_versions(info.last_update,
4000 info.last_user_version);
4001 }
4002 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4003 // append to pg log for dup detection - don't save buffers for now
4004 record_write_error(op, soid, reply, result,
4005 ctx->op->allows_returnvec() ? ctx : nullptr);
4006 close_op_ctx(ctx);
4007 return;
4008 }
4009
4010 // no need to capture PG ref, repop cancel will handle that
4011 // Can capture the ctx by pointer, it's owned by the repop
4012 ctx->register_on_commit(
4013 [m, ctx, this](){
4014 if (ctx->op)
4015 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4016
4017 if (m && !ctx->sent_reply) {
4018 MOSDOpReply *reply = ctx->reply;
4019 ctx->reply = nullptr;
4020 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4021 dout(10) << " sending reply on " << *m << " " << reply << dendl;
4022 osd->send_message_osd_client(reply, m->get_connection());
4023 ctx->sent_reply = true;
4024 ctx->op->mark_commit_sent();
4025 }
4026 });
4027 ctx->register_on_success(
4028 [ctx, this]() {
4029 do_osd_op_effects(
4030 ctx,
4031 ctx->op ? ctx->op->get_req()->get_connection() :
4032 ConnectionRef());
4033 });
4034 ctx->register_on_finish(
4035 [ctx]() {
4036 delete ctx;
4037 });
4038
4039 // issue replica writes
4040 ceph_tid_t rep_tid = osd->get_tid();
4041
4042 RepGather *repop = new_repop(ctx, obc, rep_tid);
4043
4044 issue_repop(repop, ctx);
4045 eval_repop(repop);
4046 repop->put();
4047 }
4048
4049 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4050 release_object_locks(ctx->lock_manager);
4051
4052 ctx->op_t.reset();
4053
4054 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4055 ctx->on_finish.erase(p++)) {
4056 (*p)();
4057 }
4058 delete ctx;
4059 }
4060
4061 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4062 {
4063 if (ctx->op)
4064 osd->reply_op_error(ctx->op, r);
4065 close_op_ctx(ctx);
4066 }
4067
4068 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4069 const uint64_t inb,
4070 const uint64_t outb)
4071 {
4072 auto m = op.get_req<MOSDOp>();
4073 const utime_t now = ceph_clock_now();
4074
4075 const utime_t latency = now - m->get_recv_stamp();
4076 const utime_t process_latency = now - op.get_dequeued_time();
4077
4078 osd->logger->inc(l_osd_op);
4079
4080 osd->logger->inc(l_osd_op_outb, outb);
4081 osd->logger->inc(l_osd_op_inb, inb);
4082 osd->logger->tinc(l_osd_op_lat, latency);
4083 osd->logger->tinc(l_osd_op_process_lat, process_latency);
4084
4085 if (op.may_read() && op.may_write()) {
4086 osd->logger->inc(l_osd_op_rw);
4087 osd->logger->inc(l_osd_op_rw_inb, inb);
4088 osd->logger->inc(l_osd_op_rw_outb, outb);
4089 osd->logger->tinc(l_osd_op_rw_lat, latency);
4090 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4091 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4092 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4093 } else if (op.may_read()) {
4094 osd->logger->inc(l_osd_op_r);
4095 osd->logger->inc(l_osd_op_r_outb, outb);
4096 osd->logger->tinc(l_osd_op_r_lat, latency);
4097 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4098 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4099 } else if (op.may_write() || op.may_cache()) {
4100 osd->logger->inc(l_osd_op_w);
4101 osd->logger->inc(l_osd_op_w_inb, inb);
4102 osd->logger->tinc(l_osd_op_w_lat, latency);
4103 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4104 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4105 } else {
4106 ceph_abort();
4107 }
4108
4109 dout(15) << "log_op_stats " << *m
4110 << " inb " << inb
4111 << " outb " << outb
4112 << " lat " << latency << dendl;
4113
4114 if (m_dynamic_perf_stats.is_enabled()) {
4115 m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4116 }
4117 }
4118
4119 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4120 const std::list<OSDPerfMetricQuery> &queries)
4121 {
4122 m_dynamic_perf_stats.set_queries(queries);
4123 }
4124
4125 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4126 {
4127 std::swap(m_dynamic_perf_stats, *stats);
4128 }
4129
4130 void PrimaryLogPG::do_scan(
4131 OpRequestRef op,
4132 ThreadPool::TPHandle &handle)
4133 {
4134 auto m = op->get_req<MOSDPGScan>();
4135 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4136 dout(10) << "do_scan " << *m << dendl;
4137
4138 op->mark_started();
4139
4140 switch (m->op) {
4141 case MOSDPGScan::OP_SCAN_GET_DIGEST:
4142 {
4143 auto dpp = get_dpp();
4144 if (osd->check_backfill_full(dpp)) {
4145 dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4146 queue_peering_event(
4147 PGPeeringEventRef(
4148 std::make_shared<PGPeeringEvent>(
4149 get_osdmap_epoch(),
4150 get_osdmap_epoch(),
4151 PeeringState::BackfillTooFull())));
4152 return;
4153 }
4154
4155 BackfillInterval bi;
4156 bi.begin = m->begin;
4157 // No need to flush, there won't be any in progress writes occuring
4158 // past m->begin
4159 scan_range(
4160 cct->_conf->osd_backfill_scan_min,
4161 cct->_conf->osd_backfill_scan_max,
4162 &bi,
4163 handle);
4164 MOSDPGScan *reply = new MOSDPGScan(
4165 MOSDPGScan::OP_SCAN_DIGEST,
4166 pg_whoami,
4167 get_osdmap_epoch(), m->query_epoch,
4168 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4169 encode(bi.objects, reply->get_data());
4170 osd->send_message_osd_cluster(reply, m->get_connection());
4171 }
4172 break;
4173
4174 case MOSDPGScan::OP_SCAN_DIGEST:
4175 {
4176 pg_shard_t from = m->from;
4177
4178 // Check that from is in backfill_targets vector
4179 ceph_assert(is_backfill_target(from));
4180
4181 BackfillInterval& bi = peer_backfill_info[from];
4182 bi.begin = m->begin;
4183 bi.end = m->end;
4184 auto p = m->get_data().cbegin();
4185
4186 // take care to preserve ordering!
4187 bi.clear_objects();
4188 ::decode_noclear(bi.objects, p);
4189
4190 if (waiting_on_backfill.erase(from)) {
4191 if (waiting_on_backfill.empty()) {
4192 ceph_assert(
4193 peer_backfill_info.size() ==
4194 get_backfill_targets().size());
4195 finish_recovery_op(hobject_t::get_max());
4196 }
4197 } else {
4198 // we canceled backfill for a while due to a too full, and this
4199 // is an extra response from a non-too-full peer
4200 dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4201 }
4202 }
4203 break;
4204 }
4205 }
4206
4207 void PrimaryLogPG::do_backfill(OpRequestRef op)
4208 {
4209 auto m = op->get_req<MOSDPGBackfill>();
4210 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4211 dout(10) << "do_backfill " << *m << dendl;
4212
4213 op->mark_started();
4214
4215 switch (m->op) {
4216 case MOSDPGBackfill::OP_BACKFILL_FINISH:
4217 {
4218 ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4219
4220 MOSDPGBackfill *reply = new MOSDPGBackfill(
4221 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4222 get_osdmap_epoch(),
4223 m->query_epoch,
4224 spg_t(info.pgid.pgid, get_primary().shard));
4225 reply->set_priority(get_recovery_op_priority());
4226 osd->send_message_osd_cluster(reply, m->get_connection());
4227 queue_peering_event(
4228 PGPeeringEventRef(
4229 std::make_shared<PGPeeringEvent>(
4230 get_osdmap_epoch(),
4231 get_osdmap_epoch(),
4232 RecoveryDone())));
4233 }
4234 // fall-thru
4235
4236 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4237 {
4238 ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4239
4240 ObjectStore::Transaction t;
4241 recovery_state.update_backfill_progress(
4242 m->last_backfill,
4243 m->stats,
4244 m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4245 t);
4246
4247 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4248 ceph_assert(tr == 0);
4249 }
4250 break;
4251
4252 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4253 {
4254 ceph_assert(is_primary());
4255 ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4256 finish_recovery_op(hobject_t::get_max());
4257 }
4258 break;
4259 }
4260 }
4261
4262 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4263 {
4264 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4265 op->get_req());
4266 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4267 dout(7) << __func__ << " " << m->ls << dendl;
4268
4269 op->mark_started();
4270
4271 ObjectStore::Transaction t;
4272 for (auto& p : m->ls) {
4273 if (is_remote_backfilling()) {
4274 struct stat st;
4275 int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4276 pg_whoami.shard) , &st);
4277 if (r == 0) {
4278 sub_local_num_bytes(st.st_size);
4279 int64_t usersize;
4280 if (pool.info.is_erasure()) {
4281 bufferlist bv;
4282 int r = osd->store->getattr(
4283 ch,
4284 ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4285 OI_ATTR,
4286 bv);
4287 if (r >= 0) {
4288 object_info_t oi(bv);
4289 usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4290 } else {
4291 dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4292 << " can't get object info" << dendl;
4293 usersize = 0;
4294 }
4295 } else {
4296 usersize = st.st_size;
4297 }
4298 sub_num_bytes(usersize);
4299 dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4300 << " sub actual data by " << st.st_size
4301 << " sub num_bytes by " << usersize
4302 << dendl;
4303 }
4304 }
4305 remove_snap_mapped_object(t, p.first);
4306 }
4307 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4308 ceph_assert(r == 0);
4309 }
4310
4311 int PrimaryLogPG::trim_object(
4312 bool first, const hobject_t &coid, snapid_t snap_to_trim,
4313 PrimaryLogPG::OpContextUPtr *ctxp)
4314 {
4315 *ctxp = NULL;
4316
4317 // load clone info
4318 bufferlist bl;
4319 ObjectContextRef obc = get_object_context(coid, false, NULL);
4320 if (!obc || !obc->ssc || !obc->ssc->exists) {
4321 osd->clog->error() << __func__ << ": Can not trim " << coid
4322 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4323 return -ENOENT;
4324 }
4325
4326 hobject_t head_oid = coid.get_head();
4327 ObjectContextRef head_obc = get_object_context(head_oid, false);
4328 if (!head_obc) {
4329 osd->clog->error() << __func__ << ": Can not trim " << coid
4330 << " repair needed, no snapset obc for " << head_oid;
4331 return -ENOENT;
4332 }
4333
4334 SnapSet& snapset = obc->ssc->snapset;
4335
4336 object_info_t &coi = obc->obs.oi;
4337 auto citer = snapset.clone_snaps.find(coid.snap);
4338 if (citer == snapset.clone_snaps.end()) {
4339 osd->clog->error() << "No clone_snaps in snapset " << snapset
4340 << " for object " << coid << "\n";
4341 return -ENOENT;
4342 }
4343 set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4344 if (old_snaps.empty()) {
4345 osd->clog->error() << "No object info snaps for object " << coid;
4346 return -ENOENT;
4347 }
4348
4349 dout(10) << coid << " old_snaps " << old_snaps
4350 << " old snapset " << snapset << dendl;
4351 if (snapset.seq == 0) {
4352 osd->clog->error() << "No snapset.seq for object " << coid;
4353 return -ENOENT;
4354 }
4355
4356 set<snapid_t> new_snaps;
4357 const OSDMapRef& osdmap = get_osdmap();
4358 for (set<snapid_t>::iterator i = old_snaps.begin();
4359 i != old_snaps.end();
4360 ++i) {
4361 if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4362 *i != snap_to_trim) {
4363 new_snaps.insert(*i);
4364 }
4365 }
4366
4367 vector<snapid_t>::iterator p = snapset.clones.end();
4368
4369 if (new_snaps.empty()) {
4370 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4371 if (p == snapset.clones.end()) {
4372 osd->clog->error() << "Snap " << coid.snap << " not in clones";
4373 return -ENOENT;
4374 }
4375 }
4376
4377 OpContextUPtr ctx = simple_opc_create(obc);
4378 ctx->head_obc = head_obc;
4379
4380 if (!ctx->lock_manager.get_snaptrimmer_write(
4381 coid,
4382 obc,
4383 first)) {
4384 close_op_ctx(ctx.release());
4385 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4386 return -ENOLCK;
4387 }
4388
4389 if (!ctx->lock_manager.get_snaptrimmer_write(
4390 head_oid,
4391 head_obc,
4392 first)) {
4393 close_op_ctx(ctx.release());
4394 dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4395 return -ENOLCK;
4396 }
4397
4398 ctx->at_version = get_next_version();
4399
4400 PGTransaction *t = ctx->op_t.get();
4401
4402 if (new_snaps.empty()) {
4403 // remove clone
4404 dout(10) << coid << " snaps " << old_snaps << " -> "
4405 << new_snaps << " ... deleting" << dendl;
4406
4407 // ...from snapset
4408 ceph_assert(p != snapset.clones.end());
4409
4410 snapid_t last = coid.snap;
4411 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4412
4413 if (p != snapset.clones.begin()) {
4414 // not the oldest... merge overlap into next older clone
4415 vector<snapid_t>::iterator n = p - 1;
4416 hobject_t prev_coid = coid;
4417 prev_coid.snap = *n;
4418 bool adjust_prev_bytes = is_present_clone(prev_coid);
4419
4420 if (adjust_prev_bytes)
4421 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4422
4423 snapset.clone_overlap[*n].intersection_of(
4424 snapset.clone_overlap[*p]);
4425
4426 if (adjust_prev_bytes)
4427 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4428 }
4429 ctx->delta_stats.num_objects--;
4430 if (coi.is_dirty())
4431 ctx->delta_stats.num_objects_dirty--;
4432 if (coi.is_omap())
4433 ctx->delta_stats.num_objects_omap--;
4434 if (coi.is_whiteout()) {
4435 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4436 ctx->delta_stats.num_whiteouts--;
4437 }
4438 ctx->delta_stats.num_object_clones--;
4439 if (coi.is_cache_pinned())
4440 ctx->delta_stats.num_objects_pinned--;
4441 if (coi.has_manifest())
4442 ctx->delta_stats.num_objects_manifest--;
4443 obc->obs.exists = false;
4444
4445 snapset.clones.erase(p);
4446 snapset.clone_overlap.erase(last);
4447 snapset.clone_size.erase(last);
4448 snapset.clone_snaps.erase(last);
4449
4450 ctx->log.push_back(
4451 pg_log_entry_t(
4452 pg_log_entry_t::DELETE,
4453 coid,
4454 ctx->at_version,
4455 ctx->obs->oi.version,
4456 0,
4457 osd_reqid_t(),
4458 ctx->mtime,
4459 0)
4460 );
4461 t->remove(coid);
4462 t->update_snaps(
4463 coid,
4464 old_snaps,
4465 new_snaps);
4466
4467 coi = object_info_t(coid);
4468
4469 ctx->at_version.version++;
4470 } else {
4471 // save adjusted snaps for this object
4472 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4473 snapset.clone_snaps[coid.snap] =
4474 vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4475 // we still do a 'modify' event on this object just to trigger a
4476 // snapmapper.update ... :(
4477
4478 coi.prior_version = coi.version;
4479 coi.version = ctx->at_version;
4480 bl.clear();
4481 encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4482 t->setattr(coid, OI_ATTR, bl);
4483
4484 ctx->log.push_back(
4485 pg_log_entry_t(
4486 pg_log_entry_t::MODIFY,
4487 coid,
4488 coi.version,
4489 coi.prior_version,
4490 0,
4491 osd_reqid_t(),
4492 ctx->mtime,
4493 0)
4494 );
4495 ctx->at_version.version++;
4496
4497 t->update_snaps(
4498 coid,
4499 old_snaps,
4500 new_snaps);
4501 }
4502
4503 // save head snapset
4504 dout(10) << coid << " new snapset " << snapset << " on "
4505 << head_obc->obs.oi << dendl;
4506 if (snapset.clones.empty() &&
4507 (head_obc->obs.oi.is_whiteout() &&
4508 !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4509 !head_obc->obs.oi.is_cache_pinned())) {
4510 // NOTE: this arguably constitutes minor interference with the
4511 // tiering agent if this is a cache tier since a snap trim event
4512 // is effectively evicting a whiteout we might otherwise want to
4513 // keep around.
4514 dout(10) << coid << " removing " << head_oid << dendl;
4515 ctx->log.push_back(
4516 pg_log_entry_t(
4517 pg_log_entry_t::DELETE,
4518 head_oid,
4519 ctx->at_version,
4520 head_obc->obs.oi.version,
4521 0,
4522 osd_reqid_t(),
4523 ctx->mtime,
4524 0)
4525 );
4526 derr << "removing snap head" << dendl;
4527 object_info_t& oi = head_obc->obs.oi;
4528 ctx->delta_stats.num_objects--;
4529 if (oi.is_dirty()) {
4530 ctx->delta_stats.num_objects_dirty--;
4531 }
4532 if (oi.is_omap())
4533 ctx->delta_stats.num_objects_omap--;
4534 if (oi.is_whiteout()) {
4535 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4536 ctx->delta_stats.num_whiteouts--;
4537 }
4538 if (oi.is_cache_pinned()) {
4539 ctx->delta_stats.num_objects_pinned--;
4540 }
4541 if (coi.has_manifest())
4542 ctx->delta_stats.num_objects_manifest--;
4543 head_obc->obs.exists = false;
4544 head_obc->obs.oi = object_info_t(head_oid);
4545 t->remove(head_oid);
4546 } else {
4547 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4548 // filter SnapSet::snaps for the benefit of pre-octopus
4549 // peers. This is perhaps overly conservative in that I'm not
4550 // certain they need this, but let's be conservative here.
4551 dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4552 snapset.filter(pool.info);
4553 } else {
4554 snapset.snaps.clear();
4555 }
4556 dout(10) << coid << " writing updated snapset on " << head_oid
4557 << ", snapset is " << snapset << dendl;
4558 ctx->log.push_back(
4559 pg_log_entry_t(
4560 pg_log_entry_t::MODIFY,
4561 head_oid,
4562 ctx->at_version,
4563 head_obc->obs.oi.version,
4564 0,
4565 osd_reqid_t(),
4566 ctx->mtime,
4567 0)
4568 );
4569
4570 head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4571 head_obc->obs.oi.version = ctx->at_version;
4572
4573 map <string, bufferlist> attrs;
4574 bl.clear();
4575 encode(snapset, bl);
4576 attrs[SS_ATTR].claim(bl);
4577
4578 bl.clear();
4579 encode(head_obc->obs.oi, bl,
4580 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4581 attrs[OI_ATTR].claim(bl);
4582 t->setattrs(head_oid, attrs);
4583 }
4584
4585 *ctxp = std::move(ctx);
4586 return 0;
4587 }
4588
4589 void PrimaryLogPG::kick_snap_trim()
4590 {
4591 ceph_assert(is_active());
4592 ceph_assert(is_primary());
4593 if (is_clean() &&
4594 !state_test(PG_STATE_PREMERGE) &&
4595 !snap_trimq.empty()) {
4596 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4597 dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4598 } else {
4599 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4600 snap_trimmer_machine.process_event(KickTrim());
4601 }
4602 }
4603 }
4604
4605 void PrimaryLogPG::snap_trimmer_scrub_complete()
4606 {
4607 if (is_primary() && is_active() && is_clean()) {
4608 ceph_assert(!snap_trimq.empty());
4609 snap_trimmer_machine.process_event(ScrubComplete());
4610 }
4611 }
4612
4613 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4614 {
4615 if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
4616 return;
4617 }
4618
4619 ceph_assert(is_primary());
4620
4621 dout(10) << "snap_trimmer posting" << dendl;
4622 snap_trimmer_machine.process_event(DoSnapWork());
4623 dout(10) << "snap_trimmer complete" << dendl;
4624 return;
4625 }
4626
4627 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4628 {
4629 __u64 v2;
4630
4631 string v2s(xattr.c_str(), xattr.length());
4632 if (v2s.length())
4633 v2 = strtoull(v2s.c_str(), NULL, 10);
4634 else
4635 v2 = 0;
4636
4637 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4638
4639 switch (op) {
4640 case CEPH_OSD_CMPXATTR_OP_EQ:
4641 return (v1 == v2);
4642 case CEPH_OSD_CMPXATTR_OP_NE:
4643 return (v1 != v2);
4644 case CEPH_OSD_CMPXATTR_OP_GT:
4645 return (v1 > v2);
4646 case CEPH_OSD_CMPXATTR_OP_GTE:
4647 return (v1 >= v2);
4648 case CEPH_OSD_CMPXATTR_OP_LT:
4649 return (v1 < v2);
4650 case CEPH_OSD_CMPXATTR_OP_LTE:
4651 return (v1 <= v2);
4652 default:
4653 return -EINVAL;
4654 }
4655 }
4656
4657 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4658 {
4659 string v2s(xattr.c_str(), xattr.length());
4660
4661 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4662
4663 switch (op) {
4664 case CEPH_OSD_CMPXATTR_OP_EQ:
4665 return (v1s.compare(v2s) == 0);
4666 case CEPH_OSD_CMPXATTR_OP_NE:
4667 return (v1s.compare(v2s) != 0);
4668 case CEPH_OSD_CMPXATTR_OP_GT:
4669 return (v1s.compare(v2s) > 0);
4670 case CEPH_OSD_CMPXATTR_OP_GTE:
4671 return (v1s.compare(v2s) >= 0);
4672 case CEPH_OSD_CMPXATTR_OP_LT:
4673 return (v1s.compare(v2s) < 0);
4674 case CEPH_OSD_CMPXATTR_OP_LTE:
4675 return (v1s.compare(v2s) <= 0);
4676 default:
4677 return -EINVAL;
4678 }
4679 }
4680
4681 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4682 {
4683 ceph_osd_op& op = osd_op.op;
4684 vector<OSDOp> write_ops(1);
4685 OSDOp& write_op = write_ops[0];
4686 uint64_t write_length = op.writesame.length;
4687 int result = 0;
4688
4689 if (!write_length)
4690 return 0;
4691
4692 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4693 return -EINVAL;
4694
4695 if (op.writesame.data_length != osd_op.indata.length()) {
4696 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4697 return -EINVAL;
4698 }
4699
4700 while (write_length) {
4701 write_op.indata.append(osd_op.indata);
4702 write_length -= op.writesame.data_length;
4703 }
4704
4705 write_op.op.op = CEPH_OSD_OP_WRITE;
4706 write_op.op.extent.offset = op.writesame.offset;
4707 write_op.op.extent.length = op.writesame.length;
4708 result = do_osd_ops(ctx, write_ops);
4709 if (result < 0)
4710 derr << "do_writesame do_osd_ops failed " << result << dendl;
4711
4712 return result;
4713 }
4714
4715 // ========================================================================
4716 // low level osd ops
4717
4718 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4719 {
4720 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4721 bufferlist header, vals;
4722 int r = _get_tmap(ctx, &header, &vals);
4723 if (r < 0) {
4724 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4725 r = 0;
4726 return r;
4727 }
4728
4729 vector<OSDOp> ops(3);
4730
4731 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4732 ops[0].op.extent.offset = 0;
4733 ops[0].op.extent.length = 0;
4734
4735 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4736 ops[1].indata.claim(header);
4737
4738 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4739 ops[2].indata.claim(vals);
4740
4741 return do_osd_ops(ctx, ops);
4742 }
4743
4744 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4745 OSDOp& osd_op, bufferlist& bl)
4746 {
4747 // decode
4748 bufferlist header;
4749 map<string, bufferlist> m;
4750 if (bl.length()) {
4751 auto p = bl.cbegin();
4752 decode(header, p);
4753 decode(m, p);
4754 ceph_assert(p.end());
4755 }
4756
4757 // do the update(s)
4758 while (!bp.end()) {
4759 __u8 op;
4760 string key;
4761 decode(op, bp);
4762
4763 switch (op) {
4764 case CEPH_OSD_TMAP_SET: // insert key
4765 {
4766 decode(key, bp);
4767 bufferlist data;
4768 decode(data, bp);
4769 m[key] = data;
4770 }
4771 break;
4772 case CEPH_OSD_TMAP_RM: // remove key
4773 decode(key, bp);
4774 if (!m.count(key)) {
4775 return -ENOENT;
4776 }
4777 m.erase(key);
4778 break;
4779 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4780 decode(key, bp);
4781 m.erase(key);
4782 break;
4783 case CEPH_OSD_TMAP_HDR: // update header
4784 {
4785 decode(header, bp);
4786 }
4787 break;
4788 default:
4789 return -EINVAL;
4790 }
4791 }
4792
4793 // reencode
4794 bufferlist obl;
4795 encode(header, obl);
4796 encode(m, obl);
4797
4798 // write it out
4799 vector<OSDOp> nops(1);
4800 OSDOp& newop = nops[0];
4801 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4802 newop.op.extent.offset = 0;
4803 newop.op.extent.length = obl.length();
4804 newop.indata = obl;
4805 do_osd_ops(ctx, nops);
4806 return 0;
4807 }
4808
4809 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
4810 {
4811 bufferlist::const_iterator orig_bp = bp;
4812 int result = 0;
4813 if (bp.end()) {
4814 dout(10) << "tmapup is a no-op" << dendl;
4815 } else {
4816 // read the whole object
4817 vector<OSDOp> nops(1);
4818 OSDOp& newop = nops[0];
4819 newop.op.op = CEPH_OSD_OP_READ;
4820 newop.op.extent.offset = 0;
4821 newop.op.extent.length = 0;
4822 result = do_osd_ops(ctx, nops);
4823
4824 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4825
4826 dout(30) << " starting is \n";
4827 newop.outdata.hexdump(*_dout);
4828 *_dout << dendl;
4829
4830 auto ip = newop.outdata.cbegin();
4831 bufferlist obl;
4832
4833 dout(30) << "the update command is: \n";
4834 osd_op.indata.hexdump(*_dout);
4835 *_dout << dendl;
4836
4837 // header
4838 bufferlist header;
4839 __u32 nkeys = 0;
4840 if (newop.outdata.length()) {
4841 decode(header, ip);
4842 decode(nkeys, ip);
4843 }
4844 dout(10) << "tmapup header " << header.length() << dendl;
4845
4846 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4847 ++bp;
4848 decode(header, bp);
4849 dout(10) << "tmapup new header " << header.length() << dendl;
4850 }
4851
4852 encode(header, obl);
4853
4854 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4855
4856 // update keys
4857 bufferlist newkeydata;
4858 string nextkey, last_in_key;
4859 bufferlist nextval;
4860 bool have_next = false;
4861 if (!ip.end()) {
4862 have_next = true;
4863 decode(nextkey, ip);
4864 decode(nextval, ip);
4865 }
4866 while (!bp.end() && !result) {
4867 __u8 op;
4868 string key;
4869 try {
4870 decode(op, bp);
4871 decode(key, bp);
4872 }
4873 catch (buffer::error& e) {
4874 return -EINVAL;
4875 }
4876 if (key < last_in_key) {
4877 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4878 << "', falling back to an inefficient (unsorted) update" << dendl;
4879 bp = orig_bp;
4880 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4881 }
4882 last_in_key = key;
4883
4884 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4885
4886 // skip existing intervening keys
4887 bool key_exists = false;
4888 while (have_next && !key_exists) {
4889 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4890 if (nextkey > key)
4891 break;
4892 if (nextkey < key) {
4893 // copy untouched.
4894 encode(nextkey, newkeydata);
4895 encode(nextval, newkeydata);
4896 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4897 } else {
4898 // don't copy; discard old value. and stop.
4899 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4900 key_exists = true;
4901 nkeys--;
4902 }
4903 if (!ip.end()) {
4904 decode(nextkey, ip);
4905 decode(nextval, ip);
4906 } else {
4907 have_next = false;
4908 }
4909 }
4910
4911 if (op == CEPH_OSD_TMAP_SET) {
4912 bufferlist val;
4913 try {
4914 decode(val, bp);
4915 }
4916 catch (buffer::error& e) {
4917 return -EINVAL;
4918 }
4919 encode(key, newkeydata);
4920 encode(val, newkeydata);
4921 dout(20) << " set " << key << " " << val.length() << dendl;
4922 nkeys++;
4923 } else if (op == CEPH_OSD_TMAP_CREATE) {
4924 if (key_exists) {
4925 return -EEXIST;
4926 }
4927 bufferlist val;
4928 try {
4929 decode(val, bp);
4930 }
4931 catch (buffer::error& e) {
4932 return -EINVAL;
4933 }
4934 encode(key, newkeydata);
4935 encode(val, newkeydata);
4936 dout(20) << " create " << key << " " << val.length() << dendl;
4937 nkeys++;
4938 } else if (op == CEPH_OSD_TMAP_RM) {
4939 // do nothing.
4940 if (!key_exists) {
4941 return -ENOENT;
4942 }
4943 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4944 // do nothing
4945 } else {
4946 dout(10) << " invalid tmap op " << (int)op << dendl;
4947 return -EINVAL;
4948 }
4949 }
4950
4951 // copy remaining
4952 if (have_next) {
4953 encode(nextkey, newkeydata);
4954 encode(nextval, newkeydata);
4955 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4956 }
4957 if (!ip.end()) {
4958 bufferlist rest;
4959 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4960 dout(20) << " keep trailing " << rest.length()
4961 << " at " << newkeydata.length() << dendl;
4962 newkeydata.claim_append(rest);
4963 }
4964
4965 // encode final key count + key data
4966 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4967 encode(nkeys, obl);
4968 obl.claim_append(newkeydata);
4969
4970 if (0) {
4971 dout(30) << " final is \n";
4972 obl.hexdump(*_dout);
4973 *_dout << dendl;
4974
4975 // sanity check
4976 auto tp = obl.cbegin();
4977 bufferlist h;
4978 decode(h, tp);
4979 map<string,bufferlist> d;
4980 decode(d, tp);
4981 ceph_assert(tp.end());
4982 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4983 }
4984
4985 // write it out
4986 if (!result) {
4987 dout(20) << "tmapput write " << obl.length() << dendl;
4988 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4989 newop.op.extent.offset = 0;
4990 newop.op.extent.length = obl.length();
4991 newop.indata = obl;
4992 do_osd_ops(ctx, nops);
4993 }
4994 }
4995 return result;
4996 }
4997
4998 static int check_offset_and_length(uint64_t offset, uint64_t length,
4999 uint64_t max, DoutPrefixProvider *dpp)
5000 {
5001 if (offset >= max ||
5002 length > max ||
5003 offset + length > max) {
5004 ldpp_dout(dpp, 10) << __func__ << " "
5005 << "osd_max_object_size: " << max
5006 << "; Hard limit of object size is 4GB." << dendl;
5007 return -EFBIG;
5008 }
5009
5010 return 0;
5011 }
5012
5013 struct FillInVerifyExtent : public Context {
5014 ceph_le64 *r;
5015 int32_t *rval;
5016 bufferlist *outdatap;
5017 std::optional<uint32_t> maybe_crc;
5018 uint64_t size;
5019 OSDService *osd;
5020 hobject_t soid;
5021 uint32_t flags;
5022 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5023 std::optional<uint32_t> mc, uint64_t size,
5024 OSDService *osd, hobject_t soid, uint32_t flags) :
5025 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5026 size(size), osd(osd), soid(soid), flags(flags) {}
5027 void finish(int len) override {
5028 *r = len;
5029 if (len < 0) {
5030 *rval = len;
5031 return;
5032 }
5033 *rval = 0;
5034
5035 // whole object? can we verify the checksum?
5036 if (maybe_crc && *r == size) {
5037 uint32_t crc = outdatap->crc32c(-1);
5038 if (maybe_crc != crc) {
5039 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5040 << " != expected 0x" << *maybe_crc
5041 << std::dec << " on " << soid;
5042 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5043 *rval = -EIO;
5044 *r = 0;
5045 }
5046 }
5047 }
5048 }
5049 };
5050
5051 struct ToSparseReadResult : public Context {
5052 int* result;
5053 bufferlist* data_bl;
5054 uint64_t data_offset;
5055 ceph_le64* len;
5056 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5057 ceph_le64* len)
5058 : result(result), data_bl(bl), data_offset(offset),len(len) {}
5059 void finish(int r) override {
5060 if (r < 0) {
5061 *result = r;
5062 return;
5063 }
5064 *result = 0;
5065 *len = r;
5066 bufferlist outdata;
5067 map<uint64_t, uint64_t> extents = {{data_offset, r}};
5068 encode(extents, outdata);
5069 ::encode_destructively(*data_bl, outdata);
5070 data_bl->swap(outdata);
5071 }
5072 };
5073
5074 template<typename V>
5075 static string list_keys(const map<string, V>& m) {
5076 string s;
5077 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5078 if (!s.empty()) {
5079 s.push_back(',');
5080 }
5081 s.append(itr->first);
5082 }
5083 return s;
5084 }
5085
5086 template<typename T>
5087 static string list_entries(const T& m) {
5088 string s;
5089 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5090 if (!s.empty()) {
5091 s.push_back(',');
5092 }
5093 s.append(*itr);
5094 }
5095 return s;
5096 }
5097
5098 void PrimaryLogPG::maybe_create_new_object(
5099 OpContext *ctx,
5100 bool ignore_transaction)
5101 {
5102 ObjectState& obs = ctx->new_obs;
5103 if (!obs.exists) {
5104 ctx->delta_stats.num_objects++;
5105 obs.exists = true;
5106 ceph_assert(!obs.oi.is_whiteout());
5107 obs.oi.new_object();
5108 if (!ignore_transaction)
5109 ctx->op_t->create(obs.oi.soid);
5110 } else if (obs.oi.is_whiteout()) {
5111 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5112 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5113 --ctx->delta_stats.num_whiteouts;
5114 }
5115 }
5116
5117 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5118 OSDOp& osd_op;
5119
5120 explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5121 }
5122
5123 int execute() override {
5124 return osd_op.rval;
5125 }
5126 };
5127
5128 struct C_ChecksumRead : public Context {
5129 PrimaryLogPG *primary_log_pg;
5130 OSDOp &osd_op;
5131 Checksummer::CSumType csum_type;
5132 bufferlist init_value_bl;
5133 ceph_le64 read_length;
5134 bufferlist read_bl;
5135 Context *fill_extent_ctx;
5136
5137 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5138 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5139 std::optional<uint32_t> maybe_crc, uint64_t size,
5140 OSDService *osd, hobject_t soid, uint32_t flags)
5141 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5142 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5143 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5144 &read_bl, maybe_crc, size,
5145 osd, soid, flags)) {
5146 }
5147 ~C_ChecksumRead() override {
5148 delete fill_extent_ctx;
5149 }
5150
5151 void finish(int r) override {
5152 fill_extent_ctx->complete(r);
5153 fill_extent_ctx = nullptr;
5154
5155 if (osd_op.rval >= 0) {
5156 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5157 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5158 &init_value_bl_it, read_bl);
5159 }
5160 }
5161 };
5162
5163 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5164 bufferlist::const_iterator *bl_it)
5165 {
5166 dout(20) << __func__ << dendl;
5167
5168 auto& op = osd_op.op;
5169 if (op.checksum.chunk_size > 0) {
5170 if (op.checksum.length == 0) {
5171 dout(10) << __func__ << ": length required when chunk size provided"
5172 << dendl;
5173 return -EINVAL;
5174 }
5175 if (op.checksum.length % op.checksum.chunk_size != 0) {
5176 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5177 return -EINVAL;
5178 }
5179 }
5180
5181 auto& oi = ctx->new_obs.oi;
5182 if (op.checksum.offset == 0 && op.checksum.length == 0) {
5183 // zeroed offset+length implies checksum whole object
5184 op.checksum.length = oi.size;
5185 } else if (op.checksum.offset >= oi.size) {
5186 // read size was trimmed to zero, do nothing
5187 // see PrimaryLogPG::do_read
5188 return 0;
5189 } else if (op.extent.offset + op.extent.length > oi.size) {
5190 op.extent.length = oi.size - op.extent.offset;
5191 if (op.checksum.chunk_size > 0 &&
5192 op.checksum.length % op.checksum.chunk_size != 0) {
5193 dout(10) << __func__ << ": length (trimmed to 0x"
5194 << std::hex << op.checksum.length
5195 << ") not aligned to chunk size 0x"
5196 << op.checksum.chunk_size << std::dec
5197 << dendl;
5198 return -EINVAL;
5199 }
5200 }
5201
5202 Checksummer::CSumType csum_type;
5203 switch (op.checksum.type) {
5204 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5205 csum_type = Checksummer::CSUM_XXHASH32;
5206 break;
5207 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5208 csum_type = Checksummer::CSUM_XXHASH64;
5209 break;
5210 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5211 csum_type = Checksummer::CSUM_CRC32C;
5212 break;
5213 default:
5214 dout(10) << __func__ << ": unknown crc type ("
5215 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5216 return -EINVAL;
5217 }
5218
5219 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5220 if (bl_it->get_remaining() < csum_init_value_size) {
5221 dout(10) << __func__ << ": init value not provided" << dendl;
5222 return -EINVAL;
5223 }
5224
5225 bufferlist init_value_bl;
5226 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5227 csum_init_value_size);
5228 *bl_it += csum_init_value_size;
5229
5230 if (pool.info.is_erasure() && op.checksum.length > 0) {
5231 // If there is a data digest and it is possible we are reading
5232 // entire object, pass the digest.
5233 std::optional<uint32_t> maybe_crc;
5234 if (oi.is_data_digest() && op.checksum.offset == 0 &&
5235 op.checksum.length >= oi.size) {
5236 maybe_crc = oi.data_digest;
5237 }
5238
5239 // async read
5240 auto& soid = oi.soid;
5241 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5242 std::move(init_value_bl), maybe_crc,
5243 oi.size, osd, soid, op.flags);
5244
5245 ctx->pending_async_reads.push_back({
5246 {op.checksum.offset, op.checksum.length, op.flags},
5247 {&checksum_ctx->read_bl, checksum_ctx}});
5248
5249 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5250 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5251 new ReadFinisher(osd_op));
5252 return -EINPROGRESS;
5253 }
5254
5255 // sync read
5256 std::vector<OSDOp> read_ops(1);
5257 auto& read_op = read_ops[0];
5258 if (op.checksum.length > 0) {
5259 read_op.op.op = CEPH_OSD_OP_READ;
5260 read_op.op.flags = op.flags;
5261 read_op.op.extent.offset = op.checksum.offset;
5262 read_op.op.extent.length = op.checksum.length;
5263 read_op.op.extent.truncate_size = 0;
5264 read_op.op.extent.truncate_seq = 0;
5265
5266 int r = do_osd_ops(ctx, read_ops);
5267 if (r < 0) {
5268 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5269 return r;
5270 }
5271 }
5272
5273 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5274 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5275 read_op.outdata);
5276 }
5277
5278 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5279 Checksummer::CSumType csum_type,
5280 bufferlist::const_iterator *init_value_bl_it,
5281 const bufferlist &read_bl) {
5282 dout(20) << __func__ << dendl;
5283
5284 auto& op = osd_op.op;
5285
5286 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5287 derr << __func__ << ": bytes read " << read_bl.length() << " != "
5288 << op.checksum.length << dendl;
5289 return -EINVAL;
5290 }
5291
5292 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5293 op.checksum.chunk_size : read_bl.length());
5294 uint32_t csum_count = (csum_chunk_size > 0 ?
5295 read_bl.length() / csum_chunk_size : 0);
5296
5297 bufferlist csum;
5298 bufferptr csum_data;
5299 if (csum_count > 0) {
5300 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5301 csum_data = buffer::create(csum_value_size * csum_count);
5302 csum_data.zero();
5303 csum.append(csum_data);
5304
5305 switch (csum_type) {
5306 case Checksummer::CSUM_XXHASH32:
5307 {
5308 Checksummer::xxhash32::init_value_t init_value;
5309 decode(init_value, *init_value_bl_it);
5310 Checksummer::calculate<Checksummer::xxhash32>(
5311 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5312 &csum_data);
5313 }
5314 break;
5315 case Checksummer::CSUM_XXHASH64:
5316 {
5317 Checksummer::xxhash64::init_value_t init_value;
5318 decode(init_value, *init_value_bl_it);
5319 Checksummer::calculate<Checksummer::xxhash64>(
5320 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5321 &csum_data);
5322 }
5323 break;
5324 case Checksummer::CSUM_CRC32C:
5325 {
5326 Checksummer::crc32c::init_value_t init_value;
5327 decode(init_value, *init_value_bl_it);
5328 Checksummer::calculate<Checksummer::crc32c>(
5329 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5330 &csum_data);
5331 }
5332 break;
5333 default:
5334 break;
5335 }
5336 }
5337
5338 encode(csum_count, osd_op.outdata);
5339 osd_op.outdata.claim_append(csum);
5340 return 0;
5341 }
5342
5343 struct C_ExtentCmpRead : public Context {
5344 PrimaryLogPG *primary_log_pg;
5345 OSDOp &osd_op;
5346 ceph_le64 read_length{};
5347 bufferlist read_bl;
5348 Context *fill_extent_ctx;
5349
5350 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5351 std::optional<uint32_t> maybe_crc, uint64_t size,
5352 OSDService *osd, hobject_t soid, uint32_t flags)
5353 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5354 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5355 &read_bl, maybe_crc, size,
5356 osd, soid, flags)) {
5357 }
5358 ~C_ExtentCmpRead() override {
5359 delete fill_extent_ctx;
5360 }
5361
5362 void finish(int r) override {
5363 if (r == -ENOENT) {
5364 osd_op.rval = 0;
5365 read_bl.clear();
5366 delete fill_extent_ctx;
5367 } else {
5368 fill_extent_ctx->complete(r);
5369 }
5370 fill_extent_ctx = nullptr;
5371
5372 if (osd_op.rval >= 0) {
5373 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5374 }
5375 }
5376 };
5377
5378 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5379 {
5380 dout(20) << __func__ << dendl;
5381 ceph_osd_op& op = osd_op.op;
5382
5383 auto& oi = ctx->new_obs.oi;
5384 uint64_t size = oi.size;
5385 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5386 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5387 size = op.extent.truncate_size;
5388 }
5389
5390 if (op.extent.offset >= size) {
5391 op.extent.length = 0;
5392 } else if (op.extent.offset + op.extent.length > size) {
5393 op.extent.length = size - op.extent.offset;
5394 }
5395
5396 if (op.extent.length == 0) {
5397 dout(20) << __func__ << " zero length extent" << dendl;
5398 return finish_extent_cmp(osd_op, bufferlist{});
5399 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5400 dout(20) << __func__ << " object DNE" << dendl;
5401 return finish_extent_cmp(osd_op, {});
5402 } else if (pool.info.is_erasure()) {
5403 // If there is a data digest and it is possible we are reading
5404 // entire object, pass the digest.
5405 std::optional<uint32_t> maybe_crc;
5406 if (oi.is_data_digest() && op.checksum.offset == 0 &&
5407 op.checksum.length >= oi.size) {
5408 maybe_crc = oi.data_digest;
5409 }
5410
5411 // async read
5412 auto& soid = oi.soid;
5413 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5414 osd, soid, op.flags);
5415 ctx->pending_async_reads.push_back({
5416 {op.extent.offset, op.extent.length, op.flags},
5417 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5418
5419 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5420
5421 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5422 new ReadFinisher(osd_op));
5423 return -EINPROGRESS;
5424 }
5425
5426 // sync read
5427 vector<OSDOp> read_ops(1);
5428 OSDOp& read_op = read_ops[0];
5429
5430 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5431 read_op.op.extent.offset = op.extent.offset;
5432 read_op.op.extent.length = op.extent.length;
5433 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5434 read_op.op.extent.truncate_size = op.extent.truncate_size;
5435
5436 int result = do_osd_ops(ctx, read_ops);
5437 if (result < 0) {
5438 derr << __func__ << " failed " << result << dendl;
5439 return result;
5440 }
5441 return finish_extent_cmp(osd_op, read_op.outdata);
5442 }
5443
5444 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5445 {
5446 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5447 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5448 if (osd_op.indata[idx] != read_byte) {
5449 return (-MAX_ERRNO - idx);
5450 }
5451 }
5452
5453 return 0;
5454 }
5455
5456 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5457 dout(20) << __func__ << dendl;
5458 auto& op = osd_op.op;
5459 auto& oi = ctx->new_obs.oi;
5460 auto& soid = oi.soid;
5461 __u32 seq = oi.truncate_seq;
5462 uint64_t size = oi.size;
5463 bool trimmed_read = false;
5464
5465 dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5466 dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5467 dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5468 dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5469
5470 // are we beyond truncate_size?
5471 if ( (seq < op.extent.truncate_seq) &&
5472 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5473 (size > op.extent.truncate_size) )
5474 size = op.extent.truncate_size;
5475
5476 if (op.extent.length == 0) //length is zero mean read the whole object
5477 op.extent.length = size;
5478
5479 if (op.extent.offset >= size) {
5480 op.extent.length = 0;
5481 trimmed_read = true;
5482 } else if (op.extent.offset + op.extent.length > size) {
5483 op.extent.length = size - op.extent.offset;
5484 trimmed_read = true;
5485 }
5486
5487 dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5488
5489 // read into a buffer
5490 int result = 0;
5491 if (trimmed_read && op.extent.length == 0) {
5492 // read size was trimmed to zero and it is expected to do nothing
5493 // a read operation of 0 bytes does *not* do nothing, this is why
5494 // the trimmed_read boolean is needed
5495 } else if (pool.info.is_erasure()) {
5496 // The initialisation below is required to silence a false positive
5497 // -Wmaybe-uninitialized warning
5498 std::optional<uint32_t> maybe_crc;
5499 // If there is a data digest and it is possible we are reading
5500 // entire object, pass the digest. FillInVerifyExtent will
5501 // will check the oi.size again.
5502 if (oi.is_data_digest() && op.extent.offset == 0 &&
5503 op.extent.length >= oi.size)
5504 maybe_crc = oi.data_digest;
5505 ctx->pending_async_reads.push_back(
5506 make_pair(
5507 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5508 make_pair(&osd_op.outdata,
5509 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5510 &osd_op.outdata, maybe_crc, oi.size,
5511 osd, soid, op.flags))));
5512 dout(10) << " async_read noted for " << soid << dendl;
5513
5514 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5515 new ReadFinisher(osd_op));
5516 } else {
5517 int r = pgbackend->objects_read_sync(
5518 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5519 // whole object? can we verify the checksum?
5520 if (r >= 0 && op.extent.offset == 0 &&
5521 (uint64_t)r == oi.size && oi.is_data_digest()) {
5522 uint32_t crc = osd_op.outdata.crc32c(-1);
5523 if (oi.data_digest != crc) {
5524 osd->clog->error() << info.pgid << std::hex
5525 << " full-object read crc 0x" << crc
5526 << " != expected 0x" << oi.data_digest
5527 << std::dec << " on " << soid;
5528 r = -EIO; // try repair later
5529 }
5530 }
5531 if (r == -EIO) {
5532 r = rep_repair_primary_object(soid, ctx);
5533 }
5534 if (r >= 0)
5535 op.extent.length = r;
5536 else if (r == -EAGAIN) {
5537 result = -EAGAIN;
5538 } else {
5539 result = r;
5540 op.extent.length = 0;
5541 }
5542 dout(10) << " read got " << r << " / " << op.extent.length
5543 << " bytes from obj " << soid << dendl;
5544 }
5545 if (result >= 0) {
5546 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5547 ctx->delta_stats.num_rd++;
5548 }
5549 return result;
5550 }
5551
5552 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5553 dout(20) << __func__ << dendl;
5554 auto& op = osd_op.op;
5555 auto& oi = ctx->new_obs.oi;
5556 auto& soid = oi.soid;
5557
5558 if (op.extent.truncate_seq) {
5559 dout(0) << "sparse_read does not support truncation sequence " << dendl;
5560 return -EINVAL;
5561 }
5562
5563 ++ctx->num_read;
5564 if (pool.info.is_erasure()) {
5565 // translate sparse read to a normal one if not supported
5566 uint64_t offset = op.extent.offset;
5567 uint64_t length = op.extent.length;
5568 if (offset > oi.size) {
5569 length = 0;
5570 } else if (offset + length > oi.size) {
5571 length = oi.size - offset;
5572 }
5573
5574 if (length > 0) {
5575 ctx->pending_async_reads.push_back(
5576 make_pair(
5577 boost::make_tuple(offset, length, op.flags),
5578 make_pair(
5579 &osd_op.outdata,
5580 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5581 &op.extent.length))));
5582 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5583
5584 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5585 new ReadFinisher(osd_op));
5586 } else {
5587 dout(10) << " sparse read ended up empty for " << soid << dendl;
5588 map<uint64_t, uint64_t> extents;
5589 encode(extents, osd_op.outdata);
5590 }
5591 } else {
5592 // read into a buffer
5593 map<uint64_t, uint64_t> m;
5594 uint32_t total_read = 0;
5595 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5596 info.pgid.shard),
5597 op.extent.offset, op.extent.length, m);
5598 if (r < 0) {
5599 return r;
5600 }
5601
5602 bufferlist data_bl;
5603 r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5604 if (r == -EIO) {
5605 r = rep_repair_primary_object(soid, ctx);
5606 }
5607 if (r < 0) {
5608 return r;
5609 }
5610
5611 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5612 // Maybe at first, there is no much whole objects. With continued use, more
5613 // and more whole object exist. So from this point, for spare-read add
5614 // checksum make sense.
5615 if ((uint64_t)r == oi.size && oi.is_data_digest()) {
5616 uint32_t crc = data_bl.crc32c(-1);
5617 if (oi.data_digest != crc) {
5618 osd->clog->error() << info.pgid << std::hex
5619 << " full-object read crc 0x" << crc
5620 << " != expected 0x" << oi.data_digest
5621 << std::dec << " on " << soid;
5622 r = rep_repair_primary_object(soid, ctx);
5623 if (r < 0) {
5624 return r;
5625 }
5626 }
5627 }
5628
5629 op.extent.length = total_read;
5630
5631 encode(m, osd_op.outdata); // re-encode since it might be modified
5632 ::encode_destructively(data_bl, osd_op.outdata);
5633
5634 dout(10) << " sparse_read got " << r << " bytes from object "
5635 << soid << dendl;
5636 }
5637
5638 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5639 ctx->delta_stats.num_rd++;
5640 return 0;
5641 }
5642
5643 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5644 {
5645 int result = 0;
5646 SnapSetContext *ssc = ctx->obc->ssc;
5647 ObjectState& obs = ctx->new_obs;
5648 object_info_t& oi = obs.oi;
5649 const hobject_t& soid = oi.soid;
5650 const bool skip_data_digest = osd->store->has_builtin_csum() &&
5651 osd->osd_skip_data_digest;
5652
5653 PGTransaction* t = ctx->op_t.get();
5654
5655 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5656
5657 ctx->current_osd_subop_num = 0;
5658 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5659 OSDOp& osd_op = *p;
5660 ceph_osd_op& op = osd_op.op;
5661
5662 OpFinisher* op_finisher = nullptr;
5663 {
5664 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5665 if (op_finisher_it != ctx->op_finishers.end()) {
5666 op_finisher = op_finisher_it->second.get();
5667 }
5668 }
5669
5670 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5671 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5672 // but the code in this function seems to treat them as native-endian. What should the
5673 // tracepoints do?
5674 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5675
5676 dout(10) << "do_osd_op " << osd_op << dendl;
5677
5678 auto bp = osd_op.indata.cbegin();
5679
5680 // user-visible modifcation?
5681 switch (op.op) {
5682 // non user-visible modifications
5683 case CEPH_OSD_OP_WATCH:
5684 case CEPH_OSD_OP_CACHE_EVICT:
5685 case CEPH_OSD_OP_CACHE_FLUSH:
5686 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5687 case CEPH_OSD_OP_UNDIRTY:
5688 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5689 case CEPH_OSD_OP_COPY_FROM2:
5690 case CEPH_OSD_OP_CACHE_PIN:
5691 case CEPH_OSD_OP_CACHE_UNPIN:
5692 case CEPH_OSD_OP_SET_REDIRECT:
5693 case CEPH_OSD_OP_TIER_PROMOTE:
5694 case CEPH_OSD_OP_TIER_FLUSH:
5695 break;
5696 default:
5697 if (op.op & CEPH_OSD_OP_MODE_WR)
5698 ctx->user_modify = true;
5699 }
5700
5701 // munge -1 truncate to 0 truncate
5702 if (ceph_osd_op_uses_extent(op.op) &&
5703 op.extent.truncate_seq == 1 &&
5704 op.extent.truncate_size == (-1ULL)) {
5705 op.extent.truncate_size = 0;
5706 op.extent.truncate_seq = 0;
5707 }
5708
5709 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5710 if (op.op == CEPH_OSD_OP_ZERO &&
5711 obs.exists &&
5712 op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5713 op.extent.length >= 1 &&
5714 op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
5715 op.extent.offset + op.extent.length >= oi.size) {
5716 if (op.extent.offset >= oi.size) {
5717 // no-op
5718 goto fail;
5719 }
5720 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5721 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5722 op.op = CEPH_OSD_OP_TRUNCATE;
5723 }
5724
5725 switch (op.op) {
5726
5727 // --- READS ---
5728
5729 case CEPH_OSD_OP_CMPEXT:
5730 ++ctx->num_read;
5731 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5732 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5733 op.extent.length, op.extent.truncate_size,
5734 op.extent.truncate_seq);
5735
5736 if (op_finisher == nullptr) {
5737 result = do_extent_cmp(ctx, osd_op);
5738 } else {
5739 result = op_finisher->execute();
5740 }
5741 break;
5742
5743 case CEPH_OSD_OP_SYNC_READ:
5744 if (pool.info.is_erasure()) {
5745 result = -EOPNOTSUPP;
5746 break;
5747 }
5748 // fall through
5749 case CEPH_OSD_OP_READ:
5750 ++ctx->num_read;
5751 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5752 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5753 op.extent.length, op.extent.truncate_size,
5754 op.extent.truncate_seq);
5755 if (op_finisher == nullptr) {
5756 if (!ctx->data_off) {
5757 ctx->data_off = op.extent.offset;
5758 }
5759 result = do_read(ctx, osd_op);
5760 } else {
5761 result = op_finisher->execute();
5762 }
5763 break;
5764
5765 case CEPH_OSD_OP_CHECKSUM:
5766 ++ctx->num_read;
5767 {
5768 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5769 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5770 op.checksum.offset, op.checksum.length,
5771 op.checksum.chunk_size);
5772
5773 if (op_finisher == nullptr) {
5774 result = do_checksum(ctx, osd_op, &bp);
5775 } else {
5776 result = op_finisher->execute();
5777 }
5778 }
5779 break;
5780
5781 /* map extents */
5782 case CEPH_OSD_OP_MAPEXT:
5783 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5784 if (pool.info.is_erasure()) {
5785 result = -EOPNOTSUPP;
5786 break;
5787 }
5788 ++ctx->num_read;
5789 {
5790 // read into a buffer
5791 bufferlist bl;
5792 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5793 info.pgid.shard),
5794 op.extent.offset, op.extent.length, bl);
5795 osd_op.outdata.claim(bl);
5796 if (r < 0)
5797 result = r;
5798 else
5799 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
5800 ctx->delta_stats.num_rd++;
5801 dout(10) << " map_extents done on object " << soid << dendl;
5802 }
5803 break;
5804
5805 /* map extents */
5806 case CEPH_OSD_OP_SPARSE_READ:
5807 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5808 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5809 op.extent.length, op.extent.truncate_size,
5810 op.extent.truncate_seq);
5811 if (op_finisher == nullptr) {
5812 result = do_sparse_read(ctx, osd_op);
5813 } else {
5814 result = op_finisher->execute();
5815 }
5816 break;
5817
5818 case CEPH_OSD_OP_CALL:
5819 {
5820 string cname, mname;
5821 bufferlist indata;
5822 try {
5823 bp.copy(op.cls.class_len, cname);
5824 bp.copy(op.cls.method_len, mname);
5825 bp.copy(op.cls.indata_len, indata);
5826 } catch (buffer::error& e) {
5827 dout(10) << "call unable to decode class + method + indata" << dendl;
5828 dout(30) << "in dump: ";
5829 osd_op.indata.hexdump(*_dout);
5830 *_dout << dendl;
5831 result = -EINVAL;
5832 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5833 break;
5834 }
5835 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5836
5837 ClassHandler::ClassData *cls;
5838 result = ClassHandler::get_instance().open_class(cname, &cls);
5839 ceph_assert(result == 0); // init_op_flags() already verified this works.
5840
5841 ClassHandler::ClassMethod *method = cls->get_method(mname);
5842 if (!method) {
5843 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5844 result = -EOPNOTSUPP;
5845 break;
5846 }
5847
5848 int flags = method->get_flags();
5849 if (flags & CLS_METHOD_WR)
5850 ctx->user_modify = true;
5851
5852 bufferlist outdata;
5853 dout(10) << "call method " << cname << "." << mname << dendl;
5854 int prev_rd = ctx->num_read;
5855 int prev_wr = ctx->num_write;
5856 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5857
5858 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5859 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5860 result = -EIO;
5861 break;
5862 }
5863 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5864 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5865 result = -EIO;
5866 break;
5867 }
5868
5869 dout(10) << "method called response length=" << outdata.length() << dendl;
5870 op.extent.length = outdata.length();
5871 osd_op.outdata.claim_append(outdata);
5872 dout(30) << "out dump: ";
5873 osd_op.outdata.hexdump(*_dout);
5874 *_dout << dendl;
5875 }
5876 break;
5877
5878 case CEPH_OSD_OP_STAT:
5879 // note: stat does not require RD
5880 {
5881 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5882
5883 if (obs.exists && !oi.is_whiteout()) {
5884 encode(oi.size, osd_op.outdata);
5885 encode(oi.mtime, osd_op.outdata);
5886 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5887 } else {
5888 result = -ENOENT;
5889 dout(10) << "stat oi object does not exist" << dendl;
5890 }
5891
5892 ctx->delta_stats.num_rd++;
5893 }
5894 break;
5895
5896 case CEPH_OSD_OP_ISDIRTY:
5897 ++ctx->num_read;
5898 {
5899 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5900 bool is_dirty = obs.oi.is_dirty();
5901 encode(is_dirty, osd_op.outdata);
5902 ctx->delta_stats.num_rd++;
5903 result = 0;
5904 }
5905 break;
5906
5907 case CEPH_OSD_OP_UNDIRTY:
5908 ++ctx->num_write;
5909 result = 0;
5910 {
5911 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5912 if (oi.is_dirty()) {
5913 ctx->undirty = true; // see make_writeable()
5914 ctx->modify = true;
5915 ctx->delta_stats.num_wr++;
5916 }
5917 }
5918 break;
5919
5920 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5921 ++ctx->num_write;
5922 result = 0;
5923 {
5924 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5925 if (ctx->lock_type != RWState::RWNONE) {
5926 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5927 result = -EINVAL;
5928 break;
5929 }
5930 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5931 result = -EINVAL;
5932 break;
5933 }
5934 if (!obs.exists) {
5935 result = 0;
5936 break;
5937 }
5938 if (oi.is_cache_pinned()) {
5939 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5940 result = -EPERM;
5941 break;
5942 }
5943 if (oi.is_dirty()) {
5944 result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
5945 if (result == -EINPROGRESS)
5946 result = -EAGAIN;
5947 } else {
5948 result = 0;
5949 }
5950 }
5951 break;
5952
5953 case CEPH_OSD_OP_CACHE_FLUSH:
5954 ++ctx->num_write;
5955 result = 0;
5956 {
5957 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5958 if (ctx->lock_type == RWState::RWNONE) {
5959 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5960 result = -EINVAL;
5961 break;
5962 }
5963 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5964 result = -EINVAL;
5965 break;
5966 }
5967 if (!obs.exists) {
5968 result = 0;
5969 break;
5970 }
5971 if (oi.is_cache_pinned()) {
5972 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5973 result = -EPERM;
5974 break;
5975 }
5976 hobject_t missing;
5977 if (oi.is_dirty()) {
5978 result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
5979 if (result == -EINPROGRESS)
5980 result = -EAGAIN;
5981 } else {
5982 result = 0;
5983 }
5984 // Check special return value which has set missing_return
5985 if (result == -ENOENT) {
5986 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5987 ceph_assert(!missing.is_min());
5988 wait_for_unreadable_object(missing, ctx->op);
5989 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5990 result = -EAGAIN;
5991 }
5992 }
5993 break;
5994
5995 case CEPH_OSD_OP_CACHE_EVICT:
5996 ++ctx->num_write;
5997 result = 0;
5998 {
5999 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6000 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6001 result = -EINVAL;
6002 break;
6003 }
6004 if (!obs.exists) {
6005 result = 0;
6006 break;
6007 }
6008 if (oi.is_cache_pinned()) {
6009 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6010 result = -EPERM;
6011 break;
6012 }
6013 if (oi.is_dirty()) {
6014 result = -EBUSY;
6015 break;
6016 }
6017 if (!oi.watchers.empty()) {
6018 result = -EBUSY;
6019 break;
6020 }
6021 if (soid.snap == CEPH_NOSNAP) {
6022 result = _verify_no_head_clones(soid, ssc->snapset);
6023 if (result < 0)
6024 break;
6025 }
6026 result = _delete_oid(ctx, true, false);
6027 if (result >= 0) {
6028 // mark that this is a cache eviction to avoid triggering normal
6029 // make_writeable() clone creation in finish_ctx()
6030 ctx->cache_evict = true;
6031 }
6032 osd->logger->inc(l_osd_tier_evict);
6033 }
6034 break;
6035
6036 case CEPH_OSD_OP_GETXATTR:
6037 ++ctx->num_read;
6038 {
6039 string aname;
6040 bp.copy(op.xattr.name_len, aname);
6041 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6042 string name = "_" + aname;
6043 int r = getattr_maybe_cache(
6044 ctx->obc,
6045 name,
6046 &(osd_op.outdata));
6047 if (r >= 0) {
6048 op.xattr.value_len = osd_op.outdata.length();
6049 result = 0;
6050 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6051 } else
6052 result = r;
6053
6054 ctx->delta_stats.num_rd++;
6055 }
6056 break;
6057
6058 case CEPH_OSD_OP_GETXATTRS:
6059 ++ctx->num_read;
6060 {
6061 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6062 map<string, bufferlist> out;
6063 result = getattrs_maybe_cache(
6064 ctx->obc,
6065 &out);
6066
6067 bufferlist bl;
6068 encode(out, bl);
6069 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6070 ctx->delta_stats.num_rd++;
6071 osd_op.outdata.claim_append(bl);
6072 }
6073 break;
6074
6075 case CEPH_OSD_OP_CMPXATTR:
6076 ++ctx->num_read;
6077 {
6078 string aname;
6079 bp.copy(op.xattr.name_len, aname);
6080 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6081 string name = "_" + aname;
6082 name[op.xattr.name_len + 1] = 0;
6083
6084 bufferlist xattr;
6085 result = getattr_maybe_cache(
6086 ctx->obc,
6087 name,
6088 &xattr);
6089 if (result < 0 && result != -EEXIST && result != -ENODATA)
6090 break;
6091
6092 ctx->delta_stats.num_rd++;
6093 ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6094
6095 switch (op.xattr.cmp_mode) {
6096 case CEPH_OSD_CMPXATTR_MODE_STRING:
6097 {
6098 string val;
6099 bp.copy(op.xattr.value_len, val);
6100 val[op.xattr.value_len] = 0;
6101 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6102 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6103 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6104 }
6105 break;
6106
6107 case CEPH_OSD_CMPXATTR_MODE_U64:
6108 {
6109 uint64_t u64val;
6110 try {
6111 decode(u64val, bp);
6112 }
6113 catch (buffer::error& e) {
6114 result = -EINVAL;
6115 goto fail;
6116 }
6117 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6118 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6119 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6120 }
6121 break;
6122
6123 default:
6124 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6125 result = -EINVAL;
6126 }
6127
6128 if (!result) {
6129 dout(10) << "comparison returned false" << dendl;
6130 result = -ECANCELED;
6131 break;
6132 }
6133 if (result < 0) {
6134 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6135 break;
6136 }
6137
6138 dout(10) << "comparison returned true" << dendl;
6139 }
6140 break;
6141
6142 case CEPH_OSD_OP_ASSERT_VER:
6143 ++ctx->num_read;
6144 {
6145 uint64_t ver = op.assert_ver.ver;
6146 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6147 if (!ver)
6148 result = -EINVAL;
6149 else if (ver < oi.user_version)
6150 result = -ERANGE;
6151 else if (ver > oi.user_version)
6152 result = -EOVERFLOW;
6153 }
6154 break;
6155
6156 case CEPH_OSD_OP_LIST_WATCHERS:
6157 ++ctx->num_read;
6158 {
6159 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6160 obj_list_watch_response_t resp;
6161
6162 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6163 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6164 ++oi_iter) {
6165 dout(20) << "key cookie=" << oi_iter->first.first
6166 << " entity=" << oi_iter->first.second << " "
6167 << oi_iter->second << dendl;
6168 ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6169 ceph_assert(oi_iter->first.second.is_client());
6170
6171 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6172 oi_iter->second.timeout_seconds, oi_iter->second.addr);
6173 resp.entries.push_back(wi);
6174 }
6175
6176 resp.encode(osd_op.outdata, ctx->get_features());
6177 result = 0;
6178
6179 ctx->delta_stats.num_rd++;
6180 break;
6181 }
6182
6183 case CEPH_OSD_OP_LIST_SNAPS:
6184 ++ctx->num_read;
6185 {
6186 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6187 obj_list_snap_response_t resp;
6188
6189 if (!ssc) {
6190 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6191 }
6192 ceph_assert(ssc);
6193 dout(20) << " snapset " << ssc->snapset << dendl;
6194
6195 int clonecount = ssc->snapset.clones.size();
6196 clonecount++; // for head
6197 resp.clones.reserve(clonecount);
6198 for (auto clone_iter = ssc->snapset.clones.begin();
6199 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6200 clone_info ci;
6201 ci.cloneid = *clone_iter;
6202
6203 hobject_t clone_oid = soid;
6204 clone_oid.snap = *clone_iter;
6205
6206 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6207 if (p == ssc->snapset.clone_snaps.end()) {
6208 osd->clog->error() << "osd." << osd->whoami
6209 << ": inconsistent clone_snaps found for oid "
6210 << soid << " clone " << *clone_iter
6211 << " snapset " << ssc->snapset;
6212 result = -EINVAL;
6213 break;
6214 }
6215 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6216 ci.snaps.push_back(*q);
6217 }
6218
6219 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6220
6221 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6222 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6223 if (coi == ssc->snapset.clone_overlap.end()) {
6224 osd->clog->error() << "osd." << osd->whoami
6225 << ": inconsistent clone_overlap found for oid "
6226 << soid << " clone " << *clone_iter;
6227 result = -EINVAL;
6228 break;
6229 }
6230 const interval_set<uint64_t> &o = coi->second;
6231 ci.overlap.reserve(o.num_intervals());
6232 for (interval_set<uint64_t>::const_iterator r = o.begin();
6233 r != o.end(); ++r) {
6234 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6235 r.get_len()));
6236 }
6237
6238 map<snapid_t, uint64_t>::const_iterator si;
6239 si = ssc->snapset.clone_size.find(ci.cloneid);
6240 if (si == ssc->snapset.clone_size.end()) {
6241 osd->clog->error() << "osd." << osd->whoami
6242 << ": inconsistent clone_size found for oid "
6243 << soid << " clone " << *clone_iter;
6244 result = -EINVAL;
6245 break;
6246 }
6247 ci.size = si->second;
6248
6249 resp.clones.push_back(ci);
6250 }
6251 if (result < 0) {
6252 break;
6253 }
6254 if (!ctx->obc->obs.oi.is_whiteout()) {
6255 ceph_assert(obs.exists);
6256 clone_info ci;
6257 ci.cloneid = CEPH_NOSNAP;
6258
6259 //Size for HEAD is oi.size
6260 ci.size = oi.size;
6261
6262 resp.clones.push_back(ci);
6263 }
6264 resp.seq = ssc->snapset.seq;
6265
6266 resp.encode(osd_op.outdata);
6267 result = 0;
6268
6269 ctx->delta_stats.num_rd++;
6270 break;
6271 }
6272
6273 case CEPH_OSD_OP_NOTIFY:
6274 ++ctx->num_read;
6275 {
6276 uint32_t timeout;
6277 bufferlist bl;
6278
6279 try {
6280 uint32_t ver; // obsolete
6281 decode(ver, bp);
6282 decode(timeout, bp);
6283 decode(bl, bp);
6284 } catch (const buffer::error &e) {
6285 timeout = 0;
6286 }
6287 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6288 if (!timeout)
6289 timeout = cct->_conf->osd_default_notify_timeout;
6290
6291 notify_info_t n;
6292 n.timeout = timeout;
6293 n.notify_id = osd->get_next_id(get_osdmap_epoch());
6294 n.cookie = op.notify.cookie;
6295 n.bl = bl;
6296 ctx->notifies.push_back(n);
6297
6298 // return our unique notify id to the client
6299 encode(n.notify_id, osd_op.outdata);
6300 }
6301 break;
6302
6303 case CEPH_OSD_OP_NOTIFY_ACK:
6304 ++ctx->num_read;
6305 {
6306 try {
6307 uint64_t notify_id = 0;
6308 uint64_t watch_cookie = 0;
6309 decode(notify_id, bp);
6310 decode(watch_cookie, bp);
6311 bufferlist reply_bl;
6312 if (!bp.end()) {
6313 decode(reply_bl, bp);
6314 }
6315 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6316 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6317 ctx->notify_acks.push_back(ack);
6318 } catch (const buffer::error &e) {
6319 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6320 OpContext::NotifyAck ack(
6321 // op.watch.cookie is actually the notify_id for historical reasons
6322 op.watch.cookie
6323 );
6324 ctx->notify_acks.push_back(ack);
6325 }
6326 }
6327 break;
6328
6329 case CEPH_OSD_OP_SETALLOCHINT:
6330 ++ctx->num_write;
6331 result = 0;
6332 {
6333 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6334 maybe_create_new_object(ctx);
6335 oi.expected_object_size = op.alloc_hint.expected_object_size;
6336 oi.expected_write_size = op.alloc_hint.expected_write_size;
6337 oi.alloc_hint_flags = op.alloc_hint.flags;
6338 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6339 op.alloc_hint.expected_write_size,
6340 op.alloc_hint.flags);
6341 }
6342 break;
6343
6344
6345 // --- WRITES ---
6346
6347 // -- object data --
6348
6349 case CEPH_OSD_OP_WRITE:
6350 ++ctx->num_write;
6351 result = 0;
6352 { // write
6353 __u32 seq = oi.truncate_seq;
6354 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6355 if (op.extent.length != osd_op.indata.length()) {
6356 result = -EINVAL;
6357 break;
6358 }
6359
6360 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6361 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6362
6363 if (pool.info.requires_aligned_append() &&
6364 (op.extent.offset % pool.info.required_alignment() != 0)) {
6365 result = -EOPNOTSUPP;
6366 break;
6367 }
6368
6369 if (!obs.exists) {
6370 if (pool.info.requires_aligned_append() && op.extent.offset) {
6371 result = -EOPNOTSUPP;
6372 break;
6373 }
6374 } else if (op.extent.offset != oi.size &&
6375 pool.info.requires_aligned_append()) {
6376 result = -EOPNOTSUPP;
6377 break;
6378 }
6379
6380 if (seq && (seq > op.extent.truncate_seq) &&
6381 (op.extent.offset + op.extent.length > oi.size)) {
6382 // old write, arrived after trimtrunc
6383 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6384 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6385 << ", adjusting write length to " << op.extent.length << dendl;
6386 bufferlist t;
6387 t.substr_of(osd_op.indata, 0, op.extent.length);
6388 osd_op.indata.swap(t);
6389 }
6390 if (op.extent.truncate_seq > seq) {
6391 // write arrives before trimtrunc
6392 if (obs.exists && !oi.is_whiteout()) {
6393 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6394 << ", truncating to " << op.extent.truncate_size << dendl;
6395 t->truncate(soid, op.extent.truncate_size);
6396 oi.truncate_seq = op.extent.truncate_seq;
6397 oi.truncate_size = op.extent.truncate_size;
6398 if (oi.size > op.extent.truncate_size) {
6399 interval_set<uint64_t> trim;
6400 trim.insert(op.extent.truncate_size,
6401 oi.size - op.extent.truncate_size);
6402 ctx->modified_ranges.union_of(trim);
6403 ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
6404 }
6405 if (op.extent.truncate_size != oi.size) {
6406 truncate_update_size_and_usage(ctx->delta_stats,
6407 oi,
6408 op.extent.truncate_size);
6409 }
6410 } else {
6411 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6412 << ", but object is new" << dendl;
6413 oi.truncate_seq = op.extent.truncate_seq;
6414 oi.truncate_size = op.extent.truncate_size;
6415 }
6416 }
6417 result = check_offset_and_length(
6418 op.extent.offset, op.extent.length,
6419 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6420 if (result < 0)
6421 break;
6422
6423 maybe_create_new_object(ctx);
6424
6425 if (op.extent.length == 0) {
6426 if (op.extent.offset > oi.size) {
6427 t->truncate(
6428 soid, op.extent.offset);
6429 truncate_update_size_and_usage(ctx->delta_stats, oi,
6430 op.extent.offset);
6431 } else {
6432 t->nop(soid);
6433 }
6434 } else {
6435 t->write(
6436 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6437 }
6438
6439 if (op.extent.offset == 0 && op.extent.length >= oi.size
6440 && !skip_data_digest) {
6441 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6442 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6443 if (skip_data_digest) {
6444 obs.oi.clear_data_digest();
6445 } else {
6446 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6447 }
6448 } else {
6449 obs.oi.clear_data_digest();
6450 }
6451 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6452 op.extent.offset, op.extent.length);
6453 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6454 dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
6455 }
6456 break;
6457
6458 case CEPH_OSD_OP_WRITEFULL:
6459 ++ctx->num_write;
6460 result = 0;
6461 { // write full object
6462 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6463
6464 if (op.extent.length != osd_op.indata.length()) {
6465 result = -EINVAL;
6466 break;
6467 }
6468 result = check_offset_and_length(
6469 0, op.extent.length,
6470 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6471 if (result < 0)
6472 break;
6473
6474 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6475 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6476
6477 maybe_create_new_object(ctx);
6478 if (pool.info.is_erasure()) {
6479 t->truncate(soid, 0);
6480 } else if (obs.exists && op.extent.length < oi.size) {
6481 t->truncate(soid, op.extent.length);
6482 }
6483 if (op.extent.length) {
6484 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6485 }
6486 if (!skip_data_digest) {
6487 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6488 } else {
6489 obs.oi.clear_data_digest();
6490 }
6491 ctx->clean_regions.mark_data_region_dirty(0,
6492 std::max((uint64_t)op.extent.length, oi.size));
6493 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6494 0, op.extent.length, true);
6495 }
6496 break;
6497
6498 case CEPH_OSD_OP_WRITESAME:
6499 ++ctx->num_write;
6500 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6501 result = do_writesame(ctx, osd_op);
6502 break;
6503
6504 case CEPH_OSD_OP_ROLLBACK :
6505 ++ctx->num_write;
6506 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6507 result = _rollback_to(ctx, op);
6508 break;
6509
6510 case CEPH_OSD_OP_ZERO:
6511 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6512 if (pool.info.requires_aligned_append()) {
6513 result = -EOPNOTSUPP;
6514 break;
6515 }
6516 ++ctx->num_write;
6517 { // zero
6518 result = check_offset_and_length(
6519 op.extent.offset, op.extent.length,
6520 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6521 if (result < 0)
6522 break;
6523
6524 ceph_assert(op.extent.length);
6525 if (obs.exists && !oi.is_whiteout()) {
6526 t->zero(soid, op.extent.offset, op.extent.length);
6527 interval_set<uint64_t> ch;
6528 ch.insert(op.extent.offset, op.extent.length);
6529 ctx->modified_ranges.union_of(ch);
6530 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6531 ctx->delta_stats.num_wr++;
6532 oi.clear_data_digest();
6533 } else {
6534 // no-op
6535 }
6536 }
6537 break;
6538 case CEPH_OSD_OP_CREATE:
6539 ++ctx->num_write;
6540 result = 0;
6541 {
6542 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6543 if (obs.exists && !oi.is_whiteout() &&
6544 (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
6545 result = -EEXIST; /* this is an exclusive create */
6546 } else {
6547 if (osd_op.indata.length()) {
6548 auto p = osd_op.indata.cbegin();
6549 string category;
6550 try {
6551 decode(category, p);
6552 }
6553 catch (buffer::error& e) {
6554 result = -EINVAL;
6555 goto fail;
6556 }
6557 // category is no longer implemented.
6558 }
6559 maybe_create_new_object(ctx);
6560 t->nop(soid);
6561 }
6562 }
6563 break;
6564
6565 case CEPH_OSD_OP_TRIMTRUNC:
6566 op.extent.offset = op.extent.truncate_size;
6567 // falling through
6568
6569 case CEPH_OSD_OP_TRUNCATE:
6570 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6571 if (pool.info.requires_aligned_append()) {
6572 result = -EOPNOTSUPP;
6573 break;
6574 }
6575 ++ctx->num_write;
6576 result = 0;
6577 {
6578 // truncate
6579 if (!obs.exists || oi.is_whiteout()) {
6580 dout(10) << " object dne, truncate is a no-op" << dendl;
6581 break;
6582 }
6583
6584 result = check_offset_and_length(
6585 op.extent.offset, op.extent.length,
6586 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6587 if (result < 0)
6588 break;
6589
6590 if (op.extent.truncate_seq) {
6591 ceph_assert(op.extent.offset == op.extent.truncate_size);
6592 if (op.extent.truncate_seq <= oi.truncate_seq) {
6593 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6594 << ", no-op" << dendl;
6595 break; // old
6596 }
6597 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6598 << ", truncating" << dendl;
6599 oi.truncate_seq = op.extent.truncate_seq;
6600 oi.truncate_size = op.extent.truncate_size;
6601 }
6602
6603 maybe_create_new_object(ctx);
6604 t->truncate(soid, op.extent.offset);
6605 if (oi.size > op.extent.offset) {
6606 interval_set<uint64_t> trim;
6607 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6608 ctx->modified_ranges.union_of(trim);
6609 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6610 } else if (oi.size < op.extent.offset) {
6611 ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6612 }
6613 if (op.extent.offset != oi.size) {
6614 truncate_update_size_and_usage(ctx->delta_stats,
6615 oi,
6616 op.extent.offset);
6617 }
6618 ctx->delta_stats.num_wr++;
6619 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6620
6621 oi.clear_data_digest();
6622 }
6623 break;
6624
6625 case CEPH_OSD_OP_DELETE:
6626 ++ctx->num_write;
6627 result = 0;
6628 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6629 {
6630 if (oi.has_manifest()) {
6631 if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && oi.manifest.is_redirect()) {
6632 ctx->register_on_commit(
6633 [oi, ctx, this](){
6634 object_locator_t target_oloc(oi.manifest.redirect_target);
6635 refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
6636 SnapContext(), false, NULL, 0);
6637 });
6638 } else if (oi.manifest.is_chunked()) {
6639 ctx->register_on_commit(
6640 [oi, ctx, this](){
6641 for (auto p : oi.manifest.chunk_map) {
6642 if (p.second.has_reference()) {
6643 object_locator_t target_oloc(p.second.oid);
6644 refcount_manifest(ctx->obc, target_oloc, p.second.oid,
6645 SnapContext(), false, NULL, p.first);
6646 }
6647 }
6648 });
6649 }
6650 }
6651 result = _delete_oid(ctx, false, ctx->ignore_cache);
6652 }
6653 break;
6654
6655 case CEPH_OSD_OP_WATCH:
6656 ++ctx->num_write;
6657 result = 0;
6658 {
6659 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6660 op.watch.cookie, op.watch.op);
6661 if (!obs.exists) {
6662 result = -ENOENT;
6663 break;
6664 }
6665 result = 0;
6666 uint64_t cookie = op.watch.cookie;
6667 entity_name_t entity = ctx->reqid.name;
6668 ObjectContextRef obc = ctx->obc;
6669
6670 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6671 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6672 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6673 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6674 dout(10) << "watch: peer_addr="
6675 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6676
6677 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6678 if (op.watch.timeout != 0) {
6679 timeout = op.watch.timeout;
6680 }
6681
6682 watch_info_t w(cookie, timeout,
6683 ctx->op->get_req()->get_connection()->get_peer_addr());
6684 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6685 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6686 if (oi.watchers.count(make_pair(cookie, entity))) {
6687 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6688 } else {
6689 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6690 oi.watchers[make_pair(cookie, entity)] = w;
6691 t->nop(soid); // make sure update the object_info on disk!
6692 }
6693 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6694 ctx->watch_connects.push_back(make_pair(w, will_ping));
6695 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6696 if (!oi.watchers.count(make_pair(cookie, entity))) {
6697 result = -ENOTCONN;
6698 break;
6699 }
6700 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6701 ctx->watch_connects.push_back(make_pair(w, true));
6702 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6703 /* Note: WATCH with PING doesn't cause may_write() to return true,
6704 * so if there is nothing else in the transaction, this is going
6705 * to run do_osd_op_effects, but not write out a log entry */
6706 if (!oi.watchers.count(make_pair(cookie, entity))) {
6707 result = -ENOTCONN;
6708 break;
6709 }
6710 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6711 obc->watchers.find(make_pair(cookie, entity));
6712 if (p == obc->watchers.end() ||
6713 !p->second->is_connected()) {
6714 // client needs to reconnect
6715 result = -ETIMEDOUT;
6716 break;
6717 }
6718 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6719 p->second->got_ping(ceph_clock_now());
6720 result = 0;
6721 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6722 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6723 oi.watchers.find(make_pair(cookie, entity));
6724 if (oi_iter != oi.watchers.end()) {
6725 dout(10) << " removed watch " << oi_iter->second << " by "
6726 << entity << dendl;
6727 oi.watchers.erase(oi_iter);
6728 t->nop(soid); // update oi on disk
6729 ctx->watch_disconnects.push_back(
6730 watch_disconnect_t(cookie, entity, false));
6731 } else {
6732 dout(10) << " can't remove: no watch by " << entity << dendl;
6733 }
6734 }
6735 }
6736 break;
6737
6738 case CEPH_OSD_OP_CACHE_PIN:
6739 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6740 if ((!pool.info.is_tier() ||
6741 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6742 result = -EINVAL;
6743 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6744 break;
6745 }
6746 ++ctx->num_write;
6747 result = 0;
6748 {
6749 if (!obs.exists || oi.is_whiteout()) {
6750 result = -ENOENT;
6751 break;
6752 }
6753
6754 if (!oi.is_cache_pinned()) {
6755 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6756 ctx->modify = true;
6757 ctx->delta_stats.num_objects_pinned++;
6758 ctx->delta_stats.num_wr++;
6759 }
6760 }
6761 break;
6762
6763 case CEPH_OSD_OP_CACHE_UNPIN:
6764 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6765 if ((!pool.info.is_tier() ||
6766 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6767 result = -EINVAL;
6768 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6769 break;
6770 }
6771 ++ctx->num_write;
6772 result = 0;
6773 {
6774 if (!obs.exists || oi.is_whiteout()) {
6775 result = -ENOENT;
6776 break;
6777 }
6778
6779 if (oi.is_cache_pinned()) {
6780 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6781 ctx->modify = true;
6782 ctx->delta_stats.num_objects_pinned--;
6783 ctx->delta_stats.num_wr++;
6784 }
6785 }
6786 break;
6787
6788 case CEPH_OSD_OP_SET_REDIRECT:
6789 ++ctx->num_write;
6790 result = 0;
6791 {
6792 if (pool.info.is_tier()) {
6793 result = -EINVAL;
6794 break;
6795 }
6796 if (!obs.exists) {
6797 result = -ENOENT;
6798 break;
6799 }
6800 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
6801 result = -EOPNOTSUPP;
6802 break;
6803 }
6804
6805 object_t target_name;
6806 object_locator_t target_oloc;
6807 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6808 version_t target_version = op.copy_from.src_version;
6809 try {
6810 decode(target_name, bp);
6811 decode(target_oloc, bp);
6812 }
6813 catch (buffer::error& e) {
6814 result = -EINVAL;
6815 goto fail;
6816 }
6817 pg_t raw_pg;
6818 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6819 hobject_t target(target_name, target_oloc.key, target_snapid,
6820 raw_pg.ps(), raw_pg.pool(),
6821 target_oloc.nspace);
6822 if (target == soid) {
6823 dout(20) << " set-redirect self is invalid" << dendl;
6824 result = -EINVAL;
6825 break;
6826 }
6827
6828 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6829 bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6830 if (has_reference) {
6831 result = -EINVAL;
6832 dout(5) << " the object is already a manifest " << dendl;
6833 break;
6834 }
6835 if (op_finisher == nullptr && need_reference) {
6836 // start
6837 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6838 new SetManifestFinisher(osd_op));
6839 RefCountCallback *fin = new RefCountCallback(ctx, osd_op);
6840 refcount_manifest(ctx->obc, target_oloc, target, SnapContext(),
6841 true, fin, 0);
6842 result = -EINPROGRESS;
6843 } else {
6844 // finish
6845 if (op_finisher) {
6846 result = op_finisher->execute();
6847 ceph_assert(result == 0);
6848 }
6849
6850 if (!oi.has_manifest() && !oi.manifest.is_redirect())
6851 ctx->delta_stats.num_objects_manifest++;
6852
6853 oi.set_flag(object_info_t::FLAG_MANIFEST);
6854 oi.manifest.redirect_target = target;
6855 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6856 t->truncate(soid, 0);
6857 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
6858 if (oi.is_omap() && pool.info.supports_omap()) {
6859 t->omap_clear(soid);
6860 obs.oi.clear_omap_digest();
6861 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6862 ctx->clean_regions.mark_omap_dirty();
6863 }
6864 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6865 0, oi.size, false);
6866 ctx->delta_stats.num_bytes -= oi.size;
6867 oi.size = 0;
6868 oi.new_object();
6869 oi.user_version = target_version;
6870 ctx->user_at_version = target_version;
6871 /* rm_attrs */
6872 map<string,bufferlist> rmattrs;
6873 result = getattrs_maybe_cache(ctx->obc, &rmattrs);
6874 if (result < 0) {
6875 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
6876 return result;
6877 }
6878 map<string, bufferlist>::iterator iter;
6879 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6880 const string& name = iter->first;
6881 t->rmattr(soid, name);
6882 }
6883 if (!has_reference && need_reference) {
6884 oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6885 }
6886 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6887 if (op_finisher) {
6888 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6889 }
6890 }
6891 }
6892
6893 break;
6894
6895 case CEPH_OSD_OP_SET_CHUNK:
6896 ++ctx->num_write;
6897 result = 0;
6898 {
6899 if (pool.info.is_tier()) {
6900 result = -EINVAL;
6901 break;
6902 }
6903 if (!obs.exists) {
6904 result = -ENOENT;
6905 break;
6906 }
6907 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
6908 result = -EOPNOTSUPP;
6909 break;
6910 }
6911
6912 object_locator_t tgt_oloc;
6913 uint64_t src_offset, src_length, tgt_offset;
6914 object_t tgt_name;
6915 try {
6916 decode(src_offset, bp);
6917 decode(src_length, bp);
6918 decode(tgt_oloc, bp);
6919 decode(tgt_name, bp);
6920 decode(tgt_offset, bp);
6921 }
6922 catch (buffer::error& e) {
6923 result = -EINVAL;
6924 goto fail;
6925 }
6926
6927 if (!src_length) {
6928 result = -EINVAL;
6929 goto fail;
6930 }
6931
6932 for (auto &p : oi.manifest.chunk_map) {
6933 if ((p.first <= src_offset && p.first + p.second.length > src_offset) ||
6934 (p.first > src_offset && p.first <= src_offset + src_length)) {
6935 dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
6936 << " chunk_info: " << p << dendl;
6937 result = -EOPNOTSUPP;
6938 goto fail;
6939 }
6940 }
6941
6942 if (!oi.manifest.is_chunked()) {
6943 oi.manifest.clear();
6944 }
6945
6946 pg_t raw_pg;
6947 chunk_info_t chunk_info;
6948 get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
6949 hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
6950 raw_pg.ps(), raw_pg.pool(),
6951 tgt_oloc.nspace);
6952 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6953 bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
6954 (oi.manifest.chunk_map[src_offset].flags & chunk_info_t::FLAG_HAS_REFERENCE);
6955 if (has_reference) {
6956 result = -EINVAL;
6957 dout(5) << " the object is already a manifest " << dendl;
6958 break;
6959 }
6960 if (op_finisher == nullptr && need_reference) {
6961 // start
6962 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6963 new SetManifestFinisher(osd_op));
6964 RefCountCallback *fin = new RefCountCallback(ctx, osd_op);
6965 refcount_manifest(ctx->obc, tgt_oloc, target, SnapContext(),
6966 true, fin, src_offset);
6967 result = -EINPROGRESS;
6968 } else {
6969 if (op_finisher) {
6970 result = op_finisher->execute();
6971 ceph_assert(result == 0);
6972 }
6973
6974 chunk_info_t chunk_info;
6975 chunk_info.set_flag(chunk_info_t::FLAG_MISSING);
6976 chunk_info.oid = target;
6977 chunk_info.offset = tgt_offset;
6978 chunk_info.length= src_length;
6979 oi.manifest.chunk_map[src_offset] = chunk_info;
6980 if (!oi.has_manifest() && !oi.manifest.is_chunked())
6981 ctx->delta_stats.num_objects_manifest++;
6982 oi.set_flag(object_info_t::FLAG_MANIFEST);
6983 oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
6984 if (!has_reference && need_reference) {
6985 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
6986 }
6987 if (need_reference && pool.info.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE) {
6988 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT);
6989 }
6990 ctx->modify = true;
6991
6992 dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
6993 << " chunk_info: " << chunk_info << dendl;
6994 if (op_finisher) {
6995 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6996 }
6997 }
6998 }
6999
7000 break;
7001
7002 case CEPH_OSD_OP_TIER_PROMOTE:
7003 ++ctx->num_write;
7004 result = 0;
7005 {
7006 if (pool.info.is_tier()) {
7007 result = -EINVAL;
7008 break;
7009 }
7010 if (!obs.exists) {
7011 result = -ENOENT;
7012 break;
7013 }
7014 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7015 result = -EOPNOTSUPP;
7016 break;
7017 }
7018 if (!obs.oi.has_manifest()) {
7019 result = 0;
7020 break;
7021 }
7022
7023 if (op_finisher == nullptr) {
7024 PromoteManifestCallback *cb;
7025 object_locator_t my_oloc;
7026 hobject_t src_hoid;
7027
7028 if (obs.oi.manifest.is_chunked()) {
7029 src_hoid = obs.oi.soid;
7030 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7031 } else if (obs.oi.manifest.is_redirect()) {
7032 object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7033 my_oloc = src_oloc;
7034 src_hoid = obs.oi.manifest.redirect_target;
7035 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7036 } else {
7037 ceph_abort_msg("unrecognized manifest type");
7038 }
7039 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7040 new PromoteFinisher(cb));
7041 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7042 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7043 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7044 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7045 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7046 start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7047 obs.oi.soid.snap == CEPH_NOSNAP,
7048 src_fadvise_flags, 0);
7049
7050 dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7051 result = -EINPROGRESS;
7052 } else {
7053 result = op_finisher->execute();
7054 ceph_assert(result == 0);
7055 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7056 }
7057 }
7058
7059 break;
7060
7061 case CEPH_OSD_OP_TIER_FLUSH:
7062 ++ctx->num_write;
7063 result = 0;
7064 {
7065 if (pool.info.is_tier()) {
7066 result = -EINVAL;
7067 break;
7068 }
7069 if (!obs.exists) {
7070 result = -ENOENT;
7071 break;
7072 }
7073 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7074 result = -EOPNOTSUPP;
7075 break;
7076 }
7077 if (!obs.oi.has_manifest()) {
7078 result = 0;
7079 break;
7080 }
7081
7082 hobject_t missing;
7083 bool is_dirty = false;
7084 for (auto& p : ctx->obc->obs.oi.manifest.chunk_map) {
7085 if (p.second.is_dirty()) {
7086 is_dirty = true;
7087 break;
7088 }
7089 }
7090
7091 if (is_dirty) {
7092 result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7093 if (result == -EINPROGRESS)
7094 result = -EAGAIN;
7095 } else {
7096 result = 0;
7097 }
7098 }
7099
7100 break;
7101
7102 case CEPH_OSD_OP_UNSET_MANIFEST:
7103 ++ctx->num_write;
7104 result = 0;
7105 {
7106 if (pool.info.is_tier()) {
7107 result = -EINVAL;
7108 break;
7109 }
7110 if (!obs.exists) {
7111 result = -ENOENT;
7112 break;
7113 }
7114 if (!oi.has_manifest()) {
7115 result = -EOPNOTSUPP;
7116 break;
7117 }
7118 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7119 result = -EOPNOTSUPP;
7120 break;
7121 }
7122
7123 if (oi.manifest.is_redirect()) {
7124 if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
7125 ctx->register_on_commit(
7126 [oi, ctx, this](){
7127 object_locator_t target_oloc(oi.manifest.redirect_target);
7128 refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
7129 SnapContext(), false, NULL, 0);
7130 });
7131 }
7132 } else if (oi.manifest.is_chunked()) {
7133 ctx->register_on_commit(
7134 [oi, ctx, this](){
7135 for (auto p : oi.manifest.chunk_map) {
7136 if (p.second.flags & chunk_info_t::FLAG_HAS_REFERENCE) {
7137 object_locator_t target_oloc(p.second.oid);
7138 refcount_manifest(ctx->obc, target_oloc, p.second.oid,
7139 SnapContext(), false, NULL, p.first);
7140 }
7141 }
7142 });
7143 } else {
7144 ceph_abort_msg("unrecognized manifest type");
7145 }
7146
7147 oi.clear_flag(object_info_t::FLAG_MANIFEST);
7148 oi.manifest = object_manifest_t();
7149 ctx->delta_stats.num_objects_manifest--;
7150 ctx->delta_stats.num_wr++;
7151 ctx->modify = true;
7152 }
7153
7154 break;
7155
7156 // -- object attrs --
7157
7158 case CEPH_OSD_OP_SETXATTR:
7159 ++ctx->num_write;
7160 result = 0;
7161 {
7162 if (cct->_conf->osd_max_attr_size > 0 &&
7163 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7164 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7165 result = -EFBIG;
7166 break;
7167 }
7168 unsigned max_name_len =
7169 std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7170 cct->_conf->osd_max_attr_name_len);
7171 if (op.xattr.name_len > max_name_len) {
7172 result = -ENAMETOOLONG;
7173 break;
7174 }
7175 maybe_create_new_object(ctx);
7176 string aname;
7177 bp.copy(op.xattr.name_len, aname);
7178 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7179 string name = "_" + aname;
7180 bufferlist bl;
7181 bp.copy(op.xattr.value_len, bl);
7182 t->setattr(soid, name, bl);
7183 ctx->delta_stats.num_wr++;
7184 }
7185 break;
7186
7187 case CEPH_OSD_OP_RMXATTR:
7188 ++ctx->num_write;
7189 result = 0;
7190 {
7191 string aname;
7192 bp.copy(op.xattr.name_len, aname);
7193 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7194 if (!obs.exists || oi.is_whiteout()) {
7195 result = -ENOENT;
7196 break;
7197 }
7198 string name = "_" + aname;
7199 t->rmattr(soid, name);
7200 ctx->delta_stats.num_wr++;
7201 }
7202 break;
7203
7204
7205 // -- fancy writers --
7206 case CEPH_OSD_OP_APPEND:
7207 {
7208 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7209 // just do it inline; this works because we are happy to execute
7210 // fancy op on replicas as well.
7211 vector<OSDOp> nops(1);
7212 OSDOp& newop = nops[0];
7213 newop.op.op = CEPH_OSD_OP_WRITE;
7214 newop.op.extent.offset = oi.size;
7215 newop.op.extent.length = op.extent.length;
7216 newop.op.extent.truncate_seq = oi.truncate_seq;
7217 newop.indata = osd_op.indata;
7218 result = do_osd_ops(ctx, nops);
7219 osd_op.outdata.claim(newop.outdata);
7220 }
7221 break;
7222
7223 case CEPH_OSD_OP_STARTSYNC:
7224 result = 0;
7225 t->nop(soid);
7226 break;
7227
7228 // -- trivial map --
7229 case CEPH_OSD_OP_TMAPGET:
7230 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7231 if (pool.info.is_erasure()) {
7232 result = -EOPNOTSUPP;
7233 break;
7234 }
7235 {
7236 vector<OSDOp> nops(1);
7237 OSDOp& newop = nops[0];
7238 newop.op.op = CEPH_OSD_OP_SYNC_READ;
7239 newop.op.extent.offset = 0;
7240 newop.op.extent.length = 0;
7241 result = do_osd_ops(ctx, nops);
7242 osd_op.outdata.claim(newop.outdata);
7243 }
7244 break;
7245
7246 case CEPH_OSD_OP_TMAPPUT:
7247 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7248 if (pool.info.is_erasure()) {
7249 result = -EOPNOTSUPP;
7250 break;
7251 }
7252 {
7253 //_dout_lock.Lock();
7254 //osd_op.data.hexdump(*_dout);
7255 //_dout_lock.Unlock();
7256
7257 // verify sort order
7258 bool unsorted = false;
7259 if (true) {
7260 bufferlist header;
7261 decode(header, bp);
7262 uint32_t n;
7263 decode(n, bp);
7264 string last_key;
7265 while (n--) {
7266 string key;
7267 decode(key, bp);
7268 dout(10) << "tmapput key " << key << dendl;
7269 bufferlist val;
7270 decode(val, bp);
7271 if (key < last_key) {
7272 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7273 unsorted = true;
7274 break;
7275 }
7276 last_key = key;
7277 }
7278 }
7279
7280 // write it
7281 vector<OSDOp> nops(1);
7282 OSDOp& newop = nops[0];
7283 newop.op.op = CEPH_OSD_OP_WRITEFULL;
7284 newop.op.extent.offset = 0;
7285 newop.op.extent.length = osd_op.indata.length();
7286 newop.indata = osd_op.indata;
7287
7288 if (unsorted) {
7289 bp = osd_op.indata.begin();
7290 bufferlist header;
7291 map<string, bufferlist> m;
7292 decode(header, bp);
7293 decode(m, bp);
7294 ceph_assert(bp.end());
7295 bufferlist newbl;
7296 encode(header, newbl);
7297 encode(m, newbl);
7298 newop.indata = newbl;
7299 }
7300 result = do_osd_ops(ctx, nops);
7301 ceph_assert(result == 0);
7302 }
7303 break;
7304
7305 case CEPH_OSD_OP_TMAPUP:
7306 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7307 if (pool.info.is_erasure()) {
7308 result = -EOPNOTSUPP;
7309 break;
7310 }
7311 ++ctx->num_write;
7312 result = do_tmapup(ctx, bp, osd_op);
7313 break;
7314
7315 case CEPH_OSD_OP_TMAP2OMAP:
7316 ++ctx->num_write;
7317 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7318 result = do_tmap2omap(ctx, op.tmap2omap.flags);
7319 break;
7320
7321 // OMAP Read ops
7322 case CEPH_OSD_OP_OMAPGETKEYS:
7323 ++ctx->num_read;
7324 {
7325 string start_after;
7326 uint64_t max_return;
7327 try {
7328 decode(start_after, bp);
7329 decode(max_return, bp);
7330 }
7331 catch (buffer::error& e) {
7332 result = -EINVAL;
7333 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7334 goto fail;
7335 }
7336 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7337 max_return = cct->_conf->osd_max_omap_entries_per_request;
7338 }
7339 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7340
7341 bufferlist bl;
7342 uint32_t num = 0;
7343 bool truncated = false;
7344 if (oi.is_omap()) {
7345 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7346 ch, ghobject_t(soid)
7347 );
7348 ceph_assert(iter);
7349 iter->upper_bound(start_after);
7350 for (num = 0; iter->valid(); ++num, iter->next()) {
7351 if (num >= max_return ||
7352 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7353 truncated = true;
7354 break;
7355 }
7356 encode(iter->key(), bl);
7357 }
7358 } // else return empty out_set
7359 encode(num, osd_op.outdata);
7360 osd_op.outdata.claim_append(bl);
7361 encode(truncated, osd_op.outdata);
7362 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7363 ctx->delta_stats.num_rd++;
7364 }
7365 break;
7366
7367 case CEPH_OSD_OP_OMAPGETVALS:
7368 ++ctx->num_read;
7369 {
7370 string start_after;
7371 uint64_t max_return;
7372 string filter_prefix;
7373 try {
7374 decode(start_after, bp);
7375 decode(max_return, bp);
7376 decode(filter_prefix, bp);
7377 }
7378 catch (buffer::error& e) {
7379 result = -EINVAL;
7380 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7381 goto fail;
7382 }
7383 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7384 max_return = cct->_conf->osd_max_omap_entries_per_request;
7385 }
7386 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7387
7388 uint32_t num = 0;
7389 bool truncated = false;
7390 bufferlist bl;
7391 if (oi.is_omap()) {
7392 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7393 ch, ghobject_t(soid)
7394 );
7395 if (!iter) {
7396 result = -ENOENT;
7397 goto fail;
7398 }
7399 iter->upper_bound(start_after);
7400 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7401 for (num = 0;
7402 iter->valid() &&
7403 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7404 ++num, iter->next()) {
7405 dout(20) << "Found key " << iter->key() << dendl;
7406 if (num >= max_return ||
7407 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7408 truncated = true;
7409 break;
7410 }
7411 encode(iter->key(), bl);
7412 encode(iter->value(), bl);
7413 }
7414 } // else return empty out_set
7415 encode(num, osd_op.outdata);
7416 osd_op.outdata.claim_append(bl);
7417 encode(truncated, osd_op.outdata);
7418 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7419 ctx->delta_stats.num_rd++;
7420 }
7421 break;
7422
7423 case CEPH_OSD_OP_OMAPGETHEADER:
7424 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7425 if (!oi.is_omap()) {
7426 // return empty header
7427 break;
7428 }
7429 ++ctx->num_read;
7430 {
7431 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7432 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7433 ctx->delta_stats.num_rd++;
7434 }
7435 break;
7436
7437 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7438 ++ctx->num_read;
7439 {
7440 set<string> keys_to_get;
7441 try {
7442 decode(keys_to_get, bp);
7443 }
7444 catch (buffer::error& e) {
7445 result = -EINVAL;
7446 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7447 goto fail;
7448 }
7449 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7450 map<string, bufferlist> out;
7451 if (oi.is_omap()) {
7452 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7453 } // else return empty omap entries
7454 encode(out, osd_op.outdata);
7455 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7456 ctx->delta_stats.num_rd++;
7457 }
7458 break;
7459
7460 case CEPH_OSD_OP_OMAP_CMP:
7461 ++ctx->num_read;
7462 {
7463 if (!obs.exists || oi.is_whiteout()) {
7464 result = -ENOENT;
7465 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7466 break;
7467 }
7468 map<string, pair<bufferlist, int> > assertions;
7469 try {
7470 decode(assertions, bp);
7471 }
7472 catch (buffer::error& e) {
7473 result = -EINVAL;
7474 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7475 goto fail;
7476 }
7477 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7478
7479 map<string, bufferlist> out;
7480
7481 if (oi.is_omap()) {
7482 set<string> to_get;
7483 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7484 i != assertions.end();
7485 ++i)
7486 to_get.insert(i->first);
7487 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7488 to_get, &out);
7489 if (r < 0) {
7490 result = r;
7491 break;
7492 }
7493 } // else leave out empty
7494
7495 //Should set num_rd_kb based on encode length of map
7496 ctx->delta_stats.num_rd++;
7497
7498 int r = 0;
7499 bufferlist empty;
7500 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7501 i != assertions.end();
7502 ++i) {
7503 auto out_entry = out.find(i->first);
7504 bufferlist &bl = (out_entry != out.end()) ?
7505 out_entry->second : empty;
7506 switch (i->second.second) {
7507 case CEPH_OSD_CMPXATTR_OP_EQ:
7508 if (!(bl == i->second.first)) {
7509 r = -ECANCELED;
7510 }
7511 break;
7512 case CEPH_OSD_CMPXATTR_OP_LT:
7513 if (!(bl < i->second.first)) {
7514 r = -ECANCELED;
7515 }
7516 break;
7517 case CEPH_OSD_CMPXATTR_OP_GT:
7518 if (!(bl > i->second.first)) {
7519 r = -ECANCELED;
7520 }
7521 break;
7522 default:
7523 r = -EINVAL;
7524 break;
7525 }
7526 if (r < 0)
7527 break;
7528 }
7529 if (r < 0) {
7530 result = r;
7531 }
7532 }
7533 break;
7534
7535 // OMAP Write ops
7536 case CEPH_OSD_OP_OMAPSETVALS:
7537 if (!pool.info.supports_omap()) {
7538 result = -EOPNOTSUPP;
7539 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7540 break;
7541 }
7542 ++ctx->num_write;
7543 result = 0;
7544 {
7545 maybe_create_new_object(ctx);
7546 bufferlist to_set_bl;
7547 try {
7548 decode_str_str_map_to_bl(bp, &to_set_bl);
7549 }
7550 catch (buffer::error& e) {
7551 result = -EINVAL;
7552 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7553 goto fail;
7554 }
7555 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7556 if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7557 dout(20) << "setting vals: " << dendl;
7558 map<string,bufferlist> to_set;
7559 bufferlist::const_iterator pt = to_set_bl.begin();
7560 decode(to_set, pt);
7561 for (map<string, bufferlist>::iterator i = to_set.begin();
7562 i != to_set.end();
7563 ++i) {
7564 dout(20) << "\t" << i->first << dendl;
7565 }
7566 }
7567 t->omap_setkeys(soid, to_set_bl);
7568 ctx->clean_regions.mark_omap_dirty();
7569 ctx->delta_stats.num_wr++;
7570 ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7571 }
7572 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7573 obs.oi.clear_omap_digest();
7574 break;
7575
7576 case CEPH_OSD_OP_OMAPSETHEADER:
7577 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7578 if (!pool.info.supports_omap()) {
7579 result = -EOPNOTSUPP;
7580 break;
7581 }
7582 ++ctx->num_write;
7583 result = 0;
7584 {
7585 maybe_create_new_object(ctx);
7586 t->omap_setheader(soid, osd_op.indata);
7587 ctx->clean_regions.mark_omap_dirty();
7588 ctx->delta_stats.num_wr++;
7589 }
7590 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7591 obs.oi.clear_omap_digest();
7592 break;
7593
7594 case CEPH_OSD_OP_OMAPCLEAR:
7595 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7596 if (!pool.info.supports_omap()) {
7597 result = -EOPNOTSUPP;
7598 break;
7599 }
7600 ++ctx->num_write;
7601 result = 0;
7602 {
7603 if (!obs.exists || oi.is_whiteout()) {
7604 result = -ENOENT;
7605 break;
7606 }
7607 if (oi.is_omap()) {
7608 t->omap_clear(soid);
7609 ctx->clean_regions.mark_omap_dirty();
7610 ctx->delta_stats.num_wr++;
7611 obs.oi.clear_omap_digest();
7612 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7613 }
7614 }
7615 break;
7616
7617 case CEPH_OSD_OP_OMAPRMKEYS:
7618 if (!pool.info.supports_omap()) {
7619 result = -EOPNOTSUPP;
7620 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7621 break;
7622 }
7623 ++ctx->num_write;
7624 result = 0;
7625 {
7626 if (!obs.exists || oi.is_whiteout()) {
7627 result = -ENOENT;
7628 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7629 break;
7630 }
7631 bufferlist to_rm_bl;
7632 try {
7633 decode_str_set_to_bl(bp, &to_rm_bl);
7634 }
7635 catch (buffer::error& e) {
7636 result = -EINVAL;
7637 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7638 goto fail;
7639 }
7640 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7641 t->omap_rmkeys(soid, to_rm_bl);
7642 ctx->clean_regions.mark_omap_dirty();
7643 ctx->delta_stats.num_wr++;
7644 }
7645 obs.oi.clear_omap_digest();
7646 break;
7647
7648 case CEPH_OSD_OP_OMAPRMKEYRANGE:
7649 tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7650 if (!pool.info.supports_omap()) {
7651 result = -EOPNOTSUPP;
7652 break;
7653 }
7654 ++ctx->num_write;
7655 result = 0;
7656 {
7657 if (!obs.exists || oi.is_whiteout()) {
7658 result = -ENOENT;
7659 break;
7660 }
7661 std::string key_begin, key_end;
7662 try {
7663 decode(key_begin, bp);
7664 decode(key_end, bp);
7665 } catch (buffer::error& e) {
7666 result = -EINVAL;
7667 goto fail;
7668 }
7669 t->omap_rmkeyrange(soid, key_begin, key_end);
7670 ctx->delta_stats.num_wr++;
7671 }
7672 obs.oi.clear_omap_digest();
7673 break;
7674
7675 case CEPH_OSD_OP_COPY_GET:
7676 ++ctx->num_read;
7677 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7678 soid.snap.val);
7679 if (op_finisher == nullptr) {
7680 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7681 } else {
7682 result = op_finisher->execute();
7683 }
7684 break;
7685
7686 case CEPH_OSD_OP_COPY_FROM:
7687 case CEPH_OSD_OP_COPY_FROM2:
7688 ++ctx->num_write;
7689 result = 0;
7690 {
7691 object_t src_name;
7692 object_locator_t src_oloc;
7693 uint32_t truncate_seq = 0;
7694 uint64_t truncate_size = 0;
7695 bool have_truncate = false;
7696 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7697 version_t src_version = op.copy_from.src_version;
7698
7699 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7700 (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7701 dout(20) << "invalid copy-from2 flags 0x"
7702 << std::hex << (int)op.copy_from.flags << std::dec << dendl;
7703 result = -EINVAL;
7704 break;
7705 }
7706 try {
7707 decode(src_name, bp);
7708 decode(src_oloc, bp);
7709 // check if client sent us truncate_seq and truncate_size
7710 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7711 (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
7712 decode(truncate_seq, bp);
7713 decode(truncate_size, bp);
7714 have_truncate = true;
7715 }
7716 }
7717 catch (buffer::error& e) {
7718 result = -EINVAL;
7719 tracepoint(osd,
7720 do_osd_op_pre_copy_from,
7721 soid.oid.name.c_str(),
7722 soid.snap.val,
7723 "???",
7724 0,
7725 "???",
7726 "???",
7727 0,
7728 src_snapid,
7729 src_version);
7730 goto fail;
7731 }
7732 tracepoint(osd,
7733 do_osd_op_pre_copy_from,
7734 soid.oid.name.c_str(),
7735 soid.snap.val,
7736 src_name.name.c_str(),
7737 src_oloc.pool,
7738 src_oloc.key.c_str(),
7739 src_oloc.nspace.c_str(),
7740 src_oloc.hash,
7741 src_snapid,
7742 src_version);
7743 if (op_finisher == nullptr) {
7744 // start
7745 pg_t raw_pg;
7746 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7747 hobject_t src(src_name, src_oloc.key, src_snapid,
7748 raw_pg.ps(), raw_pg.pool(),
7749 src_oloc.nspace);
7750 if (src == soid) {
7751 dout(20) << " copy from self is invalid" << dendl;
7752 result = -EINVAL;
7753 break;
7754 }
7755 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
7756 if (have_truncate)
7757 cb->set_truncate(truncate_seq, truncate_size);
7758 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7759 new CopyFromFinisher(cb));
7760 start_copy(cb, ctx->obc, src, src_oloc, src_version,
7761 op.copy_from.flags,
7762 false,
7763 op.copy_from.src_fadvise_flags,
7764 op.flags);
7765 result = -EINPROGRESS;
7766 } else {
7767 // finish
7768 result = op_finisher->execute();
7769 ceph_assert(result == 0);
7770
7771 // COPY_FROM cannot be executed multiple times -- it must restart
7772 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7773 }
7774 }
7775 break;
7776
7777 default:
7778 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7779 dout(1) << "unrecognized osd op " << op.op
7780 << " " << ceph_osd_op_name(op.op)
7781 << dendl;
7782 result = -EOPNOTSUPP;
7783 }
7784
7785 fail:
7786 osd_op.rval = result;
7787 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
7788 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7789 result != -EAGAIN && result != -EINPROGRESS)
7790 result = 0;
7791
7792 if (result < 0)
7793 break;
7794 }
7795 if (result < 0) {
7796 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7797 }
7798 return result;
7799 }
7800
7801 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7802 {
7803 if (ctx->new_obs.oi.size == 0) {
7804 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7805 return -ENODATA;
7806 }
7807 vector<OSDOp> nops(1);
7808 OSDOp &newop = nops[0];
7809 newop.op.op = CEPH_OSD_OP_TMAPGET;
7810 do_osd_ops(ctx, nops);
7811 try {
7812 bufferlist::const_iterator i = newop.outdata.begin();
7813 decode(*header, i);
7814 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7815 } catch (...) {
7816 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7817 << dendl;
7818 return -EINVAL;
7819 }
7820 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7821 << dendl;
7822 return 0;
7823 }
7824
7825 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7826 const SnapSet& ss)
7827 {
7828 // verify that all clones have been evicted
7829 dout(20) << __func__ << " verifying clones are absent "
7830 << ss << dendl;
7831 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7832 p != ss.clones.end();
7833 ++p) {
7834 hobject_t clone_oid = soid;
7835 clone_oid.snap = *p;
7836 if (is_missing_object(clone_oid))
7837 return -EBUSY;
7838 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
7839 if (clone_obc && clone_obc->obs.exists) {
7840 dout(10) << __func__ << " cannot evict head before clone "
7841 << clone_oid << dendl;
7842 return -EBUSY;
7843 }
7844 if (copy_ops.count(clone_oid)) {
7845 dout(10) << __func__ << " cannot evict head, pending promote on clone "
7846 << clone_oid << dendl;
7847 return -EBUSY;
7848 }
7849 }
7850 return 0;
7851 }
7852
7853 inline int PrimaryLogPG::_delete_oid(
7854 OpContext *ctx,
7855 bool no_whiteout, // no whiteouts, no matter what.
7856 bool try_no_whiteout) // try not to whiteout
7857 {
7858 SnapSet& snapset = ctx->new_snapset;
7859 ObjectState& obs = ctx->new_obs;
7860 object_info_t& oi = obs.oi;
7861 const hobject_t& soid = oi.soid;
7862 PGTransaction* t = ctx->op_t.get();
7863
7864 // cache: cache: set whiteout on delete?
7865 bool whiteout = false;
7866 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
7867 && !no_whiteout
7868 && !try_no_whiteout) {
7869 whiteout = true;
7870 }
7871
7872 // in luminous or later, we can't delete the head if there are
7873 // clones. we trust the caller passing no_whiteout has already
7874 // verified they don't exist.
7875 if (!snapset.clones.empty() ||
7876 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
7877 if (no_whiteout) {
7878 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
7879 << dendl;
7880 } else {
7881 dout(20) << __func__ << " has or will have clones; will whiteout"
7882 << dendl;
7883 whiteout = true;
7884 }
7885 }
7886 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
7887 << " no_whiteout=" << (int)no_whiteout
7888 << " try_no_whiteout=" << (int)try_no_whiteout
7889 << dendl;
7890 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
7891 return -ENOENT;
7892
7893 t->remove(soid);
7894
7895 if (oi.size > 0) {
7896 interval_set<uint64_t> ch;
7897 ch.insert(0, oi.size);
7898 ctx->modified_ranges.union_of(ch);
7899 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7900 }
7901
7902 ctx->clean_regions.mark_omap_dirty();
7903 ctx->delta_stats.num_wr++;
7904 if (soid.is_snap()) {
7905 ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7906 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
7907 } else {
7908 ctx->delta_stats.num_bytes -= oi.size;
7909 }
7910 oi.size = 0;
7911 oi.new_object();
7912
7913 // disconnect all watchers
7914 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
7915 oi.watchers.begin();
7916 p != oi.watchers.end();
7917 ++p) {
7918 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
7919 ctx->watch_disconnects.push_back(
7920 watch_disconnect_t(p->first.first, p->first.second, true));
7921 }
7922 oi.watchers.clear();
7923
7924 if (whiteout) {
7925 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
7926 oi.set_flag(object_info_t::FLAG_WHITEOUT);
7927 ctx->delta_stats.num_whiteouts++;
7928 t->create(soid);
7929 osd->logger->inc(l_osd_tier_whiteout);
7930 return 0;
7931 }
7932
7933 // delete the head
7934 ctx->delta_stats.num_objects--;
7935 if (soid.is_snap())
7936 ctx->delta_stats.num_object_clones--;
7937 if (oi.is_whiteout()) {
7938 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
7939 ctx->delta_stats.num_whiteouts--;
7940 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
7941 }
7942 if (oi.is_cache_pinned()) {
7943 ctx->delta_stats.num_objects_pinned--;
7944 }
7945 if (oi.has_manifest()) {
7946 ctx->delta_stats.num_objects_manifest--;
7947 }
7948 obs.exists = false;
7949 return 0;
7950 }
7951
7952 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7953 {
7954 SnapSet& snapset = ctx->new_snapset;
7955 ObjectState& obs = ctx->new_obs;
7956 object_info_t& oi = obs.oi;
7957 const hobject_t& soid = oi.soid;
7958 PGTransaction* t = ctx->op_t.get();
7959 snapid_t snapid = (uint64_t)op.snap.snapid;
7960 hobject_t missing_oid;
7961
7962 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7963
7964 ObjectContextRef rollback_to;
7965
7966 int ret = find_object_context(
7967 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7968 soid.get_namespace()),
7969 &rollback_to, false, false, &missing_oid);
7970 if (ret == -EAGAIN) {
7971 /* clone must be missing */
7972 ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
7973 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7974 << missing_oid << " (requested snapid: ) " << snapid << dendl;
7975 block_write_on_degraded_snap(missing_oid, ctx->op);
7976 return ret;
7977 }
7978 {
7979 ObjectContextRef promote_obc;
7980 cache_result_t tier_mode_result;
7981 if (obs.exists && obs.oi.has_manifest()) {
7982 tier_mode_result =
7983 maybe_handle_manifest_detail(
7984 ctx->op,
7985 true,
7986 rollback_to);
7987 } else {
7988 tier_mode_result =
7989 maybe_handle_cache_detail(
7990 ctx->op,
7991 true,
7992 rollback_to,
7993 ret,
7994 missing_oid,
7995 true,
7996 false,
7997 &promote_obc);
7998 }
7999 switch (tier_mode_result) {
8000 case cache_result_t::NOOP:
8001 break;
8002 case cache_result_t::BLOCKED_PROMOTE:
8003 ceph_assert(promote_obc);
8004 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8005 return -EAGAIN;
8006 case cache_result_t::BLOCKED_FULL:
8007 block_write_on_full_cache(soid, ctx->op);
8008 return -EAGAIN;
8009 case cache_result_t::REPLIED_WITH_EAGAIN:
8010 ceph_abort_msg("this can't happen, no rollback on replica");
8011 default:
8012 ceph_abort_msg("must promote was set, other values are not valid");
8013 return -EAGAIN;
8014 }
8015 }
8016
8017 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8018 // there's no snapshot here, or there's no object.
8019 // if there's no snapshot, we delete the object; otherwise, do nothing.
8020 dout(20) << "_rollback_to deleting head on " << soid.oid
8021 << " because got ENOENT|whiteout on find_object_context" << dendl;
8022 if (ctx->obc->obs.oi.watchers.size()) {
8023 // Cannot delete an object with watchers
8024 ret = -EBUSY;
8025 } else {
8026 _delete_oid(ctx, false, false);
8027 ret = 0;
8028 }
8029 } else if (ret) {
8030 // ummm....huh? It *can't* return anything else at time of writing.
8031 ceph_abort_msg("unexpected error code in _rollback_to");
8032 } else { //we got our context, let's use it to do the rollback!
8033 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8034 if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8035 is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8036 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8037 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8038 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8039 ret = -EAGAIN;
8040 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8041 // rolling back to the head; we just need to clone it.
8042 ctx->modify = true;
8043 } else {
8044 /* 1) Delete current head
8045 * 2) Clone correct snapshot into head
8046 * 3) Calculate clone_overlaps by following overlaps
8047 * forward from rollback snapshot */
8048 dout(10) << "_rollback_to deleting " << soid.oid
8049 << " and rolling back to old snap" << dendl;
8050
8051 if (obs.exists) {
8052 t->remove(soid);
8053 }
8054 t->clone(soid, rollback_to_sobject);
8055 t->add_obc(rollback_to);
8056
8057 map<snapid_t, interval_set<uint64_t> >::iterator iter =
8058 snapset.clone_overlap.lower_bound(snapid);
8059 ceph_assert(iter != snapset.clone_overlap.end());
8060 interval_set<uint64_t> overlaps = iter->second;
8061 for ( ;
8062 iter != snapset.clone_overlap.end();
8063 ++iter)
8064 overlaps.intersection_of(iter->second);
8065
8066 if (obs.oi.size > 0) {
8067 interval_set<uint64_t> modified;
8068 modified.insert(0, obs.oi.size);
8069 overlaps.intersection_of(modified);
8070 modified.subtract(overlaps);
8071 ctx->modified_ranges.union_of(modified);
8072 }
8073
8074 // Adjust the cached objectcontext
8075 maybe_create_new_object(ctx, true);
8076 ctx->delta_stats.num_bytes -= obs.oi.size;
8077 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8078 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8079 ctx->clean_regions.mark_omap_dirty();
8080 obs.oi.size = rollback_to->obs.oi.size;
8081 if (rollback_to->obs.oi.is_data_digest())
8082 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8083 else
8084 obs.oi.clear_data_digest();
8085 if (rollback_to->obs.oi.is_omap_digest())
8086 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8087 else
8088 obs.oi.clear_omap_digest();
8089
8090 if (rollback_to->obs.oi.is_omap()) {
8091 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8092 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8093 } else {
8094 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8095 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8096 }
8097 }
8098 }
8099 return ret;
8100 }
8101
8102 void PrimaryLogPG::_make_clone(
8103 OpContext *ctx,
8104 PGTransaction* t,
8105 ObjectContextRef obc,
8106 const hobject_t& head, const hobject_t& coid,
8107 object_info_t *poi)
8108 {
8109 bufferlist bv;
8110 encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8111
8112 t->clone(coid, head);
8113 setattr_maybe_cache(obc, t, OI_ATTR, bv);
8114 rmattr_maybe_cache(obc, t, SS_ATTR);
8115 }
8116
8117 void PrimaryLogPG::make_writeable(OpContext *ctx)
8118 {
8119 const hobject_t& soid = ctx->obs->oi.soid;
8120 SnapContext& snapc = ctx->snapc;
8121
8122 // clone?
8123 ceph_assert(soid.snap == CEPH_NOSNAP);
8124 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8125 << " snapc=" << snapc << dendl;
8126
8127 bool was_dirty = ctx->obc->obs.oi.is_dirty();
8128 if (ctx->new_obs.exists) {
8129 // we will mark the object dirty
8130 if (ctx->undirty && was_dirty) {
8131 dout(20) << " clearing DIRTY flag" << dendl;
8132 ceph_assert(ctx->new_obs.oi.is_dirty());
8133 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8134 --ctx->delta_stats.num_objects_dirty;
8135 osd->logger->inc(l_osd_tier_clean);
8136 } else if (!was_dirty && !ctx->undirty) {
8137 dout(20) << " setting DIRTY flag" << dendl;
8138 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8139 ++ctx->delta_stats.num_objects_dirty;
8140 osd->logger->inc(l_osd_tier_dirty);
8141 }
8142 } else {
8143 if (was_dirty) {
8144 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8145 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8146 --ctx->delta_stats.num_objects_dirty;
8147 }
8148 }
8149
8150 if ((ctx->new_obs.exists &&
8151 ctx->new_obs.oi.is_omap()) &&
8152 (!ctx->obc->obs.exists ||
8153 !ctx->obc->obs.oi.is_omap())) {
8154 ++ctx->delta_stats.num_objects_omap;
8155 }
8156 if ((!ctx->new_obs.exists ||
8157 !ctx->new_obs.oi.is_omap()) &&
8158 (ctx->obc->obs.exists &&
8159 ctx->obc->obs.oi.is_omap())) {
8160 --ctx->delta_stats.num_objects_omap;
8161 }
8162
8163 if (ctx->new_snapset.seq > snapc.seq) {
8164 dout(10) << " op snapset is old" << dendl;
8165 }
8166
8167 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8168 snapc.snaps.size() && // there are snaps
8169 !ctx->cache_evict &&
8170 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
8171 // clone
8172 hobject_t coid = soid;
8173 coid.snap = snapc.seq;
8174
8175 unsigned l;
8176 for (l = 1;
8177 l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8178 l++) ;
8179
8180 vector<snapid_t> snaps(l);
8181 for (unsigned i=0; i<l; i++)
8182 snaps[i] = snapc.snaps[i];
8183
8184 // prepare clone
8185 object_info_t static_snap_oi(coid);
8186 object_info_t *snap_oi;
8187 if (is_primary()) {
8188 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8189 ctx->clone_obc->destructor_callback =
8190 new C_PG_ObjectContext(this, ctx->clone_obc.get());
8191 ctx->clone_obc->obs.oi = static_snap_oi;
8192 ctx->clone_obc->obs.exists = true;
8193 ctx->clone_obc->ssc = ctx->obc->ssc;
8194 ctx->clone_obc->ssc->ref++;
8195 if (pool.info.is_erasure())
8196 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8197 snap_oi = &ctx->clone_obc->obs.oi;
8198 bool got = ctx->lock_manager.get_write_greedy(
8199 coid,
8200 ctx->clone_obc,
8201 ctx->op);
8202 ceph_assert(got);
8203 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8204 } else {
8205 snap_oi = &static_snap_oi;
8206 }
8207 snap_oi->version = ctx->at_version;
8208 snap_oi->prior_version = ctx->obs->oi.version;
8209 snap_oi->copy_user_bits(ctx->obs->oi);
8210
8211 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8212
8213 ctx->delta_stats.num_objects++;
8214 if (snap_oi->is_dirty()) {
8215 ctx->delta_stats.num_objects_dirty++;
8216 osd->logger->inc(l_osd_tier_dirty);
8217 }
8218 if (snap_oi->is_omap())
8219 ctx->delta_stats.num_objects_omap++;
8220 if (snap_oi->is_cache_pinned())
8221 ctx->delta_stats.num_objects_pinned++;
8222 if (snap_oi->has_manifest())
8223 ctx->delta_stats.num_objects_manifest++;
8224 ctx->delta_stats.num_object_clones++;
8225 ctx->new_snapset.clones.push_back(coid.snap);
8226 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8227 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8228
8229 // clone_overlap should contain an entry for each clone
8230 // (an empty interval_set if there is no overlap)
8231 ctx->new_snapset.clone_overlap[coid.snap];
8232 if (ctx->obs->oi.size)
8233 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8234
8235 // log clone
8236 dout(10) << " cloning v " << ctx->obs->oi.version
8237 << " to " << coid << " v " << ctx->at_version
8238 << " snaps=" << snaps
8239 << " snapset=" << ctx->new_snapset << dendl;
8240 ctx->log.push_back(pg_log_entry_t(
8241 pg_log_entry_t::CLONE, coid, ctx->at_version,
8242 ctx->obs->oi.version,
8243 ctx->obs->oi.user_version,
8244 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8245 encode(snaps, ctx->log.back().snaps);
8246
8247 ctx->at_version.version++;
8248 }
8249
8250 // update most recent clone_overlap and usage stats
8251 if (ctx->new_snapset.clones.size() > 0) {
8252 // the clone_overlap is difference of range between head and clones.
8253 // we need to check whether the most recent clone exists, if it's
8254 // been evicted, it's not included in the stats, but the clone_overlap
8255 // is still exist in the snapset, so we should update the
8256 // clone_overlap to make it sense.
8257 hobject_t last_clone_oid = soid;
8258 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8259 interval_set<uint64_t> &newest_overlap =
8260 ctx->new_snapset.clone_overlap.rbegin()->second;
8261 ctx->modified_ranges.intersection_of(newest_overlap);
8262 if (is_present_clone(last_clone_oid)) {
8263 // modified_ranges is still in use by the clone
8264 ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8265 }
8266 newest_overlap.subtract(ctx->modified_ranges);
8267 }
8268
8269 if (snapc.seq > ctx->new_snapset.seq) {
8270 // update snapset with latest snap context
8271 ctx->new_snapset.seq = snapc.seq;
8272 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8273 ctx->new_snapset.snaps = snapc.snaps;
8274 } else {
8275 ctx->new_snapset.snaps.clear();
8276 }
8277 }
8278 dout(20) << "make_writeable " << soid
8279 << " done, snapset=" << ctx->new_snapset << dendl;
8280 }
8281
8282
8283 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8284 interval_set<uint64_t>& modified, uint64_t offset,
8285 uint64_t length, bool write_full)
8286 {
8287 interval_set<uint64_t> ch;
8288 if (write_full) {
8289 if (oi.size)
8290 ch.insert(0, oi.size);
8291 } else if (length)
8292 ch.insert(offset, length);
8293 modified.union_of(ch);
8294 if (write_full ||
8295 (offset + length > oi.size && length)) {
8296 uint64_t new_size = offset + length;
8297 delta_stats.num_bytes -= oi.size;
8298 delta_stats.num_bytes += new_size;
8299 oi.size = new_size;
8300 }
8301
8302 if (oi.has_manifest() && oi.manifest.is_chunked()) {
8303 for (auto &p : oi.manifest.chunk_map) {
8304 if ((p.first <= offset && p.first + p.second.length > offset) ||
8305 (p.first > offset && p.first < offset + length)) {
8306 p.second.clear_flag(chunk_info_t::FLAG_MISSING);
8307 p.second.set_flag(chunk_info_t::FLAG_DIRTY);
8308 }
8309 }
8310 }
8311 delta_stats.num_wr++;
8312 delta_stats.num_wr_kb += shift_round_up(length, 10);
8313 }
8314
8315 void PrimaryLogPG::truncate_update_size_and_usage(
8316 object_stat_sum_t& delta_stats,
8317 object_info_t& oi,
8318 uint64_t truncate_size)
8319 {
8320 if (oi.size != truncate_size) {
8321 delta_stats.num_bytes -= oi.size;
8322 delta_stats.num_bytes += truncate_size;
8323 oi.size = truncate_size;
8324 }
8325 }
8326
8327 void PrimaryLogPG::complete_disconnect_watches(
8328 ObjectContextRef obc,
8329 const list<watch_disconnect_t> &to_disconnect)
8330 {
8331 for (list<watch_disconnect_t>::const_iterator i =
8332 to_disconnect.begin();
8333 i != to_disconnect.end();
8334 ++i) {
8335 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8336 auto watchers_entry = obc->watchers.find(watcher);
8337 if (watchers_entry != obc->watchers.end()) {
8338 WatchRef watch = watchers_entry->second;
8339 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8340 obc->watchers.erase(watcher);
8341 watch->remove(i->send_disconnect);
8342 } else {
8343 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8344 << watcher << dendl;
8345 }
8346 }
8347 }
8348
8349 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8350 {
8351 entity_name_t entity = ctx->reqid.name;
8352 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8353
8354 // disconnects first
8355 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8356
8357 ceph_assert(conn);
8358
8359 auto session = conn->get_priv();
8360 if (!session)
8361 return;
8362
8363 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8364 i != ctx->watch_connects.end();
8365 ++i) {
8366 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8367 dout(15) << "do_osd_op_effects applying watch connect on session "
8368 << session.get() << " watcher " << watcher << dendl;
8369 WatchRef watch;
8370 if (ctx->obc->watchers.count(watcher)) {
8371 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8372 << dendl;
8373 watch = ctx->obc->watchers[watcher];
8374 } else {
8375 dout(15) << "do_osd_op_effects new watcher " << watcher
8376 << dendl;
8377 watch = Watch::makeWatchRef(
8378 this, osd, ctx->obc, i->first.timeout_seconds,
8379 i->first.cookie, entity, conn->get_peer_addr());
8380 ctx->obc->watchers.insert(
8381 make_pair(
8382 watcher,
8383 watch));
8384 }
8385 watch->connect(conn, i->second);
8386 }
8387
8388 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8389 p != ctx->notifies.end();
8390 ++p) {
8391 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8392 ConnectionRef conn(ctx->op->get_req()->get_connection());
8393 NotifyRef notif(
8394 Notify::makeNotifyRef(
8395 conn,
8396 ctx->reqid.name.num(),
8397 p->bl,
8398 p->timeout,
8399 p->cookie,
8400 p->notify_id,
8401 ctx->obc->obs.oi.user_version,
8402 osd));
8403 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8404 ctx->obc->watchers.begin();
8405 i != ctx->obc->watchers.end();
8406 ++i) {
8407 dout(10) << "starting notify on watch " << i->first << dendl;
8408 i->second->start_notify(notif);
8409 }
8410 notif->init();
8411 }
8412
8413 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8414 p != ctx->notify_acks.end();
8415 ++p) {
8416 if (p->watch_cookie)
8417 dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
8418 else
8419 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8420 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8421 ctx->obc->watchers.begin();
8422 i != ctx->obc->watchers.end();
8423 ++i) {
8424 if (i->first.second != entity) continue;
8425 if (p->watch_cookie &&
8426 *(p->watch_cookie) != i->first.first) continue;
8427 dout(10) << "acking notify on watch " << i->first << dendl;
8428 i->second->notify_ack(p->notify_id, p->reply_bl);
8429 }
8430 }
8431 }
8432
8433 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8434 {
8435 ostringstream ss;
8436 ss << "temp_" << info.pgid << "_" << get_role()
8437 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8438 hobject_t hoid = target.make_temp_hobject(ss.str());
8439 dout(20) << __func__ << " " << hoid << dendl;
8440 return hoid;
8441 }
8442
8443 hobject_t PrimaryLogPG::get_temp_recovery_object(
8444 const hobject_t& target,
8445 eversion_t version)
8446 {
8447 ostringstream ss;
8448 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
8449 << "_" << version
8450 << "_" << info.history.same_interval_since
8451 << "_" << target.snap;
8452 // pgid + version + interval + snapid is unique, and short
8453 hobject_t hoid = target.make_temp_hobject(ss.str());
8454 dout(20) << __func__ << " " << hoid << dendl;
8455 return hoid;
8456 }
8457
8458 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8459 {
8460 ceph_assert(!ctx->ops->empty());
8461
8462 // valid snap context?
8463 if (!ctx->snapc.is_valid()) {
8464 dout(10) << " invalid snapc " << ctx->snapc << dendl;
8465 return -EINVAL;
8466 }
8467
8468 // prepare the actual mutation
8469 int result = do_osd_ops(ctx, *ctx->ops);
8470 if (result < 0) {
8471 if (ctx->op->may_write() &&
8472 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8473 // need to save the error code in the pg log, to detect dup ops,
8474 // but do nothing else
8475 ctx->update_log_only = true;
8476 }
8477 return result;
8478 }
8479
8480 // read-op? write-op noop? done?
8481 if (ctx->op_t->empty() && !ctx->modify) {
8482 if (ctx->pending_async_reads.empty())
8483 unstable_stats.add(ctx->delta_stats);
8484 if (ctx->op->may_write() &&
8485 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8486 ctx->update_log_only = true;
8487 }
8488 return result;
8489 }
8490
8491 // check for full
8492 if ((ctx->delta_stats.num_bytes > 0 ||
8493 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
8494 pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8495 auto m = ctx->op->get_req<MOSDOp>();
8496 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
8497 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8498 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8499 << dendl;
8500 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8501 // they tried, they failed.
8502 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8503 return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8504 } else {
8505 // drop request
8506 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8507 return -EAGAIN;
8508 }
8509 }
8510
8511 const hobject_t& soid = ctx->obs->oi.soid;
8512 // clone, if necessary
8513 if (soid.snap == CEPH_NOSNAP)
8514 make_writeable(ctx);
8515
8516 finish_ctx(ctx,
8517 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8518 pg_log_entry_t::DELETE,
8519 result);
8520
8521 return result;
8522 }
8523
8524 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
8525 {
8526 const hobject_t& soid = ctx->obs->oi.soid;
8527 dout(20) << __func__ << " " << soid << " " << ctx
8528 << " op " << pg_log_entry_t::get_op_name(log_op_type)
8529 << dendl;
8530 utime_t now = ceph_clock_now();
8531
8532 // finish and log the op.
8533 if (ctx->user_modify) {
8534 // update the user_version for any modify ops, except for the watch op
8535 ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8536 /* In order for new clients and old clients to interoperate properly
8537 * when exchanging versions, we need to lower bound the user_version
8538 * (which our new clients pay proper attention to)
8539 * by the at_version (which is all the old clients can ever see). */
8540 if (ctx->at_version.version > ctx->user_at_version)
8541 ctx->user_at_version = ctx->at_version.version;
8542 ctx->new_obs.oi.user_version = ctx->user_at_version;
8543 }
8544 ctx->bytes_written = ctx->op_t->get_bytes_written();
8545
8546 if (ctx->new_obs.exists) {
8547 ctx->new_obs.oi.version = ctx->at_version;
8548 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8549 ctx->new_obs.oi.last_reqid = ctx->reqid;
8550 if (ctx->mtime != utime_t()) {
8551 ctx->new_obs.oi.mtime = ctx->mtime;
8552 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8553 ctx->new_obs.oi.local_mtime = now;
8554 } else {
8555 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8556 }
8557
8558 // object_info_t
8559 map <string, bufferlist> attrs;
8560 bufferlist bv(sizeof(ctx->new_obs.oi));
8561 encode(ctx->new_obs.oi, bv,
8562 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8563 attrs[OI_ATTR].claim(bv);
8564
8565 // snapset
8566 if (soid.snap == CEPH_NOSNAP) {
8567 dout(10) << " final snapset " << ctx->new_snapset
8568 << " in " << soid << dendl;
8569 bufferlist bss;
8570 encode(ctx->new_snapset, bss);
8571 attrs[SS_ATTR].claim(bss);
8572 } else {
8573 dout(10) << " no snapset (this is a clone)" << dendl;
8574 }
8575 ctx->op_t->setattrs(soid, attrs);
8576 } else {
8577 // reset cached oi
8578 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8579 }
8580
8581 // append to log
8582 ctx->log.push_back(
8583 pg_log_entry_t(log_op_type, soid, ctx->at_version,
8584 ctx->obs->oi.version,
8585 ctx->user_at_version, ctx->reqid,
8586 ctx->mtime,
8587 (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8588 if (ctx->op && ctx->op->allows_returnvec()) {
8589 // also the per-op values
8590 ctx->log.back().set_op_returns(*ctx->ops);
8591 dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8592 << dendl;
8593 }
8594
8595 ctx->log.back().clean_regions = ctx->clean_regions;
8596 dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8597
8598 if (soid.snap < CEPH_NOSNAP) {
8599 switch (log_op_type) {
8600 case pg_log_entry_t::MODIFY:
8601 case pg_log_entry_t::PROMOTE:
8602 case pg_log_entry_t::CLEAN:
8603 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8604 << dendl;
8605 encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8606 break;
8607 default:
8608 break;
8609 }
8610 }
8611
8612 if (!ctx->extra_reqids.empty()) {
8613 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
8614 << ctx->extra_reqid_return_codes << dendl;
8615 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
8616 ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
8617 }
8618
8619 // apply new object state.
8620 ctx->obc->obs = ctx->new_obs;
8621
8622 if (soid.is_head() && !ctx->obc->obs.exists) {
8623 ctx->obc->ssc->exists = false;
8624 ctx->obc->ssc->snapset = SnapSet();
8625 } else {
8626 ctx->obc->ssc->exists = true;
8627 ctx->obc->ssc->snapset = ctx->new_snapset;
8628 }
8629 }
8630
8631 void PrimaryLogPG::apply_stats(
8632 const hobject_t &soid,
8633 const object_stat_sum_t &delta_stats) {
8634
8635 recovery_state.apply_op_stats(soid, delta_stats);
8636 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
8637 i != get_backfill_targets().end();
8638 ++i) {
8639 pg_shard_t bt = *i;
8640 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
8641 if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
8642 pending_backfill_updates[soid].stats.add(delta_stats);
8643 }
8644 }
8645
8646 if (is_primary() && scrubber.active) {
8647 if (soid < scrubber.start) {
8648 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
8649 << "," << scrubber.end << ")" << dendl;
8650 scrub_cstat.add(delta_stats);
8651 } else {
8652 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
8653 << "," << scrubber.end << ")" << dendl;
8654 }
8655 }
8656 }
8657
8658 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8659 {
8660 auto m = ctx->op->get_req<MOSDOp>();
8661 ceph_assert(ctx->async_reads_complete());
8662
8663 for (vector<OSDOp>::iterator p = ctx->ops->begin();
8664 p != ctx->ops->end() && result >= 0; ++p) {
8665 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8666 result = p->rval;
8667 break;
8668 }
8669 ctx->bytes_read += p->outdata.length();
8670 }
8671 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
8672
8673 MOSDOpReply *reply = ctx->reply;
8674 ctx->reply = nullptr;
8675
8676 if (result >= 0) {
8677 if (!ctx->ignore_log_op_stats) {
8678 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8679
8680 publish_stats_to_osd();
8681 }
8682
8683 // on read, return the current object version
8684 if (ctx->obs) {
8685 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8686 } else {
8687 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8688 }
8689 } else if (result == -ENOENT) {
8690 // on ENOENT, set a floor for what the next user version will be.
8691 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8692 }
8693
8694 reply->set_result(result);
8695 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8696 osd->send_message_osd_client(reply, m->get_connection());
8697 close_op_ctx(ctx);
8698 }
8699
8700 // ========================================================================
8701 // copyfrom
8702
8703 struct C_Copyfrom : public Context {
8704 PrimaryLogPGRef pg;
8705 hobject_t oid;
8706 epoch_t last_peering_reset;
8707 ceph_tid_t tid;
8708 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
8709 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8710 const PrimaryLogPG::CopyOpRef& c)
8711 : pg(p), oid(o), last_peering_reset(lpr),
8712 tid(0), cop(c)
8713 {}
8714 void finish(int r) override {
8715 if (r == -ECANCELED)
8716 return;
8717 std::scoped_lock l{*pg};
8718 if (last_peering_reset == pg->get_last_peering_reset()) {
8719 pg->process_copy_chunk(oid, tid, r);
8720 cop.reset();
8721 }
8722 }
8723 };
8724
8725 struct C_CopyFrom_AsyncReadCb : public Context {
8726 OSDOp *osd_op;
8727 object_copy_data_t reply_obj;
8728 uint64_t features;
8729 size_t len;
8730 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8731 osd_op(osd_op), features(features), len(0) {}
8732 void finish(int r) override {
8733 osd_op->rval = r;
8734 if (r < 0) {
8735 return;
8736 }
8737
8738 ceph_assert(len > 0);
8739 ceph_assert(len <= reply_obj.data.length());
8740 bufferlist bl;
8741 bl.substr_of(reply_obj.data, 0, len);
8742 reply_obj.data.swap(bl);
8743 encode(reply_obj, osd_op->outdata, features);
8744 }
8745 };
8746
8747 struct C_CopyChunk : public Context {
8748 PrimaryLogPGRef pg;
8749 hobject_t oid;
8750 epoch_t last_peering_reset;
8751 ceph_tid_t tid;
8752 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
8753 uint64_t offset = 0;
8754 C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8755 const PrimaryLogPG::CopyOpRef& c)
8756 : pg(p), oid(o), last_peering_reset(lpr),
8757 tid(0), cop(c)
8758 {}
8759 void finish(int r) override {
8760 if (r == -ECANCELED)
8761 return;
8762 std::scoped_lock l{*pg};
8763 if (last_peering_reset == pg->get_last_peering_reset()) {
8764 pg->process_copy_chunk_manifest(oid, tid, r, offset);
8765 cop.reset();
8766 }
8767 }
8768 };
8769
8770 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
8771 OSDOp& osd_op, ObjectContextRef &obc)
8772 {
8773 object_info_t& oi = obc->obs.oi;
8774 hobject_t& soid = oi.soid;
8775 int result = 0;
8776 object_copy_cursor_t cursor;
8777 uint64_t out_max;
8778 try {
8779 decode(cursor, bp);
8780 decode(out_max, bp);
8781 }
8782 catch (buffer::error& e) {
8783 result = -EINVAL;
8784 return result;
8785 }
8786
8787 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8788 uint64_t features = op->get_features();
8789
8790 bool async_read_started = false;
8791 object_copy_data_t _reply_obj;
8792 C_CopyFrom_AsyncReadCb *cb = nullptr;
8793 if (pool.info.is_erasure()) {
8794 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8795 }
8796 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8797 // size, mtime
8798 reply_obj.size = oi.size;
8799 reply_obj.mtime = oi.mtime;
8800 ceph_assert(obc->ssc);
8801 if (soid.snap < CEPH_NOSNAP) {
8802 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8803 ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8804 reply_obj.snaps = p->second;
8805 } else {
8806 reply_obj.snap_seq = obc->ssc->snapset.seq;
8807 }
8808 if (oi.is_data_digest()) {
8809 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8810 reply_obj.data_digest = oi.data_digest;
8811 }
8812 if (oi.is_omap_digest()) {
8813 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8814 reply_obj.omap_digest = oi.omap_digest;
8815 }
8816 reply_obj.truncate_seq = oi.truncate_seq;
8817 reply_obj.truncate_size = oi.truncate_size;
8818
8819 // attrs
8820 map<string,bufferlist>& out_attrs = reply_obj.attrs;
8821 if (!cursor.attr_complete) {
8822 result = getattrs_maybe_cache(
8823 ctx->obc,
8824 &out_attrs);
8825 if (result < 0) {
8826 if (cb) {
8827 delete cb;
8828 }
8829 return result;
8830 }
8831 cursor.attr_complete = true;
8832 dout(20) << " got attrs" << dendl;
8833 }
8834
8835 int64_t left = out_max - osd_op.outdata.length();
8836
8837 // data
8838 bufferlist& bl = reply_obj.data;
8839 if (left > 0 && !cursor.data_complete) {
8840 if (cursor.data_offset < oi.size) {
8841 uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
8842 if (cb) {
8843 async_read_started = true;
8844 ctx->pending_async_reads.push_back(
8845 make_pair(
8846 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
8847 make_pair(&bl, cb)));
8848 cb->len = max_read;
8849
8850 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8851 new ReadFinisher(osd_op));
8852 result = -EINPROGRESS;
8853
8854 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
8855 } else {
8856 result = pgbackend->objects_read_sync(
8857 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
8858 if (result < 0)
8859 return result;
8860 }
8861 left -= max_read;
8862 cursor.data_offset += max_read;
8863 }
8864 if (cursor.data_offset == oi.size) {
8865 cursor.data_complete = true;
8866 dout(20) << " got data" << dendl;
8867 }
8868 ceph_assert(cursor.data_offset <= oi.size);
8869 }
8870
8871 // omap
8872 uint32_t omap_keys = 0;
8873 if (!pool.info.supports_omap() || !oi.is_omap()) {
8874 cursor.omap_complete = true;
8875 } else {
8876 if (left > 0 && !cursor.omap_complete) {
8877 ceph_assert(cursor.data_complete);
8878 if (cursor.omap_offset.empty()) {
8879 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
8880 &reply_obj.omap_header);
8881 }
8882 bufferlist omap_data;
8883 ObjectMap::ObjectMapIterator iter =
8884 osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
8885 ceph_assert(iter);
8886 iter->upper_bound(cursor.omap_offset);
8887 for (; iter->valid(); iter->next()) {
8888 ++omap_keys;
8889 encode(iter->key(), omap_data);
8890 encode(iter->value(), omap_data);
8891 left -= iter->key().length() + 4 + iter->value().length() + 4;
8892 if (left <= 0)
8893 break;
8894 }
8895 if (omap_keys) {
8896 encode(omap_keys, reply_obj.omap_data);
8897 reply_obj.omap_data.claim_append(omap_data);
8898 }
8899 if (iter->valid()) {
8900 cursor.omap_offset = iter->key();
8901 } else {
8902 cursor.omap_complete = true;
8903 dout(20) << " got omap" << dendl;
8904 }
8905 }
8906 }
8907
8908 if (cursor.is_complete()) {
8909 // include reqids only in the final step. this is a bit fragile
8910 // but it works...
8911 recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
8912 &reply_obj.reqids,
8913 &reply_obj.reqid_return_codes);
8914 dout(20) << " got reqids" << dendl;
8915 }
8916
8917 dout(20) << " cursor.is_complete=" << cursor.is_complete()
8918 << " " << out_attrs.size() << " attrs"
8919 << " " << bl.length() << " bytes"
8920 << " " << reply_obj.omap_header.length() << " omap header bytes"
8921 << " " << reply_obj.omap_data.length() << " omap data bytes in "
8922 << omap_keys << " keys"
8923 << " " << reply_obj.reqids.size() << " reqids"
8924 << dendl;
8925 reply_obj.cursor = cursor;
8926 if (!async_read_started) {
8927 encode(reply_obj, osd_op.outdata, features);
8928 }
8929 if (cb && !async_read_started) {
8930 delete cb;
8931 }
8932
8933 if (result > 0) {
8934 result = 0;
8935 }
8936 return result;
8937 }
8938
8939 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8940 OSDOp& osd_op)
8941 {
8942 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
8943 uint64_t features = m->get_features();
8944 object_copy_data_t reply_obj;
8945
8946 recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
8947 &reply_obj.reqid_return_codes);
8948 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8949 encode(reply_obj, osd_op.outdata, features);
8950 osd_op.rval = -ENOENT;
8951 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
8952 reply->set_result(-ENOENT);
8953 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8954 osd->send_message_osd_client(reply, m->get_connection());
8955 }
8956
8957 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8958 hobject_t src, object_locator_t oloc,
8959 version_t version, unsigned flags,
8960 bool mirror_snapset,
8961 unsigned src_obj_fadvise_flags,
8962 unsigned dest_obj_fadvise_flags)
8963 {
8964 const hobject_t& dest = obc->obs.oi.soid;
8965 dout(10) << __func__ << " " << dest
8966 << " from " << src << " " << oloc << " v" << version
8967 << " flags " << flags
8968 << (mirror_snapset ? " mirror_snapset" : "")
8969 << dendl;
8970
8971 ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
8972
8973 // cancel a previous in-progress copy?
8974 if (copy_ops.count(dest)) {
8975 // FIXME: if the src etc match, we could avoid restarting from the
8976 // beginning.
8977 CopyOpRef cop = copy_ops[dest];
8978 vector<ceph_tid_t> tids;
8979 cancel_copy(cop, false, &tids);
8980 osd->objecter->op_cancel(tids, -ECANCELED);
8981 }
8982
8983 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8984 mirror_snapset, src_obj_fadvise_flags,
8985 dest_obj_fadvise_flags));
8986 copy_ops[dest] = cop;
8987 obc->start_block();
8988
8989 if (!obc->obs.oi.has_manifest()) {
8990 _copy_some(obc, cop);
8991 } else {
8992 if (obc->obs.oi.manifest.is_redirect()) {
8993 _copy_some(obc, cop);
8994 } else if (obc->obs.oi.manifest.is_chunked()) {
8995 auto p = obc->obs.oi.manifest.chunk_map.begin();
8996 _copy_some_manifest(obc, cop, p->first);
8997 } else {
8998 ceph_abort_msg("unrecognized manifest type");
8999 }
9000 }
9001 }
9002
9003 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9004 {
9005 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9006
9007 unsigned flags = 0;
9008 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9009 flags |= CEPH_OSD_FLAG_FLUSH;
9010 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9011 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9012 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9013 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9014 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9015 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9016 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9017 flags |= CEPH_OSD_FLAG_RWORDERED;
9018
9019 C_GatherBuilder gather(cct);
9020
9021 if (cop->cursor.is_initial() && cop->mirror_snapset) {
9022 // list snaps too.
9023 ceph_assert(cop->src.snap == CEPH_NOSNAP);
9024 ObjectOperation op;
9025 op.list_snaps(&cop->results.snapset, NULL);
9026 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9027 CEPH_SNAPDIR, NULL,
9028 flags, gather.new_sub(), NULL);
9029 cop->objecter_tid2 = tid;
9030 }
9031
9032 ObjectOperation op;
9033 if (cop->results.user_version) {
9034 op.assert_version(cop->results.user_version);
9035 } else {
9036 // we should learn the version after the first chunk, if we didn't know
9037 // it already!
9038 ceph_assert(cop->cursor.is_initial());
9039 }
9040 op.copy_get(&cop->cursor, get_copy_chunk_size(),
9041 &cop->results.object_size, &cop->results.mtime,
9042 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9043 &cop->results.snaps, &cop->results.snap_seq,
9044 &cop->results.flags,
9045 &cop->results.source_data_digest,
9046 &cop->results.source_omap_digest,
9047 &cop->results.reqids,
9048 &cop->results.reqid_return_codes,
9049 &cop->results.truncate_seq,
9050 &cop->results.truncate_size,
9051 &cop->rval);
9052 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9053
9054 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9055 get_last_peering_reset(), cop);
9056 gather.set_finisher(new C_OnFinisher(fin,
9057 osd->get_objecter_finisher(get_pg_shard())));
9058
9059 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9060 cop->src.snap, NULL,
9061 flags,
9062 gather.new_sub(),
9063 // discover the object version if we don't know it yet
9064 cop->results.user_version ? NULL : &cop->results.user_version);
9065 fin->tid = tid;
9066 cop->objecter_tid = tid;
9067 gather.activate();
9068 }
9069
9070 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9071 {
9072 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9073
9074 unsigned flags = 0;
9075 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9076 flags |= CEPH_OSD_FLAG_FLUSH;
9077 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9078 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9079 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9080 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9081 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9082 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9083 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9084 flags |= CEPH_OSD_FLAG_RWORDERED;
9085
9086 int num_chunks = 0;
9087 uint64_t last_offset = 0, chunks_size = 0;
9088 object_manifest_t *manifest = &obc->obs.oi.manifest;
9089 map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9090 for (;iter != manifest->chunk_map.end(); ++iter) {
9091 num_chunks++;
9092 chunks_size += iter->second.length;
9093 last_offset = iter->first;
9094 if (get_copy_chunk_size() < chunks_size) {
9095 break;
9096 }
9097 }
9098
9099 cop->num_chunk = num_chunks;
9100 cop->start_offset = start_offset;
9101 cop->last_offset = last_offset;
9102 dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9103 << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9104 << " last_offset: " << last_offset << dendl;
9105
9106 iter = manifest->chunk_map.find(start_offset);
9107 for (;iter != manifest->chunk_map.end(); ++iter) {
9108 uint64_t obj_offset = iter->first;
9109 uint64_t length = manifest->chunk_map[iter->first].length;
9110 hobject_t soid = manifest->chunk_map[iter->first].oid;
9111 object_locator_t oloc(soid);
9112 CopyCallback * cb = NULL;
9113 CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9114 cop->results.user_version, cop->flags, cop->mirror_snapset,
9115 cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9116 sub_cop->cursor.data_offset = obj_offset;
9117 cop->chunk_cops[obj_offset] = sub_cop;
9118
9119 int s = sub_cop->chunk_ops.size();
9120 sub_cop->chunk_ops.resize(s+1);
9121 sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
9122 sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9123 sub_cop->chunk_ops[s].op.extent.length = length;
9124
9125 ObjectOperation op;
9126 op.dup(sub_cop->chunk_ops);
9127
9128 dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9129 << manifest->chunk_map[iter->first].offset
9130 << " length: " << length << " pool id: " << oloc.pool << dendl;
9131
9132 if (cop->results.user_version) {
9133 op.assert_version(cop->results.user_version);
9134 } else {
9135 // we should learn the version after the first chunk, if we didn't know
9136 // it already!
9137 ceph_assert(cop->cursor.is_initial());
9138 }
9139 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9140
9141 C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9142 get_last_peering_reset(), cop);
9143 fin->offset = obj_offset;
9144
9145 ceph_tid_t tid = osd->objecter->read(
9146 soid.oid, oloc, op,
9147 sub_cop->src.snap, NULL,
9148 flags,
9149 new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9150 // discover the object version if we don't know it yet
9151 sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9152 fin->tid = tid;
9153 sub_cop->objecter_tid = tid;
9154 if (last_offset < iter->first) {
9155 break;
9156 }
9157 }
9158 }
9159
9160 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9161 {
9162 dout(10) << __func__ << " " << oid << " tid " << tid
9163 << " " << cpp_strerror(r) << dendl;
9164 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9165 if (p == copy_ops.end()) {
9166 dout(10) << __func__ << " no copy_op found" << dendl;
9167 return;
9168 }
9169 CopyOpRef cop = p->second;
9170 if (tid != cop->objecter_tid) {
9171 dout(10) << __func__ << " tid " << tid << " != cop " << cop
9172 << " tid " << cop->objecter_tid << dendl;
9173 return;
9174 }
9175
9176 if (cop->omap_data.length() || cop->omap_header.length())
9177 cop->results.has_omap = true;
9178
9179 if (r >= 0 && !pool.info.supports_omap() &&
9180 (cop->omap_data.length() || cop->omap_header.length())) {
9181 r = -EOPNOTSUPP;
9182 }
9183 cop->objecter_tid = 0;
9184 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9185 ObjectContextRef& cobc = cop->obc;
9186
9187 if (r < 0)
9188 goto out;
9189
9190 ceph_assert(cop->rval >= 0);
9191
9192 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9193 // verify snap hasn't been deleted
9194 vector<snapid_t>::iterator p = cop->results.snaps.begin();
9195 while (p != cop->results.snaps.end()) {
9196 // make best effort to sanitize snaps/clones.
9197 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
9198 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9199 << dendl;
9200 for (vector<snapid_t>::iterator q = p + 1;
9201 q != cop->results.snaps.end();
9202 ++q)
9203 *(q - 1) = *q;
9204 cop->results.snaps.resize(cop->results.snaps.size() - 1);
9205 } else {
9206 ++p;
9207 }
9208 }
9209 if (cop->results.snaps.empty()) {
9210 dout(10) << __func__ << " no more snaps for " << oid << dendl;
9211 r = -ENOENT;
9212 goto out;
9213 }
9214 }
9215
9216 ceph_assert(cop->rval >= 0);
9217
9218 if (!cop->temp_cursor.data_complete) {
9219 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9220 }
9221 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9222 if (cop->omap_header.length()) {
9223 cop->results.omap_digest =
9224 cop->omap_header.crc32c(cop->results.omap_digest);
9225 }
9226 if (cop->omap_data.length()) {
9227 bufferlist keys;
9228 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9229 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9230 }
9231 }
9232
9233 if (!cop->temp_cursor.attr_complete) {
9234 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9235 p != cop->attrs.end();
9236 ++p) {
9237 cop->results.attrs[string("_") + p->first] = p->second;
9238 }
9239 cop->attrs.clear();
9240 }
9241
9242 if (!cop->cursor.is_complete()) {
9243 // write out what we have so far
9244 if (cop->temp_cursor.is_initial()) {
9245 ceph_assert(!cop->results.started_temp_obj);
9246 cop->results.started_temp_obj = true;
9247 cop->results.temp_oid = generate_temp_object(oid);
9248 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9249 }
9250 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9251 OpContextUPtr ctx = simple_opc_create(tempobc);
9252 if (cop->temp_cursor.is_initial()) {
9253 ctx->new_temp_oid = cop->results.temp_oid;
9254 }
9255 _write_copy_chunk(cop, ctx->op_t.get());
9256 simple_opc_submit(std::move(ctx));
9257 dout(10) << __func__ << " fetching more" << dendl;
9258 _copy_some(cobc, cop);
9259 return;
9260 }
9261
9262 // verify digests?
9263 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9264 dout(20) << __func__ << std::hex
9265 << " got digest: rx data 0x" << cop->results.data_digest
9266 << " omap 0x" << cop->results.omap_digest
9267 << ", source: data 0x" << cop->results.source_data_digest
9268 << " omap 0x" << cop->results.source_omap_digest
9269 << std::dec
9270 << " flags " << cop->results.flags
9271 << dendl;
9272 }
9273 if (cop->results.is_data_digest() &&
9274 cop->results.data_digest != cop->results.source_data_digest) {
9275 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9276 << " != source 0x" << cop->results.source_data_digest << std::dec
9277 << dendl;
9278 osd->clog->error() << info.pgid << " copy from " << cop->src
9279 << " to " << cop->obc->obs.oi.soid << std::hex
9280 << " data digest 0x" << cop->results.data_digest
9281 << " != source 0x" << cop->results.source_data_digest
9282 << std::dec;
9283 r = -EIO;
9284 goto out;
9285 }
9286 if (cop->results.is_omap_digest() &&
9287 cop->results.omap_digest != cop->results.source_omap_digest) {
9288 derr << __func__ << std::hex
9289 << " omap digest 0x" << cop->results.omap_digest
9290 << " != source 0x" << cop->results.source_omap_digest
9291 << std::dec << dendl;
9292 osd->clog->error() << info.pgid << " copy from " << cop->src
9293 << " to " << cop->obc->obs.oi.soid << std::hex
9294 << " omap digest 0x" << cop->results.omap_digest
9295 << " != source 0x" << cop->results.source_omap_digest
9296 << std::dec;
9297 r = -EIO;
9298 goto out;
9299 }
9300 if (cct->_conf->osd_debug_inject_copyfrom_error) {
9301 derr << __func__ << " injecting copyfrom failure" << dendl;
9302 r = -EIO;
9303 goto out;
9304 }
9305
9306 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9307 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9308 ObjectState& obs = cop->obc->obs;
9309 if (cop->temp_cursor.is_initial()) {
9310 dout(20) << "fill_in_final_tx: writing "
9311 << "directly to final object" << dendl;
9312 // write directly to final object
9313 cop->results.temp_oid = obs.oi.soid;
9314 _write_copy_chunk(cop, t);
9315 } else {
9316 // finish writing to temp object, then move into place
9317 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9318 _write_copy_chunk(cop, t);
9319 t->rename(obs.oi.soid, cop->results.temp_oid);
9320 }
9321 t->setattrs(obs.oi.soid, cop->results.attrs);
9322 });
9323
9324 dout(20) << __func__ << " success; committing" << dendl;
9325
9326 out:
9327 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9328 CopyCallbackResults results(r, &cop->results);
9329 cop->cb->complete(results);
9330
9331 copy_ops.erase(cobc->obs.oi.soid);
9332 cobc->stop_block();
9333
9334 if (r < 0 && cop->results.started_temp_obj) {
9335 dout(10) << __func__ << " deleting partial temp object "
9336 << cop->results.temp_oid << dendl;
9337 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9338 OpContextUPtr ctx = simple_opc_create(tempobc);
9339 ctx->op_t->remove(cop->results.temp_oid);
9340 ctx->discard_temp_oid = cop->results.temp_oid;
9341 simple_opc_submit(std::move(ctx));
9342 }
9343
9344 // cancel and requeue proxy ops on this object
9345 if (!r) {
9346 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9347 }
9348
9349 kick_object_context_blocked(cobc);
9350 }
9351
9352 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9353 {
9354 dout(10) << __func__ << " " << oid << " tid " << tid
9355 << " " << cpp_strerror(r) << dendl;
9356 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9357 if (p == copy_ops.end()) {
9358 dout(10) << __func__ << " no copy_op found" << dendl;
9359 return;
9360 }
9361 CopyOpRef obj_cop = p->second;
9362 CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9363
9364 if (tid != chunk_cop->objecter_tid) {
9365 dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9366 << " tid " << chunk_cop->objecter_tid << dendl;
9367 return;
9368 }
9369
9370 if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9371 r = -EOPNOTSUPP;
9372 }
9373
9374 chunk_cop->objecter_tid = 0;
9375 chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9376 ObjectContextRef& cobc = obj_cop->obc;
9377 OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9378
9379 if (r < 0) {
9380 obj_cop->failed = true;
9381 goto out;
9382 }
9383
9384 if (obj_cop->failed) {
9385 return;
9386 }
9387 if (!chunk_data.outdata.length()) {
9388 r = -EIO;
9389 obj_cop->failed = true;
9390 goto out;
9391 }
9392
9393 obj_cop->num_chunk--;
9394
9395 /* check all of the copyop are completed */
9396 if (obj_cop->num_chunk) {
9397 dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9398 return;
9399 }
9400
9401 {
9402 OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9403 if (!ctx->lock_manager.take_write_lock(
9404 obj_cop->obc->obs.oi.soid,
9405 obj_cop->obc)) {
9406 // recovery op can take read lock.
9407 // so need to wait for recovery completion
9408 r = -EAGAIN;
9409 obj_cop->failed = true;
9410 close_op_ctx(ctx.release());
9411 goto out;
9412 }
9413 dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9414
9415 PGTransaction *t = ctx->op_t.get();
9416 ObjectState& obs = ctx->new_obs;
9417 for (auto p : obj_cop->chunk_cops) {
9418 OSDOp &sub_chunk = p.second->chunk_ops[0];
9419 t->write(cobc->obs.oi.soid,
9420 p.second->cursor.data_offset,
9421 sub_chunk.outdata.length(),
9422 sub_chunk.outdata,
9423 p.second->dest_obj_fadvise_flags);
9424 dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9425 << " length: " << sub_chunk.outdata.length() << dendl;
9426 write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9427 p.second->cursor.data_offset, sub_chunk.outdata.length());
9428 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_DIRTY);
9429 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9430 ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
9431 sub_chunk.outdata.clear();
9432 }
9433 obs.oi.clear_data_digest();
9434 ctx->at_version = get_next_version();
9435 finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9436 simple_opc_submit(std::move(ctx));
9437
9438 auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9439 /* check remaining work */
9440 if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9441 if (obj_cop->last_offset >= p->first + p->second.length) {
9442 for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9443 if (obj_cop->last_offset < en.first) {
9444 _copy_some_manifest(cobc, obj_cop, en.first);
9445 return;
9446 }
9447 }
9448 }
9449 }
9450 }
9451
9452 out:
9453 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9454 CopyCallbackResults results(r, &obj_cop->results);
9455 obj_cop->cb->complete(results);
9456
9457 copy_ops.erase(cobc->obs.oi.soid);
9458 cobc->stop_block();
9459
9460 // cancel and requeue proxy ops on this object
9461 if (!r) {
9462 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9463 }
9464
9465 kick_object_context_blocked(cobc);
9466 }
9467
9468 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9469 vector<ceph_tid_t> tids;
9470 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9471 it != proxyread_ops.end();) {
9472 if (it->second->soid == oid) {
9473 cancel_proxy_read((it++)->second, &tids);
9474 } else {
9475 ++it;
9476 }
9477 }
9478 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9479 it != proxywrite_ops.end();) {
9480 if (it->second->soid == oid) {
9481 cancel_proxy_write((it++)->second, &tids);
9482 } else {
9483 ++it;
9484 }
9485 }
9486 osd->objecter->op_cancel(tids, -ECANCELED);
9487 kick_proxy_ops_blocked(oid);
9488 }
9489
9490 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9491 {
9492 dout(20) << __func__ << " " << cop
9493 << " " << cop->attrs.size() << " attrs"
9494 << " " << cop->data.length() << " bytes"
9495 << " " << cop->omap_header.length() << " omap header bytes"
9496 << " " << cop->omap_data.length() << " omap data bytes"
9497 << dendl;
9498 if (!cop->temp_cursor.attr_complete) {
9499 t->create(cop->results.temp_oid);
9500 }
9501 if (!cop->temp_cursor.data_complete) {
9502 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9503 cop->cursor.data_offset);
9504 if (pool.info.required_alignment() &&
9505 !cop->cursor.data_complete) {
9506 /**
9507 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9508 * to pick it up on the next pass.
9509 */
9510 ceph_assert(cop->temp_cursor.data_offset %
9511 pool.info.required_alignment() == 0);
9512 if (cop->data.length() % pool.info.required_alignment() != 0) {
9513 uint64_t to_trim =
9514 cop->data.length() % pool.info.required_alignment();
9515 bufferlist bl;
9516 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9517 cop->data.swap(bl);
9518 cop->cursor.data_offset -= to_trim;
9519 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9520 cop->cursor.data_offset);
9521 }
9522 }
9523 if (cop->data.length()) {
9524 t->write(
9525 cop->results.temp_oid,
9526 cop->temp_cursor.data_offset,
9527 cop->data.length(),
9528 cop->data,
9529 cop->dest_obj_fadvise_flags);
9530 }
9531 cop->data.clear();
9532 }
9533 if (pool.info.supports_omap()) {
9534 if (!cop->temp_cursor.omap_complete) {
9535 if (cop->omap_header.length()) {
9536 t->omap_setheader(
9537 cop->results.temp_oid,
9538 cop->omap_header);
9539 cop->omap_header.clear();
9540 }
9541 if (cop->omap_data.length()) {
9542 map<string,bufferlist> omap;
9543 bufferlist::const_iterator p = cop->omap_data.begin();
9544 decode(omap, p);
9545 t->omap_setkeys(cop->results.temp_oid, omap);
9546 cop->omap_data.clear();
9547 }
9548 }
9549 } else {
9550 ceph_assert(cop->omap_header.length() == 0);
9551 ceph_assert(cop->omap_data.length() == 0);
9552 }
9553 cop->temp_cursor = cop->cursor;
9554 }
9555
9556 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9557 {
9558 OpContext *ctx = cb->ctx;
9559 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9560
9561 ObjectState& obs = ctx->new_obs;
9562 if (obs.exists) {
9563 dout(20) << __func__ << ": exists, removing" << dendl;
9564 ctx->op_t->remove(obs.oi.soid);
9565 } else {
9566 ctx->delta_stats.num_objects++;
9567 obs.exists = true;
9568 }
9569 if (cb->is_temp_obj_used()) {
9570 ctx->discard_temp_oid = cb->results->temp_oid;
9571 }
9572 cb->results->fill_in_final_tx(ctx->op_t.get());
9573
9574 // CopyFromCallback fills this in for us
9575 obs.oi.user_version = ctx->user_at_version;
9576
9577 if (cb->results->is_data_digest()) {
9578 obs.oi.set_data_digest(cb->results->data_digest);
9579 } else {
9580 obs.oi.clear_data_digest();
9581 }
9582 if (cb->results->is_omap_digest()) {
9583 obs.oi.set_omap_digest(cb->results->omap_digest);
9584 } else {
9585 obs.oi.clear_omap_digest();
9586 }
9587
9588 obs.oi.truncate_seq = cb->truncate_seq;
9589 obs.oi.truncate_size = cb->truncate_size;
9590
9591 obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9592 ctx->mtime = utime_t();
9593
9594 ctx->extra_reqids = cb->results->reqids;
9595 ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9596
9597 // cache: clear whiteout?
9598 if (obs.oi.is_whiteout()) {
9599 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9600 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9601 --ctx->delta_stats.num_whiteouts;
9602 }
9603
9604 if (cb->results->has_omap) {
9605 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9606 obs.oi.set_flag(object_info_t::FLAG_OMAP);
9607 ctx->clean_regions.mark_omap_dirty();
9608 } else {
9609 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9610 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9611 }
9612
9613 interval_set<uint64_t> ch;
9614 if (obs.oi.size > 0)
9615 ch.insert(0, obs.oi.size);
9616 ctx->modified_ranges.union_of(ch);
9617 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
9618
9619 if (cb->get_data_size() != obs.oi.size) {
9620 ctx->delta_stats.num_bytes -= obs.oi.size;
9621 obs.oi.size = cb->get_data_size();
9622 ctx->delta_stats.num_bytes += obs.oi.size;
9623 }
9624 ctx->delta_stats.num_wr++;
9625 ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
9626
9627 osd->logger->inc(l_osd_copyfrom);
9628 }
9629
9630 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9631 ObjectContextRef obc)
9632 {
9633 const hobject_t& soid = obc->obs.oi.soid;
9634 dout(10) << __func__ << " " << soid << " r=" << r
9635 << " uv" << results->user_version << dendl;
9636
9637 if (r == -ECANCELED) {
9638 return;
9639 }
9640
9641 if (r != -ENOENT && soid.is_snap()) {
9642 if (results->snaps.empty()) {
9643 // we must have read "snap" content from the head object in the
9644 // base pool. use snap_seq to construct what snaps should be
9645 // for this clone (what is was before we evicted the clean clone
9646 // from this pool, and what it will be when we flush and the
9647 // clone eventually happens in the base pool). we want to use
9648 // snaps in (results->snap_seq,soid.snap]
9649 SnapSet& snapset = obc->ssc->snapset;
9650 for (auto p = snapset.clone_snaps.rbegin();
9651 p != snapset.clone_snaps.rend();
9652 ++p) {
9653 for (auto snap : p->second) {
9654 if (snap > soid.snap) {
9655 continue;
9656 }
9657 if (snap <= results->snap_seq) {
9658 break;
9659 }
9660 results->snaps.push_back(snap);
9661 }
9662 }
9663 }
9664
9665 dout(20) << __func__ << " snaps " << results->snaps << dendl;
9666 filter_snapc(results->snaps);
9667
9668 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9669 if (results->snaps.empty()) {
9670 dout(20) << __func__
9671 << " snaps are empty, clone is invalid,"
9672 << " setting r to ENOENT" << dendl;
9673 r = -ENOENT;
9674 }
9675 }
9676
9677 if (r < 0 && results->started_temp_obj) {
9678 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9679 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
9680 ceph_assert(tempobc);
9681 OpContextUPtr ctx = simple_opc_create(tempobc);
9682 ctx->op_t->remove(results->temp_oid);
9683 simple_opc_submit(std::move(ctx));
9684 results->started_temp_obj = false;
9685 }
9686
9687 if (r == -ENOENT && soid.is_snap()) {
9688 dout(10) << __func__
9689 << ": enoent while trying to promote clone, " << soid
9690 << " must have been trimmed, removing from snapset"
9691 << dendl;
9692 hobject_t head(soid.get_head());
9693 ObjectContextRef obc = get_object_context(head, false);
9694 ceph_assert(obc);
9695
9696 OpContextUPtr tctx = simple_opc_create(obc);
9697 tctx->at_version = get_next_version();
9698 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
9699 filter_snapc(tctx->new_snapset.snaps);
9700 } else {
9701 tctx->new_snapset.snaps.clear();
9702 }
9703 vector<snapid_t> new_clones;
9704 map<snapid_t, vector<snapid_t>> new_clone_snaps;
9705 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9706 i != tctx->new_snapset.clones.end();
9707 ++i) {
9708 if (*i != soid.snap) {
9709 new_clones.push_back(*i);
9710 auto p = tctx->new_snapset.clone_snaps.find(*i);
9711 if (p != tctx->new_snapset.clone_snaps.end()) {
9712 new_clone_snaps[*i] = p->second;
9713 }
9714 }
9715 }
9716 tctx->new_snapset.clones.swap(new_clones);
9717 tctx->new_snapset.clone_overlap.erase(soid.snap);
9718 tctx->new_snapset.clone_size.erase(soid.snap);
9719 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9720
9721 // take RWWRITE lock for duration of our local write. ignore starvation.
9722 if (!tctx->lock_manager.take_write_lock(
9723 head,
9724 obc)) {
9725 ceph_abort_msg("problem!");
9726 }
9727 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9728
9729 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9730
9731 simple_opc_submit(std::move(tctx));
9732 return;
9733 }
9734
9735 bool whiteout = false;
9736 if (r == -ENOENT) {
9737 ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
9738 dout(10) << __func__ << " whiteout " << soid << dendl;
9739 whiteout = true;
9740 }
9741
9742 if (r < 0 && !whiteout) {
9743 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9744 // pass error to everyone blocked on this object
9745 // FIXME: this is pretty sloppy, but at this point we got
9746 // something unexpected and don't have many other options.
9747 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9748 waiting_for_blocked_object.find(soid);
9749 if (blocked_iter != waiting_for_blocked_object.end()) {
9750 while (!blocked_iter->second.empty()) {
9751 osd->reply_op_error(blocked_iter->second.front(), r);
9752 blocked_iter->second.pop_front();
9753 }
9754 waiting_for_blocked_object.erase(blocked_iter);
9755 }
9756 return;
9757 }
9758
9759 osd->promote_finish(results->object_size);
9760
9761 OpContextUPtr tctx = simple_opc_create(obc);
9762 tctx->at_version = get_next_version();
9763
9764 if (!obc->obs.oi.has_manifest()) {
9765 ++tctx->delta_stats.num_objects;
9766 }
9767 if (soid.snap < CEPH_NOSNAP)
9768 ++tctx->delta_stats.num_object_clones;
9769 tctx->new_obs.exists = true;
9770
9771 tctx->extra_reqids = results->reqids;
9772 tctx->extra_reqid_return_codes = results->reqid_return_codes;
9773
9774 if (whiteout) {
9775 // create a whiteout
9776 tctx->op_t->create(soid);
9777 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9778 ++tctx->delta_stats.num_whiteouts;
9779 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9780 osd->logger->inc(l_osd_tier_whiteout);
9781 } else {
9782 if (results->has_omap) {
9783 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9784 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9785 ++tctx->delta_stats.num_objects_omap;
9786 }
9787
9788 results->fill_in_final_tx(tctx->op_t.get());
9789 if (results->started_temp_obj) {
9790 tctx->discard_temp_oid = results->temp_oid;
9791 }
9792 tctx->new_obs.oi.size = results->object_size;
9793 tctx->new_obs.oi.user_version = results->user_version;
9794 tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
9795 tctx->mtime = utime_t();
9796 if (results->is_data_digest()) {
9797 tctx->new_obs.oi.set_data_digest(results->data_digest);
9798 } else {
9799 tctx->new_obs.oi.clear_data_digest();
9800 }
9801 if (results->object_size)
9802 tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
9803 if (results->is_omap_digest()) {
9804 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
9805 } else {
9806 tctx->new_obs.oi.clear_omap_digest();
9807 }
9808 if (results->has_omap)
9809 tctx->clean_regions.mark_omap_dirty();
9810 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
9811 tctx->new_obs.oi.truncate_size = results->truncate_size;
9812
9813 if (soid.snap != CEPH_NOSNAP) {
9814 ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
9815 ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
9816 ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
9817 results->object_size);
9818 ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
9819
9820 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
9821 } else {
9822 tctx->delta_stats.num_bytes += results->object_size;
9823 }
9824 }
9825
9826 if (results->mirror_snapset) {
9827 ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
9828 tctx->new_snapset.from_snap_set(
9829 results->snapset,
9830 get_osdmap()->require_osd_release < ceph_release_t::luminous);
9831 }
9832 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
9833
9834 // take RWWRITE lock for duration of our local write. ignore starvation.
9835 if (!tctx->lock_manager.take_write_lock(
9836 obc->obs.oi.soid,
9837 obc)) {
9838 ceph_abort_msg("problem!");
9839 }
9840 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9841
9842 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9843
9844 simple_opc_submit(std::move(tctx));
9845
9846 osd->logger->inc(l_osd_tier_promote);
9847
9848 if (agent_state &&
9849 agent_state->is_idle())
9850 agent_choose_mode();
9851 }
9852
9853 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
9854 ObjectContextRef obc)
9855 {
9856 const hobject_t& soid = obc->obs.oi.soid;
9857 dout(10) << __func__ << " " << soid << " r=" << r
9858 << " uv" << results->user_version << dendl;
9859
9860 if (r == -ECANCELED || r == -EAGAIN) {
9861 return;
9862 }
9863
9864 if (r < 0) {
9865 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9866 // pass error to everyone blocked on this object
9867 // FIXME: this is pretty sloppy, but at this point we got
9868 // something unexpected and don't have many other options.
9869 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9870 waiting_for_blocked_object.find(soid);
9871 if (blocked_iter != waiting_for_blocked_object.end()) {
9872 while (!blocked_iter->second.empty()) {
9873 osd->reply_op_error(blocked_iter->second.front(), r);
9874 blocked_iter->second.pop_front();
9875 }
9876 waiting_for_blocked_object.erase(blocked_iter);
9877 }
9878 return;
9879 }
9880
9881 osd->promote_finish(results->object_size);
9882 osd->logger->inc(l_osd_tier_promote);
9883
9884 if (agent_state &&
9885 agent_state->is_idle())
9886 agent_choose_mode();
9887 }
9888
9889 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
9890 vector<ceph_tid_t> *tids)
9891 {
9892 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
9893 << " from " << cop->src << " " << cop->oloc
9894 << " v" << cop->results.user_version << dendl;
9895
9896 // cancel objecter op, if we can
9897 if (cop->objecter_tid) {
9898 tids->push_back(cop->objecter_tid);
9899 cop->objecter_tid = 0;
9900 if (cop->objecter_tid2) {
9901 tids->push_back(cop->objecter_tid2);
9902 cop->objecter_tid2 = 0;
9903 }
9904 }
9905
9906 copy_ops.erase(cop->obc->obs.oi.soid);
9907 cop->obc->stop_block();
9908
9909 kick_object_context_blocked(cop->obc);
9910 cop->results.should_requeue = requeue;
9911 CopyCallbackResults result(-ECANCELED, &cop->results);
9912 cop->cb->complete(result);
9913
9914 // There may still be an objecter callback referencing this copy op.
9915 // That callback will not need the obc since it's been canceled, and
9916 // we need the obc reference to go away prior to flush.
9917 cop->obc = ObjectContextRef();
9918 }
9919
9920 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
9921 {
9922 dout(10) << __func__ << dendl;
9923 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
9924 while (p != copy_ops.end()) {
9925 // requeue this op? can I queue up all of them?
9926 cancel_copy((p++)->second, requeue, tids);
9927 }
9928 }
9929
9930
9931 // ========================================================================
9932 // flush
9933 //
9934 // Flush a dirty object in the cache tier by writing it back to the
9935 // base tier. The sequence looks like:
9936 //
9937 // * send a copy-from operation to the base tier to copy the current
9938 // version of the object
9939 // * base tier will pull the object via (perhaps multiple) copy-get(s)
9940 // * on completion, we check if the object has been modified. if so,
9941 // just reply with -EAGAIN.
9942 // * try to take a write lock so we can clear the dirty flag. if this
9943 // fails, wait and retry
9944 // * start a repop that clears the bit.
9945 //
9946 // If we have to wait, we will retry by coming back through the
9947 // start_flush method. We check if a flush is already in progress
9948 // and, if so, try to finish it by rechecking the version and trying
9949 // to clear the dirty bit.
9950 //
9951 // In order for the cache-flush (a write op) to not block the copy-get
9952 // from reading the object, the client *must* set the SKIPRWLOCKS
9953 // flag.
9954 //
9955 // NOTE: normally writes are strictly ordered for the client, but
9956 // flushes are special in that they can be reordered with respect to
9957 // other writes. In particular, we can't have a flush request block
9958 // an update to the cache pool object!
9959
9960 struct C_Flush : public Context {
9961 PrimaryLogPGRef pg;
9962 hobject_t oid;
9963 epoch_t last_peering_reset;
9964 ceph_tid_t tid;
9965 utime_t start;
9966 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
9967 : pg(p), oid(o), last_peering_reset(lpr),
9968 tid(0), start(ceph_clock_now())
9969 {}
9970 void finish(int r) override {
9971 if (r == -ECANCELED)
9972 return;
9973 std::scoped_lock locker{*pg};
9974 if (last_peering_reset == pg->get_last_peering_reset()) {
9975 pg->finish_flush(oid, tid, r);
9976 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
9977 }
9978 }
9979 };
9980
9981 int PrimaryLogPG::start_flush(
9982 OpRequestRef op, ObjectContextRef obc,
9983 bool blocking, hobject_t *pmissing,
9984 std::optional<std::function<void()>> &&on_flush)
9985 {
9986 const object_info_t& oi = obc->obs.oi;
9987 const hobject_t& soid = oi.soid;
9988 dout(10) << __func__ << " " << soid
9989 << " v" << oi.version
9990 << " uv" << oi.user_version
9991 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
9992 << dendl;
9993
9994 bool preoctopus_compat =
9995 get_osdmap()->require_osd_release < ceph_release_t::octopus;
9996 SnapSet snapset;
9997 if (preoctopus_compat) {
9998 // for pre-octopus compatibility, filter SnapSet::snaps. not
9999 // certain we need this, but let's be conservative.
10000 snapset = obc->ssc->snapset.get_filtered(pool.info);
10001 } else {
10002 // NOTE: change this to a const ref when we remove this compat code
10003 snapset = obc->ssc->snapset;
10004 }
10005
10006 // verify there are no (older) check for dirty clones
10007 {
10008 dout(20) << " snapset " << snapset << dendl;
10009 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10010 while (p != snapset.clones.rend() && *p >= soid.snap)
10011 ++p;
10012 if (p != snapset.clones.rend()) {
10013 hobject_t next = soid;
10014 next.snap = *p;
10015 ceph_assert(next.snap < soid.snap);
10016 if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
10017 dout(10) << __func__ << " missing clone is " << next << dendl;
10018 if (pmissing)
10019 *pmissing = next;
10020 return -ENOENT;
10021 }
10022 ObjectContextRef older_obc = get_object_context(next, false);
10023 if (older_obc) {
10024 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10025 << dendl;
10026 if (older_obc->obs.oi.is_dirty()) {
10027 dout(10) << __func__ << " next oldest clone is dirty: "
10028 << older_obc->obs.oi << dendl;
10029 return -EBUSY;
10030 }
10031 } else {
10032 dout(20) << __func__ << " next oldest clone " << next
10033 << " is not present; implicitly clean" << dendl;
10034 }
10035 } else {
10036 dout(20) << __func__ << " no older clones" << dendl;
10037 }
10038 }
10039
10040 if (blocking)
10041 obc->start_block();
10042
10043 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10044 if (p != flush_ops.end()) {
10045 FlushOpRef fop = p->second;
10046 if (fop->op == op) {
10047 // we couldn't take the write lock on a cache-try-flush before;
10048 // now we are trying again for the lock.
10049 return try_flush_mark_clean(fop);
10050 }
10051 if (fop->flushed_version == obc->obs.oi.user_version &&
10052 (fop->blocking || !blocking)) {
10053 // nonblocking can join anything
10054 // blocking can only join a blocking flush
10055 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10056 if (op)
10057 fop->dup_ops.push_back(op);
10058 return -EAGAIN; // clean up this ctx; op will retry later
10059 }
10060
10061 // cancel current flush since it will fail anyway, or because we
10062 // are blocking and the existing flush is nonblocking.
10063 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10064 if (fop->op)
10065 osd->reply_op_error(fop->op, -EBUSY);
10066 while (!fop->dup_ops.empty()) {
10067 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10068 fop->dup_ops.pop_front();
10069 }
10070 vector<ceph_tid_t> tids;
10071 cancel_flush(fop, false, &tids);
10072 osd->objecter->op_cancel(tids, -ECANCELED);
10073 }
10074
10075 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10076 int r = start_manifest_flush(op, obc, blocking, std::move(on_flush));
10077 if (r != -EINPROGRESS) {
10078 if (blocking)
10079 obc->stop_block();
10080 }
10081 return r;
10082 }
10083
10084 /**
10085 * In general, we need to send a delete and a copyfrom.
10086 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10087 * where 4 is marked as clean. To flush 10, we have to:
10088 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10089 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10090 *
10091 * There is a complicating case. Supposed there had been a clone 7
10092 * for snaps [7, 6] which has been trimmed since they no longer exist.
10093 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10094 * the delete, the snap will be promoted to 5, and the head will become
10095 * a whiteout. When the copy-from goes through, we'll end up with
10096 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10097 *
10098 * Another complication is the case where there is an interval change
10099 * after doing the delete and the flush but before marking the object
10100 * clean. We'll happily delete head and then recreate it at the same
10101 * sequence number, which works out ok.
10102 */
10103
10104 SnapContext snapc, dsnapc;
10105 if (snapset.seq != 0) {
10106 if (soid.snap == CEPH_NOSNAP) {
10107 snapc = snapset.get_ssc_as_of(snapset.seq);
10108 } else {
10109 snapid_t min_included_snap;
10110 auto p = snapset.clone_snaps.find(soid.snap);
10111 ceph_assert(p != snapset.clone_snaps.end());
10112 min_included_snap = p->second.back();
10113 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10114 }
10115
10116 snapid_t prev_snapc = 0;
10117 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10118 citer != snapset.clones.rend();
10119 ++citer) {
10120 if (*citer < soid.snap) {
10121 prev_snapc = *citer;
10122 break;
10123 }
10124 }
10125
10126 dsnapc = snapset.get_ssc_as_of(prev_snapc);
10127 }
10128
10129 object_locator_t base_oloc(soid);
10130 base_oloc.pool = pool.info.tier_of;
10131
10132 if (dsnapc.seq < snapc.seq) {
10133 ObjectOperation o;
10134 o.remove();
10135 osd->objecter->mutate(
10136 soid.oid,
10137 base_oloc,
10138 o,
10139 dsnapc,
10140 ceph::real_clock::from_ceph_timespec(oi.mtime),
10141 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10142 CEPH_OSD_FLAG_ENFORCE_SNAPC),
10143 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10144 }
10145
10146 FlushOpRef fop(std::make_shared<FlushOp>());
10147 fop->obc = obc;
10148 fop->flushed_version = oi.user_version;
10149 fop->blocking = blocking;
10150 fop->on_flush = std::move(on_flush);
10151 fop->op = op;
10152
10153 ObjectOperation o;
10154 if (oi.is_whiteout()) {
10155 fop->removal = true;
10156 o.remove();
10157 } else {
10158 object_locator_t oloc(soid);
10159 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10160 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10161 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10162 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10163 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10164 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10165
10166 //mean the base tier don't cache data after this
10167 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10168 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10169 }
10170 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10171
10172 ceph_tid_t tid = osd->objecter->mutate(
10173 soid.oid, base_oloc, o, snapc,
10174 ceph::real_clock::from_ceph_timespec(oi.mtime),
10175 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10176 new C_OnFinisher(fin,
10177 osd->get_objecter_finisher(get_pg_shard())));
10178 /* we're under the pg lock and fin->finish() is grabbing that */
10179 fin->tid = tid;
10180 fop->objecter_tid = tid;
10181
10182 flush_ops[soid] = fop;
10183
10184 recovery_state.update_stats(
10185 [&oi](auto &history, auto &stats) {
10186 stats.stats.sum.num_flush++;
10187 stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10188 return false;
10189 });
10190 return -EINPROGRESS;
10191 }
10192
10193 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10194 {
10195 dout(10) << __func__ << " " << oid << " tid " << tid
10196 << " " << cpp_strerror(r) << dendl;
10197 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10198 if (p == flush_ops.end()) {
10199 dout(10) << __func__ << " no flush_op found" << dendl;
10200 return;
10201 }
10202 FlushOpRef fop = p->second;
10203 if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10204 dout(10) << __func__ << " tid " << tid << " != fop " << fop
10205 << " tid " << fop->objecter_tid << dendl;
10206 return;
10207 }
10208 ObjectContextRef obc = fop->obc;
10209 fop->objecter_tid = 0;
10210
10211 if (r < 0 && !(r == -ENOENT && fop->removal)) {
10212 if (fop->op)
10213 osd->reply_op_error(fop->op, -EBUSY);
10214 if (fop->blocking) {
10215 obc->stop_block();
10216 kick_object_context_blocked(obc);
10217 }
10218
10219 if (!fop->dup_ops.empty()) {
10220 dout(20) << __func__ << " requeueing dups" << dendl;
10221 requeue_ops(fop->dup_ops);
10222 }
10223 if (fop->on_flush) {
10224 (*(fop->on_flush))();
10225 fop->on_flush = std::nullopt;
10226 }
10227 flush_ops.erase(oid);
10228 return;
10229 }
10230
10231 r = try_flush_mark_clean(fop);
10232 if (r == -EBUSY && fop->op) {
10233 osd->reply_op_error(fop->op, r);
10234 }
10235 }
10236
10237 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10238 {
10239 ObjectContextRef obc = fop->obc;
10240 const hobject_t& oid = obc->obs.oi.soid;
10241
10242 if (fop->blocking) {
10243 obc->stop_block();
10244 kick_object_context_blocked(obc);
10245 }
10246
10247 if (fop->flushed_version != obc->obs.oi.user_version ||
10248 !obc->obs.exists) {
10249 if (obc->obs.exists)
10250 dout(10) << __func__ << " flushed_version " << fop->flushed_version
10251 << " != current " << obc->obs.oi.user_version
10252 << dendl;
10253 else
10254 dout(10) << __func__ << " object no longer exists" << dendl;
10255
10256 if (!fop->dup_ops.empty()) {
10257 dout(20) << __func__ << " requeueing dups" << dendl;
10258 requeue_ops(fop->dup_ops);
10259 }
10260 if (fop->on_flush) {
10261 (*(fop->on_flush))();
10262 fop->on_flush = std::nullopt;
10263 }
10264 flush_ops.erase(oid);
10265 if (fop->blocking)
10266 osd->logger->inc(l_osd_tier_flush_fail);
10267 else
10268 osd->logger->inc(l_osd_tier_try_flush_fail);
10269 return -EBUSY;
10270 }
10271
10272 if (!fop->blocking &&
10273 write_blocked_by_scrub(oid)) {
10274 if (fop->op) {
10275 dout(10) << __func__ << " blocked by scrub" << dendl;
10276 requeue_op(fop->op);
10277 requeue_ops(fop->dup_ops);
10278 return -EAGAIN; // will retry
10279 } else {
10280 osd->logger->inc(l_osd_tier_try_flush_fail);
10281 vector<ceph_tid_t> tids;
10282 cancel_flush(fop, false, &tids);
10283 osd->objecter->op_cancel(tids, -ECANCELED);
10284 return -ECANCELED;
10285 }
10286 }
10287
10288 // successfully flushed, can we evict this object?
10289 if (!obc->obs.oi.has_manifest() && !fop->op &&
10290 agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
10291 agent_maybe_evict(obc, true)) {
10292 osd->logger->inc(l_osd_tier_clean);
10293 if (fop->on_flush) {
10294 (*(fop->on_flush))();
10295 fop->on_flush = std::nullopt;
10296 }
10297 flush_ops.erase(oid);
10298 return 0;
10299 }
10300
10301 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10302 OpContextUPtr ctx = simple_opc_create(fop->obc);
10303
10304 // successfully flushed; can we clear the dirty bit?
10305 // try to take the lock manually, since we don't
10306 // have a ctx yet.
10307 if (ctx->lock_manager.get_lock_type(
10308 RWState::RWWRITE,
10309 oid,
10310 obc,
10311 fop->op)) {
10312 dout(20) << __func__ << " took write lock" << dendl;
10313 } else if (fop->op) {
10314 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10315 << fop->dup_ops << dendl;
10316 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10317 for (auto op : fop->dup_ops) {
10318 bool locked = ctx->lock_manager.get_lock_type(
10319 RWState::RWWRITE,
10320 oid,
10321 obc,
10322 op);
10323 ceph_assert(!locked);
10324 }
10325 close_op_ctx(ctx.release());
10326 return -EAGAIN; // will retry
10327 } else {
10328 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10329 close_op_ctx(ctx.release());
10330 osd->logger->inc(l_osd_tier_try_flush_fail);
10331 vector<ceph_tid_t> tids;
10332 cancel_flush(fop, false, &tids);
10333 osd->objecter->op_cancel(tids, -ECANCELED);
10334 return -ECANCELED;
10335 }
10336
10337 if (fop->on_flush) {
10338 ctx->register_on_finish(*(fop->on_flush));
10339 fop->on_flush = std::nullopt;
10340 }
10341
10342 ctx->at_version = get_next_version();
10343
10344 ctx->new_obs = obc->obs;
10345 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10346 --ctx->delta_stats.num_objects_dirty;
10347 if (fop->obc->obs.oi.has_manifest()) {
10348 ceph_assert(obc->obs.oi.manifest.is_chunked());
10349 PGTransaction* t = ctx->op_t.get();
10350 uint64_t chunks_size = 0;
10351 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10352 chunks_size += p.second.length;
10353 }
10354 if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10355 t->omap_clear(oid);
10356 ctx->new_obs.oi.clear_omap_digest();
10357 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10358 ctx->clean_regions.mark_omap_dirty();
10359 }
10360 if (obc->obs.oi.size == chunks_size) {
10361 t->truncate(oid, 0);
10362 interval_set<uint64_t> trim;
10363 trim.insert(0, ctx->new_obs.oi.size);
10364 ctx->modified_ranges.union_of(trim);
10365 truncate_update_size_and_usage(ctx->delta_stats,
10366 ctx->new_obs.oi,
10367 0);
10368 ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
10369 ctx->new_obs.oi.new_object();
10370 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10371 p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10372 p.second.set_flag(chunk_info_t::FLAG_MISSING);
10373 }
10374 } else {
10375 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10376 if (p.second.is_dirty()) {
10377 dout(20) << __func__ << " offset: " << p.second.offset
10378 << " length: " << p.second.length << dendl;
10379 p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10380 p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
10381 }
10382 }
10383 }
10384 }
10385
10386 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10387
10388 osd->logger->inc(l_osd_tier_clean);
10389
10390 if (!fop->dup_ops.empty() || fop->op) {
10391 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10392 list<OpRequestRef> ls;
10393 if (fop->op)
10394 ls.push_back(fop->op);
10395 ls.splice(ls.end(), fop->dup_ops);
10396 requeue_ops(ls);
10397 }
10398
10399 simple_opc_submit(std::move(ctx));
10400
10401 flush_ops.erase(oid);
10402
10403 if (fop->blocking)
10404 osd->logger->inc(l_osd_tier_flush);
10405 else
10406 osd->logger->inc(l_osd_tier_try_flush);
10407
10408 return -EINPROGRESS;
10409 }
10410
10411 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10412 vector<ceph_tid_t> *tids)
10413 {
10414 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10415 << fop->objecter_tid << dendl;
10416 if (fop->objecter_tid) {
10417 tids->push_back(fop->objecter_tid);
10418 fop->objecter_tid = 0;
10419 }
10420 if (fop->io_tids.size()) {
10421 for (auto &p : fop->io_tids) {
10422 tids->push_back(p.second);
10423 p.second = 0;
10424 }
10425 }
10426 if (fop->blocking && fop->obc->is_blocked()) {
10427 fop->obc->stop_block();
10428 kick_object_context_blocked(fop->obc);
10429 }
10430 if (requeue) {
10431 if (fop->op)
10432 requeue_op(fop->op);
10433 requeue_ops(fop->dup_ops);
10434 }
10435 if (fop->on_flush) {
10436 (*(fop->on_flush))();
10437 fop->on_flush = std::nullopt;
10438 }
10439 flush_ops.erase(fop->obc->obs.oi.soid);
10440 }
10441
10442 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
10443 {
10444 dout(10) << __func__ << dendl;
10445 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10446 while (p != flush_ops.end()) {
10447 cancel_flush((p++)->second, requeue, tids);
10448 }
10449 }
10450
10451 bool PrimaryLogPG::is_present_clone(hobject_t coid)
10452 {
10453 if (!pool.info.allow_incomplete_clones())
10454 return true;
10455 if (is_missing_object(coid))
10456 return true;
10457 ObjectContextRef obc = get_object_context(coid, false);
10458 return obc && obc->obs.exists;
10459 }
10460
10461 // ========================================================================
10462 // rep op gather
10463
10464 class C_OSD_RepopCommit : public Context {
10465 PrimaryLogPGRef pg;
10466 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10467 public:
10468 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10469 : pg(pg), repop(repop) {}
10470 void finish(int) override {
10471 pg->repop_all_committed(repop.get());
10472 }
10473 };
10474
10475 void PrimaryLogPG::repop_all_committed(RepGather *repop)
10476 {
10477 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10478 << dendl;
10479 repop->all_committed = true;
10480 if (!repop->rep_aborted) {
10481 if (repop->v != eversion_t()) {
10482 recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
10483 }
10484 eval_repop(repop);
10485 }
10486 }
10487
10488 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10489 {
10490 dout(10) << "op_applied version " << applied_version << dendl;
10491 ceph_assert(applied_version != eversion_t());
10492 ceph_assert(applied_version <= info.last_update);
10493 recovery_state.local_write_applied(applied_version);
10494 if (is_primary()) {
10495 if (scrubber.active) {
10496 if (recovery_state.get_last_update_applied() >=
10497 scrubber.subset_last_update) {
10498 requeue_scrub(ops_blocked_by_scrub());
10499 }
10500 } else {
10501 ceph_assert(scrubber.start == scrubber.end);
10502 }
10503 }
10504 }
10505
10506 void PrimaryLogPG::eval_repop(RepGather *repop)
10507 {
10508 dout(10) << "eval_repop " << *repop
10509 << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
10510
10511 // ondisk?
10512 if (repop->all_committed) {
10513 dout(10) << " commit: " << *repop << dendl;
10514 for (auto p = repop->on_committed.begin();
10515 p != repop->on_committed.end();
10516 repop->on_committed.erase(p++)) {
10517 (*p)();
10518 }
10519 // send dup commits, in order
10520 auto it = waiting_for_ondisk.find(repop->v);
10521 if (it != waiting_for_ondisk.end()) {
10522 ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10523 for (auto& i : it->second) {
10524 int return_code = repop->r;
10525 if (return_code >= 0) {
10526 return_code = std::get<2>(i);
10527 }
10528 osd->reply_op_error(std::get<0>(i), return_code, repop->v,
10529 std::get<1>(i), std::get<3>(i));
10530 }
10531 waiting_for_ondisk.erase(it);
10532 }
10533
10534 publish_stats_to_osd();
10535
10536 dout(10) << " removing " << *repop << dendl;
10537 ceph_assert(!repop_queue.empty());
10538 dout(20) << " q front is " << *repop_queue.front() << dendl;
10539 if (repop_queue.front() == repop) {
10540 RepGather *to_remove = nullptr;
10541 while (!repop_queue.empty() &&
10542 (to_remove = repop_queue.front())->all_committed) {
10543 repop_queue.pop_front();
10544 for (auto p = to_remove->on_success.begin();
10545 p != to_remove->on_success.end();
10546 to_remove->on_success.erase(p++)) {
10547 (*p)();
10548 }
10549 remove_repop(to_remove);
10550 }
10551 }
10552 }
10553 }
10554
10555 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
10556 {
10557 FUNCTRACE(cct);
10558 const hobject_t& soid = ctx->obs->oi.soid;
10559 dout(7) << "issue_repop rep_tid " << repop->rep_tid
10560 << " o " << soid
10561 << dendl;
10562
10563 repop->v = ctx->at_version;
10564
10565 ctx->op_t->add_obc(ctx->obc);
10566 if (ctx->clone_obc) {
10567 ctx->op_t->add_obc(ctx->clone_obc);
10568 }
10569 if (ctx->head_obc) {
10570 ctx->op_t->add_obc(ctx->head_obc);
10571 }
10572
10573 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
10574 if (!(ctx->log.empty())) {
10575 ceph_assert(ctx->at_version >= projected_last_update);
10576 projected_last_update = ctx->at_version;
10577 }
10578 for (auto &&entry: ctx->log) {
10579 projected_log.add(entry);
10580 }
10581
10582 recovery_state.pre_submit_op(
10583 soid,
10584 ctx->log,
10585 ctx->at_version);
10586 pgbackend->submit_transaction(
10587 soid,
10588 ctx->delta_stats,
10589 ctx->at_version,
10590 std::move(ctx->op_t),
10591 recovery_state.get_pg_trim_to(),
10592 recovery_state.get_min_last_complete_ondisk(),
10593 ctx->log,
10594 ctx->updated_hset_history,
10595 on_all_commit,
10596 repop->rep_tid,
10597 ctx->reqid,
10598 ctx->op);
10599 }
10600
10601 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
10602 OpContext *ctx, ObjectContextRef obc,
10603 ceph_tid_t rep_tid)
10604 {
10605 if (ctx->op)
10606 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
10607 else
10608 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
10609
10610 RepGather *repop = new RepGather(
10611 ctx, rep_tid, info.last_complete);
10612
10613 repop->start = ceph_clock_now();
10614
10615 repop_queue.push_back(&repop->queue_item);
10616 repop->get();
10617
10618 osd->logger->inc(l_osd_op_wip);
10619
10620 dout(10) << __func__ << ": " << *repop << dendl;
10621 return repop;
10622 }
10623
10624 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
10625 eversion_t version,
10626 int r,
10627 ObcLockManager &&manager,
10628 OpRequestRef &&op,
10629 std::optional<std::function<void(void)> > &&on_complete)
10630 {
10631 RepGather *repop = new RepGather(
10632 std::move(manager),
10633 std::move(op),
10634 std::move(on_complete),
10635 osd->get_tid(),
10636 info.last_complete,
10637 r);
10638 repop->v = version;
10639
10640 repop->start = ceph_clock_now();
10641
10642 repop_queue.push_back(&repop->queue_item);
10643
10644 osd->logger->inc(l_osd_op_wip);
10645
10646 dout(10) << __func__ << ": " << *repop << dendl;
10647 return boost::intrusive_ptr<RepGather>(repop);
10648 }
10649
10650 void PrimaryLogPG::remove_repop(RepGather *repop)
10651 {
10652 dout(20) << __func__ << " " << *repop << dendl;
10653
10654 for (auto p = repop->on_finish.begin();
10655 p != repop->on_finish.end();
10656 repop->on_finish.erase(p++)) {
10657 (*p)();
10658 }
10659
10660 release_object_locks(
10661 repop->lock_manager);
10662 repop->put();
10663
10664 osd->logger->dec(l_osd_op_wip);
10665 }
10666
10667 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
10668 {
10669 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
10670 ceph_tid_t rep_tid = osd->get_tid();
10671 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
10672 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
10673 ctx->op_t.reset(new PGTransaction());
10674 ctx->mtime = ceph_clock_now();
10675 return ctx;
10676 }
10677
10678 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
10679 {
10680 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
10681 dout(20) << __func__ << " " << repop << dendl;
10682 issue_repop(repop, ctx.get());
10683 eval_repop(repop);
10684 recovery_state.update_trim_to();
10685 repop->put();
10686 }
10687
10688
10689 void PrimaryLogPG::submit_log_entries(
10690 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
10691 ObcLockManager &&manager,
10692 std::optional<std::function<void(void)> > &&_on_complete,
10693 OpRequestRef op,
10694 int r)
10695 {
10696 dout(10) << __func__ << " " << entries << dendl;
10697 ceph_assert(is_primary());
10698
10699 eversion_t version;
10700 if (!entries.empty()) {
10701 ceph_assert(entries.rbegin()->version >= projected_last_update);
10702 version = projected_last_update = entries.rbegin()->version;
10703 }
10704
10705 boost::intrusive_ptr<RepGather> repop;
10706 std::optional<std::function<void(void)> > on_complete;
10707 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
10708 repop = new_repop(
10709 version,
10710 r,
10711 std::move(manager),
10712 std::move(op),
10713 std::move(_on_complete));
10714 } else {
10715 on_complete = std::move(_on_complete);
10716 }
10717
10718 pgbackend->call_write_ordered(
10719 [this, entries, repop, on_complete]() {
10720 ObjectStore::Transaction t;
10721 eversion_t old_last_update = info.last_update;
10722 recovery_state.merge_new_log_entries(
10723 entries, t, recovery_state.get_pg_trim_to(),
10724 recovery_state.get_min_last_complete_ondisk());
10725
10726 set<pg_shard_t> waiting_on;
10727 for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
10728 i != get_acting_recovery_backfill().end();
10729 ++i) {
10730 pg_shard_t peer(*i);
10731 if (peer == pg_whoami) continue;
10732 ceph_assert(recovery_state.get_peer_missing().count(peer));
10733 ceph_assert(recovery_state.has_peer_info(peer));
10734 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
10735 ceph_assert(repop);
10736 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
10737 entries,
10738 spg_t(info.pgid.pgid, i->shard),
10739 pg_whoami.shard,
10740 get_osdmap_epoch(),
10741 get_last_peering_reset(),
10742 repop->rep_tid,
10743 recovery_state.get_pg_trim_to(),
10744 recovery_state.get_min_last_complete_ondisk());
10745 osd->send_message_osd_cluster(
10746 peer.osd, m, get_osdmap_epoch());
10747 waiting_on.insert(peer);
10748 } else {
10749 MOSDPGLog *m = new MOSDPGLog(
10750 peer.shard, pg_whoami.shard,
10751 info.last_update.epoch,
10752 info, get_last_peering_reset());
10753 m->log.log = entries;
10754 m->log.tail = old_last_update;
10755 m->log.head = info.last_update;
10756 osd->send_message_osd_cluster(
10757 peer.osd, m, get_osdmap_epoch());
10758 }
10759 }
10760 ceph_tid_t rep_tid = repop->rep_tid;
10761 waiting_on.insert(pg_whoami);
10762 log_entry_update_waiting_on.insert(
10763 make_pair(
10764 rep_tid,
10765 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
10766 ));
10767 struct OnComplete : public Context {
10768 PrimaryLogPGRef pg;
10769 ceph_tid_t rep_tid;
10770 epoch_t epoch;
10771 OnComplete(
10772 PrimaryLogPGRef pg,
10773 ceph_tid_t rep_tid,
10774 epoch_t epoch)
10775 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
10776 void finish(int) override {
10777 std::scoped_lock l{*pg};
10778 if (!pg->pg_has_reset_since(epoch)) {
10779 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
10780 ceph_assert(it != pg->log_entry_update_waiting_on.end());
10781 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
10782 ceph_assert(it2 != it->second.waiting_on.end());
10783 it->second.waiting_on.erase(it2);
10784 if (it->second.waiting_on.empty()) {
10785 pg->repop_all_committed(it->second.repop.get());
10786 pg->log_entry_update_waiting_on.erase(it);
10787 }
10788 }
10789 }
10790 };
10791 t.register_on_commit(
10792 new OnComplete{this, rep_tid, get_osdmap_epoch()});
10793 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
10794 ceph_assert(r == 0);
10795 op_applied(info.last_update);
10796 });
10797
10798 recovery_state.update_trim_to();
10799 }
10800
10801 void PrimaryLogPG::cancel_log_updates()
10802 {
10803 // get rid of all the LogUpdateCtx so their references to repops are
10804 // dropped
10805 log_entry_update_waiting_on.clear();
10806 }
10807
10808 // -------------------------------------------------------
10809
10810 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
10811 {
10812 std::scoped_lock l{*this};
10813 pair<hobject_t, ObjectContextRef> i;
10814 while (object_contexts.get_next(i.first, &i)) {
10815 ObjectContextRef obc(i.second);
10816 get_obc_watchers(obc, *ls);
10817 }
10818 }
10819
10820 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
10821 {
10822 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10823 obc->watchers.begin();
10824 j != obc->watchers.end();
10825 ++j) {
10826 obj_watch_item_t owi;
10827
10828 owi.obj = obc->obs.oi.soid;
10829 owi.wi.addr = j->second->get_peer_addr();
10830 owi.wi.name = j->second->get_entity();
10831 owi.wi.cookie = j->second->get_cookie();
10832 owi.wi.timeout_seconds = j->second->get_timeout();
10833
10834 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
10835 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
10836
10837 pg_watchers.push_back(owi);
10838 }
10839 }
10840
10841 void PrimaryLogPG::check_blacklisted_watchers()
10842 {
10843 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
10844 pair<hobject_t, ObjectContextRef> i;
10845 while (object_contexts.get_next(i.first, &i))
10846 check_blacklisted_obc_watchers(i.second);
10847 }
10848
10849 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
10850 {
10851 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
10852 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
10853 obc->watchers.begin();
10854 k != obc->watchers.end();
10855 ) {
10856 //Advance iterator now so handle_watch_timeout() can erase element
10857 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
10858 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
10859 entity_addr_t ea = j->second->get_peer_addr();
10860 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
10861 if (get_osdmap()->is_blacklisted(ea)) {
10862 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
10863 ceph_assert(j->second->get_pg() == this);
10864 j->second->unregister_cb();
10865 handle_watch_timeout(j->second);
10866 }
10867 }
10868 }
10869
10870 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
10871 {
10872 ceph_assert(is_active());
10873 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
10874 ceph_assert((recovering.count(obc->obs.oi.soid) ||
10875 !is_missing_object(obc->obs.oi.soid)) ||
10876 (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
10877 it_objects->second->op ==
10878 pg_log_entry_t::LOST_REVERT &&
10879 it_objects->second->reverting_to ==
10880 obc->obs.oi.version));
10881
10882 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
10883 ceph_assert(obc->watchers.empty());
10884 // populate unconnected_watchers
10885 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
10886 obc->obs.oi.watchers.begin();
10887 p != obc->obs.oi.watchers.end();
10888 ++p) {
10889 utime_t expire = info.stats.last_became_active;
10890 expire += p->second.timeout_seconds;
10891 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
10892 WatchRef watch(
10893 Watch::makeWatchRef(
10894 this, osd, obc, p->second.timeout_seconds, p->first.first,
10895 p->first.second, p->second.addr));
10896 watch->disconnect();
10897 obc->watchers.insert(
10898 make_pair(
10899 make_pair(p->first.first, p->first.second),
10900 watch));
10901 }
10902 // Look for watchers from blacklisted clients and drop
10903 check_blacklisted_obc_watchers(obc);
10904 }
10905
10906 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
10907 {
10908 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
10909 dout(10) << "handle_watch_timeout obc " << obc << dendl;
10910
10911 if (!is_active()) {
10912 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
10913 return;
10914 }
10915 if (!obc->obs.exists) {
10916 dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
10917 return;
10918 }
10919 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
10920 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
10921 watch->get_delayed_cb()
10922 );
10923 dout(10) << "handle_watch_timeout waiting for degraded on obj "
10924 << obc->obs.oi.soid
10925 << dendl;
10926 return;
10927 }
10928
10929 if (write_blocked_by_scrub(obc->obs.oi.soid)) {
10930 dout(10) << "handle_watch_timeout waiting for scrub on obj "
10931 << obc->obs.oi.soid
10932 << dendl;
10933 scrubber.add_callback(
10934 watch->get_delayed_cb() // This callback!
10935 );
10936 return;
10937 }
10938
10939 OpContextUPtr ctx = simple_opc_create(obc);
10940 ctx->at_version = get_next_version();
10941
10942 object_info_t& oi = ctx->new_obs.oi;
10943 oi.watchers.erase(make_pair(watch->get_cookie(),
10944 watch->get_entity()));
10945
10946 list<watch_disconnect_t> watch_disconnects = {
10947 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
10948 };
10949 ctx->register_on_success(
10950 [this, obc, watch_disconnects]() {
10951 complete_disconnect_watches(obc, watch_disconnects);
10952 });
10953
10954
10955 PGTransaction *t = ctx->op_t.get();
10956 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
10957 ctx->at_version,
10958 oi.version,
10959 0,
10960 osd_reqid_t(), ctx->mtime, 0));
10961
10962 oi.prior_version = obc->obs.oi.version;
10963 oi.version = ctx->at_version;
10964 bufferlist bl;
10965 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
10966 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
10967
10968 // apply new object state.
10969 ctx->obc->obs = ctx->new_obs;
10970
10971 // no ctx->delta_stats
10972 simple_opc_submit(std::move(ctx));
10973 }
10974
10975 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
10976 SnapSetContext *ssc)
10977 {
10978 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
10979 ceph_assert(obc->destructor_callback == NULL);
10980 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
10981 obc->obs.oi = oi;
10982 obc->obs.exists = false;
10983 obc->ssc = ssc;
10984 if (ssc)
10985 register_snapset_context(ssc);
10986 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
10987 if (is_active())
10988 populate_obc_watchers(obc);
10989 return obc;
10990 }
10991
10992 ObjectContextRef PrimaryLogPG::get_object_context(
10993 const hobject_t& soid,
10994 bool can_create,
10995 const map<string, bufferlist> *attrs)
10996 {
10997 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
10998 ceph_assert(
10999 attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
11000 // or this is a revert... see recover_primary()
11001 (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11002 it_objects->second->op ==
11003 pg_log_entry_t::LOST_REVERT));
11004 ObjectContextRef obc = object_contexts.lookup(soid);
11005 osd->logger->inc(l_osd_object_ctx_cache_total);
11006 if (obc) {
11007 osd->logger->inc(l_osd_object_ctx_cache_hit);
11008 dout(10) << __func__ << ": found obc in cache: " << obc
11009 << dendl;
11010 } else {
11011 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11012 // check disk
11013 bufferlist bv;
11014 if (attrs) {
11015 auto it_oi = attrs->find(OI_ATTR);
11016 ceph_assert(it_oi != attrs->end());
11017 bv = it_oi->second;
11018 } else {
11019 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11020 if (r < 0) {
11021 if (!can_create) {
11022 dout(10) << __func__ << ": no obc for soid "
11023 << soid << " and !can_create"
11024 << dendl;
11025 return ObjectContextRef(); // -ENOENT!
11026 }
11027
11028 dout(10) << __func__ << ": no obc for soid "
11029 << soid << " but can_create"
11030 << dendl;
11031 // new object.
11032 object_info_t oi(soid);
11033 SnapSetContext *ssc = get_snapset_context(
11034 soid, true, 0, false);
11035 ceph_assert(ssc);
11036 obc = create_object_context(oi, ssc);
11037 dout(10) << __func__ << ": " << obc << " " << soid
11038 << " " << obc->rwstate
11039 << " oi: " << obc->obs.oi
11040 << " ssc: " << obc->ssc
11041 << " snapset: " << obc->ssc->snapset << dendl;
11042 return obc;
11043 }
11044 }
11045
11046 object_info_t oi;
11047 try {
11048 bufferlist::const_iterator bliter = bv.begin();
11049 decode(oi, bliter);
11050 } catch (...) {
11051 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11052 return ObjectContextRef(); // -ENOENT!
11053 }
11054
11055 ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11056
11057 obc = object_contexts.lookup_or_create(oi.soid);
11058 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11059 obc->obs.oi = oi;
11060 obc->obs.exists = true;
11061
11062 obc->ssc = get_snapset_context(
11063 soid, true,
11064 soid.has_snapset() ? attrs : 0);
11065
11066 if (is_active())
11067 populate_obc_watchers(obc);
11068
11069 if (pool.info.is_erasure()) {
11070 if (attrs) {
11071 obc->attr_cache = *attrs;
11072 } else {
11073 int r = pgbackend->objects_get_attrs(
11074 soid,
11075 &obc->attr_cache);
11076 ceph_assert(r == 0);
11077 }
11078 }
11079
11080 dout(10) << __func__ << ": creating obc from disk: " << obc
11081 << dendl;
11082 }
11083
11084 // XXX: Caller doesn't expect this
11085 if (obc->ssc == NULL) {
11086 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11087 return ObjectContextRef(); // -ENOENT!
11088 }
11089
11090 dout(10) << __func__ << ": " << obc << " " << soid
11091 << " " << obc->rwstate
11092 << " oi: " << obc->obs.oi
11093 << " exists: " << (int)obc->obs.exists
11094 << " ssc: " << obc->ssc
11095 << " snapset: " << obc->ssc->snapset << dendl;
11096 return obc;
11097 }
11098
11099 void PrimaryLogPG::context_registry_on_change()
11100 {
11101 pair<hobject_t, ObjectContextRef> i;
11102 while (object_contexts.get_next(i.first, &i)) {
11103 ObjectContextRef obc(i.second);
11104 if (obc) {
11105 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11106 obc->watchers.begin();
11107 j != obc->watchers.end();
11108 obc->watchers.erase(j++)) {
11109 j->second->discard();
11110 }
11111 }
11112 }
11113 }
11114
11115
11116 /*
11117 * If we return an error, and set *pmissing, then promoting that
11118 * object may help.
11119 *
11120 * If we return -EAGAIN, we will always set *pmissing to the missing
11121 * object to wait for.
11122 *
11123 * If we return an error but do not set *pmissing, then we know the
11124 * object does not exist.
11125 */
11126 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11127 ObjectContextRef *pobc,
11128 bool can_create,
11129 bool map_snapid_to_clone,
11130 hobject_t *pmissing)
11131 {
11132 FUNCTRACE(cct);
11133 ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11134 // want the head?
11135 if (oid.snap == CEPH_NOSNAP) {
11136 ObjectContextRef obc = get_object_context(oid, can_create);
11137 if (!obc) {
11138 if (pmissing)
11139 *pmissing = oid;
11140 return -ENOENT;
11141 }
11142 dout(10) << __func__ << " " << oid
11143 << " @" << oid.snap
11144 << " oi=" << obc->obs.oi
11145 << dendl;
11146 *pobc = obc;
11147
11148 return 0;
11149 }
11150
11151 // we want a snap
11152
11153 hobject_t head = oid.get_head();
11154 SnapSetContext *ssc = get_snapset_context(oid, can_create);
11155 if (!ssc || !(ssc->exists || can_create)) {
11156 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11157 if (pmissing)
11158 *pmissing = head; // start by getting the head
11159 if (ssc)
11160 put_snapset_context(ssc);
11161 return -ENOENT;
11162 }
11163
11164 if (map_snapid_to_clone) {
11165 dout(10) << __func__ << " " << oid << " @" << oid.snap
11166 << " snapset " << ssc->snapset
11167 << " map_snapid_to_clone=true" << dendl;
11168 if (oid.snap > ssc->snapset.seq) {
11169 // already must be readable
11170 ObjectContextRef obc = get_object_context(head, false);
11171 dout(10) << __func__ << " " << oid << " @" << oid.snap
11172 << " snapset " << ssc->snapset
11173 << " maps to head" << dendl;
11174 *pobc = obc;
11175 put_snapset_context(ssc);
11176 return (obc && obc->obs.exists) ? 0 : -ENOENT;
11177 } else {
11178 vector<snapid_t>::const_iterator citer = std::find(
11179 ssc->snapset.clones.begin(),
11180 ssc->snapset.clones.end(),
11181 oid.snap);
11182 if (citer == ssc->snapset.clones.end()) {
11183 dout(10) << __func__ << " " << oid << " @" << oid.snap
11184 << " snapset " << ssc->snapset
11185 << " maps to nothing" << dendl;
11186 put_snapset_context(ssc);
11187 return -ENOENT;
11188 }
11189
11190 dout(10) << __func__ << " " << oid << " @" << oid.snap
11191 << " snapset " << ssc->snapset
11192 << " maps to " << oid << dendl;
11193
11194 if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11195 dout(10) << __func__ << " " << oid << " @" << oid.snap
11196 << " snapset " << ssc->snapset
11197 << " " << oid << " is missing" << dendl;
11198 if (pmissing)
11199 *pmissing = oid;
11200 put_snapset_context(ssc);
11201 return -EAGAIN;
11202 }
11203
11204 ObjectContextRef obc = get_object_context(oid, false);
11205 if (!obc || !obc->obs.exists) {
11206 dout(10) << __func__ << " " << oid << " @" << oid.snap
11207 << " snapset " << ssc->snapset
11208 << " " << oid << " is not present" << dendl;
11209 if (pmissing)
11210 *pmissing = oid;
11211 put_snapset_context(ssc);
11212 return -ENOENT;
11213 }
11214 dout(10) << __func__ << " " << oid << " @" << oid.snap
11215 << " snapset " << ssc->snapset
11216 << " " << oid << " HIT" << dendl;
11217 *pobc = obc;
11218 put_snapset_context(ssc);
11219 return 0;
11220 }
11221 ceph_abort(); //unreachable
11222 }
11223
11224 dout(10) << __func__ << " " << oid << " @" << oid.snap
11225 << " snapset " << ssc->snapset << dendl;
11226
11227 // head?
11228 if (oid.snap > ssc->snapset.seq) {
11229 ObjectContextRef obc = get_object_context(head, false);
11230 dout(10) << __func__ << " " << head
11231 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11232 << " -- HIT " << obc->obs
11233 << dendl;
11234 if (!obc->ssc)
11235 obc->ssc = ssc;
11236 else {
11237 ceph_assert(ssc == obc->ssc);
11238 put_snapset_context(ssc);
11239 }
11240 *pobc = obc;
11241 return 0;
11242 }
11243
11244 // which clone would it be?
11245 unsigned k = 0;
11246 while (k < ssc->snapset.clones.size() &&
11247 ssc->snapset.clones[k] < oid.snap)
11248 k++;
11249 if (k == ssc->snapset.clones.size()) {
11250 dout(10) << __func__ << " no clones with last >= oid.snap "
11251 << oid.snap << " -- DNE" << dendl;
11252 put_snapset_context(ssc);
11253 return -ENOENT;
11254 }
11255 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11256 info.pgid.pool(), oid.get_namespace());
11257
11258 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
11259 dout(20) << __func__ << " " << soid << " missing, try again later"
11260 << dendl;
11261 if (pmissing)
11262 *pmissing = soid;
11263 put_snapset_context(ssc);
11264 return -EAGAIN;
11265 }
11266
11267 ObjectContextRef obc = get_object_context(soid, false);
11268 if (!obc || !obc->obs.exists) {
11269 if (pmissing)
11270 *pmissing = soid;
11271 put_snapset_context(ssc);
11272 if (is_primary()) {
11273 if (is_degraded_or_backfilling_object(soid)) {
11274 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11275 return -EAGAIN;
11276 } else if (is_degraded_on_async_recovery_target(soid)) {
11277 dout(20) << __func__ << " clone is recovering " << soid << dendl;
11278 return -EAGAIN;
11279 } else {
11280 dout(20) << __func__ << " missing clone " << soid << dendl;
11281 return -ENOENT;
11282 }
11283 } else {
11284 dout(20) << __func__ << " replica missing clone" << soid << dendl;
11285 return -ENOENT;
11286 }
11287 }
11288
11289 if (!obc->ssc) {
11290 obc->ssc = ssc;
11291 } else {
11292 ceph_assert(obc->ssc == ssc);
11293 put_snapset_context(ssc);
11294 }
11295 ssc = 0;
11296
11297 // clone
11298 dout(20) << __func__ << " " << soid
11299 << " snapset " << obc->ssc->snapset
11300 << dendl;
11301 snapid_t first, last;
11302 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11303 ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11304 if (p->second.empty()) {
11305 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11306 ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11307 return -ENOENT;
11308 }
11309 if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
11310 p->second.end()) {
11311 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11312 << " does not contain " << oid.snap << " -- DNE" << dendl;
11313 return -ENOENT;
11314 }
11315 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
11316 dout(20) << __func__ << " " << soid << " snap " << oid.snap
11317 << " in removed_snaps_queue" << " -- DNE" << dendl;
11318 return -ENOENT;
11319 }
11320 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11321 << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11322 *pobc = obc;
11323 return 0;
11324 }
11325
11326 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11327 {
11328 if (obc->ssc)
11329 put_snapset_context(obc->ssc);
11330 }
11331
11332 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11333 {
11334 object_info_t& oi = obc->obs.oi;
11335
11336 dout(10) << __func__ << " " << oi.soid << dendl;
11337 ceph_assert(!oi.soid.is_snapdir());
11338
11339 object_stat_sum_t stat;
11340 stat.num_objects++;
11341 if (oi.is_dirty())
11342 stat.num_objects_dirty++;
11343 if (oi.is_whiteout())
11344 stat.num_whiteouts++;
11345 if (oi.is_omap())
11346 stat.num_objects_omap++;
11347 if (oi.is_cache_pinned())
11348 stat.num_objects_pinned++;
11349 if (oi.has_manifest())
11350 stat.num_objects_manifest++;
11351
11352 if (oi.soid.is_snap()) {
11353 stat.num_object_clones++;
11354
11355 if (!obc->ssc)
11356 obc->ssc = get_snapset_context(oi.soid, false);
11357 ceph_assert(obc->ssc);
11358 stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11359 } else {
11360 stat.num_bytes += oi.size;
11361 }
11362
11363 // add it in
11364 pgstat->stats.sum.add(stat);
11365 }
11366
11367 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11368 {
11369 const hobject_t& soid = obc->obs.oi.soid;
11370 if (obc->is_blocked()) {
11371 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11372 return;
11373 }
11374
11375 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11376 if (p != waiting_for_blocked_object.end()) {
11377 list<OpRequestRef>& ls = p->second;
11378 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11379 requeue_ops(ls);
11380 waiting_for_blocked_object.erase(p);
11381 }
11382
11383 map<hobject_t, ObjectContextRef>::iterator i =
11384 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11385 if (i != objects_blocked_on_snap_promotion.end()) {
11386 ceph_assert(i->second == obc);
11387 objects_blocked_on_snap_promotion.erase(i);
11388 }
11389
11390 if (obc->requeue_scrub_on_unblock) {
11391 obc->requeue_scrub_on_unblock = false;
11392 // only requeue if we are still active: we may be unblocking
11393 // because we are resetting for a new peering interval
11394 if (is_active()) {
11395 requeue_scrub();
11396 }
11397 }
11398 }
11399
11400 SnapSetContext *PrimaryLogPG::get_snapset_context(
11401 const hobject_t& oid,
11402 bool can_create,
11403 const map<string, bufferlist> *attrs,
11404 bool oid_existed)
11405 {
11406 std::lock_guard l(snapset_contexts_lock);
11407 SnapSetContext *ssc;
11408 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11409 oid.get_snapdir());
11410 if (p != snapset_contexts.end()) {
11411 if (can_create || p->second->exists) {
11412 ssc = p->second;
11413 } else {
11414 return NULL;
11415 }
11416 } else {
11417 bufferlist bv;
11418 if (!attrs) {
11419 int r = -ENOENT;
11420 if (!(oid.is_head() && !oid_existed)) {
11421 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
11422 }
11423 if (r < 0 && !can_create)
11424 return NULL;
11425 } else {
11426 auto it_ss = attrs->find(SS_ATTR);
11427 ceph_assert(it_ss != attrs->end());
11428 bv = it_ss->second;
11429 }
11430 ssc = new SnapSetContext(oid.get_snapdir());
11431 _register_snapset_context(ssc);
11432 if (bv.length()) {
11433 bufferlist::const_iterator bvp = bv.begin();
11434 try {
11435 ssc->snapset.decode(bvp);
11436 } catch (buffer::error& e) {
11437 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
11438 return NULL;
11439 }
11440 ssc->exists = true;
11441 } else {
11442 ssc->exists = false;
11443 }
11444 }
11445 ceph_assert(ssc);
11446 ssc->ref++;
11447 return ssc;
11448 }
11449
11450 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11451 {
11452 std::lock_guard l(snapset_contexts_lock);
11453 --ssc->ref;
11454 if (ssc->ref == 0) {
11455 if (ssc->registered)
11456 snapset_contexts.erase(ssc->oid);
11457 delete ssc;
11458 }
11459 }
11460
11461 /*
11462 * Return values:
11463 * NONE - didn't pull anything
11464 * YES - pulled what the caller wanted
11465 * HEAD - needed to pull head first
11466 */
11467 enum { PULL_NONE, PULL_HEAD, PULL_YES };
11468
11469 int PrimaryLogPG::recover_missing(
11470 const hobject_t &soid, eversion_t v,
11471 int priority,
11472 PGBackend::RecoveryHandle *h)
11473 {
11474 if (recovery_state.get_missing_loc().is_unfound(soid)) {
11475 dout(7) << __func__ << " " << soid
11476 << " v " << v
11477 << " but it is unfound" << dendl;
11478 return PULL_NONE;
11479 }
11480
11481 if (recovery_state.get_missing_loc().is_deleted(soid)) {
11482 start_recovery_op(soid);
11483 ceph_assert(!recovering.count(soid));
11484 recovering.insert(make_pair(soid, ObjectContextRef()));
11485 epoch_t cur_epoch = get_osdmap_epoch();
11486 remove_missing_object(soid, v, new LambdaContext(
11487 [=](int) {
11488 std::scoped_lock locker{*this};
11489 if (!pg_has_reset_since(cur_epoch)) {
11490 bool object_missing = false;
11491 for (const auto& shard : get_acting_recovery_backfill()) {
11492 if (shard == pg_whoami)
11493 continue;
11494 if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
11495 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11496 object_missing = true;
11497 break;
11498 }
11499 }
11500 if (!object_missing) {
11501 object_stat_sum_t stat_diff;
11502 stat_diff.num_objects_recovered = 1;
11503 if (scrub_after_recovery)
11504 stat_diff.num_objects_repaired = 1;
11505 on_global_recover(soid, stat_diff, true);
11506 } else {
11507 auto recovery_handle = pgbackend->open_recovery_op();
11508 pgbackend->recover_delete_object(soid, v, recovery_handle);
11509 pgbackend->run_recovery_op(recovery_handle, priority);
11510 }
11511 }
11512 }));
11513 return PULL_YES;
11514 }
11515
11516 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11517 ObjectContextRef obc;
11518 ObjectContextRef head_obc;
11519 if (soid.snap && soid.snap < CEPH_NOSNAP) {
11520 // do we have the head?
11521 hobject_t head = soid.get_head();
11522 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
11523 if (recovering.count(head)) {
11524 dout(10) << " missing but already recovering head " << head << dendl;
11525 return PULL_NONE;
11526 } else {
11527 int r = recover_missing(
11528 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
11529 h);
11530 if (r != PULL_NONE)
11531 return PULL_HEAD;
11532 return PULL_NONE;
11533 }
11534 }
11535 head_obc = get_object_context(
11536 head,
11537 false,
11538 0);
11539 ceph_assert(head_obc);
11540 }
11541 start_recovery_op(soid);
11542 ceph_assert(!recovering.count(soid));
11543 recovering.insert(make_pair(soid, obc));
11544 int r = pgbackend->recover_object(
11545 soid,
11546 v,
11547 head_obc,
11548 obc,
11549 h);
11550 // This is only a pull which shouldn't return an error
11551 ceph_assert(r >= 0);
11552 return PULL_YES;
11553 }
11554
11555 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
11556 eversion_t v, Context *on_complete)
11557 {
11558 dout(20) << __func__ << " " << soid << " " << v << dendl;
11559 ceph_assert(on_complete != nullptr);
11560 // delete locally
11561 ObjectStore::Transaction t;
11562 remove_snap_mapped_object(t, soid);
11563
11564 ObjectRecoveryInfo recovery_info;
11565 recovery_info.soid = soid;
11566 recovery_info.version = v;
11567
11568 epoch_t cur_epoch = get_osdmap_epoch();
11569 t.register_on_complete(new LambdaContext(
11570 [=](int) {
11571 std::unique_lock locker{*this};
11572 if (!pg_has_reset_since(cur_epoch)) {
11573 ObjectStore::Transaction t2;
11574 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
11575 t2.register_on_complete(on_complete);
11576 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
11577 ceph_assert(r == 0);
11578 locker.unlock();
11579 } else {
11580 locker.unlock();
11581 on_complete->complete(-EAGAIN);
11582 }
11583 }));
11584 int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
11585 ceph_assert(r == 0);
11586 }
11587
11588 void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
11589 {
11590 dout(10) << __func__ << " " << oid << dendl;
11591 if (callbacks_for_degraded_object.count(oid)) {
11592 list<Context*> contexts;
11593 contexts.swap(callbacks_for_degraded_object[oid]);
11594 callbacks_for_degraded_object.erase(oid);
11595 for (list<Context*>::iterator i = contexts.begin();
11596 i != contexts.end();
11597 ++i) {
11598 (*i)->complete(0);
11599 }
11600 }
11601 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
11602 oid.get_head());
11603 if (i != objects_blocked_on_degraded_snap.end() &&
11604 i->second == oid.snap)
11605 objects_blocked_on_degraded_snap.erase(i);
11606 }
11607
11608 void PrimaryLogPG::_committed_pushed_object(
11609 epoch_t epoch, eversion_t last_complete)
11610 {
11611 std::scoped_lock locker{*this};
11612 if (!pg_has_reset_since(epoch)) {
11613 recovery_state.recovery_committed_to(last_complete);
11614 } else {
11615 dout(10) << __func__
11616 << " pg has changed, not touching last_complete_ondisk" << dendl;
11617 }
11618 }
11619
11620 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
11621 {
11622 dout(20) << __func__ << dendl;
11623 if (obc) {
11624 dout(20) << "obc = " << *obc << dendl;
11625 }
11626 ceph_assert(active_pushes >= 1);
11627 --active_pushes;
11628
11629 // requeue an active chunky scrub waiting on recovery ops
11630 if (!recovery_state.is_deleting() && active_pushes == 0
11631 && scrubber.is_chunky_scrub_active()) {
11632 requeue_scrub(ops_blocked_by_scrub());
11633 }
11634 }
11635
11636 void PrimaryLogPG::_applied_recovered_object_replica()
11637 {
11638 dout(20) << __func__ << dendl;
11639 ceph_assert(active_pushes >= 1);
11640 --active_pushes;
11641
11642 // requeue an active chunky scrub waiting on recovery ops
11643 if (!recovery_state.is_deleting() && active_pushes == 0 &&
11644 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
11645 scrubber.active_rep_scrub->get_req())->chunky) {
11646 auto& op = scrubber.active_rep_scrub;
11647 osd->enqueue_back(
11648 OpSchedulerItem(
11649 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
11650 op->get_req()->get_cost(),
11651 op->get_req()->get_priority(),
11652 op->get_req()->get_recv_stamp(),
11653 op->get_req()->get_source().num(),
11654 get_osdmap_epoch()));
11655 scrubber.active_rep_scrub.reset();
11656 }
11657 }
11658
11659 void PrimaryLogPG::on_failed_pull(
11660 const set<pg_shard_t> &from,
11661 const hobject_t &soid,
11662 const eversion_t &v)
11663 {
11664 dout(20) << __func__ << ": " << soid << dendl;
11665 ceph_assert(recovering.count(soid));
11666 auto obc = recovering[soid];
11667 if (obc) {
11668 list<OpRequestRef> blocked_ops;
11669 obc->drop_recovery_read(&blocked_ops);
11670 requeue_ops(blocked_ops);
11671 }
11672 recovering.erase(soid);
11673 for (auto&& i : from) {
11674 if (i != pg_whoami) { // we'll get it below in primary_error
11675 recovery_state.force_object_missing(i, soid, v);
11676 }
11677 }
11678
11679 dout(0) << __func__ << " " << soid << " from shard " << from
11680 << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
11681 << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
11682 << dendl;
11683 finish_recovery_op(soid); // close out this attempt,
11684 finish_degraded_object(soid);
11685
11686 if (from.count(pg_whoami)) {
11687 dout(0) << " primary missing oid " << soid << " version " << v << dendl;
11688 primary_error(soid, v);
11689 backfills_in_flight.erase(soid);
11690 }
11691 }
11692
11693 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
11694 {
11695 eversion_t v;
11696 pg_missing_item pmi;
11697 bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
11698 ceph_assert(is_missing);
11699 v = pmi.have;
11700 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
11701
11702 ceph_assert(!get_acting_recovery_backfill().empty());
11703 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
11704 i != get_acting_recovery_backfill().end();
11705 ++i) {
11706 if (*i == get_primary()) continue;
11707 pg_shard_t peer = *i;
11708 if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
11709 continue;
11710 }
11711 eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
11712 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
11713 if (h > v)
11714 v = h;
11715 }
11716
11717 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
11718 return v;
11719 }
11720
11721 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
11722 {
11723 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
11724 op->get_req());
11725 ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
11726 ObjectStore::Transaction t;
11727 std::optional<eversion_t> op_trim_to, op_roll_forward_to;
11728 if (m->pg_trim_to != eversion_t())
11729 op_trim_to = m->pg_trim_to;
11730 if (m->pg_roll_forward_to != eversion_t())
11731 op_roll_forward_to = m->pg_roll_forward_to;
11732
11733 dout(20) << __func__
11734 << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
11735
11736 recovery_state.append_log_entries_update_missing(
11737 m->entries, t, op_trim_to, op_roll_forward_to);
11738 eversion_t new_lcod = info.last_complete;
11739
11740 Context *complete = new LambdaContext(
11741 [=](int) {
11742 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
11743 op->get_req());
11744 std::scoped_lock locker{*this};
11745 if (!pg_has_reset_since(msg->get_epoch())) {
11746 update_last_complete_ondisk(new_lcod);
11747 MOSDPGUpdateLogMissingReply *reply =
11748 new MOSDPGUpdateLogMissingReply(
11749 spg_t(info.pgid.pgid, primary_shard().shard),
11750 pg_whoami.shard,
11751 msg->get_epoch(),
11752 msg->min_epoch,
11753 msg->get_tid(),
11754 new_lcod);
11755 reply->set_priority(CEPH_MSG_PRIO_HIGH);
11756 msg->get_connection()->send_message(reply);
11757 }
11758 });
11759
11760 if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
11761 t.register_on_commit(complete);
11762 } else {
11763 /* Hack to work around the fact that ReplicatedBackend sends
11764 * ack+commit if commit happens first
11765 *
11766 * This behavior is no longer necessary, but we preserve it so old
11767 * primaries can keep their repops in order */
11768 if (pool.info.is_erasure()) {
11769 t.register_on_complete(complete);
11770 } else {
11771 t.register_on_commit(complete);
11772 }
11773 }
11774 int tr = osd->store->queue_transaction(
11775 ch,
11776 std::move(t),
11777 nullptr);
11778 ceph_assert(tr == 0);
11779 op_applied(info.last_update);
11780 }
11781
11782 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
11783 {
11784 const MOSDPGUpdateLogMissingReply *m =
11785 static_cast<const MOSDPGUpdateLogMissingReply*>(
11786 op->get_req());
11787 dout(20) << __func__ << " got reply from "
11788 << m->get_from() << dendl;
11789
11790 auto it = log_entry_update_waiting_on.find(m->get_tid());
11791 if (it != log_entry_update_waiting_on.end()) {
11792 if (it->second.waiting_on.count(m->get_from())) {
11793 it->second.waiting_on.erase(m->get_from());
11794 if (m->last_complete_ondisk != eversion_t()) {
11795 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
11796 }
11797 } else {
11798 osd->clog->error()
11799 << info.pgid << " got reply "
11800 << *m << " from shard we are not waiting for "
11801 << m->get_from();
11802 }
11803
11804 if (it->second.waiting_on.empty()) {
11805 repop_all_committed(it->second.repop.get());
11806 log_entry_update_waiting_on.erase(it);
11807 }
11808 } else {
11809 osd->clog->error()
11810 << info.pgid << " got reply "
11811 << *m << " on unknown tid " << m->get_tid();
11812 }
11813 }
11814
11815 /* Mark all unfound objects as lost.
11816 */
11817 void PrimaryLogPG::mark_all_unfound_lost(
11818 int what,
11819 std::function<void(int,const std::string&,bufferlist&)> on_finish)
11820 {
11821 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
11822 list<hobject_t> oids;
11823
11824 dout(30) << __func__ << ": log before:\n";
11825 recovery_state.get_pg_log().get_log().print(*_dout);
11826 *_dout << dendl;
11827
11828 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
11829
11830 utime_t mtime = ceph_clock_now();
11831 map<hobject_t, pg_missing_item>::const_iterator m =
11832 recovery_state.get_missing_loc().get_needs_recovery().begin();
11833 map<hobject_t, pg_missing_item>::const_iterator mend =
11834 recovery_state.get_missing_loc().get_needs_recovery().end();
11835
11836 ObcLockManager manager;
11837 eversion_t v = get_next_version();
11838 v.epoch = get_osdmap_epoch();
11839 uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
11840 while (m != mend) {
11841 const hobject_t &oid(m->first);
11842 if (!recovery_state.get_missing_loc().is_unfound(oid)) {
11843 // We only care about unfound objects
11844 ++m;
11845 continue;
11846 }
11847
11848 ObjectContextRef obc;
11849 eversion_t prev;
11850
11851 switch (what) {
11852 case pg_log_entry_t::LOST_MARK:
11853 ceph_abort_msg("actually, not implemented yet!");
11854 break;
11855
11856 case pg_log_entry_t::LOST_REVERT:
11857 prev = pick_newest_available(oid);
11858 if (prev > eversion_t()) {
11859 // log it
11860 pg_log_entry_t e(
11861 pg_log_entry_t::LOST_REVERT, oid, v,
11862 m->second.need, 0, osd_reqid_t(), mtime, 0);
11863 e.reverting_to = prev;
11864 e.mark_unrollbackable();
11865 log_entries.push_back(e);
11866 dout(10) << e << dendl;
11867
11868 // we are now missing the new version; recovery code will sort it out.
11869 ++v.version;
11870 ++m;
11871 break;
11872 }
11873
11874 case pg_log_entry_t::LOST_DELETE:
11875 {
11876 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
11877 0, osd_reqid_t(), mtime, 0);
11878 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11879 if (pool.info.require_rollback()) {
11880 e.mod_desc.try_rmobject(v.version);
11881 } else {
11882 e.mark_unrollbackable();
11883 }
11884 } // otherwise, just do what we used to do
11885 dout(10) << e << dendl;
11886 log_entries.push_back(e);
11887 oids.push_back(oid);
11888
11889 // If context found mark object as deleted in case
11890 // of racing with new creation. This can happen if
11891 // object lost and EIO at primary.
11892 obc = object_contexts.lookup(oid);
11893 if (obc)
11894 obc->obs.exists = false;
11895
11896 ++v.version;
11897 ++m;
11898 }
11899 break;
11900
11901 default:
11902 ceph_abort();
11903 }
11904 }
11905
11906 recovery_state.update_stats(
11907 [](auto &history, auto &stats) {
11908 stats.stats_invalid = true;
11909 return false;
11910 });
11911
11912 submit_log_entries(
11913 log_entries,
11914 std::move(manager),
11915 std::optional<std::function<void(void)> >(
11916 [this, oids, num_unfound, on_finish]() {
11917 if (recovery_state.perform_deletes_during_peering()) {
11918 for (auto oid : oids) {
11919 // clear old locations - merge_new_log_entries will have
11920 // handled rebuilding missing_loc for each of these
11921 // objects if we have the RECOVERY_DELETES flag
11922 recovery_state.object_recovered(oid, object_stat_sum_t());
11923 }
11924 }
11925
11926 if (is_recovery_unfound()) {
11927 queue_peering_event(
11928 PGPeeringEventRef(
11929 std::make_shared<PGPeeringEvent>(
11930 get_osdmap_epoch(),
11931 get_osdmap_epoch(),
11932 PeeringState::DoRecovery())));
11933 } else if (is_backfill_unfound()) {
11934 queue_peering_event(
11935 PGPeeringEventRef(
11936 std::make_shared<PGPeeringEvent>(
11937 get_osdmap_epoch(),
11938 get_osdmap_epoch(),
11939 PeeringState::RequestBackfill())));
11940 } else {
11941 queue_recovery();
11942 }
11943
11944 stringstream ss;
11945 ss << "pg has " << num_unfound
11946 << " objects unfound and apparently lost marking";
11947 string rs = ss.str();
11948 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
11949 osd->clog->info() << rs;
11950 bufferlist empty;
11951 on_finish(0, rs, empty);
11952 }),
11953 OpRequestRef());
11954 }
11955
11956 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
11957 {
11958 ceph_assert(repop_queue.empty());
11959 }
11960
11961 /*
11962 * pg status change notification
11963 */
11964
11965 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11966 {
11967 list<OpRequestRef> rq;
11968
11969 // apply all repops
11970 while (!repop_queue.empty()) {
11971 RepGather *repop = repop_queue.front();
11972 repop_queue.pop_front();
11973 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11974 repop->rep_aborted = true;
11975 repop->on_committed.clear();
11976 repop->on_success.clear();
11977
11978 if (requeue) {
11979 if (repop->op) {
11980 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11981 rq.push_back(repop->op);
11982 repop->op = OpRequestRef();
11983 }
11984
11985 // also requeue any dups, interleaved into position
11986 auto p = waiting_for_ondisk.find(repop->v);
11987 if (p != waiting_for_ondisk.end()) {
11988 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11989 for (auto& i : p->second) {
11990 rq.push_back(std::get<0>(i));
11991 }
11992 waiting_for_ondisk.erase(p);
11993 }
11994 }
11995
11996 remove_repop(repop);
11997 }
11998
11999 ceph_assert(repop_queue.empty());
12000
12001 if (requeue) {
12002 requeue_ops(rq);
12003 if (!waiting_for_ondisk.empty()) {
12004 for (auto& i : waiting_for_ondisk) {
12005 for (auto& j : i.second) {
12006 derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12007 << " waiting on " << i.first << dendl;
12008 }
12009 }
12010 ceph_assert(waiting_for_ondisk.empty());
12011 }
12012 }
12013
12014 waiting_for_ondisk.clear();
12015 }
12016
12017 void PrimaryLogPG::on_flushed()
12018 {
12019 requeue_ops(waiting_for_flush);
12020 if (!is_peered() || !is_primary()) {
12021 pair<hobject_t, ObjectContextRef> i;
12022 while (object_contexts.get_next(i.first, &i)) {
12023 derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12024 }
12025 ceph_assert(object_contexts.empty());
12026 }
12027 }
12028
12029 void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
12030 {
12031 dout(10) << __func__ << dendl;
12032
12033 on_shutdown();
12034
12035 t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12036 }
12037
12038 void PrimaryLogPG::clear_async_reads()
12039 {
12040 dout(10) << __func__ << dendl;
12041 for(auto& i : in_progress_async_reads) {
12042 dout(10) << "clear ctx: "
12043 << "OpRequestRef " << i.first
12044 << " OpContext " << i.second
12045 << dendl;
12046 close_op_ctx(i.second);
12047 }
12048 }
12049
12050 void PrimaryLogPG::clear_cache()
12051 {
12052 object_contexts.clear();
12053 }
12054
12055 void PrimaryLogPG::on_shutdown()
12056 {
12057 dout(10) << __func__ << dendl;
12058
12059 if (recovery_queued) {
12060 recovery_queued = false;
12061 osd->clear_queued_recovery(this);
12062 }
12063
12064 clear_scrub_reserved();
12065 scrub_clear_state();
12066
12067 unreg_next_scrub();
12068
12069 vector<ceph_tid_t> tids;
12070 cancel_copy_ops(false, &tids);
12071 cancel_flush_ops(false, &tids);
12072 cancel_proxy_ops(false, &tids);
12073 cancel_manifest_ops(false, &tids);
12074 osd->objecter->op_cancel(tids, -ECANCELED);
12075
12076 apply_and_flush_repops(false);
12077 cancel_log_updates();
12078 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12079 clear_backoffs();
12080 // clean up snap trim references
12081 snap_trimmer_machine.process_event(Reset());
12082
12083 pgbackend->on_change();
12084
12085 context_registry_on_change();
12086 object_contexts.clear();
12087
12088 clear_async_reads();
12089
12090 osd->remote_reserver.cancel_reservation(info.pgid);
12091 osd->local_reserver.cancel_reservation(info.pgid);
12092
12093 clear_primary_state();
12094 cancel_recovery();
12095
12096 if (is_primary()) {
12097 osd->clear_ready_to_merge(this);
12098 }
12099 }
12100
12101 void PrimaryLogPG::on_activate_complete()
12102 {
12103 check_local();
12104 // waiters
12105 if (!recovery_state.needs_flush()) {
12106 requeue_ops(waiting_for_peered);
12107 } else if (!waiting_for_peered.empty()) {
12108 dout(10) << __func__ << " flushes in progress, moving "
12109 << waiting_for_peered.size()
12110 << " items to waiting_for_flush"
12111 << dendl;
12112 ceph_assert(waiting_for_flush.empty());
12113 waiting_for_flush.swap(waiting_for_peered);
12114 }
12115
12116
12117 // all clean?
12118 if (needs_recovery()) {
12119 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12120 queue_peering_event(
12121 PGPeeringEventRef(
12122 std::make_shared<PGPeeringEvent>(
12123 get_osdmap_epoch(),
12124 get_osdmap_epoch(),
12125 PeeringState::DoRecovery())));
12126 } else if (needs_backfill()) {
12127 dout(10) << "activate queueing backfill" << dendl;
12128 queue_peering_event(
12129 PGPeeringEventRef(
12130 std::make_shared<PGPeeringEvent>(
12131 get_osdmap_epoch(),
12132 get_osdmap_epoch(),
12133 PeeringState::RequestBackfill())));
12134 } else {
12135 dout(10) << "activate all replicas clean, no recovery" << dendl;
12136 eio_errors_to_process = false;
12137 queue_peering_event(
12138 PGPeeringEventRef(
12139 std::make_shared<PGPeeringEvent>(
12140 get_osdmap_epoch(),
12141 get_osdmap_epoch(),
12142 PeeringState::AllReplicasRecovered())));
12143 }
12144
12145 publish_stats_to_osd();
12146
12147 if (get_backfill_targets().size()) {
12148 last_backfill_started = earliest_backfill();
12149 new_backfill = true;
12150 ceph_assert(!last_backfill_started.is_max());
12151 dout(5) << __func__ << ": bft=" << get_backfill_targets()
12152 << " from " << last_backfill_started << dendl;
12153 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12154 i != get_backfill_targets().end();
12155 ++i) {
12156 dout(5) << "target shard " << *i
12157 << " from " << recovery_state.get_peer_info(*i).last_backfill
12158 << dendl;
12159 }
12160 }
12161
12162 hit_set_setup();
12163 agent_setup();
12164 }
12165
12166 void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
12167 {
12168 dout(10) << __func__ << dendl;
12169
12170 if (hit_set && hit_set->insert_count() == 0) {
12171 dout(20) << " discarding empty hit_set" << dendl;
12172 hit_set_clear();
12173 }
12174
12175 if (recovery_queued) {
12176 recovery_queued = false;
12177 osd->clear_queued_recovery(this);
12178 }
12179
12180 // requeue everything in the reverse order they should be
12181 // reexamined.
12182 requeue_ops(waiting_for_peered);
12183 requeue_ops(waiting_for_flush);
12184 requeue_ops(waiting_for_active);
12185 requeue_ops(waiting_for_readable);
12186
12187 clear_scrub_reserved();
12188
12189 vector<ceph_tid_t> tids;
12190 cancel_copy_ops(is_primary(), &tids);
12191 cancel_flush_ops(is_primary(), &tids);
12192 cancel_proxy_ops(is_primary(), &tids);
12193 cancel_manifest_ops(is_primary(), &tids);
12194 osd->objecter->op_cancel(tids, -ECANCELED);
12195
12196 // requeue object waiters
12197 for (auto& p : waiting_for_unreadable_object) {
12198 release_backoffs(p.first);
12199 }
12200 if (is_primary()) {
12201 requeue_object_waiters(waiting_for_unreadable_object);
12202 } else {
12203 waiting_for_unreadable_object.clear();
12204 }
12205 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12206 p != waiting_for_degraded_object.end();
12207 waiting_for_degraded_object.erase(p++)) {
12208 release_backoffs(p->first);
12209 if (is_primary())
12210 requeue_ops(p->second);
12211 else
12212 p->second.clear();
12213 finish_degraded_object(p->first);
12214 }
12215
12216 // requeues waiting_for_scrub
12217 scrub_clear_state();
12218
12219 for (auto p = waiting_for_blocked_object.begin();
12220 p != waiting_for_blocked_object.end();
12221 waiting_for_blocked_object.erase(p++)) {
12222 if (is_primary())
12223 requeue_ops(p->second);
12224 else
12225 p->second.clear();
12226 }
12227 for (auto i = callbacks_for_degraded_object.begin();
12228 i != callbacks_for_degraded_object.end();
12229 ) {
12230 finish_degraded_object((i++)->first);
12231 }
12232 ceph_assert(callbacks_for_degraded_object.empty());
12233
12234 if (is_primary()) {
12235 requeue_ops(waiting_for_cache_not_full);
12236 } else {
12237 waiting_for_cache_not_full.clear();
12238 }
12239 objects_blocked_on_cache_full.clear();
12240
12241 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12242 in_progress_async_reads.begin();
12243 i != in_progress_async_reads.end();
12244 in_progress_async_reads.erase(i++)) {
12245 close_op_ctx(i->second);
12246 if (is_primary())
12247 requeue_op(i->first);
12248 }
12249
12250 // this will requeue ops we were working on but didn't finish, and
12251 // any dups
12252 apply_and_flush_repops(is_primary());
12253 cancel_log_updates();
12254
12255 // do this *after* apply_and_flush_repops so that we catch any newly
12256 // registered watches.
12257 context_registry_on_change();
12258
12259 pgbackend->on_change_cleanup(&t);
12260 scrubber.cleanup_store(&t);
12261 pgbackend->on_change();
12262
12263 // clear snap_trimmer state
12264 snap_trimmer_machine.process_event(Reset());
12265
12266 debug_op_order.clear();
12267 unstable_stats.clear();
12268
12269 // we don't want to cache object_contexts through the interval change
12270 // NOTE: we actually assert that all currently live references are dead
12271 // by the time the flush for the next interval completes.
12272 object_contexts.clear();
12273
12274 // should have been cleared above by finishing all of the degraded objects
12275 ceph_assert(objects_blocked_on_degraded_snap.empty());
12276 }
12277
12278 void PrimaryLogPG::plpg_on_role_change()
12279 {
12280 dout(10) << __func__ << dendl;
12281 if (get_role() != 0 && hit_set) {
12282 dout(10) << " clearing hit set" << dendl;
12283 hit_set_clear();
12284 }
12285 }
12286
12287 void PrimaryLogPG::plpg_on_pool_change()
12288 {
12289 dout(10) << __func__ << dendl;
12290 // requeue cache full waiters just in case the cache_mode is
12291 // changing away from writeback mode. note that if we are not
12292 // active the normal requeuing machinery is sufficient (and properly
12293 // ordered).
12294 if (is_active() &&
12295 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12296 !waiting_for_cache_not_full.empty()) {
12297 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12298 << dendl;
12299 requeue_ops(waiting_for_cache_not_full);
12300 objects_blocked_on_cache_full.clear();
12301 }
12302 hit_set_setup();
12303 agent_setup();
12304 }
12305
12306 // clear state. called on recovery completion AND cancellation.
12307 void PrimaryLogPG::_clear_recovery_state()
12308 {
12309 #ifdef DEBUG_RECOVERY_OIDS
12310 recovering_oids.clear();
12311 #endif
12312 last_backfill_started = hobject_t();
12313 set<hobject_t>::iterator i = backfills_in_flight.begin();
12314 while (i != backfills_in_flight.end()) {
12315 ceph_assert(recovering.count(*i));
12316 backfills_in_flight.erase(i++);
12317 }
12318
12319 list<OpRequestRef> blocked_ops;
12320 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12321 i != recovering.end();
12322 recovering.erase(i++)) {
12323 if (i->second) {
12324 i->second->drop_recovery_read(&blocked_ops);
12325 requeue_ops(blocked_ops);
12326 }
12327 }
12328 ceph_assert(backfills_in_flight.empty());
12329 pending_backfill_updates.clear();
12330 ceph_assert(recovering.empty());
12331 pgbackend->clear_recovery_state();
12332 }
12333
12334 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12335 {
12336 dout(20) << __func__ << ": " << soid << dendl;
12337 ceph_assert(recovering.count(soid));
12338 ObjectContextRef obc = recovering[soid];
12339 if (obc) {
12340 list<OpRequestRef> blocked_ops;
12341 obc->drop_recovery_read(&blocked_ops);
12342 requeue_ops(blocked_ops);
12343 }
12344 recovering.erase(soid);
12345 finish_recovery_op(soid);
12346 release_backoffs(soid);
12347 if (waiting_for_degraded_object.count(soid)) {
12348 dout(20) << " kicking degraded waiters on " << soid << dendl;
12349 requeue_ops(waiting_for_degraded_object[soid]);
12350 waiting_for_degraded_object.erase(soid);
12351 }
12352 if (waiting_for_unreadable_object.count(soid)) {
12353 dout(20) << " kicking unreadable waiters on " << soid << dendl;
12354 requeue_ops(waiting_for_unreadable_object[soid]);
12355 waiting_for_unreadable_object.erase(soid);
12356 }
12357 if (is_missing_object(soid))
12358 recovery_state.set_last_requested(0);
12359 finish_degraded_object(soid);
12360 }
12361
12362 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12363 {
12364 pgbackend->check_recovery_sources(osdmap);
12365 }
12366
12367 bool PrimaryLogPG::start_recovery_ops(
12368 uint64_t max,
12369 ThreadPool::TPHandle &handle,
12370 uint64_t *ops_started)
12371 {
12372 uint64_t& started = *ops_started;
12373 started = 0;
12374 bool work_in_progress = false;
12375 bool recovery_started = false;
12376 ceph_assert(is_primary());
12377 ceph_assert(is_peered());
12378 ceph_assert(!recovery_state.is_deleting());
12379
12380 ceph_assert(recovery_queued);
12381 recovery_queued = false;
12382
12383 if (!state_test(PG_STATE_RECOVERING) &&
12384 !state_test(PG_STATE_BACKFILLING)) {
12385 /* TODO: I think this case is broken and will make do_recovery()
12386 * unhappy since we're returning false */
12387 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
12388 return have_unfound();
12389 }
12390
12391 const auto &missing = recovery_state.get_pg_log().get_missing();
12392
12393 uint64_t num_unfound = get_num_unfound();
12394
12395 if (!recovery_state.have_missing()) {
12396 recovery_state.local_recovery_complete();
12397 }
12398
12399 if (!missing.have_missing() || // Primary does not have missing
12400 // or all of the missing objects are unfound.
12401 recovery_state.all_missing_unfound()) {
12402 // Recover the replicas.
12403 started = recover_replicas(max, handle, &recovery_started);
12404 }
12405 if (!started) {
12406 // We still have missing objects that we should grab from replicas.
12407 started += recover_primary(max, handle);
12408 }
12409 if (!started && num_unfound != get_num_unfound()) {
12410 // second chance to recovery replicas
12411 started = recover_replicas(max, handle, &recovery_started);
12412 }
12413
12414 if (started || recovery_started)
12415 work_in_progress = true;
12416
12417 bool deferred_backfill = false;
12418 if (recovering.empty() &&
12419 state_test(PG_STATE_BACKFILLING) &&
12420 !get_backfill_targets().empty() && started < max &&
12421 missing.num_missing() == 0 &&
12422 waiting_on_backfill.empty()) {
12423 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12424 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12425 deferred_backfill = true;
12426 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12427 !is_degraded()) {
12428 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12429 deferred_backfill = true;
12430 } else if (!recovery_state.is_backfill_reserved()) {
12431 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12432 if (!backfill_reserving) {
12433 dout(10) << "queueing RequestBackfill" << dendl;
12434 backfill_reserving = true;
12435 queue_peering_event(
12436 PGPeeringEventRef(
12437 std::make_shared<PGPeeringEvent>(
12438 get_osdmap_epoch(),
12439 get_osdmap_epoch(),
12440 PeeringState::RequestBackfill())));
12441 }
12442 deferred_backfill = true;
12443 } else {
12444 started += recover_backfill(max - started, handle, &work_in_progress);
12445 }
12446 }
12447
12448 dout(10) << " started " << started << dendl;
12449 osd->logger->inc(l_osd_rop, started);
12450
12451 if (!recovering.empty() ||
12452 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
12453 return !work_in_progress && have_unfound();
12454
12455 ceph_assert(recovering.empty());
12456 ceph_assert(recovery_ops_active == 0);
12457
12458 dout(10) << __func__ << " needs_recovery: "
12459 << recovery_state.get_missing_loc().get_needs_recovery()
12460 << dendl;
12461 dout(10) << __func__ << " missing_loc: "
12462 << recovery_state.get_missing_loc().get_missing_locs()
12463 << dendl;
12464 int unfound = get_num_unfound();
12465 if (unfound) {
12466 dout(10) << " still have " << unfound << " unfound" << dendl;
12467 return true;
12468 }
12469
12470 if (missing.num_missing() > 0) {
12471 // this shouldn't happen!
12472 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12473 << missing.num_missing() << ": " << missing.get_items();
12474 return false;
12475 }
12476
12477 if (needs_recovery()) {
12478 // this shouldn't happen!
12479 // We already checked num_missing() so we must have missing replicas
12480 osd->clog->error() << info.pgid
12481 << " Unexpected Error: recovery ending with missing replicas";
12482 return false;
12483 }
12484
12485 if (state_test(PG_STATE_RECOVERING)) {
12486 state_clear(PG_STATE_RECOVERING);
12487 state_clear(PG_STATE_FORCED_RECOVERY);
12488 if (needs_backfill()) {
12489 dout(10) << "recovery done, queuing backfill" << dendl;
12490 queue_peering_event(
12491 PGPeeringEventRef(
12492 std::make_shared<PGPeeringEvent>(
12493 get_osdmap_epoch(),
12494 get_osdmap_epoch(),
12495 PeeringState::RequestBackfill())));
12496 } else {
12497 dout(10) << "recovery done, no backfill" << dendl;
12498 eio_errors_to_process = false;
12499 state_clear(PG_STATE_FORCED_BACKFILL);
12500 queue_peering_event(
12501 PGPeeringEventRef(
12502 std::make_shared<PGPeeringEvent>(
12503 get_osdmap_epoch(),
12504 get_osdmap_epoch(),
12505 PeeringState::AllReplicasRecovered())));
12506 }
12507 } else { // backfilling
12508 state_clear(PG_STATE_BACKFILLING);
12509 state_clear(PG_STATE_FORCED_BACKFILL);
12510 state_clear(PG_STATE_FORCED_RECOVERY);
12511 dout(10) << "recovery done, backfill done" << dendl;
12512 eio_errors_to_process = false;
12513 queue_peering_event(
12514 PGPeeringEventRef(
12515 std::make_shared<PGPeeringEvent>(
12516 get_osdmap_epoch(),
12517 get_osdmap_epoch(),
12518 PeeringState::Backfilled())));
12519 }
12520
12521 return false;
12522 }
12523
12524 /**
12525 * do one recovery op.
12526 * return true if done, false if nothing left to do.
12527 */
12528 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12529 {
12530 ceph_assert(is_primary());
12531
12532 const auto &missing = recovery_state.get_pg_log().get_missing();
12533
12534 dout(10) << __func__ << " recovering " << recovering.size()
12535 << " in pg,"
12536 << " missing " << missing << dendl;
12537
12538 dout(25) << __func__ << " " << missing.get_items() << dendl;
12539
12540 // look at log!
12541 pg_log_entry_t *latest = 0;
12542 unsigned started = 0;
12543 int skipped = 0;
12544
12545 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12546 map<version_t, hobject_t>::const_iterator p =
12547 missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
12548 while (p != missing.get_rmissing().end()) {
12549 handle.reset_tp_timeout();
12550 hobject_t soid;
12551 version_t v = p->first;
12552
12553 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
12554 if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
12555 latest = it_objects->second;
12556 ceph_assert(latest->is_update() || latest->is_delete());
12557 soid = latest->soid;
12558 } else {
12559 latest = 0;
12560 soid = p->second;
12561 }
12562 const pg_missing_item& item = missing.get_items().find(p->second)->second;
12563 ++p;
12564
12565 hobject_t head = soid.get_head();
12566
12567 eversion_t need = item.need;
12568
12569 dout(10) << __func__ << " "
12570 << soid << " " << item.need
12571 << (missing.is_missing(soid) ? " (missing)":"")
12572 << (missing.is_missing(head) ? " (missing head)":"")
12573 << (recovering.count(soid) ? " (recovering)":"")
12574 << (recovering.count(head) ? " (recovering head)":"")
12575 << dendl;
12576
12577 if (latest) {
12578 switch (latest->op) {
12579 case pg_log_entry_t::CLONE:
12580 /*
12581 * Handling for this special case removed for now, until we
12582 * can correctly construct an accurate SnapSet from the old
12583 * one.
12584 */
12585 break;
12586
12587 case pg_log_entry_t::LOST_REVERT:
12588 {
12589 if (item.have == latest->reverting_to) {
12590 ObjectContextRef obc = get_object_context(soid, true);
12591
12592 if (obc->obs.oi.version == latest->version) {
12593 // I'm already reverting
12594 dout(10) << " already reverting " << soid << dendl;
12595 } else {
12596 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
12597 obc->obs.oi.version = latest->version;
12598
12599 ObjectStore::Transaction t;
12600 bufferlist b2;
12601 obc->obs.oi.encode(
12602 b2,
12603 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12604 ceph_assert(!pool.info.require_rollback());
12605 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
12606
12607 recovery_state.recover_got(
12608 soid,
12609 latest->version,
12610 false,
12611 t);
12612
12613 ++active_pushes;
12614
12615 t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
12616 t.register_on_commit(new C_OSD_CommittedPushedObject(
12617 this,
12618 get_osdmap_epoch(),
12619 info.last_complete));
12620 osd->store->queue_transaction(ch, std::move(t));
12621 continue;
12622 }
12623 } else {
12624 /*
12625 * Pull the old version of the object. Update missing_loc here to have the location
12626 * of the version we want.
12627 *
12628 * This doesn't use the usual missing_loc paths, but that's okay:
12629 * - if we have it locally, we hit the case above, and go from there.
12630 * - if we don't, we always pass through this case during recovery and set up the location
12631 * properly.
12632 * - this way we don't need to mangle the missing code to be general about needing an old
12633 * version...
12634 */
12635 eversion_t alternate_need = latest->reverting_to;
12636 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
12637
12638 set<pg_shard_t> good_peers;
12639 for (auto p = recovery_state.get_peer_missing().begin();
12640 p != recovery_state.get_peer_missing().end();
12641 ++p) {
12642 if (p->second.is_missing(soid, need) &&
12643 p->second.get_items().at(soid).have == alternate_need) {
12644 good_peers.insert(p->first);
12645 }
12646 }
12647 recovery_state.set_revert_with_targets(
12648 soid,
12649 good_peers);
12650 dout(10) << " will pull " << alternate_need << " or " << need
12651 << " from one of "
12652 << recovery_state.get_missing_loc().get_locations(soid)
12653 << dendl;
12654 }
12655 }
12656 break;
12657 }
12658 }
12659
12660 if (!recovering.count(soid)) {
12661 if (recovering.count(head)) {
12662 ++skipped;
12663 } else {
12664 int r = recover_missing(
12665 soid, need, get_recovery_op_priority(), h);
12666 switch (r) {
12667 case PULL_YES:
12668 ++started;
12669 break;
12670 case PULL_HEAD:
12671 ++started;
12672 case PULL_NONE:
12673 ++skipped;
12674 break;
12675 default:
12676 ceph_abort();
12677 }
12678 if (started >= max)
12679 break;
12680 }
12681 }
12682
12683 // only advance last_requested if we haven't skipped anything
12684 if (!skipped)
12685 recovery_state.set_last_requested(v);
12686 }
12687
12688 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12689 return started;
12690 }
12691
12692 bool PrimaryLogPG::primary_error(
12693 const hobject_t& soid, eversion_t v)
12694 {
12695 recovery_state.force_object_missing(pg_whoami, soid, v);
12696 bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
12697 if (uhoh)
12698 osd->clog->error() << info.pgid << " missing primary copy of "
12699 << soid << ", unfound";
12700 else
12701 osd->clog->error() << info.pgid << " missing primary copy of "
12702 << soid
12703 << ", will try copies on "
12704 << recovery_state.get_missing_loc().get_locations(soid);
12705 return uhoh;
12706 }
12707
12708 int PrimaryLogPG::prep_object_replica_deletes(
12709 const hobject_t& soid, eversion_t v,
12710 PGBackend::RecoveryHandle *h,
12711 bool *work_started)
12712 {
12713 ceph_assert(is_primary());
12714 dout(10) << __func__ << ": on " << soid << dendl;
12715
12716 ObjectContextRef obc = get_object_context(soid, false);
12717 if (obc) {
12718 if (!obc->get_recovery_read()) {
12719 dout(20) << "replica delete delayed on " << soid
12720 << "; could not get rw_manager lock" << dendl;
12721 *work_started = true;
12722 return 0;
12723 } else {
12724 dout(20) << "replica delete got recovery read lock on " << soid
12725 << dendl;
12726 }
12727 }
12728
12729 start_recovery_op(soid);
12730 ceph_assert(!recovering.count(soid));
12731 if (!obc)
12732 recovering.insert(make_pair(soid, ObjectContextRef()));
12733 else
12734 recovering.insert(make_pair(soid, obc));
12735
12736 pgbackend->recover_delete_object(soid, v, h);
12737 return 1;
12738 }
12739
12740 int PrimaryLogPG::prep_object_replica_pushes(
12741 const hobject_t& soid, eversion_t v,
12742 PGBackend::RecoveryHandle *h,
12743 bool *work_started)
12744 {
12745 ceph_assert(is_primary());
12746 dout(10) << __func__ << ": on " << soid << dendl;
12747
12748 if (soid.snap && soid.snap < CEPH_NOSNAP) {
12749 // do we have the head and/or snapdir?
12750 hobject_t head = soid.get_head();
12751 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
12752 if (recovering.count(head)) {
12753 dout(10) << " missing but already recovering head " << head << dendl;
12754 return 0;
12755 } else {
12756 int r = recover_missing(
12757 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
12758 get_recovery_op_priority(), h);
12759 if (r != PULL_NONE)
12760 return 1;
12761 return 0;
12762 }
12763 }
12764 }
12765
12766 // NOTE: we know we will get a valid oloc off of disk here.
12767 ObjectContextRef obc = get_object_context(soid, false);
12768 if (!obc) {
12769 primary_error(soid, v);
12770 return 0;
12771 }
12772
12773 if (!obc->get_recovery_read()) {
12774 dout(20) << "recovery delayed on " << soid
12775 << "; could not get rw_manager lock" << dendl;
12776 *work_started = true;
12777 return 0;
12778 } else {
12779 dout(20) << "recovery got recovery read lock on " << soid
12780 << dendl;
12781 }
12782
12783 start_recovery_op(soid);
12784 ceph_assert(!recovering.count(soid));
12785 recovering.insert(make_pair(soid, obc));
12786
12787 int r = pgbackend->recover_object(
12788 soid,
12789 v,
12790 ObjectContextRef(),
12791 obc, // has snapset context
12792 h);
12793 if (r < 0) {
12794 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
12795 on_failed_pull({ pg_whoami }, soid, v);
12796 return 0;
12797 }
12798 return 1;
12799 }
12800
12801 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
12802 bool *work_started)
12803 {
12804 dout(10) << __func__ << "(" << max << ")" << dendl;
12805 uint64_t started = 0;
12806
12807 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12808
12809 // this is FAR from an optimal recovery order. pretty lame, really.
12810 ceph_assert(!get_acting_recovery_backfill().empty());
12811 // choose replicas to recover, replica has the shortest missing list first
12812 // so we can bring it back to normal ASAP
12813 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
12814 async_by_num_missing;
12815 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
12816 for (auto &p: get_acting_recovery_backfill()) {
12817 if (p == get_primary()) {
12818 continue;
12819 }
12820 auto pm = recovery_state.get_peer_missing().find(p);
12821 ceph_assert(pm != recovery_state.get_peer_missing().end());
12822 auto nm = pm->second.num_missing();
12823 if (nm != 0) {
12824 if (is_async_recovery_target(p)) {
12825 async_by_num_missing.push_back(make_pair(nm, p));
12826 } else {
12827 replicas_by_num_missing.push_back(make_pair(nm, p));
12828 }
12829 }
12830 }
12831 // sort by number of missing objects, in ascending order.
12832 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
12833 const std::pair<unsigned int, pg_shard_t> &rhs) {
12834 return lhs.first < rhs.first;
12835 };
12836 // acting goes first
12837 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
12838 // then async_recovery_targets
12839 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
12840 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
12841 async_by_num_missing.begin(), async_by_num_missing.end());
12842 for (auto &replica: replicas_by_num_missing) {
12843 pg_shard_t &peer = replica.second;
12844 ceph_assert(peer != get_primary());
12845 auto pm = recovery_state.get_peer_missing().find(peer);
12846 ceph_assert(pm != recovery_state.get_peer_missing().end());
12847 size_t m_sz = pm->second.num_missing();
12848
12849 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
12850 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
12851
12852 // oldest first!
12853 const pg_missing_t &m(pm->second);
12854 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
12855 p != m.get_rmissing().end() && started < max;
12856 ++p) {
12857 handle.reset_tp_timeout();
12858 const hobject_t soid(p->second);
12859
12860 if (recovery_state.get_missing_loc().is_unfound(soid)) {
12861 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
12862 continue;
12863 }
12864
12865 const pg_info_t &pi = recovery_state.get_peer_info(peer);
12866 if (soid > pi.last_backfill) {
12867 if (!recovering.count(soid)) {
12868 derr << __func__ << ": object " << soid << " last_backfill "
12869 << pi.last_backfill << dendl;
12870 derr << __func__ << ": object added to missing set for backfill, but "
12871 << "is not in recovering, error!" << dendl;
12872 ceph_abort();
12873 }
12874 continue;
12875 }
12876
12877 if (recovering.count(soid)) {
12878 dout(10) << __func__ << ": already recovering " << soid << dendl;
12879 continue;
12880 }
12881
12882 if (recovery_state.get_missing_loc().is_deleted(soid)) {
12883 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
12884 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12885 started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
12886 continue;
12887 }
12888
12889 if (soid.is_snap() &&
12890 recovery_state.get_pg_log().get_missing().is_missing(
12891 soid.get_head())) {
12892 dout(10) << __func__ << ": " << soid.get_head()
12893 << " still missing on primary" << dendl;
12894 continue;
12895 }
12896
12897 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
12898 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
12899 continue;
12900 }
12901
12902 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
12903 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12904 started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
12905 }
12906 }
12907
12908 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12909 return started;
12910 }
12911
12912 hobject_t PrimaryLogPG::earliest_peer_backfill() const
12913 {
12914 hobject_t e = hobject_t::get_max();
12915 for (const pg_shard_t& peer : get_backfill_targets()) {
12916 const auto iter = peer_backfill_info.find(peer);
12917 ceph_assert(iter != peer_backfill_info.end());
12918 e = std::min(e, iter->second.begin);
12919 }
12920 return e;
12921 }
12922
12923 bool PrimaryLogPG::all_peer_done() const
12924 {
12925 // Primary hasn't got any more objects
12926 ceph_assert(backfill_info.empty());
12927
12928 for (const pg_shard_t& bt : get_backfill_targets()) {
12929 const auto piter = peer_backfill_info.find(bt);
12930 ceph_assert(piter != peer_backfill_info.end());
12931 const BackfillInterval& pbi = piter->second;
12932 // See if peer has more to process
12933 if (!pbi.extends_to_end() || !pbi.empty())
12934 return false;
12935 }
12936 return true;
12937 }
12938
12939 /**
12940 * recover_backfill
12941 *
12942 * Invariants:
12943 *
12944 * backfilled: fully pushed to replica or present in replica's missing set (both
12945 * our copy and theirs).
12946 *
12947 * All objects on a backfill_target in
12948 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12949 * objects have been actually deleted and all logically-valid objects are replicated.
12950 * There may be PG objects in this interval yet to be backfilled.
12951 *
12952 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12953 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
12954 *
12955 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
12956 * backfill_info.begin) in PG are backfilled. No deleted objects in this
12957 * interval remain on the backfill target.
12958 *
12959 * For a backfill target, all objects <= peer_info[target].last_backfill
12960 * have been backfilled to target
12961 *
12962 * There *MAY* be missing/outdated objects between last_backfill_started and
12963 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12964 * io created objects since the last scan. For this reason, we call
12965 * update_range() again before continuing backfill.
12966 */
12967 uint64_t PrimaryLogPG::recover_backfill(
12968 uint64_t max,
12969 ThreadPool::TPHandle &handle, bool *work_started)
12970 {
12971 dout(10) << __func__ << " (" << max << ")"
12972 << " bft=" << get_backfill_targets()
12973 << " last_backfill_started " << last_backfill_started
12974 << (new_backfill ? " new_backfill":"")
12975 << dendl;
12976 ceph_assert(!get_backfill_targets().empty());
12977
12978 // Initialize from prior backfill state
12979 if (new_backfill) {
12980 // on_activate() was called prior to getting here
12981 ceph_assert(last_backfill_started == earliest_backfill());
12982 new_backfill = false;
12983
12984 // initialize BackfillIntervals
12985 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12986 i != get_backfill_targets().end();
12987 ++i) {
12988 peer_backfill_info[*i].reset(
12989 recovery_state.get_peer_info(*i).last_backfill);
12990 }
12991 backfill_info.reset(last_backfill_started);
12992
12993 backfills_in_flight.clear();
12994 pending_backfill_updates.clear();
12995 }
12996
12997 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12998 i != get_backfill_targets().end();
12999 ++i) {
13000 dout(10) << "peer osd." << *i
13001 << " info " << recovery_state.get_peer_info(*i)
13002 << " interval " << peer_backfill_info[*i].begin
13003 << "-" << peer_backfill_info[*i].end
13004 << " " << peer_backfill_info[*i].objects.size() << " objects"
13005 << dendl;
13006 }
13007
13008 // update our local interval to cope with recent changes
13009 backfill_info.begin = last_backfill_started;
13010 update_range(&backfill_info, handle);
13011
13012 unsigned ops = 0;
13013 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13014 set<hobject_t> add_to_stat;
13015
13016 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13017 i != get_backfill_targets().end();
13018 ++i) {
13019 peer_backfill_info[*i].trim_to(
13020 std::max(
13021 recovery_state.get_peer_info(*i).last_backfill,
13022 last_backfill_started));
13023 }
13024 backfill_info.trim_to(last_backfill_started);
13025
13026 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13027 while (ops < max) {
13028 if (backfill_info.begin <= earliest_peer_backfill() &&
13029 !backfill_info.extends_to_end() && backfill_info.empty()) {
13030 hobject_t next = backfill_info.end;
13031 backfill_info.reset(next);
13032 backfill_info.end = hobject_t::get_max();
13033 update_range(&backfill_info, handle);
13034 backfill_info.trim();
13035 }
13036
13037 dout(20) << " my backfill interval " << backfill_info << dendl;
13038
13039 bool sent_scan = false;
13040 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13041 i != get_backfill_targets().end();
13042 ++i) {
13043 pg_shard_t bt = *i;
13044 BackfillInterval& pbi = peer_backfill_info[bt];
13045
13046 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13047 if (pbi.begin <= backfill_info.begin &&
13048 !pbi.extends_to_end() && pbi.empty()) {
13049 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13050 epoch_t e = get_osdmap_epoch();
13051 MOSDPGScan *m = new MOSDPGScan(
13052 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
13053 spg_t(info.pgid.pgid, bt.shard),
13054 pbi.end, hobject_t());
13055 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13056 ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13057 waiting_on_backfill.insert(bt);
13058 sent_scan = true;
13059 }
13060 }
13061
13062 // Count simultaneous scans as a single op and let those complete
13063 if (sent_scan) {
13064 ops++;
13065 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13066 break;
13067 }
13068
13069 if (backfill_info.empty() && all_peer_done()) {
13070 dout(10) << " reached end for both local and all peers" << dendl;
13071 break;
13072 }
13073
13074 // Get object within set of peers to operate on and
13075 // the set of targets for which that object applies.
13076 hobject_t check = earliest_peer_backfill();
13077
13078 if (check < backfill_info.begin) {
13079
13080 set<pg_shard_t> check_targets;
13081 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13082 i != get_backfill_targets().end();
13083 ++i) {
13084 pg_shard_t bt = *i;
13085 BackfillInterval& pbi = peer_backfill_info[bt];
13086 if (pbi.begin == check)
13087 check_targets.insert(bt);
13088 }
13089 ceph_assert(!check_targets.empty());
13090
13091 dout(20) << " BACKFILL removing " << check
13092 << " from peers " << check_targets << dendl;
13093 for (set<pg_shard_t>::iterator i = check_targets.begin();
13094 i != check_targets.end();
13095 ++i) {
13096 pg_shard_t bt = *i;
13097 BackfillInterval& pbi = peer_backfill_info[bt];
13098 ceph_assert(pbi.begin == check);
13099
13100 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13101 pbi.pop_front();
13102 }
13103
13104 last_backfill_started = check;
13105
13106 // Don't increment ops here because deletions
13107 // are cheap and not replied to unlike real recovery_ops,
13108 // and we can't increment ops without requeueing ourself
13109 // for recovery.
13110 } else {
13111 eversion_t& obj_v = backfill_info.objects.begin()->second;
13112
13113 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13114 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13115 i != get_backfill_targets().end();
13116 ++i) {
13117 pg_shard_t bt = *i;
13118 BackfillInterval& pbi = peer_backfill_info[bt];
13119 // Find all check peers that have the wrong version
13120 if (check == backfill_info.begin && check == pbi.begin) {
13121 if (pbi.objects.begin()->second != obj_v) {
13122 need_ver_targs.push_back(bt);
13123 } else {
13124 keep_ver_targs.push_back(bt);
13125 }
13126 } else {
13127 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13128
13129 // Only include peers that we've caught up to their backfill line
13130 // otherwise, they only appear to be missing this object
13131 // because their pbi.begin > backfill_info.begin.
13132 if (backfill_info.begin > pinfo.last_backfill)
13133 missing_targs.push_back(bt);
13134 else
13135 skip_targs.push_back(bt);
13136 }
13137 }
13138
13139 if (!keep_ver_targs.empty()) {
13140 // These peers have version obj_v
13141 dout(20) << " BACKFILL keeping " << check
13142 << " with ver " << obj_v
13143 << " on peers " << keep_ver_targs << dendl;
13144 //assert(!waiting_for_degraded_object.count(check));
13145 }
13146 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13147 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13148 ceph_assert(obc);
13149 if (obc->get_recovery_read()) {
13150 if (!need_ver_targs.empty()) {
13151 dout(20) << " BACKFILL replacing " << check
13152 << " with ver " << obj_v
13153 << " to peers " << need_ver_targs << dendl;
13154 }
13155 if (!missing_targs.empty()) {
13156 dout(20) << " BACKFILL pushing " << backfill_info.begin
13157 << " with ver " << obj_v
13158 << " to peers " << missing_targs << dendl;
13159 }
13160 vector<pg_shard_t> all_push = need_ver_targs;
13161 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13162
13163 handle.reset_tp_timeout();
13164 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13165 if (r < 0) {
13166 *work_started = true;
13167 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13168 break;
13169 }
13170 ops++;
13171 } else {
13172 *work_started = true;
13173 dout(20) << "backfill blocking on " << backfill_info.begin
13174 << "; could not get rw_manager lock" << dendl;
13175 break;
13176 }
13177 }
13178 dout(20) << "need_ver_targs=" << need_ver_targs
13179 << " keep_ver_targs=" << keep_ver_targs << dendl;
13180 dout(20) << "backfill_targets=" << get_backfill_targets()
13181 << " missing_targs=" << missing_targs
13182 << " skip_targs=" << skip_targs << dendl;
13183
13184 last_backfill_started = backfill_info.begin;
13185 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13186 backfill_info.pop_front();
13187 vector<pg_shard_t> check_targets = need_ver_targs;
13188 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13189 for (vector<pg_shard_t>::iterator i = check_targets.begin();
13190 i != check_targets.end();
13191 ++i) {
13192 pg_shard_t bt = *i;
13193 BackfillInterval& pbi = peer_backfill_info[bt];
13194 pbi.pop_front();
13195 }
13196 }
13197 }
13198
13199 hobject_t backfill_pos =
13200 std::min(backfill_info.begin, earliest_peer_backfill());
13201
13202 for (set<hobject_t>::iterator i = add_to_stat.begin();
13203 i != add_to_stat.end();
13204 ++i) {
13205 ObjectContextRef obc = get_object_context(*i, false);
13206 ceph_assert(obc);
13207 pg_stat_t stat;
13208 add_object_context_to_pg_stat(obc, &stat);
13209 pending_backfill_updates[*i] = stat;
13210 }
13211 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13212 for (unsigned i = 0; i < to_remove.size(); ++i) {
13213 handle.reset_tp_timeout();
13214 const hobject_t& oid = to_remove[i].get<0>();
13215 eversion_t v = to_remove[i].get<1>();
13216 pg_shard_t peer = to_remove[i].get<2>();
13217 MOSDPGBackfillRemove *m;
13218 auto it = reqs.find(peer);
13219 if (it != reqs.end()) {
13220 m = it->second;
13221 } else {
13222 m = reqs[peer] = new MOSDPGBackfillRemove(
13223 spg_t(info.pgid.pgid, peer.shard),
13224 get_osdmap_epoch());
13225 }
13226 m->ls.push_back(make_pair(oid, v));
13227
13228 if (oid <= last_backfill_started)
13229 pending_backfill_updates[oid]; // add empty stat!
13230 }
13231 for (auto p : reqs) {
13232 osd->send_message_osd_cluster(p.first.osd, p.second,
13233 get_osdmap_epoch());
13234 }
13235
13236 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13237
13238 dout(5) << "backfill_pos is " << backfill_pos << dendl;
13239 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13240 i != backfills_in_flight.end();
13241 ++i) {
13242 dout(20) << *i << " is still in flight" << dendl;
13243 }
13244
13245 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13246 backfill_pos : *(backfills_in_flight.begin());
13247 hobject_t new_last_backfill = earliest_backfill();
13248 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13249 for (map<hobject_t, pg_stat_t>::iterator i =
13250 pending_backfill_updates.begin();
13251 i != pending_backfill_updates.end() &&
13252 i->first < next_backfill_to_complete;
13253 pending_backfill_updates.erase(i++)) {
13254 dout(20) << " pending_backfill_update " << i->first << dendl;
13255 ceph_assert(i->first > new_last_backfill);
13256 recovery_state.update_complete_backfill_object_stats(
13257 i->first,
13258 i->second);
13259 new_last_backfill = i->first;
13260 }
13261 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13262
13263 ceph_assert(!pending_backfill_updates.empty() ||
13264 new_last_backfill == last_backfill_started);
13265 if (pending_backfill_updates.empty() &&
13266 backfill_pos.is_max()) {
13267 ceph_assert(backfills_in_flight.empty());
13268 new_last_backfill = backfill_pos;
13269 last_backfill_started = backfill_pos;
13270 }
13271 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13272
13273 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13274 // all the backfill targets. Otherwise, we will move last_backfill up on
13275 // those targets need it and send OP_BACKFILL_PROGRESS to them.
13276 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13277 i != get_backfill_targets().end();
13278 ++i) {
13279 pg_shard_t bt = *i;
13280 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13281
13282 if (new_last_backfill > pinfo.last_backfill) {
13283 recovery_state.update_peer_last_backfill(bt, new_last_backfill);
13284 epoch_t e = get_osdmap_epoch();
13285 MOSDPGBackfill *m = NULL;
13286 if (pinfo.last_backfill.is_max()) {
13287 m = new MOSDPGBackfill(
13288 MOSDPGBackfill::OP_BACKFILL_FINISH,
13289 e,
13290 get_last_peering_reset(),
13291 spg_t(info.pgid.pgid, bt.shard));
13292 // Use default priority here, must match sub_op priority
13293 start_recovery_op(hobject_t::get_max());
13294 } else {
13295 m = new MOSDPGBackfill(
13296 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13297 e,
13298 get_last_peering_reset(),
13299 spg_t(info.pgid.pgid, bt.shard));
13300 // Use default priority here, must match sub_op priority
13301 }
13302 m->last_backfill = pinfo.last_backfill;
13303 m->stats = pinfo.stats;
13304 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13305 dout(10) << " peer " << bt
13306 << " num_objects now " << pinfo.stats.stats.sum.num_objects
13307 << " / " << info.stats.stats.sum.num_objects << dendl;
13308 }
13309 }
13310
13311 if (ops)
13312 *work_started = true;
13313 return ops;
13314 }
13315
13316 int PrimaryLogPG::prep_backfill_object_push(
13317 hobject_t oid, eversion_t v,
13318 ObjectContextRef obc,
13319 vector<pg_shard_t> peers,
13320 PGBackend::RecoveryHandle *h)
13321 {
13322 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
13323 ceph_assert(!peers.empty());
13324
13325 backfills_in_flight.insert(oid);
13326 recovery_state.prepare_backfill_for_missing(oid, v, peers);
13327
13328 ceph_assert(!recovering.count(oid));
13329
13330 start_recovery_op(oid);
13331 recovering.insert(make_pair(oid, obc));
13332
13333 int r = pgbackend->recover_object(
13334 oid,
13335 v,
13336 ObjectContextRef(),
13337 obc,
13338 h);
13339 if (r < 0) {
13340 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
13341 on_failed_pull({ pg_whoami }, oid, v);
13342 }
13343 return r;
13344 }
13345
13346 void PrimaryLogPG::update_range(
13347 BackfillInterval *bi,
13348 ThreadPool::TPHandle &handle)
13349 {
13350 int local_min = cct->_conf->osd_backfill_scan_min;
13351 int local_max = cct->_conf->osd_backfill_scan_max;
13352
13353 if (bi->version < info.log_tail) {
13354 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13355 << dendl;
13356 bi->version = info.last_update;
13357 scan_range(local_min, local_max, bi, handle);
13358 }
13359
13360 if (bi->version >= projected_last_update) {
13361 dout(10) << __func__<< ": bi is current " << dendl;
13362 ceph_assert(bi->version == projected_last_update);
13363 } else if (bi->version >= info.log_tail) {
13364 if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
13365 /* Because we don't move log_tail on split, the log might be
13366 * empty even if log_tail != last_update. However, the only
13367 * way to get here with an empty log is if log_tail is actually
13368 * eversion_t(), because otherwise the entry which changed
13369 * last_update since the last scan would have to be present.
13370 */
13371 ceph_assert(bi->version == eversion_t());
13372 return;
13373 }
13374
13375 dout(10) << __func__<< ": bi is old, (" << bi->version
13376 << ") can be updated with log to projected_last_update "
13377 << projected_last_update << dendl;
13378
13379 auto func = [&](const pg_log_entry_t &e) {
13380 dout(10) << __func__ << ": updating from version " << e.version
13381 << dendl;
13382 const hobject_t &soid = e.soid;
13383 if (soid >= bi->begin &&
13384 soid < bi->end) {
13385 if (e.is_update()) {
13386 dout(10) << __func__ << ": " << e.soid << " updated to version "
13387 << e.version << dendl;
13388 bi->objects.erase(e.soid);
13389 bi->objects.insert(
13390 make_pair(
13391 e.soid,
13392 e.version));
13393 } else if (e.is_delete()) {
13394 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13395 bi->objects.erase(e.soid);
13396 }
13397 }
13398 };
13399 dout(10) << "scanning pg log first" << dendl;
13400 recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
13401 dout(10) << "scanning projected log" << dendl;
13402 projected_log.scan_log_after(bi->version, func);
13403 bi->version = projected_last_update;
13404 } else {
13405 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13406 }
13407 }
13408
13409 void PrimaryLogPG::scan_range(
13410 int min, int max, BackfillInterval *bi,
13411 ThreadPool::TPHandle &handle)
13412 {
13413 ceph_assert(is_locked());
13414 dout(10) << "scan_range from " << bi->begin << dendl;
13415 bi->clear_objects();
13416
13417 vector<hobject_t> ls;
13418 ls.reserve(max);
13419 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
13420 ceph_assert(r >= 0);
13421 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13422 dout(20) << ls << dendl;
13423
13424 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13425 handle.reset_tp_timeout();
13426 ObjectContextRef obc;
13427 if (is_primary())
13428 obc = object_contexts.lookup(*p);
13429 if (obc) {
13430 if (!obc->obs.exists) {
13431 /* If the object does not exist here, it must have been removed
13432 * between the collection_list_partial and here. This can happen
13433 * for the first item in the range, which is usually last_backfill.
13434 */
13435 continue;
13436 }
13437 bi->objects[*p] = obc->obs.oi.version;
13438 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
13439 } else {
13440 bufferlist bl;
13441 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
13442 /* If the object does not exist here, it must have been removed
13443 * between the collection_list_partial and here. This can happen
13444 * for the first item in the range, which is usually last_backfill.
13445 */
13446 if (r == -ENOENT)
13447 continue;
13448
13449 ceph_assert(r >= 0);
13450 object_info_t oi(bl);
13451 bi->objects[*p] = oi.version;
13452 dout(20) << " " << *p << " " << oi.version << dendl;
13453 }
13454 }
13455 }
13456
13457
13458 /** check_local
13459 *
13460 * verifies that stray objects have been deleted
13461 */
13462 void PrimaryLogPG::check_local()
13463 {
13464 dout(10) << __func__ << dendl;
13465
13466 ceph_assert(
13467 info.last_update >=
13468 recovery_state.get_pg_log().get_tail()); // otherwise we need some help!
13469
13470 if (!cct->_conf->osd_debug_verify_stray_on_activate)
13471 return;
13472
13473 // just scan the log.
13474 set<hobject_t> did;
13475 for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
13476 p != recovery_state.get_pg_log().get_log().log.rend();
13477 ++p) {
13478 if (did.count(p->soid))
13479 continue;
13480 did.insert(p->soid);
13481
13482 if (p->is_delete() && !is_missing_object(p->soid)) {
13483 dout(10) << " checking " << p->soid
13484 << " at " << p->version << dendl;
13485 struct stat st;
13486 int r = osd->store->stat(
13487 ch,
13488 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13489 &st);
13490 if (r != -ENOENT) {
13491 derr << __func__ << " " << p->soid << " exists, but should have been "
13492 << "deleted" << dendl;
13493 ceph_abort_msg("erroneously present object");
13494 }
13495 } else {
13496 // ignore old(+missing) objects
13497 }
13498 }
13499 }
13500
13501
13502
13503 // ===========================
13504 // hit sets
13505
13506 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13507 {
13508 ostringstream ss;
13509 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13510 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13511 info.pgid.ps(), info.pgid.pool(),
13512 cct->_conf->osd_hit_set_namespace);
13513 dout(20) << __func__ << " " << hoid << dendl;
13514 return hoid;
13515 }
13516
13517 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13518 utime_t end,
13519 bool using_gmt)
13520 {
13521 ostringstream ss;
13522 ss << "hit_set_" << info.pgid.pgid << "_archive_";
13523 if (using_gmt) {
13524 start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
13525 end.gmtime(ss, true /* legacy pre-octopus form */);
13526 } else {
13527 start.localtime(ss, true /* legacy pre-octopus form */) << "_";
13528 end.localtime(ss, true /* legacy pre-octopus form */);
13529 }
13530 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13531 info.pgid.ps(), info.pgid.pool(),
13532 cct->_conf->osd_hit_set_namespace);
13533 dout(20) << __func__ << " " << hoid << dendl;
13534 return hoid;
13535 }
13536
13537 void PrimaryLogPG::hit_set_clear()
13538 {
13539 dout(20) << __func__ << dendl;
13540 hit_set.reset();
13541 hit_set_start_stamp = utime_t();
13542 }
13543
13544 void PrimaryLogPG::hit_set_setup()
13545 {
13546 if (!is_active() ||
13547 !is_primary()) {
13548 hit_set_clear();
13549 return;
13550 }
13551
13552 if (is_active() && is_primary() &&
13553 (!pool.info.hit_set_count ||
13554 !pool.info.hit_set_period ||
13555 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
13556 hit_set_clear();
13557
13558 // only primary is allowed to remove all the hit set objects
13559 hit_set_remove_all();
13560 return;
13561 }
13562
13563 // FIXME: discard any previous data for now
13564 hit_set_create();
13565
13566 // include any writes we know about from the pg log. this doesn't
13567 // capture reads, but it is better than nothing!
13568 hit_set_apply_log();
13569 }
13570
13571 void PrimaryLogPG::hit_set_remove_all()
13572 {
13573 // If any archives are degraded we skip this
13574 for (auto p = info.hit_set.history.begin();
13575 p != info.hit_set.history.end();
13576 ++p) {
13577 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13578
13579 // Once we hit a degraded object just skip
13580 if (is_degraded_or_backfilling_object(aoid))
13581 return;
13582 if (write_blocked_by_scrub(aoid))
13583 return;
13584 }
13585
13586 if (!info.hit_set.history.empty()) {
13587 auto p = info.hit_set.history.rbegin();
13588 ceph_assert(p != info.hit_set.history.rend());
13589 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13590 ceph_assert(!is_degraded_or_backfilling_object(oid));
13591 ObjectContextRef obc = get_object_context(oid, false);
13592 ceph_assert(obc);
13593
13594 OpContextUPtr ctx = simple_opc_create(obc);
13595 ctx->at_version = get_next_version();
13596 ctx->updated_hset_history = info.hit_set;
13597 utime_t now = ceph_clock_now();
13598 ctx->mtime = now;
13599 hit_set_trim(ctx, 0);
13600 simple_opc_submit(std::move(ctx));
13601 }
13602
13603 recovery_state.update_hset(pg_hit_set_history_t());
13604 if (agent_state) {
13605 agent_state->discard_hit_sets();
13606 }
13607 }
13608
13609 void PrimaryLogPG::hit_set_create()
13610 {
13611 utime_t now = ceph_clock_now();
13612 // make a copy of the params to modify
13613 HitSet::Params params(pool.info.hit_set_params);
13614
13615 dout(20) << __func__ << " " << params << dendl;
13616 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
13617 BloomHitSet::Params *p =
13618 static_cast<BloomHitSet::Params*>(params.impl.get());
13619
13620 // convert false positive rate so it holds up across the full period
13621 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
13622 if (p->get_fpp() <= 0.0)
13623 p->set_fpp(.01); // fpp cannot be zero!
13624
13625 // if we don't have specified size, estimate target size based on the
13626 // previous bin!
13627 if (p->target_size == 0 && hit_set) {
13628 utime_t dur = now - hit_set_start_stamp;
13629 unsigned unique = hit_set->approx_unique_insert_count();
13630 dout(20) << __func__ << " previous set had approx " << unique
13631 << " unique items over " << dur << " seconds" << dendl;
13632 p->target_size = (double)unique * (double)pool.info.hit_set_period
13633 / (double)dur;
13634 }
13635 if (p->target_size <
13636 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
13637 p->target_size = cct->_conf->osd_hit_set_min_size;
13638
13639 if (p->target_size
13640 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
13641 p->target_size = cct->_conf->osd_hit_set_max_size;
13642
13643 p->seed = now.sec();
13644
13645 dout(10) << __func__ << " target_size " << p->target_size
13646 << " fpp " << p->get_fpp() << dendl;
13647 }
13648 hit_set.reset(new HitSet(params));
13649 hit_set_start_stamp = now;
13650 }
13651
13652 /**
13653 * apply log entries to set
13654 *
13655 * this would only happen after peering, to at least capture writes
13656 * during an interval that was potentially lost.
13657 */
13658 bool PrimaryLogPG::hit_set_apply_log()
13659 {
13660 if (!hit_set)
13661 return false;
13662
13663 eversion_t to = info.last_update;
13664 eversion_t from = info.hit_set.current_last_update;
13665 if (to <= from) {
13666 dout(20) << __func__ << " no update" << dendl;
13667 return false;
13668 }
13669
13670 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
13671 list<pg_log_entry_t>::const_reverse_iterator p =
13672 recovery_state.get_pg_log().get_log().log.rbegin();
13673 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
13674 ++p;
13675 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
13676 hit_set->insert(p->soid);
13677 ++p;
13678 }
13679
13680 return true;
13681 }
13682
13683 void PrimaryLogPG::hit_set_persist()
13684 {
13685 dout(10) << __func__ << dendl;
13686 bufferlist bl;
13687 unsigned max = pool.info.hit_set_count;
13688
13689 utime_t now = ceph_clock_now();
13690 hobject_t oid;
13691
13692 // If any archives are degraded we skip this persist request
13693 // account for the additional entry being added below
13694 for (auto p = info.hit_set.history.begin();
13695 p != info.hit_set.history.end();
13696 ++p) {
13697 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13698
13699 // Once we hit a degraded object just skip further trim
13700 if (is_degraded_or_backfilling_object(aoid))
13701 return;
13702 if (write_blocked_by_scrub(aoid))
13703 return;
13704 }
13705
13706 // If backfill is in progress and we could possibly overlap with the
13707 // hit_set_* objects, back off. Since these all have
13708 // hobject_t::hash set to pgid.ps(), and those sort first, we can
13709 // look just at that. This is necessary because our transactions
13710 // may include a modify of the new hit_set *and* a delete of the
13711 // old one, and this may span the backfill boundary.
13712 for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
13713 p != get_backfill_targets().end();
13714 ++p) {
13715 const pg_info_t& pi = recovery_state.get_peer_info(*p);
13716 if (pi.last_backfill == hobject_t() ||
13717 pi.last_backfill.get_hash() == info.pgid.ps()) {
13718 dout(10) << __func__ << " backfill target osd." << *p
13719 << " last_backfill has not progressed past pgid ps"
13720 << dendl;
13721 return;
13722 }
13723 }
13724
13725
13726 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
13727 new_hset.begin = hit_set_start_stamp;
13728 new_hset.end = now;
13729 oid = get_hit_set_archive_object(
13730 new_hset.begin,
13731 new_hset.end,
13732 new_hset.using_gmt);
13733
13734 // If the current object is degraded we skip this persist request
13735 if (write_blocked_by_scrub(oid))
13736 return;
13737
13738 hit_set->seal();
13739 encode(*hit_set, bl);
13740 dout(20) << __func__ << " archive " << oid << dendl;
13741
13742 if (agent_state) {
13743 agent_state->add_hit_set(new_hset.begin, hit_set);
13744 uint32_t size = agent_state->hit_set_map.size();
13745 if (size >= pool.info.hit_set_count) {
13746 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
13747 }
13748 hit_set_in_memory_trim(size);
13749 }
13750
13751 ObjectContextRef obc = get_object_context(oid, true);
13752 OpContextUPtr ctx = simple_opc_create(obc);
13753
13754 ctx->at_version = get_next_version();
13755 ctx->updated_hset_history = info.hit_set;
13756 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
13757
13758 updated_hit_set_hist.current_last_update = info.last_update;
13759 new_hset.version = ctx->at_version;
13760
13761 updated_hit_set_hist.history.push_back(new_hset);
13762 hit_set_create();
13763
13764 // fabricate an object_info_t and SnapSet
13765 obc->obs.oi.version = ctx->at_version;
13766 obc->obs.oi.mtime = now;
13767 obc->obs.oi.size = bl.length();
13768 obc->obs.exists = true;
13769 obc->obs.oi.set_data_digest(bl.crc32c(-1));
13770
13771 ctx->new_obs = obc->obs;
13772
13773 ctx->new_snapset = obc->ssc->snapset;
13774
13775 ctx->delta_stats.num_objects++;
13776 ctx->delta_stats.num_objects_hit_set_archive++;
13777
13778 ctx->delta_stats.num_bytes += bl.length();
13779 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
13780
13781 bufferlist bss;
13782 encode(ctx->new_snapset, bss);
13783 bufferlist boi(sizeof(ctx->new_obs.oi));
13784 encode(ctx->new_obs.oi, boi,
13785 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13786
13787 ctx->op_t->create(oid);
13788 if (bl.length()) {
13789 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
13790 write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
13791 0, bl.length());
13792 ctx->clean_regions.mark_data_region_dirty(0, bl.length());
13793 }
13794 map <string, bufferlist> attrs;
13795 attrs[OI_ATTR].claim(boi);
13796 attrs[SS_ATTR].claim(bss);
13797 setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
13798 ctx->log.push_back(
13799 pg_log_entry_t(
13800 pg_log_entry_t::MODIFY,
13801 oid,
13802 ctx->at_version,
13803 eversion_t(),
13804 0,
13805 osd_reqid_t(),
13806 ctx->mtime,
13807 0)
13808 );
13809 ctx->log.back().clean_regions = ctx->clean_regions;
13810
13811 hit_set_trim(ctx, max);
13812
13813 simple_opc_submit(std::move(ctx));
13814 }
13815
13816 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
13817 {
13818 ceph_assert(ctx->updated_hset_history);
13819 pg_hit_set_history_t &updated_hit_set_hist =
13820 *(ctx->updated_hset_history);
13821 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
13822 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
13823 ceph_assert(p != updated_hit_set_hist.history.end());
13824 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13825
13826 ceph_assert(!is_degraded_or_backfilling_object(oid));
13827
13828 dout(20) << __func__ << " removing " << oid << dendl;
13829 ++ctx->at_version.version;
13830 ctx->log.push_back(
13831 pg_log_entry_t(pg_log_entry_t::DELETE,
13832 oid,
13833 ctx->at_version,
13834 p->version,
13835 0,
13836 osd_reqid_t(),
13837 ctx->mtime,
13838 0));
13839
13840 ctx->op_t->remove(oid);
13841 updated_hit_set_hist.history.pop_front();
13842
13843 ObjectContextRef obc = get_object_context(oid, false);
13844 ceph_assert(obc);
13845 --ctx->delta_stats.num_objects;
13846 --ctx->delta_stats.num_objects_hit_set_archive;
13847 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
13848 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
13849 }
13850 }
13851
13852 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
13853 {
13854 while (agent_state->hit_set_map.size() > max_in_memory) {
13855 agent_state->remove_oldest_hit_set();
13856 }
13857 }
13858
13859
13860 // =======================================
13861 // cache agent
13862
13863 void PrimaryLogPG::agent_setup()
13864 {
13865 ceph_assert(is_locked());
13866 if (!is_active() ||
13867 !is_primary() ||
13868 state_test(PG_STATE_PREMERGE) ||
13869 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13870 pool.info.tier_of < 0 ||
13871 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13872 agent_clear();
13873 return;
13874 }
13875 if (!agent_state) {
13876 agent_state.reset(new TierAgentState);
13877
13878 // choose random starting position
13879 agent_state->position = hobject_t();
13880 agent_state->position.pool = info.pgid.pool();
13881 agent_state->position.set_hash(pool.info.get_random_pg_position(
13882 info.pgid.pgid,
13883 rand()));
13884 agent_state->start = agent_state->position;
13885
13886 dout(10) << __func__ << " allocated new state, position "
13887 << agent_state->position << dendl;
13888 } else {
13889 dout(10) << __func__ << " keeping existing state" << dendl;
13890 }
13891
13892 if (info.stats.stats_invalid) {
13893 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13894 }
13895
13896 agent_choose_mode();
13897 }
13898
13899 void PrimaryLogPG::agent_clear()
13900 {
13901 agent_stop();
13902 agent_state.reset(NULL);
13903 }
13904
13905 // Return false if no objects operated on since start of object hash space
13906 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13907 {
13908 std::scoped_lock locker{*this};
13909 if (!agent_state) {
13910 dout(10) << __func__ << " no agent state, stopping" << dendl;
13911 return true;
13912 }
13913
13914 ceph_assert(!recovery_state.is_deleting());
13915
13916 if (agent_state->is_idle()) {
13917 dout(10) << __func__ << " idle, stopping" << dendl;
13918 return true;
13919 }
13920
13921 osd->logger->inc(l_osd_agent_wake);
13922
13923 dout(10) << __func__
13924 << " max " << start_max
13925 << ", flush " << agent_state->get_flush_mode_name()
13926 << ", evict " << agent_state->get_evict_mode_name()
13927 << ", pos " << agent_state->position
13928 << dendl;
13929 ceph_assert(is_primary());
13930 ceph_assert(is_active());
13931
13932 agent_load_hit_sets();
13933
13934 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13935 ceph_assert(base_pool);
13936
13937 int ls_min = 1;
13938 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13939
13940 // list some objects. this conveniently lists clones (oldest to
13941 // newest) before heads... the same order we want to flush in.
13942 //
13943 // NOTE: do not flush the Sequencer. we will assume that the
13944 // listing we get back is imprecise.
13945 vector<hobject_t> ls;
13946 hobject_t next;
13947 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13948 &ls, &next);
13949 ceph_assert(r >= 0);
13950 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13951 int started = 0;
13952 for (vector<hobject_t>::iterator p = ls.begin();
13953 p != ls.end();
13954 ++p) {
13955 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13956 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13957 osd->logger->inc(l_osd_agent_skip);
13958 continue;
13959 }
13960 if (is_degraded_or_backfilling_object(*p)) {
13961 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13962 osd->logger->inc(l_osd_agent_skip);
13963 continue;
13964 }
13965 if (is_missing_object(p->get_head())) {
13966 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13967 osd->logger->inc(l_osd_agent_skip);
13968 continue;
13969 }
13970 ObjectContextRef obc = get_object_context(*p, false, NULL);
13971 if (!obc) {
13972 // we didn't flush; we may miss something here.
13973 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13974 osd->logger->inc(l_osd_agent_skip);
13975 continue;
13976 }
13977 if (!obc->obs.exists) {
13978 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13979 osd->logger->inc(l_osd_agent_skip);
13980 continue;
13981 }
13982 if (range_intersects_scrub(obc->obs.oi.soid,
13983 obc->obs.oi.soid.get_head())) {
13984 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13985 osd->logger->inc(l_osd_agent_skip);
13986 continue;
13987 }
13988 if (obc->is_blocked()) {
13989 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13990 osd->logger->inc(l_osd_agent_skip);
13991 continue;
13992 }
13993 if (obc->is_request_pending()) {
13994 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13995 osd->logger->inc(l_osd_agent_skip);
13996 continue;
13997 }
13998
13999 // be careful flushing omap to an EC pool.
14000 if (!base_pool->supports_omap() &&
14001 obc->obs.oi.is_omap()) {
14002 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14003 osd->logger->inc(l_osd_agent_skip);
14004 continue;
14005 }
14006
14007 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14008 agent_maybe_evict(obc, false))
14009 ++started;
14010 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14011 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14012 ++started;
14013 --agent_flush_quota;
14014 }
14015 if (started >= start_max) {
14016 // If finishing early, set "next" to the next object
14017 if (++p != ls.end())
14018 next = *p;
14019 break;
14020 }
14021 }
14022
14023 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14024 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14025 agent_state->hist_age = 0;
14026 agent_state->temp_hist.decay();
14027 }
14028
14029 // Total objects operated on so far
14030 int total_started = agent_state->started + started;
14031 bool need_delay = false;
14032
14033 dout(20) << __func__ << " start pos " << agent_state->position
14034 << " next start pos " << next
14035 << " started " << total_started << dendl;
14036
14037 // See if we've made a full pass over the object hash space
14038 // This might check at most ls_max objects a second time to notice that
14039 // we've checked every objects at least once.
14040 if (agent_state->position < agent_state->start &&
14041 next >= agent_state->start) {
14042 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14043 if (total_started == 0)
14044 need_delay = true;
14045 else
14046 total_started = 0;
14047 agent_state->start = next;
14048 }
14049 agent_state->started = total_started;
14050
14051 // See if we are starting from beginning
14052 if (next.is_max())
14053 agent_state->position = hobject_t();
14054 else
14055 agent_state->position = next;
14056
14057 // Discard old in memory HitSets
14058 hit_set_in_memory_trim(pool.info.hit_set_count);
14059
14060 if (need_delay) {
14061 ceph_assert(agent_state->delaying == false);
14062 agent_delay();
14063 return false;
14064 }
14065 agent_choose_mode();
14066 return true;
14067 }
14068
14069 void PrimaryLogPG::agent_load_hit_sets()
14070 {
14071 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14072 return;
14073 }
14074
14075 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14076 dout(10) << __func__ << dendl;
14077 for (auto p = info.hit_set.history.begin();
14078 p != info.hit_set.history.end(); ++p) {
14079 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14080 dout(10) << __func__ << " loading " << p->begin << "-"
14081 << p->end << dendl;
14082 if (!pool.info.is_replicated()) {
14083 // FIXME: EC not supported here yet
14084 derr << __func__ << " on non-replicated pool" << dendl;
14085 break;
14086 }
14087
14088 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14089 if (is_unreadable_object(oid)) {
14090 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14091 break;
14092 }
14093
14094 ObjectContextRef obc = get_object_context(oid, false);
14095 if (!obc) {
14096 derr << __func__ << ": could not load hitset " << oid << dendl;
14097 break;
14098 }
14099
14100 bufferlist bl;
14101 {
14102 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14103 ceph_assert(r >= 0);
14104 }
14105 HitSetRef hs(new HitSet);
14106 bufferlist::const_iterator pbl = bl.begin();
14107 decode(*hs, pbl);
14108 agent_state->add_hit_set(p->begin.sec(), hs);
14109 }
14110 }
14111 }
14112 }
14113
14114 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14115 {
14116 if (!obc->obs.oi.is_dirty()) {
14117 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14118 osd->logger->inc(l_osd_agent_skip);
14119 return false;
14120 }
14121 if (obc->obs.oi.is_cache_pinned()) {
14122 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14123 osd->logger->inc(l_osd_agent_skip);
14124 return false;
14125 }
14126
14127 utime_t now = ceph_clock_now();
14128 utime_t ob_local_mtime;
14129 if (obc->obs.oi.local_mtime != utime_t()) {
14130 ob_local_mtime = obc->obs.oi.local_mtime;
14131 } else {
14132 ob_local_mtime = obc->obs.oi.mtime;
14133 }
14134 bool evict_mode_full =
14135 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14136 if (!evict_mode_full &&
14137 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
14138 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14139 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14140 osd->logger->inc(l_osd_agent_skip);
14141 return false;
14142 }
14143
14144 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14145 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14146 osd->logger->inc(l_osd_agent_skip);
14147 return false;
14148 }
14149
14150 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14151
14152 // FIXME: flush anything dirty, regardless of what distribution of
14153 // ages we expect.
14154
14155 hobject_t oid = obc->obs.oi.soid;
14156 osd->agent_start_op(oid);
14157 // no need to capture a pg ref, can't outlive fop or ctx
14158 std::function<void()> on_flush = [this, oid]() {
14159 osd->agent_finish_op(oid);
14160 };
14161
14162 int result = start_flush(
14163 OpRequestRef(), obc, false, NULL,
14164 on_flush);
14165 if (result != -EINPROGRESS) {
14166 on_flush();
14167 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14168 << " with " << result << dendl;
14169 osd->logger->inc(l_osd_agent_skip);
14170 return false;
14171 }
14172
14173 osd->logger->inc(l_osd_agent_flush);
14174 return true;
14175 }
14176
14177 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14178 {
14179 const hobject_t& soid = obc->obs.oi.soid;
14180 if (!after_flush && obc->obs.oi.is_dirty()) {
14181 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14182 return false;
14183 }
14184 // This is already checked by agent_work() which passes after_flush = false
14185 if (after_flush && range_intersects_scrub(soid, soid.get_head())) {
14186 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14187 return false;
14188 }
14189 if (!obc->obs.oi.watchers.empty()) {
14190 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14191 return false;
14192 }
14193 if (obc->is_blocked()) {
14194 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14195 return false;
14196 }
14197 if (obc->obs.oi.is_cache_pinned()) {
14198 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14199 return false;
14200 }
14201
14202 if (soid.snap == CEPH_NOSNAP) {
14203 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14204 if (result < 0) {
14205 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14206 return false;
14207 }
14208 }
14209
14210 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14211 // is this object old than cache_min_evict_age?
14212 utime_t now = ceph_clock_now();
14213 utime_t ob_local_mtime;
14214 if (obc->obs.oi.local_mtime != utime_t()) {
14215 ob_local_mtime = obc->obs.oi.local_mtime;
14216 } else {
14217 ob_local_mtime = obc->obs.oi.mtime;
14218 }
14219 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14220 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14221 osd->logger->inc(l_osd_agent_skip);
14222 return false;
14223 }
14224 // is this object old and/or cold enough?
14225 int temp = 0;
14226 uint64_t temp_upper = 0, temp_lower = 0;
14227 if (hit_set)
14228 agent_estimate_temp(soid, &temp);
14229 agent_state->temp_hist.add(temp);
14230 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14231
14232 dout(20) << __func__
14233 << " temp " << temp
14234 << " pos " << temp_lower << "-" << temp_upper
14235 << ", evict_effort " << agent_state->evict_effort
14236 << dendl;
14237 dout(30) << "agent_state:\n";
14238 Formatter *f = Formatter::create("");
14239 f->open_object_section("agent_state");
14240 agent_state->dump(f);
14241 f->close_section();
14242 f->flush(*_dout);
14243 delete f;
14244 *_dout << dendl;
14245
14246 if (1000000 - temp_upper >= agent_state->evict_effort)
14247 return false;
14248 }
14249
14250 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14251 OpContextUPtr ctx = simple_opc_create(obc);
14252
14253 auto null_op_req = OpRequestRef();
14254 if (!ctx->lock_manager.get_lock_type(
14255 RWState::RWWRITE,
14256 obc->obs.oi.soid,
14257 obc,
14258 null_op_req)) {
14259 close_op_ctx(ctx.release());
14260 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14261 return false;
14262 }
14263
14264 osd->agent_start_evict_op();
14265 ctx->register_on_finish(
14266 [this]() {
14267 osd->agent_finish_evict_op();
14268 });
14269
14270 ctx->at_version = get_next_version();
14271 ceph_assert(ctx->new_obs.exists);
14272 int r = _delete_oid(ctx.get(), true, false);
14273 if (obc->obs.oi.is_omap())
14274 ctx->delta_stats.num_objects_omap--;
14275 ctx->delta_stats.num_evict++;
14276 ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
14277 if (obc->obs.oi.is_dirty())
14278 --ctx->delta_stats.num_objects_dirty;
14279 ceph_assert(r == 0);
14280 finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
14281 simple_opc_submit(std::move(ctx));
14282 osd->logger->inc(l_osd_tier_evict);
14283 osd->logger->inc(l_osd_agent_evict);
14284 return true;
14285 }
14286
14287 void PrimaryLogPG::agent_stop()
14288 {
14289 dout(20) << __func__ << dendl;
14290 if (agent_state && !agent_state->is_idle()) {
14291 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14292 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14293 osd->agent_disable_pg(this, agent_state->evict_effort);
14294 }
14295 }
14296
14297 void PrimaryLogPG::agent_delay()
14298 {
14299 dout(20) << __func__ << dendl;
14300 if (agent_state && !agent_state->is_idle()) {
14301 ceph_assert(agent_state->delaying == false);
14302 agent_state->delaying = true;
14303 osd->agent_disable_pg(this, agent_state->evict_effort);
14304 }
14305 }
14306
14307 void PrimaryLogPG::agent_choose_mode_restart()
14308 {
14309 dout(20) << __func__ << dendl;
14310 std::scoped_lock locker{*this};
14311 if (agent_state && agent_state->delaying) {
14312 agent_state->delaying = false;
14313 agent_choose_mode(true);
14314 }
14315 }
14316
14317 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14318 {
14319 bool requeued = false;
14320 // Let delay play out
14321 if (agent_state->delaying) {
14322 dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
14323 return requeued;
14324 }
14325
14326 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14327 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14328 unsigned evict_effort = 0;
14329
14330 if (info.stats.stats_invalid) {
14331 // idle; stats can't be trusted until we scrub.
14332 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14333 goto skip_calc;
14334 }
14335
14336 {
14337 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
14338 ceph_assert(divisor > 0);
14339
14340 // adjust (effective) user objects down based on the number
14341 // of HitSet objects, which should not count toward our total since
14342 // they cannot be flushed.
14343 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14344
14345 // also exclude omap objects if ec backing pool
14346 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14347 ceph_assert(base_pool);
14348 if (!base_pool->supports_omap())
14349 unflushable += info.stats.stats.sum.num_objects_omap;
14350
14351 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14352 if (num_user_objects > unflushable)
14353 num_user_objects -= unflushable;
14354 else
14355 num_user_objects = 0;
14356
14357 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14358 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14359 num_user_bytes -= unflushable_bytes;
14360 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14361 num_user_bytes += num_overhead_bytes;
14362
14363 // also reduce the num_dirty by num_objects_omap
14364 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14365 if (!base_pool->supports_omap()) {
14366 if (num_dirty > info.stats.stats.sum.num_objects_omap)
14367 num_dirty -= info.stats.stats.sum.num_objects_omap;
14368 else
14369 num_dirty = 0;
14370 }
14371
14372 dout(10) << __func__
14373 << " flush_mode: "
14374 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14375 << " evict_mode: "
14376 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14377 << " num_objects: " << info.stats.stats.sum.num_objects
14378 << " num_bytes: " << info.stats.stats.sum.num_bytes
14379 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14380 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14381 << " num_dirty: " << num_dirty
14382 << " num_user_objects: " << num_user_objects
14383 << " num_user_bytes: " << num_user_bytes
14384 << " num_overhead_bytes: " << num_overhead_bytes
14385 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14386 << " pool.info.target_max_objects: " << pool.info.target_max_objects
14387 << dendl;
14388
14389 // get dirty, full ratios
14390 uint64_t dirty_micro = 0;
14391 uint64_t full_micro = 0;
14392 if (pool.info.target_max_bytes && num_user_objects > 0) {
14393 uint64_t avg_size = num_user_bytes / num_user_objects;
14394 dirty_micro =
14395 num_dirty * avg_size * 1000000 /
14396 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14397 full_micro =
14398 num_user_objects * avg_size * 1000000 /
14399 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14400 }
14401 if (pool.info.target_max_objects > 0) {
14402 uint64_t dirty_objects_micro =
14403 num_dirty * 1000000 /
14404 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14405 if (dirty_objects_micro > dirty_micro)
14406 dirty_micro = dirty_objects_micro;
14407 uint64_t full_objects_micro =
14408 num_user_objects * 1000000 /
14409 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14410 if (full_objects_micro > full_micro)
14411 full_micro = full_objects_micro;
14412 }
14413 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14414 << " full " << ((float)full_micro / 1000000.0)
14415 << dendl;
14416
14417 // flush mode
14418 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14419 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14420 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14421 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14422 flush_target += flush_slop;
14423 flush_high_target += flush_slop;
14424 } else {
14425 flush_target -= std::min(flush_target, flush_slop);
14426 flush_high_target -= std::min(flush_high_target, flush_slop);
14427 }
14428
14429 if (dirty_micro > flush_high_target) {
14430 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
14431 } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
14432 flush_mode = TierAgentState::FLUSH_MODE_LOW;
14433 }
14434
14435 // evict mode
14436 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14437 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14438 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14439 evict_target += evict_slop;
14440 else
14441 evict_target -= std::min(evict_target, evict_slop);
14442
14443 if (full_micro > 1000000) {
14444 // evict anything clean
14445 evict_mode = TierAgentState::EVICT_MODE_FULL;
14446 evict_effort = 1000000;
14447 } else if (full_micro > evict_target) {
14448 // set effort in [0..1] range based on where we are between
14449 evict_mode = TierAgentState::EVICT_MODE_SOME;
14450 uint64_t over = full_micro - evict_target;
14451 uint64_t span = 1000000 - evict_target;
14452 evict_effort = std::max(over * 1000000 / span,
14453 uint64_t(1000000.0 *
14454 cct->_conf->osd_agent_min_evict_effort));
14455
14456 // quantize effort to avoid too much reordering in the agent_queue.
14457 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
14458 ceph_assert(inc > 0);
14459 uint64_t was = evict_effort;
14460 evict_effort -= evict_effort % inc;
14461 if (evict_effort < inc)
14462 evict_effort = inc;
14463 ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
14464 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14465 }
14466 }
14467
14468 skip_calc:
14469 bool old_idle = agent_state->is_idle();
14470 if (flush_mode != agent_state->flush_mode) {
14471 dout(5) << __func__ << " flush_mode "
14472 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14473 << " -> "
14474 << TierAgentState::get_flush_mode_name(flush_mode)
14475 << dendl;
14476 recovery_state.update_stats(
14477 [=](auto &history, auto &stats) {
14478 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14479 osd->agent_inc_high_count();
14480 stats.stats.sum.num_flush_mode_high = 1;
14481 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14482 stats.stats.sum.num_flush_mode_low = 1;
14483 }
14484 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14485 osd->agent_dec_high_count();
14486 stats.stats.sum.num_flush_mode_high = 0;
14487 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14488 stats.stats.sum.num_flush_mode_low = 0;
14489 }
14490 return false;
14491 });
14492 agent_state->flush_mode = flush_mode;
14493 }
14494 if (evict_mode != agent_state->evict_mode) {
14495 dout(5) << __func__ << " evict_mode "
14496 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14497 << " -> "
14498 << TierAgentState::get_evict_mode_name(evict_mode)
14499 << dendl;
14500 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14501 is_active()) {
14502 if (op)
14503 requeue_op(op);
14504 requeue_ops(waiting_for_flush);
14505 requeue_ops(waiting_for_active);
14506 requeue_ops(waiting_for_readable);
14507 requeue_ops(waiting_for_scrub);
14508 requeue_ops(waiting_for_cache_not_full);
14509 objects_blocked_on_cache_full.clear();
14510 requeued = true;
14511 }
14512 recovery_state.update_stats(
14513 [=](auto &history, auto &stats) {
14514 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14515 stats.stats.sum.num_evict_mode_some = 1;
14516 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14517 stats.stats.sum.num_evict_mode_full = 1;
14518 }
14519 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14520 stats.stats.sum.num_evict_mode_some = 0;
14521 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14522 stats.stats.sum.num_evict_mode_full = 0;
14523 }
14524 return false;
14525 });
14526 agent_state->evict_mode = evict_mode;
14527 }
14528 uint64_t old_effort = agent_state->evict_effort;
14529 if (evict_effort != agent_state->evict_effort) {
14530 dout(5) << __func__ << " evict_effort "
14531 << ((float)agent_state->evict_effort / 1000000.0)
14532 << " -> "
14533 << ((float)evict_effort / 1000000.0)
14534 << dendl;
14535 agent_state->evict_effort = evict_effort;
14536 }
14537
14538 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14539 // (including flush). This is probably fine (they should be
14540 // correlated) but it is not precisely correct.
14541 if (agent_state->is_idle()) {
14542 if (!restart && !old_idle) {
14543 osd->agent_disable_pg(this, old_effort);
14544 }
14545 } else {
14546 if (restart || old_idle) {
14547 osd->agent_enable_pg(this, agent_state->evict_effort);
14548 } else if (old_effort != agent_state->evict_effort) {
14549 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
14550 }
14551 }
14552 return requeued;
14553 }
14554
14555 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
14556 {
14557 ceph_assert(hit_set);
14558 ceph_assert(temp);
14559 *temp = 0;
14560 if (hit_set->contains(oid))
14561 *temp = 1000000;
14562 unsigned i = 0;
14563 int last_n = pool.info.hit_set_search_last_n;
14564 for (map<time_t,HitSetRef>::reverse_iterator p =
14565 agent_state->hit_set_map.rbegin(); last_n > 0 &&
14566 p != agent_state->hit_set_map.rend(); ++p, ++i) {
14567 if (p->second->contains(oid)) {
14568 *temp += pool.info.get_grade(i);
14569 --last_n;
14570 }
14571 }
14572 }
14573
14574 // Dup op detection
14575
14576 bool PrimaryLogPG::already_complete(eversion_t v)
14577 {
14578 dout(20) << __func__ << ": " << v << dendl;
14579 for (xlist<RepGather*>::iterator i = repop_queue.begin();
14580 !i.end();
14581 ++i) {
14582 dout(20) << __func__ << ": " << **i << dendl;
14583 // skip copy from temp object ops
14584 if ((*i)->v == eversion_t()) {
14585 dout(20) << __func__ << ": " << **i
14586 << " version is empty" << dendl;
14587 continue;
14588 }
14589 if ((*i)->v > v) {
14590 dout(20) << __func__ << ": " << **i
14591 << " (*i)->v past v" << dendl;
14592 break;
14593 }
14594 if (!(*i)->all_committed) {
14595 dout(20) << __func__ << ": " << **i
14596 << " not committed, returning false"
14597 << dendl;
14598 return false;
14599 }
14600 }
14601 dout(20) << __func__ << ": returning true" << dendl;
14602 return true;
14603 }
14604
14605
14606 // ==========================================================================================
14607 // SCRUB
14608
14609
14610 bool PrimaryLogPG::_range_available_for_scrub(
14611 const hobject_t &begin, const hobject_t &end)
14612 {
14613 pair<hobject_t, ObjectContextRef> next;
14614 next.second = object_contexts.lookup(begin);
14615 next.first = begin;
14616 bool more = true;
14617 while (more && next.first < end) {
14618 if (next.second && next.second->is_blocked()) {
14619 next.second->requeue_scrub_on_unblock = true;
14620 dout(10) << __func__ << ": scrub delayed, "
14621 << next.first << " is blocked"
14622 << dendl;
14623 return false;
14624 }
14625 more = object_contexts.get_next(next.first, &next);
14626 }
14627 return true;
14628 }
14629
14630 static bool doing_clones(const std::optional<SnapSet> &snapset,
14631 const vector<snapid_t>::reverse_iterator &curclone) {
14632 return snapset && curclone != snapset->clones.rend();
14633 }
14634
14635 void PrimaryLogPG::log_missing(unsigned missing,
14636 const std::optional<hobject_t> &head,
14637 LogChannelRef clog,
14638 const spg_t &pgid,
14639 const char *func,
14640 const char *mode,
14641 bool allow_incomplete_clones)
14642 {
14643 ceph_assert(head);
14644 if (allow_incomplete_clones) {
14645 dout(20) << func << " " << mode << " " << pgid << " " << *head
14646 << " skipped " << missing << " clone(s) in cache tier" << dendl;
14647 } else {
14648 clog->info() << mode << " " << pgid << " " << *head
14649 << " : " << missing << " missing clone(s)";
14650 }
14651 }
14652
14653 unsigned PrimaryLogPG::process_clones_to(const std::optional<hobject_t> &head,
14654 const std::optional<SnapSet> &snapset,
14655 LogChannelRef clog,
14656 const spg_t &pgid,
14657 const char *mode,
14658 bool allow_incomplete_clones,
14659 std::optional<snapid_t> target,
14660 vector<snapid_t>::reverse_iterator *curclone,
14661 inconsistent_snapset_wrapper &e)
14662 {
14663 ceph_assert(head);
14664 ceph_assert(snapset);
14665 unsigned missing = 0;
14666
14667 // NOTE: clones are in descending order, thus **curclone > target test here
14668 hobject_t next_clone(*head);
14669 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
14670 ++missing;
14671 // it is okay to be missing one or more clones in a cache tier.
14672 // skip higher-numbered clones in the list.
14673 if (!allow_incomplete_clones) {
14674 next_clone.snap = **curclone;
14675 clog->error() << mode << " " << pgid << " " << *head
14676 << " : expected clone " << next_clone << " " << missing
14677 << " missing";
14678 ++scrubber.shallow_errors;
14679 e.set_clone_missing(next_clone.snap);
14680 }
14681 // Clones are descending
14682 ++(*curclone);
14683 }
14684 return missing;
14685 }
14686
14687 /*
14688 * Validate consistency of the object info and snap sets.
14689 *
14690 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14691 * the comparison of the objects is against multiple snapset.clones. There are
14692 * multiple clone lists and in between lists we expect head.
14693 *
14694 * Example
14695 *
14696 * objects expected
14697 * ======= =======
14698 * obj1 snap 1 head, unexpected obj1 snap 1
14699 * obj2 head head, match
14700 * [SnapSet clones 6 4 2 1]
14701 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
14702 * obj2 snap 6 obj2 snap 6, match
14703 * obj2 snap 4 obj2 snap 4, match
14704 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
14705 * [Snapset clones 3 1]
14706 * obj3 snap 3 obj3 snap 3 match
14707 * obj3 snap 1 obj3 snap 1 match
14708 * obj4 head head, match
14709 * [Snapset clones 4]
14710 * EOL obj4 snap 4, (expected)
14711 */
14712 void PrimaryLogPG::scrub_snapshot_metadata(
14713 ScrubMap &scrubmap,
14714 const map<hobject_t,
14715 pair<std::optional<uint32_t>,
14716 std::optional<uint32_t>>> &missing_digest)
14717 {
14718 dout(10) << __func__ << dendl;
14719
14720 bool repair = state_test(PG_STATE_REPAIR);
14721 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14722 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14723 std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
14724
14725 // traverse in reverse order.
14726 std::optional<hobject_t> head;
14727 std::optional<SnapSet> snapset; // If initialized so will head (above)
14728 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
14729 unsigned missing = 0;
14730 inconsistent_snapset_wrapper soid_error, head_error;
14731 unsigned soid_error_count = 0;
14732
14733 for (map<hobject_t,ScrubMap::object>::reverse_iterator
14734 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
14735 const hobject_t& soid = p->first;
14736 ceph_assert(!soid.is_snapdir());
14737 soid_error = inconsistent_snapset_wrapper{soid};
14738 object_stat_sum_t stat;
14739 std::optional<object_info_t> oi;
14740
14741 stat.num_objects++;
14742
14743 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14744 stat.num_objects_hit_set_archive++;
14745
14746 if (soid.is_snap()) {
14747 // it's a clone
14748 stat.num_object_clones++;
14749 }
14750
14751 // basic checks.
14752 if (p->second.attrs.count(OI_ATTR) == 0) {
14753 oi = std::nullopt;
14754 osd->clog->error() << mode << " " << info.pgid << " " << soid
14755 << " : no '" << OI_ATTR << "' attr";
14756 ++scrubber.shallow_errors;
14757 soid_error.set_info_missing();
14758 } else {
14759 bufferlist bv;
14760 bv.push_back(p->second.attrs[OI_ATTR]);
14761 try {
14762 oi = object_info_t(); // Initialize optional<> before decode into it
14763 oi->decode(bv);
14764 } catch (buffer::error& e) {
14765 oi = std::nullopt;
14766 osd->clog->error() << mode << " " << info.pgid << " " << soid
14767 << " : can't decode '" << OI_ATTR << "' attr " << e.what();
14768 ++scrubber.shallow_errors;
14769 soid_error.set_info_corrupted();
14770 soid_error.set_info_missing(); // Not available too
14771 }
14772 }
14773
14774 if (oi) {
14775 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
14776 osd->clog->error() << mode << " " << info.pgid << " " << soid
14777 << " : on disk size (" << p->second.size
14778 << ") does not match object info size ("
14779 << oi->size << ") adjusted for ondisk to ("
14780 << pgbackend->be_get_ondisk_size(oi->size)
14781 << ")";
14782 soid_error.set_size_mismatch();
14783 ++scrubber.shallow_errors;
14784 }
14785
14786 dout(20) << mode << " " << soid << " " << *oi << dendl;
14787
14788 // A clone num_bytes will be added later when we have snapset
14789 if (!soid.is_snap()) {
14790 stat.num_bytes += oi->size;
14791 }
14792 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14793 stat.num_bytes_hit_set_archive += oi->size;
14794
14795 if (oi->is_dirty())
14796 ++stat.num_objects_dirty;
14797 if (oi->is_whiteout())
14798 ++stat.num_whiteouts;
14799 if (oi->is_omap())
14800 ++stat.num_objects_omap;
14801 if (oi->is_cache_pinned())
14802 ++stat.num_objects_pinned;
14803 if (oi->has_manifest())
14804 ++stat.num_objects_manifest;
14805 }
14806
14807 // Check for any problems while processing clones
14808 if (doing_clones(snapset, curclone)) {
14809 std::optional<snapid_t> target;
14810 // Expecting an object with snap for current head
14811 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
14812
14813 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
14814 << soid << " while processing " << *head << dendl;
14815
14816 target = all_clones;
14817 } else {
14818 ceph_assert(soid.is_snap());
14819 target = soid.snap;
14820 }
14821
14822 // Log any clones we were expecting to be there up to target
14823 // This will set missing, but will be a no-op if snap.soid == *curclone.
14824 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14825 pool.info.allow_incomplete_clones(), target, &curclone,
14826 head_error);
14827 }
14828 bool expected;
14829 // Check doing_clones() again in case we ran process_clones_to()
14830 if (doing_clones(snapset, curclone)) {
14831 // A head would have processed all clones above
14832 // or all greater than *curclone.
14833 ceph_assert(soid.is_snap() && *curclone <= soid.snap);
14834
14835 // After processing above clone snap should match the expected curclone
14836 expected = (*curclone == soid.snap);
14837 } else {
14838 // If we aren't doing clones any longer, then expecting head
14839 expected = soid.has_snapset();
14840 }
14841 if (!expected) {
14842 // If we couldn't read the head's snapset, just ignore clones
14843 if (head && !snapset) {
14844 osd->clog->error() << mode << " " << info.pgid << " " << soid
14845 << " : clone ignored due to missing snapset";
14846 } else {
14847 osd->clog->error() << mode << " " << info.pgid << " " << soid
14848 << " : is an unexpected clone";
14849 }
14850 ++scrubber.shallow_errors;
14851 soid_error.set_headless();
14852 scrubber.store->add_snap_error(pool.id, soid_error);
14853 ++soid_error_count;
14854 if (head && soid.get_head() == head->get_head())
14855 head_error.set_clone(soid.snap);
14856 continue;
14857 }
14858
14859 // new snapset?
14860 if (soid.has_snapset()) {
14861
14862 if (missing) {
14863 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14864 pool.info.allow_incomplete_clones());
14865 }
14866
14867 // Save previous head error information
14868 if (head && (head_error.errors || soid_error_count))
14869 scrubber.store->add_snap_error(pool.id, head_error);
14870 // Set this as a new head object
14871 head = soid;
14872 missing = 0;
14873 head_error = soid_error;
14874 soid_error_count = 0;
14875
14876 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14877
14878 if (p->second.attrs.count(SS_ATTR) == 0) {
14879 osd->clog->error() << mode << " " << info.pgid << " " << soid
14880 << " : no '" << SS_ATTR << "' attr";
14881 ++scrubber.shallow_errors;
14882 snapset = std::nullopt;
14883 head_error.set_snapset_missing();
14884 } else {
14885 bufferlist bl;
14886 bl.push_back(p->second.attrs[SS_ATTR]);
14887 auto blp = bl.cbegin();
14888 try {
14889 snapset = SnapSet(); // Initialize optional<> before decoding into it
14890 decode(*snapset, blp);
14891 head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14892 } catch (buffer::error& e) {
14893 snapset = std::nullopt;
14894 osd->clog->error() << mode << " " << info.pgid << " " << soid
14895 << " : can't decode '" << SS_ATTR << "' attr " << e.what();
14896 ++scrubber.shallow_errors;
14897 head_error.set_snapset_corrupted();
14898 }
14899 }
14900
14901 if (snapset) {
14902 // what will be next?
14903 curclone = snapset->clones.rbegin();
14904
14905 if (!snapset->clones.empty()) {
14906 dout(20) << " snapset " << *snapset << dendl;
14907 if (snapset->seq == 0) {
14908 osd->clog->error() << mode << " " << info.pgid << " " << soid
14909 << " : snaps.seq not set";
14910 ++scrubber.shallow_errors;
14911 head_error.set_snapset_error();
14912 }
14913 }
14914 }
14915 } else {
14916 ceph_assert(soid.is_snap());
14917 ceph_assert(head);
14918 ceph_assert(snapset);
14919 ceph_assert(soid.snap == *curclone);
14920
14921 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14922
14923 if (snapset->clone_size.count(soid.snap) == 0) {
14924 osd->clog->error() << mode << " " << info.pgid << " " << soid
14925 << " : is missing in clone_size";
14926 ++scrubber.shallow_errors;
14927 soid_error.set_size_mismatch();
14928 } else {
14929 if (oi && oi->size != snapset->clone_size[soid.snap]) {
14930 osd->clog->error() << mode << " " << info.pgid << " " << soid
14931 << " : size " << oi->size << " != clone_size "
14932 << snapset->clone_size[*curclone];
14933 ++scrubber.shallow_errors;
14934 soid_error.set_size_mismatch();
14935 }
14936
14937 if (snapset->clone_overlap.count(soid.snap) == 0) {
14938 osd->clog->error() << mode << " " << info.pgid << " " << soid
14939 << " : is missing in clone_overlap";
14940 ++scrubber.shallow_errors;
14941 soid_error.set_size_mismatch();
14942 } else {
14943 // This checking is based on get_clone_bytes(). The first 2 asserts
14944 // can't happen because we know we have a clone_size and
14945 // a clone_overlap. Now we check that the interval_set won't
14946 // cause the last assert.
14947 uint64_t size = snapset->clone_size.find(soid.snap)->second;
14948 const interval_set<uint64_t> &overlap =
14949 snapset->clone_overlap.find(soid.snap)->second;
14950 bool bad_interval_set = false;
14951 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14952 i != overlap.end(); ++i) {
14953 if (size < i.get_len()) {
14954 bad_interval_set = true;
14955 break;
14956 }
14957 size -= i.get_len();
14958 }
14959
14960 if (bad_interval_set) {
14961 osd->clog->error() << mode << " " << info.pgid << " " << soid
14962 << " : bad interval_set in clone_overlap";
14963 ++scrubber.shallow_errors;
14964 soid_error.set_size_mismatch();
14965 } else {
14966 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14967 }
14968 }
14969 }
14970
14971 // what's next?
14972 ++curclone;
14973 if (soid_error.errors) {
14974 scrubber.store->add_snap_error(pool.id, soid_error);
14975 ++soid_error_count;
14976 }
14977 }
14978
14979 scrub_cstat.add(stat);
14980 }
14981
14982 if (doing_clones(snapset, curclone)) {
14983 dout(10) << __func__ << " " << mode << " " << info.pgid
14984 << " No more objects while processing " << *head << dendl;
14985
14986 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14987 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14988 head_error);
14989 }
14990 // There could be missing found by the test above or even
14991 // before dropping out of the loop for the last head.
14992 if (missing) {
14993 log_missing(missing, head, osd->clog, info.pgid, __func__,
14994 mode, pool.info.allow_incomplete_clones());
14995 }
14996 if (head && (head_error.errors || soid_error_count))
14997 scrubber.store->add_snap_error(pool.id, head_error);
14998
14999 for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
15000 ceph_assert(!p->first.is_snapdir());
15001 dout(10) << __func__ << " recording digests for " << p->first << dendl;
15002 ObjectContextRef obc = get_object_context(p->first, false);
15003 if (!obc) {
15004 osd->clog->error() << info.pgid << " " << mode
15005 << " cannot get object context for object "
15006 << p->first;
15007 continue;
15008 } else if (obc->obs.oi.soid != p->first) {
15009 osd->clog->error() << info.pgid << " " << mode
15010 << " " << p->first
15011 << " : object has a valid oi attr with a mismatched name, "
15012 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
15013 continue;
15014 }
15015 OpContextUPtr ctx = simple_opc_create(obc);
15016 ctx->at_version = get_next_version();
15017 ctx->mtime = utime_t(); // do not update mtime
15018 if (p->second.first) {
15019 ctx->new_obs.oi.set_data_digest(*p->second.first);
15020 } else {
15021 ctx->new_obs.oi.clear_data_digest();
15022 }
15023 if (p->second.second) {
15024 ctx->new_obs.oi.set_omap_digest(*p->second.second);
15025 } else {
15026 ctx->new_obs.oi.clear_omap_digest();
15027 }
15028 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
15029
15030 ctx->register_on_success(
15031 [this]() {
15032 dout(20) << "updating scrub digest" << dendl;
15033 if (--scrubber.num_digest_updates_pending == 0) {
15034 requeue_scrub();
15035 }
15036 });
15037
15038 simple_opc_submit(std::move(ctx));
15039 ++scrubber.num_digest_updates_pending;
15040 }
15041
15042 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
15043 }
15044
15045 void PrimaryLogPG::_scrub_clear_state()
15046 {
15047 scrub_cstat = object_stat_collection_t();
15048 }
15049
15050 void PrimaryLogPG::_scrub_finish()
15051 {
15052 bool repair = state_test(PG_STATE_REPAIR);
15053 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
15054 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
15055
15056 if (info.stats.stats_invalid) {
15057 recovery_state.update_stats(
15058 [=](auto &history, auto &stats) {
15059 stats.stats = scrub_cstat;
15060 stats.stats_invalid = false;
15061 return false;
15062 });
15063
15064 if (agent_state)
15065 agent_choose_mode();
15066 }
15067
15068 dout(10) << mode << " got "
15069 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15070 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15071 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15072 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15073 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15074 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15075 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15076 << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15077 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
15078 << dendl;
15079
15080 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
15081 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
15082 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
15083 !info.stats.dirty_stats_invalid) ||
15084 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
15085 !info.stats.omap_stats_invalid) ||
15086 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
15087 !info.stats.pin_stats_invalid) ||
15088 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
15089 !info.stats.hitset_stats_invalid) ||
15090 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
15091 !info.stats.hitset_bytes_stats_invalid) ||
15092 (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest &&
15093 !info.stats.manifest_stats_invalid) ||
15094 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
15095 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
15096 osd->clog->error() << info.pgid << " " << mode
15097 << " : stat mismatch, got "
15098 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15099 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15100 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15101 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15102 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15103 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15104 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
15105 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15106 << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15107 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
15108 ++scrubber.shallow_errors;
15109
15110 if (repair) {
15111 ++scrubber.fixed;
15112 recovery_state.update_stats(
15113 [this](auto &history, auto &stats) {
15114 stats.stats = scrub_cstat;
15115 stats.dirty_stats_invalid = false;
15116 stats.omap_stats_invalid = false;
15117 stats.hitset_stats_invalid = false;
15118 stats.hitset_bytes_stats_invalid = false;
15119 stats.pin_stats_invalid = false;
15120 stats.manifest_stats_invalid = false;
15121 return false;
15122 });
15123 publish_stats_to_osd();
15124 recovery_state.share_pg_info();
15125 }
15126 }
15127 // Clear object context cache to get repair information
15128 if (repair)
15129 object_contexts.clear();
15130 }
15131
15132 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15133 {
15134 OpRequestRef op = ctx->op;
15135 // Only supports replicated pools
15136 ceph_assert(!pool.info.is_erasure());
15137 ceph_assert(is_primary());
15138
15139 dout(10) << __func__ << " " << soid
15140 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
15141
15142 if (!is_clean()) {
15143 block_for_clean(soid, op);
15144 return -EAGAIN;
15145 }
15146
15147 ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
15148 auto& oi = ctx->new_obs.oi;
15149 eversion_t v = oi.version;
15150
15151 if (primary_error(soid, v)) {
15152 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15153 // XXX: If we knew that there is no down osd which could include this
15154 // object, it would be nice if we could return EIO here.
15155 // If a "never fail" flag was available, that could be used
15156 // for rbd to NOT return EIO until object marked lost.
15157
15158 // Drop through to save this op in case an osd comes up with the object.
15159 }
15160
15161 // Restart the op after object becomes readable again
15162 waiting_for_unreadable_object[soid].push_back(op);
15163 op->mark_delayed("waiting for missing object");
15164
15165 if (!eio_errors_to_process) {
15166 eio_errors_to_process = true;
15167 ceph_assert(is_clean());
15168 state_set(PG_STATE_REPAIR);
15169 state_clear(PG_STATE_CLEAN);
15170 queue_peering_event(
15171 PGPeeringEventRef(
15172 std::make_shared<PGPeeringEvent>(
15173 get_osdmap_epoch(),
15174 get_osdmap_epoch(),
15175 PeeringState::DoRecovery())));
15176 } else {
15177 // A prior error must have already cleared clean state and queued recovery
15178 // or a map change has triggered re-peering.
15179 // Not inlining the recovery by calling maybe_kick_recovery(soid);
15180 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
15181 }
15182
15183 return -EAGAIN;
15184 }
15185
15186 /*---SnapTrimmer Logging---*/
15187 #undef dout_prefix
15188 #define dout_prefix pg->gen_prefix(*_dout)
15189
15190 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15191 {
15192 ldout(pg->cct, 20) << "enter " << state_name << dendl;
15193 }
15194
15195 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15196 {
15197 ldout(pg->cct, 20) << "exit " << state_name << dendl;
15198 }
15199
15200 /*---SnapTrimmer states---*/
15201 #undef dout_prefix
15202 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15203 << "SnapTrimmer state<" << get_state_name() << ">: ")
15204
15205 /* NotTrimming */
15206 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15207 : my_base(ctx),
15208 NamedState(nullptr, "NotTrimming")
15209 {
15210 context< SnapTrimmer >().log_enter(state_name);
15211 }
15212
15213 void PrimaryLogPG::NotTrimming::exit()
15214 {
15215 context< SnapTrimmer >().log_exit(state_name, enter_time);
15216 }
15217
15218 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15219 {
15220 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15221 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15222
15223 if (!(pg->is_primary() && pg->is_active())) {
15224 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15225 return discard_event();
15226 }
15227 if (!pg->is_clean() ||
15228 pg->snap_trimq.empty()) {
15229 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15230 return discard_event();
15231 }
15232 if (pg->scrubber.active) {
15233 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15234 return transit< WaitScrub >();
15235 } else {
15236 return transit< Trimming >();
15237 }
15238 }
15239
15240 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15241 {
15242 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15243 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15244
15245 pending = nullptr;
15246 if (!context< SnapTrimmer >().can_trim()) {
15247 post_event(KickTrim());
15248 return transit< NotTrimming >();
15249 }
15250
15251 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15252 ldout(pg->cct, 10) << "NotTrimming: trimming "
15253 << pg->snap_trimq.range_start()
15254 << dendl;
15255 return transit< AwaitAsyncWork >();
15256 }
15257
15258 /* AwaitAsyncWork */
15259 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15260 : my_base(ctx),
15261 NamedState(nullptr, "Trimming/AwaitAsyncWork")
15262 {
15263 auto *pg = context< SnapTrimmer >().pg;
15264 context< SnapTrimmer >().log_enter(state_name);
15265 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15266 pg->state_set(PG_STATE_SNAPTRIM);
15267 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15268 pg->publish_stats_to_osd();
15269 }
15270
15271 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15272 {
15273 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15274 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15275 auto &in_flight = context<Trimming>().in_flight;
15276 ceph_assert(in_flight.empty());
15277
15278 ceph_assert(pg->is_primary() && pg->is_active());
15279 if (!context< SnapTrimmer >().can_trim()) {
15280 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15281 post_event(KickTrim());
15282 return transit< NotTrimming >();
15283 }
15284
15285 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15286
15287 vector<hobject_t> to_trim;
15288 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15289 to_trim.reserve(max);
15290 int r = pg->snap_mapper.get_next_objects_to_trim(
15291 snap_to_trim,
15292 max,
15293 &to_trim);
15294 if (r != 0 && r != -ENOENT) {
15295 lderr(pg->cct) << "get_next_objects_to_trim returned "
15296 << cpp_strerror(r) << dendl;
15297 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15298 } else if (r == -ENOENT) {
15299 // Done!
15300 ldout(pg->cct, 10) << "got ENOENT" << dendl;
15301
15302 pg->snap_trimq.erase(snap_to_trim);
15303
15304 if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15305 ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15306 pg->snap_trimq_repeat.erase(snap_to_trim);
15307 } else {
15308 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15309 << " to purged_snaps"
15310 << dendl;
15311 ObjectStore::Transaction t;
15312 pg->recovery_state.adjust_purged_snaps(
15313 [snap_to_trim](auto &purged_snaps) {
15314 purged_snaps.insert(snap_to_trim);
15315 });
15316 pg->write_if_dirty(t);
15317
15318 ldout(pg->cct, 10) << "purged_snaps now "
15319 << pg->info.purged_snaps << ", snap_trimq now "
15320 << pg->snap_trimq << dendl;
15321
15322 int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15323 ceph_assert(tr == 0);
15324
15325 pg->recovery_state.share_pg_info();
15326 }
15327 post_event(KickTrim());
15328 return transit< NotTrimming >();
15329 }
15330 ceph_assert(!to_trim.empty());
15331
15332 for (auto &&object: to_trim) {
15333 // Get next
15334 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15335 OpContextUPtr ctx;
15336 int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
15337 if (error) {
15338 if (error == -ENOLCK) {
15339 ldout(pg->cct, 10) << "could not get write lock on obj "
15340 << object << dendl;
15341 } else {
15342 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15343 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15344 }
15345 if (!in_flight.empty()) {
15346 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15347 return transit< WaitRepops >();
15348 }
15349 if (error == -ENOLCK) {
15350 ldout(pg->cct, 10) << "waiting for it to clear"
15351 << dendl;
15352 return transit< WaitRWLock >();
15353 } else {
15354 return transit< NotTrimming >();
15355 }
15356 }
15357
15358 in_flight.insert(object);
15359 ctx->register_on_success(
15360 [pg, object, &in_flight]() {
15361 ceph_assert(in_flight.find(object) != in_flight.end());
15362 in_flight.erase(object);
15363 if (in_flight.empty()) {
15364 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15365 pg->snap_trimmer_machine.process_event(Reset());
15366 } else {
15367 pg->snap_trimmer_machine.process_event(RepopsComplete());
15368 }
15369 }
15370 });
15371
15372 pg->simple_opc_submit(std::move(ctx));
15373 }
15374
15375 return transit< WaitRepops >();
15376 }
15377
15378 void PrimaryLogPG::setattr_maybe_cache(
15379 ObjectContextRef obc,
15380 PGTransaction *t,
15381 const string &key,
15382 bufferlist &val)
15383 {
15384 t->setattr(obc->obs.oi.soid, key, val);
15385 }
15386
15387 void PrimaryLogPG::setattrs_maybe_cache(
15388 ObjectContextRef obc,
15389 PGTransaction *t,
15390 map<string, bufferlist> &attrs)
15391 {
15392 t->setattrs(obc->obs.oi.soid, attrs);
15393 }
15394
15395 void PrimaryLogPG::rmattr_maybe_cache(
15396 ObjectContextRef obc,
15397 PGTransaction *t,
15398 const string &key)
15399 {
15400 t->rmattr(obc->obs.oi.soid, key);
15401 }
15402
15403 int PrimaryLogPG::getattr_maybe_cache(
15404 ObjectContextRef obc,
15405 const string &key,
15406 bufferlist *val)
15407 {
15408 if (pool.info.is_erasure()) {
15409 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15410 if (i != obc->attr_cache.end()) {
15411 if (val)
15412 *val = i->second;
15413 return 0;
15414 } else {
15415 return -ENODATA;
15416 }
15417 }
15418 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15419 }
15420
15421 int PrimaryLogPG::getattrs_maybe_cache(
15422 ObjectContextRef obc,
15423 map<string, bufferlist> *out)
15424 {
15425 int r = 0;
15426 ceph_assert(out);
15427 if (pool.info.is_erasure()) {
15428 *out = obc->attr_cache;
15429 } else {
15430 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15431 }
15432 map<string, bufferlist> tmp;
15433 for (map<string, bufferlist>::iterator i = out->begin();
15434 i != out->end();
15435 ++i) {
15436 if (i->first.size() > 1 && i->first[0] == '_')
15437 tmp[i->first.substr(1, i->first.size())].claim(i->second);
15438 }
15439 tmp.swap(*out);
15440 return r;
15441 }
15442
15443 bool PrimaryLogPG::check_failsafe_full() {
15444 return osd->check_failsafe_full(get_dpp());
15445 }
15446
15447 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15448 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15449
15450 #ifdef PG_DEBUG_REFS
15451 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15452 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15453 #endif
15454
15455 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15456 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }