]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.cc
update sources to v12.2.0
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#include "boost/tuple/tuple.hpp"
19#include "boost/intrusive_ptr.hpp"
20#include "PG.h"
21#include "PrimaryLogPG.h"
22#include "OSD.h"
23#include "OpRequest.h"
24#include "ScrubStore.h"
25#include "Session.h"
26#include "objclass/objclass.h"
27
28#include "common/errno.h"
29#include "common/scrub_types.h"
30#include "common/perf_counters.h"
31
32#include "messages/MOSDOp.h"
33#include "messages/MOSDBackoff.h"
34#include "messages/MOSDSubOp.h"
35#include "messages/MOSDSubOpReply.h"
36#include "messages/MOSDPGTrim.h"
37#include "messages/MOSDPGScan.h"
38#include "messages/MOSDRepScrub.h"
39#include "messages/MOSDPGBackfill.h"
40#include "messages/MOSDPGBackfillRemove.h"
41#include "messages/MOSDPGUpdateLogMissing.h"
42#include "messages/MOSDPGUpdateLogMissingReply.h"
43#include "messages/MCommandReply.h"
44#include "messages/MOSDScrubReserve.h"
45#include "mds/inode_backtrace.h" // Ugh
46#include "common/EventTrace.h"
47
48#include "common/config.h"
49#include "include/compat.h"
50#include "mon/MonClient.h"
51#include "osdc/Objecter.h"
52#include "json_spirit/json_spirit_value.h"
53#include "json_spirit/json_spirit_reader.h"
54#include "include/assert.h" // json_spirit clobbers it
55#include "include/rados/rados_types.hpp"
56
57#ifdef WITH_LTTNG
58#include "tracing/osd.h"
59#else
60#define tracepoint(...)
61#endif
62
63#define dout_context cct
64#define dout_subsys ceph_subsys_osd
65#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66#undef dout_prefix
67#define dout_prefix _prefix(_dout, this)
68template <typename T>
69static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71}
72
73
74#include <sstream>
75#include <utility>
76
77#include <errno.h>
78
79MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81PGLSFilter::PGLSFilter() : cct(nullptr)
82{
83}
84
85PGLSFilter::~PGLSFilter()
86{
87}
88
89struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104};
105
106/**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128};
129
130template <typename T>
131class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146};
147
148GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152}
153
154class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169};
170
171
172Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174}
175
176class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185};
186
187class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201};
202
203struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
7c673cae
FG
210 opcontext->finish_read(pg);
211 }
212 ~OnReadComplete() override {}
213};
214
215class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
216 PrimaryLogPGRef pg;
217 ObjectContextRef obc;
218 public:
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
220 pg(p), obc(o) {}
221 void finish(int r) override {
222 pg->_applied_recovered_object(obc);
223 }
224};
225
226class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238};
239
240class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 void finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 }
248};
249
250// OpContext
251void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
252{
253 inflightreads = 1;
254 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
255 pair<bufferlist*, Context*> > > in;
256 in.swap(pending_async_reads);
257 pg->pgbackend->objects_read_async(
258 obc->obs.oi.soid,
259 in,
260 new OnReadComplete(pg, this), pg->get_pool().fast_read);
261}
262void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
263{
264 assert(inflightreads > 0);
265 --inflightreads;
266 if (async_reads_complete()) {
267 assert(pg->in_progress_async_reads.size());
268 assert(pg->in_progress_async_reads.front().second == this);
269 pg->in_progress_async_reads.pop_front();
c07f9fc5
FG
270
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg->execute_ctx(this);
7c673cae
FG
274 }
275}
276
c07f9fc5 277class CopyFromCallback : public PrimaryLogPG::CopyCallback {
7c673cae 278public:
c07f9fc5 279 PrimaryLogPG::CopyResults *results = nullptr;
7c673cae 280 PrimaryLogPG::OpContext *ctx;
c07f9fc5
FG
281 OSDOp &osd_op;
282
283 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
284 : ctx(ctx), osd_op(osd_op) {
285 }
7c673cae
FG
286 ~CopyFromCallback() override {}
287
288 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
289 results = results_.get<1>();
290 int r = results_.get<0>();
7c673cae
FG
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
c07f9fc5 297 } else {
7c673cae
FG
298 if (r != -ECANCELED) { // on cancel just toss it out; client resends
299 if (ctx->op)
300 ctx->pg->osd->reply_op_error(ctx->op, r);
301 } else if (results->should_requeue) {
302 if (ctx->op)
303 ctx->pg->requeue_op(ctx->op);
304 }
305 ctx->pg->close_op_ctx(ctx);
306 }
307 }
308
309 bool is_temp_obj_used() {
310 return results->started_temp_obj;
311 }
312 uint64_t get_data_size() {
313 return results->object_size;
314 }
c07f9fc5
FG
315};
316
317struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
318 CopyFromCallback *copy_from_callback;
319
320 CopyFromFinisher(CopyFromCallback *copy_from_callback)
321 : copy_from_callback(copy_from_callback) {
322 }
323
324 int execute() override {
325 // instance will be destructed after this method completes
326 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
327 return 0;
7c673cae
FG
328 }
329};
330
331// ======================
332// PGBackend::Listener
333
334void PrimaryLogPG::on_local_recover(
335 const hobject_t &hoid,
336 const ObjectRecoveryInfo &_recovery_info,
337 ObjectContextRef obc,
c07f9fc5 338 bool is_delete,
7c673cae
FG
339 ObjectStore::Transaction *t
340 )
341{
342 dout(10) << __func__ << ": " << hoid << dendl;
343
344 ObjectRecoveryInfo recovery_info(_recovery_info);
345 clear_object_snap_mapping(t, hoid);
c07f9fc5 346 if (!is_delete && recovery_info.soid.is_snap()) {
7c673cae
FG
347 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
348 set<snapid_t> snaps;
349 dout(20) << " snapset " << recovery_info.ss
350 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
351 if (recovery_info.ss.is_legacy() ||
352 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
353 assert(recovery_info.oi.legacy_snaps.size());
354 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
355 recovery_info.oi.legacy_snaps.end());
356 } else {
357 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
358 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
359 snaps.insert(p->second.begin(), p->second.end());
360 }
361 dout(20) << " snaps " << snaps << dendl;
362 snap_mapper.add_oid(
363 recovery_info.soid,
364 snaps,
365 &_t);
366 }
c07f9fc5 367 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
7c673cae
FG
368 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
369 assert(is_primary());
370 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
371 if (latest->op == pg_log_entry_t::LOST_REVERT &&
372 latest->reverting_to == recovery_info.version) {
373 dout(10) << " got old revert version " << recovery_info.version
374 << " for " << *latest << dendl;
375 recovery_info.version = latest->version;
376 // update the attr to the revert event version
377 recovery_info.oi.prior_version = recovery_info.oi.version;
378 recovery_info.oi.version = latest->version;
379 bufferlist bl;
380 ::encode(recovery_info.oi, bl,
381 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
382 assert(!pool.info.require_rollback());
383 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
384 if (obc)
385 obc->attr_cache[OI_ATTR] = bl;
386 }
387 }
388
389 // keep track of active pushes for scrub
390 ++active_pushes;
391
392 if (recovery_info.version > pg_log.get_can_rollback_to()) {
393 /* This can only happen during a repair, and even then, it would
394 * be one heck of a race. If we are repairing the object, the
395 * write in question must be fully committed, so it's not valid
396 * to roll it back anyway (and we'll be rolled forward shortly
397 * anyway) */
398 PGLogEntryHandler h{this, t};
399 pg_log.roll_forward_to(recovery_info.version, &h);
400 }
401 recover_got(recovery_info.soid, recovery_info.version);
402
403 if (is_primary()) {
c07f9fc5
FG
404 if (!is_delete) {
405 obc->obs.exists = true;
406 obc->ondisk_write_lock();
7c673cae 407
c07f9fc5
FG
408 bool got = obc->get_recovery_read();
409 assert(got);
7c673cae 410
c07f9fc5
FG
411 assert(recovering.count(obc->obs.oi.soid));
412 recovering[obc->obs.oi.soid] = obc;
413 obc->obs.oi = recovery_info.oi; // may have been updated above
414 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
415 }
7c673cae
FG
416
417 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
7c673cae
FG
418
419 publish_stats_to_osd();
420 assert(missing_loc.needs_recovery(hoid));
c07f9fc5
FG
421 if (!is_delete)
422 missing_loc.add_location(hoid, pg_whoami);
7c673cae
FG
423 release_backoffs(hoid);
424 if (!is_unreadable_object(hoid)) {
425 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
426 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
427 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
428 requeue_ops(unreadable_object_entry->second);
429 waiting_for_unreadable_object.erase(unreadable_object_entry);
430 }
431 }
7c673cae
FG
432 } else {
433 t->register_on_applied(
434 new C_OSD_AppliedRecoveredObjectReplica(this));
435
436 }
437
438 t->register_on_commit(
439 new C_OSD_CommittedPushedObject(
440 this,
441 get_osdmap()->get_epoch(),
442 info.last_complete));
443
444 // update pg
445 dirty_info = true;
446 write_if_dirty(*t);
447}
448
449void PrimaryLogPG::on_global_recover(
450 const hobject_t &soid,
c07f9fc5
FG
451 const object_stat_sum_t &stat_diff,
452 bool is_delete)
7c673cae
FG
453{
454 info.stats.stats.sum.add(stat_diff);
455 missing_loc.recovered(soid);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
459 assert(i != recovering.end());
460
c07f9fc5
FG
461 if (!is_delete) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 assert(i->second);
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
7c673cae
FG
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488}
489
490void PrimaryLogPG::on_peer_recover(
491 pg_shard_t peer,
492 const hobject_t &soid,
493 const ObjectRecoveryInfo &recovery_info)
494{
495 publish_stats_to_osd();
496 // done!
497 peer_missing[peer].got(soid, recovery_info.version);
498}
499
500void PrimaryLogPG::begin_peer_recover(
501 pg_shard_t peer,
502 const hobject_t soid)
503{
504 peer_missing[peer].revise_have(soid, eversion_t());
505}
506
507void PrimaryLogPG::schedule_recovery_work(
508 GenContext<ThreadPool::TPHandle&> *c)
509{
510 osd->recovery_gen_wq.queue(c);
511}
512
513void PrimaryLogPG::send_message_osd_cluster(
514 int peer, Message *m, epoch_t from_epoch)
515{
516 osd->send_message_osd_cluster(peer, m, from_epoch);
517}
518
519void PrimaryLogPG::send_message_osd_cluster(
520 Message *m, Connection *con)
521{
522 osd->send_message_osd_cluster(m, con);
523}
524
525void PrimaryLogPG::send_message_osd_cluster(
526 Message *m, const ConnectionRef& con)
527{
528 osd->send_message_osd_cluster(m, con);
529}
530
224ce89b
WB
531void PrimaryLogPG::on_primary_error(
532 const hobject_t &oid,
533 eversion_t v)
534{
535 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
536 primary_failed(oid);
537 primary_error(oid, v);
538 backfills_in_flight.erase(oid);
539 missing_loc.add_missing(oid, v, eversion_t());
540}
541
7c673cae
FG
542ConnectionRef PrimaryLogPG::get_con_osd_cluster(
543 int peer, epoch_t from_epoch)
544{
545 return osd->get_con_osd_cluster(peer, from_epoch);
546}
547
548PerfCounters *PrimaryLogPG::get_logger()
549{
550 return osd->logger;
551}
552
553
554// ====================
555// missing objects
556
557bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
558{
559 return pg_log.get_missing().get_items().count(soid);
560}
561
562void PrimaryLogPG::maybe_kick_recovery(
563 const hobject_t &soid)
564{
565 eversion_t v;
566 if (!missing_loc.needs_recovery(soid, &v))
567 return;
568
569 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
570 if (p != recovering.end()) {
571 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
572 } else if (missing_loc.is_unfound(soid)) {
573 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
574 } else {
575 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
576 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
577 if (is_missing_object(soid)) {
578 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
c07f9fc5
FG
579 } else if (missing_loc.is_deleted(soid)) {
580 prep_object_replica_deletes(soid, v, h);
7c673cae
FG
581 } else {
582 prep_object_replica_pushes(soid, v, h);
583 }
584 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
585 }
586}
587
588void PrimaryLogPG::wait_for_unreadable_object(
589 const hobject_t& soid, OpRequestRef op)
590{
591 assert(is_unreadable_object(soid));
592 maybe_kick_recovery(soid);
593 waiting_for_unreadable_object[soid].push_back(op);
594 op->mark_delayed("waiting for missing object");
595}
596
7c673cae
FG
597bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
598{
599 /* The conditions below may clear (on_local_recover, before we queue
600 * the transaction) before we actually requeue the degraded waiters
601 * in on_global_recover after the transaction completes.
602 */
603 if (waiting_for_degraded_object.count(soid))
604 return true;
605 if (pg_log.get_missing().get_items().count(soid))
606 return true;
607 assert(!actingbackfill.empty());
608 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
609 i != actingbackfill.end();
610 ++i) {
611 if (*i == get_primary()) continue;
612 pg_shard_t peer = *i;
613 auto peer_missing_entry = peer_missing.find(peer);
614 if (peer_missing_entry != peer_missing.end() &&
615 peer_missing_entry->second.get_items().count(soid))
616 return true;
617
618 // Object is degraded if after last_backfill AND
619 // we are backfilling it
620 if (is_backfill_targets(peer) &&
621 peer_info[peer].last_backfill <= soid &&
622 last_backfill_started >= soid &&
623 backfills_in_flight.count(soid))
624 return true;
625 }
626 return false;
627}
628
629void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
630{
631 assert(is_degraded_or_backfilling_object(soid));
632
633 maybe_kick_recovery(soid);
634 waiting_for_degraded_object[soid].push_back(op);
635 op->mark_delayed("waiting for degraded object");
636}
637
638void PrimaryLogPG::block_write_on_full_cache(
639 const hobject_t& _oid, OpRequestRef op)
640{
641 const hobject_t oid = _oid.get_head();
642 dout(20) << __func__ << ": blocking object " << oid
643 << " on full cache" << dendl;
644 objects_blocked_on_cache_full.insert(oid);
645 waiting_for_cache_not_full.push_back(op);
646 op->mark_delayed("waiting for cache not full");
647}
648
224ce89b
WB
649void PrimaryLogPG::block_for_clean(
650 const hobject_t& oid, OpRequestRef op)
651{
652 dout(20) << __func__ << ": blocking object " << oid
653 << " on primary repair" << dendl;
654 waiting_for_clean_to_primary_repair.push_back(op);
655 op->mark_delayed("waiting for clean to repair");
656}
657
7c673cae
FG
658void PrimaryLogPG::block_write_on_snap_rollback(
659 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
660{
661 dout(20) << __func__ << ": blocking object " << oid.get_head()
662 << " on snap promotion " << obc->obs.oi.soid << dendl;
663 // otherwise, we'd have blocked in do_op
664 assert(oid.is_head());
665 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
666 objects_blocked_on_snap_promotion[oid] = obc;
667 wait_for_blocked_object(obc->obs.oi.soid, op);
668}
669
670void PrimaryLogPG::block_write_on_degraded_snap(
671 const hobject_t& snap, OpRequestRef op)
672{
673 dout(20) << __func__ << ": blocking object " << snap.get_head()
674 << " on degraded snap " << snap << dendl;
675 // otherwise, we'd have blocked in do_op
676 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
677 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
678 wait_for_degraded_object(snap, op);
679}
680
681bool PrimaryLogPG::maybe_await_blocked_snapset(
682 const hobject_t &hoid,
683 OpRequestRef op)
684{
685 ObjectContextRef obc;
686 obc = object_contexts.lookup(hoid.get_head());
687 if (obc) {
688 if (obc->is_blocked()) {
689 wait_for_blocked_object(obc->obs.oi.soid, op);
690 return true;
691 } else {
692 return false;
693 }
694 }
695 obc = object_contexts.lookup(hoid.get_snapdir());
696 if (obc) {
697 if (obc->is_blocked()) {
698 wait_for_blocked_object(obc->obs.oi.soid, op);
699 return true;
700 } else {
701 return false;
702 }
703 }
704 return false;
705}
706
707void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
708{
709 dout(10) << __func__ << " " << soid << " " << op << dendl;
710 waiting_for_blocked_object[soid].push_back(op);
711 op->mark_delayed("waiting for blocked object");
712}
713
714void PrimaryLogPG::maybe_force_recovery()
715{
716 // no force if not in degraded/recovery/backfill stats
717 if (!is_degraded() &&
718 !state_test(PG_STATE_RECOVERING |
719 PG_STATE_RECOVERY_WAIT |
720 PG_STATE_BACKFILL |
721 PG_STATE_BACKFILL_WAIT |
722 PG_STATE_BACKFILL_TOOFULL))
723 return;
724
725 if (pg_log.get_log().approx_size() <
726 cct->_conf->osd_max_pg_log_entries *
727 cct->_conf->osd_force_recovery_pg_log_entries_factor)
728 return;
729
730 // find the oldest missing object
731 version_t min_version = 0;
732 hobject_t soid;
733 if (!pg_log.get_missing().get_items().empty()) {
734 min_version = pg_log.get_missing().get_rmissing().begin()->first;
735 soid = pg_log.get_missing().get_rmissing().begin()->second;
736 }
737 assert(!actingbackfill.empty());
738 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
739 it != actingbackfill.end();
740 ++it) {
741 if (*it == get_primary()) continue;
742 pg_shard_t peer = *it;
743 if (peer_missing.count(peer) &&
744 !peer_missing[peer].get_items().empty() &&
745 min_version > peer_missing[peer].get_rmissing().begin()->first) {
746 min_version = peer_missing[peer].get_rmissing().begin()->first;
747 soid = peer_missing[peer].get_rmissing().begin()->second;
748 }
749 }
750
751 // recover it
752 if (soid != hobject_t())
753 maybe_kick_recovery(soid);
754}
755
756class PGLSPlainFilter : public PGLSFilter {
757 string val;
758public:
759 int init(bufferlist::iterator &params) override
760 {
761 try {
762 ::decode(xattr, params);
763 ::decode(val, params);
764 } catch (buffer::error &e) {
765 return -EINVAL;
766 }
767
768 return 0;
769 }
770 ~PGLSPlainFilter() override {}
771 bool filter(const hobject_t &obj, bufferlist& xattr_data,
772 bufferlist& outdata) override;
773};
774
775class PGLSParentFilter : public PGLSFilter {
776 inodeno_t parent_ino;
777public:
778 CephContext* cct;
779 PGLSParentFilter(CephContext* cct) : cct(cct) {
780 xattr = "_parent";
781 }
782 int init(bufferlist::iterator &params) override
783 {
784 try {
785 ::decode(parent_ino, params);
786 } catch (buffer::error &e) {
787 return -EINVAL;
788 }
789 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
790
791 return 0;
792 }
793 ~PGLSParentFilter() override {}
794 bool filter(const hobject_t &obj, bufferlist& xattr_data,
795 bufferlist& outdata) override;
796};
797
798bool PGLSParentFilter::filter(const hobject_t &obj,
799 bufferlist& xattr_data, bufferlist& outdata)
800{
801 bufferlist::iterator iter = xattr_data.begin();
802 inode_backtrace_t bt;
803
804 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
805
806 ::decode(bt, iter);
807
808 vector<inode_backpointer_t>::iterator vi;
809 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
810 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
811 if (vi->dirino == parent_ino) {
812 ::encode(*vi, outdata);
813 return true;
814 }
815 }
816
817 return false;
818}
819
820bool PGLSPlainFilter::filter(const hobject_t &obj,
821 bufferlist& xattr_data, bufferlist& outdata)
822{
823 if (val.size() != xattr_data.length())
824 return false;
825
826 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
827 return false;
828
829 return true;
830}
831
832bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
833{
834 bufferlist bl;
835
836 // If filter has expressed an interest in an xattr, load it.
837 if (!filter->get_xattr().empty()) {
838 int ret = pgbackend->objects_get_attr(
839 sobj,
840 filter->get_xattr(),
841 &bl);
842 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
843 if (ret < 0) {
844 if (ret != -ENODATA || filter->reject_empty_xattr()) {
845 return false;
846 }
847 }
848 }
849
850 return filter->filter(sobj, bl, outdata);
851}
852
853int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
854{
855 string type;
856 PGLSFilter *filter;
857
858 try {
859 ::decode(type, iter);
860 }
861 catch (buffer::error& e) {
862 return -EINVAL;
863 }
864
865 if (type.compare("parent") == 0) {
866 filter = new PGLSParentFilter(cct);
867 } else if (type.compare("plain") == 0) {
868 filter = new PGLSPlainFilter();
869 } else {
870 std::size_t dot = type.find(".");
871 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
872 return -EINVAL;
873 }
874
875 const std::string class_name = type.substr(0, dot);
876 const std::string filter_name = type.substr(dot + 1);
877 ClassHandler::ClassData *cls = NULL;
878 int r = osd->class_handler->open_class(class_name, &cls);
879 if (r != 0) {
880 derr << "Error opening class '" << class_name << "': "
881 << cpp_strerror(r) << dendl;
882 if (r != -EPERM) // propogate permission error
883 r = -EINVAL;
884 return r;
885 } else {
886 assert(cls);
887 }
888
889 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
890 if (class_filter == NULL) {
891 derr << "Error finding filter '" << filter_name << "' in class "
892 << class_name << dendl;
893 return -EINVAL;
894 }
895 filter = class_filter->fn();
896 if (!filter) {
897 // Object classes are obliged to return us something, but let's
898 // give an error rather than asserting out.
899 derr << "Buggy class " << class_name << " failed to construct "
900 "filter " << filter_name << dendl;
901 return -EINVAL;
902 }
903 }
904
905 assert(filter);
906 int r = filter->init(iter);
907 if (r < 0) {
908 derr << "Error initializing filter " << type << ": "
909 << cpp_strerror(r) << dendl;
910 delete filter;
911 return -EINVAL;
912 } else {
913 // Successfully constructed and initialized, return it.
914 *pfilter = filter;
915 return 0;
916 }
917}
918
919
920// ==========================================================
921
922int PrimaryLogPG::do_command(
923 cmdmap_t cmdmap,
924 ostream& ss,
925 bufferlist& idata,
926 bufferlist& odata,
927 ConnectionRef con,
928 ceph_tid_t tid)
929{
c07f9fc5 930 const auto &missing = pg_log.get_missing();
7c673cae
FG
931 string prefix;
932 string format;
933
934 cmd_getval(cct, cmdmap, "format", format);
935 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
936
937 string command;
938 cmd_getval(cct, cmdmap, "cmd", command);
939 if (command == "query") {
940 f->open_object_section("pg");
941 f->dump_string("state", pg_state_string(get_state()));
942 f->dump_stream("snap_trimq") << snap_trimq;
943 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
944 f->open_array_section("up");
945 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
946 f->dump_unsigned("osd", *p);
947 f->close_section();
948 f->open_array_section("acting");
949 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
950 f->dump_unsigned("osd", *p);
951 f->close_section();
952 if (!backfill_targets.empty()) {
953 f->open_array_section("backfill_targets");
954 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
955 p != backfill_targets.end();
956 ++p)
957 f->dump_stream("shard") << *p;
958 f->close_section();
959 }
960 if (!actingbackfill.empty()) {
961 f->open_array_section("actingbackfill");
962 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
963 p != actingbackfill.end();
964 ++p)
965 f->dump_stream("shard") << *p;
966 f->close_section();
967 }
968 f->open_object_section("info");
969 _update_calc_stats();
970 info.dump(f.get());
971 f->close_section();
972
973 f->open_array_section("peer_info");
974 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
975 p != peer_info.end();
976 ++p) {
977 f->open_object_section("info");
978 f->dump_stream("peer") << p->first;
979 p->second.dump(f.get());
980 f->close_section();
981 }
982 f->close_section();
983
984 f->open_array_section("recovery_state");
985 handle_query_state(f.get());
986 f->close_section();
987
988 f->open_object_section("agent_state");
989 if (agent_state)
990 agent_state->dump(f.get());
991 f->close_section();
992
993 f->close_section();
994 f->flush(odata);
995 return 0;
996 }
997 else if (command == "mark_unfound_lost") {
998 string mulcmd;
999 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1000 int mode = -1;
1001 if (mulcmd == "revert") {
1002 if (pool.info.ec_pool()) {
1003 ss << "mode must be 'delete' for ec pool";
1004 return -EINVAL;
1005 }
1006 mode = pg_log_entry_t::LOST_REVERT;
1007 } else if (mulcmd == "delete") {
1008 mode = pg_log_entry_t::LOST_DELETE;
1009 } else {
1010 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1011 return -EINVAL;
1012 }
1013 assert(mode == pg_log_entry_t::LOST_REVERT ||
1014 mode == pg_log_entry_t::LOST_DELETE);
1015
1016 if (!is_primary()) {
1017 ss << "not primary";
1018 return -EROFS;
1019 }
1020
1021 uint64_t unfound = missing_loc.num_unfound();
1022 if (!unfound) {
1023 ss << "pg has no unfound objects";
1024 return 0; // make command idempotent
1025 }
1026
1027 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1028 ss << "pg has " << unfound
1029 << " unfound objects but we haven't probed all sources, not marking lost";
1030 return -EINVAL;
1031 }
1032
1033 mark_all_unfound_lost(mode, con, tid);
1034 return -EAGAIN;
1035 }
1036 else if (command == "list_missing") {
1037 hobject_t offset;
1038 string offset_json;
1039 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1040 json_spirit::Value v;
1041 try {
1042 if (!json_spirit::read(offset_json, v))
1043 throw std::runtime_error("bad json");
1044 offset.decode(v);
1045 } catch (std::runtime_error& e) {
1046 ss << "error parsing offset: " << e.what();
1047 return -EINVAL;
1048 }
1049 }
1050 f->open_object_section("missing");
1051 {
1052 f->open_object_section("offset");
1053 offset.dump(f.get());
1054 f->close_section();
1055 }
1056 f->dump_int("num_missing", missing.num_missing());
1057 f->dump_int("num_unfound", get_num_unfound());
1058 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1059 missing_loc.get_needs_recovery();
1060 map<hobject_t, pg_missing_item>::const_iterator p =
1061 needs_recovery_map.upper_bound(offset);
1062 {
1063 f->open_array_section("objects");
1064 int32_t num = 0;
1065 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1066 if (missing_loc.is_unfound(p->first)) {
1067 f->open_object_section("object");
1068 {
1069 f->open_object_section("oid");
1070 p->first.dump(f.get());
1071 f->close_section();
1072 }
1073 p->second.dump(f.get()); // have, need keys
1074 {
1075 f->open_array_section("locations");
1076 for (set<pg_shard_t>::iterator r =
1077 missing_loc.get_locations(p->first).begin();
1078 r != missing_loc.get_locations(p->first).end();
1079 ++r)
1080 f->dump_stream("shard") << *r;
1081 f->close_section();
1082 }
1083 f->close_section();
1084 num++;
1085 }
1086 }
1087 f->close_section();
1088 }
1089 f->dump_bool("more", p != needs_recovery_map.end());
1090 f->close_section();
1091 f->flush(odata);
1092 return 0;
1093 }
1094
1095 ss << "unknown pg command " << prefix;
1096 return -EINVAL;
1097}
1098
1099// ==========================================================
1100
1101void PrimaryLogPG::do_pg_op(OpRequestRef op)
1102{
1103 // NOTE: this is non-const because we modify the OSDOp.outdata in
1104 // place
1105 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1106 assert(m->get_type() == CEPH_MSG_OSD_OP);
1107 dout(10) << "do_pg_op " << *m << dendl;
1108
1109 op->mark_started();
1110
1111 int result = 0;
1112 string cname, mname;
1113 PGLSFilter *filter = NULL;
1114 bufferlist filter_out;
1115
1116 snapid_t snapid = m->get_snapid();
1117
1118 vector<OSDOp> ops = m->ops;
1119
1120 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1121 OSDOp& osd_op = *p;
1122 bufferlist::iterator bp = p->indata.begin();
1123 switch (p->op.op) {
1124 case CEPH_OSD_OP_PGNLS_FILTER:
1125 try {
1126 ::decode(cname, bp);
1127 ::decode(mname, bp);
1128 }
1129 catch (const buffer::error& e) {
1130 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1131 result = -EINVAL;
1132 break;
1133 }
1134 if (filter) {
1135 delete filter;
1136 filter = NULL;
1137 }
1138 result = get_pgls_filter(bp, &filter);
1139 if (result < 0)
1140 break;
1141
1142 assert(filter);
1143
1144 // fall through
1145
1146 case CEPH_OSD_OP_PGNLS:
1147 if (snapid != CEPH_NOSNAP) {
1148 result = -EINVAL;
1149 break;
1150 }
1151 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1152 dout(10) << " pgnls pg=" << m->get_pg()
1153 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1154 << " != " << info.pgid << dendl;
1155 result = 0; // hmm?
1156 } else {
1157 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1158
1159 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1160 // read into a buffer
1161 vector<hobject_t> sentries;
1162 pg_nls_response_t response;
1163 try {
1164 ::decode(response.handle, bp);
1165 }
1166 catch (const buffer::error& e) {
1167 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1168 result = -EINVAL;
1169 break;
1170 }
1171
1172 hobject_t next;
1173 hobject_t lower_bound = response.handle;
1174 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1175 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1176 dout(10) << " pgnls lower_bound " << lower_bound
1177 << " pg_end " << pg_end << dendl;
1178 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1179 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1180 // this should only happen with a buggy client.
1181 dout(10) << "outside of PG bounds " << pg_start << " .. "
1182 << pg_end << dendl;
1183 result = -EINVAL;
1184 break;
1185 }
1186
1187 hobject_t current = lower_bound;
1188 osr->flush();
1189 int r = pgbackend->objects_list_partial(
1190 current,
1191 list_size,
1192 list_size,
1193 &sentries,
1194 &next);
1195 if (r != 0) {
1196 result = -EINVAL;
1197 break;
1198 }
1199
1200 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1201 pg_log.get_missing().get_items().lower_bound(current);
1202 vector<hobject_t>::iterator ls_iter = sentries.begin();
1203 hobject_t _max = hobject_t::get_max();
1204 while (1) {
1205 const hobject_t &mcand =
1206 missing_iter == pg_log.get_missing().get_items().end() ?
1207 _max :
1208 missing_iter->first;
1209 const hobject_t &lcand =
1210 ls_iter == sentries.end() ?
1211 _max :
1212 *ls_iter;
1213
1214 hobject_t candidate;
1215 if (mcand == lcand) {
1216 candidate = mcand;
1217 if (!mcand.is_max()) {
1218 ++ls_iter;
1219 ++missing_iter;
1220 }
1221 } else if (mcand < lcand) {
1222 candidate = mcand;
1223 assert(!mcand.is_max());
1224 ++missing_iter;
1225 } else {
1226 candidate = lcand;
1227 assert(!lcand.is_max());
1228 ++ls_iter;
1229 }
1230
1231 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1232 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1233
1234 if (candidate >= next) {
1235 break;
1236 }
1237
1238 if (response.entries.size() == list_size) {
1239 next = candidate;
1240 break;
1241 }
1242
1243 // skip snapdir objects
1244 if (candidate.snap == CEPH_SNAPDIR)
1245 continue;
1246
1247 if (candidate.snap != CEPH_NOSNAP)
1248 continue;
1249
1250 // skip internal namespace
1251 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1252 continue;
1253
c07f9fc5
FG
1254 if (missing_loc.is_deleted(candidate))
1255 continue;
1256
7c673cae
FG
1257 // skip wrong namespace
1258 if (m->get_hobj().nspace != librados::all_nspaces &&
1259 candidate.get_namespace() != m->get_hobj().nspace)
1260 continue;
1261
1262 if (filter && !pgls_filter(filter, candidate, filter_out))
1263 continue;
1264
1265 dout(20) << "pgnls item 0x" << std::hex
1266 << candidate.get_hash()
1267 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1268 << std::dec << " "
1269 << candidate.oid.name << dendl;
1270
1271 librados::ListObjectImpl item;
1272 item.nspace = candidate.get_namespace();
1273 item.oid = candidate.oid.name;
1274 item.locator = candidate.get_key();
1275 response.entries.push_back(item);
1276 }
1277
1278 if (next.is_max() &&
1279 missing_iter == pg_log.get_missing().get_items().end() &&
1280 ls_iter == sentries.end()) {
1281 result = 1;
1282
1283 // Set response.handle to the start of the next PG according
1284 // to the object sort order.
1285 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1286 } else {
1287 response.handle = next;
1288 }
1289 dout(10) << "pgnls handle=" << response.handle << dendl;
1290 ::encode(response, osd_op.outdata);
1291 if (filter)
1292 ::encode(filter_out, osd_op.outdata);
1293 dout(10) << " pgnls result=" << result << " outdata.length()="
1294 << osd_op.outdata.length() << dendl;
1295 }
1296 break;
1297
1298 case CEPH_OSD_OP_PGLS_FILTER:
1299 try {
1300 ::decode(cname, bp);
1301 ::decode(mname, bp);
1302 }
1303 catch (const buffer::error& e) {
1304 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1305 result = -EINVAL;
1306 break;
1307 }
1308 if (filter) {
1309 delete filter;
1310 filter = NULL;
1311 }
1312 result = get_pgls_filter(bp, &filter);
1313 if (result < 0)
1314 break;
1315
1316 assert(filter);
1317
1318 // fall through
1319
1320 case CEPH_OSD_OP_PGLS:
1321 if (snapid != CEPH_NOSNAP) {
1322 result = -EINVAL;
1323 break;
1324 }
1325 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1326 dout(10) << " pgls pg=" << m->get_pg()
1327 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1328 << " != " << info.pgid << dendl;
1329 result = 0; // hmm?
1330 } else {
1331 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1332
1333 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1334 // read into a buffer
1335 vector<hobject_t> sentries;
1336 pg_ls_response_t response;
1337 try {
1338 ::decode(response.handle, bp);
1339 }
1340 catch (const buffer::error& e) {
1341 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1342 result = -EINVAL;
1343 break;
1344 }
1345
1346 hobject_t next;
1347 hobject_t current = response.handle;
1348 osr->flush();
1349 int r = pgbackend->objects_list_partial(
1350 current,
1351 list_size,
1352 list_size,
1353 &sentries,
1354 &next);
1355 if (r != 0) {
1356 result = -EINVAL;
1357 break;
1358 }
1359
1360 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1361
1362 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1363 pg_log.get_missing().get_items().lower_bound(current);
1364 vector<hobject_t>::iterator ls_iter = sentries.begin();
1365 hobject_t _max = hobject_t::get_max();
1366 while (1) {
1367 const hobject_t &mcand =
1368 missing_iter == pg_log.get_missing().get_items().end() ?
1369 _max :
1370 missing_iter->first;
1371 const hobject_t &lcand =
1372 ls_iter == sentries.end() ?
1373 _max :
1374 *ls_iter;
1375
1376 hobject_t candidate;
1377 if (mcand == lcand) {
1378 candidate = mcand;
1379 if (!mcand.is_max()) {
1380 ++ls_iter;
1381 ++missing_iter;
1382 }
1383 } else if (mcand < lcand) {
1384 candidate = mcand;
1385 assert(!mcand.is_max());
1386 ++missing_iter;
1387 } else {
1388 candidate = lcand;
1389 assert(!lcand.is_max());
1390 ++ls_iter;
1391 }
1392
1393 if (candidate >= next) {
1394 break;
1395 }
1396
1397 if (response.entries.size() == list_size) {
1398 next = candidate;
1399 break;
1400 }
1401
1402 // skip snapdir objects
1403 if (candidate.snap == CEPH_SNAPDIR)
1404 continue;
1405
1406 if (candidate.snap != CEPH_NOSNAP)
1407 continue;
1408
1409 // skip wrong namespace
1410 if (candidate.get_namespace() != m->get_hobj().nspace)
1411 continue;
1412
c07f9fc5
FG
1413 if (missing_loc.is_deleted(candidate))
1414 continue;
1415
7c673cae
FG
1416 if (filter && !pgls_filter(filter, candidate, filter_out))
1417 continue;
1418
1419 response.entries.push_back(make_pair(candidate.oid,
1420 candidate.get_key()));
1421 }
1422 if (next.is_max() &&
1423 missing_iter == pg_log.get_missing().get_items().end() &&
1424 ls_iter == sentries.end()) {
1425 result = 1;
1426 }
1427 response.handle = next;
1428 ::encode(response, osd_op.outdata);
1429 if (filter)
1430 ::encode(filter_out, osd_op.outdata);
1431 dout(10) << " pgls result=" << result << " outdata.length()="
1432 << osd_op.outdata.length() << dendl;
1433 }
1434 break;
1435
1436 case CEPH_OSD_OP_PG_HITSET_LS:
1437 {
1438 list< pair<utime_t,utime_t> > ls;
1439 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1440 p != info.hit_set.history.end();
1441 ++p)
1442 ls.push_back(make_pair(p->begin, p->end));
1443 if (hit_set)
1444 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1445 ::encode(ls, osd_op.outdata);
1446 }
1447 break;
1448
1449 case CEPH_OSD_OP_PG_HITSET_GET:
1450 {
1451 utime_t stamp(osd_op.op.hit_set_get.stamp);
1452 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1453 // read the current in-memory HitSet, not the version we've
1454 // checkpointed.
1455 if (!hit_set) {
1456 result= -ENOENT;
1457 break;
1458 }
1459 ::encode(*hit_set, osd_op.outdata);
1460 result = osd_op.outdata.length();
1461 } else {
1462 // read an archived HitSet.
1463 hobject_t oid;
1464 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1465 p != info.hit_set.history.end();
1466 ++p) {
1467 if (stamp >= p->begin && stamp <= p->end) {
1468 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1469 break;
1470 }
1471 }
1472 if (oid == hobject_t()) {
1473 result = -ENOENT;
1474 break;
1475 }
1476 if (!pool.info.is_replicated()) {
1477 // FIXME: EC not supported yet
1478 result = -EOPNOTSUPP;
1479 break;
1480 }
1481 if (is_unreadable_object(oid)) {
1482 wait_for_unreadable_object(oid, op);
1483 delete filter;
1484 return;
1485 }
1486 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1487 }
1488 }
1489 break;
1490
1491 case CEPH_OSD_OP_SCRUBLS:
1492 result = do_scrub_ls(m, &osd_op);
1493 break;
1494
1495 default:
1496 result = -EINVAL;
1497 break;
1498 }
1499
1500 if (result < 0)
1501 break;
1502 }
1503
1504 // reply
1505 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1506 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1507 false);
1508 reply->claim_op_out_data(ops);
1509 reply->set_result(result);
1510 reply->set_reply_versions(info.last_update, info.last_user_version);
1511 osd->send_message_osd_client(reply, m->get_connection());
1512 delete filter;
1513}
1514
1515int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1516{
1517 if (m->get_pg() != info.pgid.pgid) {
1518 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1519 return -EINVAL; // hmm?
1520 }
1521 auto bp = osd_op->indata.begin();
1522 scrub_ls_arg_t arg;
1523 try {
1524 arg.decode(bp);
1525 } catch (buffer::error&) {
1526 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1527 return -EINVAL;
1528 }
1529 int r = 0;
1530 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1531 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1532 r = -EAGAIN;
1533 } else if (!scrubber.store) {
1534 r = -ENOENT;
1535 } else if (arg.get_snapsets) {
1536 result.vals = scrubber.store->get_snap_errors(osd->store,
1537 get_pgid().pool(),
1538 arg.start_after,
1539 arg.max_return);
1540 } else {
1541 result.vals = scrubber.store->get_object_errors(osd->store,
1542 get_pgid().pool(),
1543 arg.start_after,
1544 arg.max_return);
1545 }
1546 ::encode(result, osd_op->outdata);
1547 return r;
1548}
1549
1550void PrimaryLogPG::calc_trim_to()
1551{
1552 size_t target = cct->_conf->osd_min_pg_log_entries;
1553 if (is_degraded() ||
1554 state_test(PG_STATE_RECOVERING |
1555 PG_STATE_RECOVERY_WAIT |
1556 PG_STATE_BACKFILL |
1557 PG_STATE_BACKFILL_WAIT |
1558 PG_STATE_BACKFILL_TOOFULL)) {
1559 target = cct->_conf->osd_max_pg_log_entries;
1560 }
1561
1562 eversion_t limit = MIN(
1563 min_last_complete_ondisk,
1564 pg_log.get_can_rollback_to());
1565 if (limit != eversion_t() &&
1566 limit != pg_trim_to &&
1567 pg_log.get_log().approx_size() > target) {
1568 size_t num_to_trim = pg_log.get_log().approx_size() - target;
1569 if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1570 return;
1571 }
1572 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1573 eversion_t new_trim_to;
1574 for (size_t i = 0; i < num_to_trim; ++i) {
1575 new_trim_to = it->version;
1576 ++it;
1577 if (new_trim_to > limit) {
1578 new_trim_to = limit;
1579 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1580 break;
1581 }
1582 }
1583 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1584 pg_trim_to = new_trim_to;
1585 assert(pg_trim_to <= pg_log.get_head());
1586 assert(pg_trim_to <= min_last_complete_ondisk);
1587 }
1588}
1589
1590PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1591 const PGPool &_pool, spg_t p) :
1592 PG(o, curmap, _pool, p),
1593 pgbackend(
1594 PGBackend::build_pg_backend(
1595 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1596 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1597 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1598 new_backfill(false),
1599 temp_seq(0),
1600 snap_trimmer_machine(this)
1601{
1602 missing_loc.set_backend_predicates(
1603 pgbackend->get_is_readable_predicate(),
1604 pgbackend->get_is_recoverable_predicate());
1605 snap_trimmer_machine.initiate();
1606}
1607
1608void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1609{
1610 src_oloc = oloc;
1611 if (oloc.key.empty())
1612 src_oloc.key = oid.name;
1613}
1614
1615void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1616{
1617 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1618 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1619 if (!session)
1620 return; // drop it.
1621 session->put(); // get_priv takes a ref, and so does the SessionRef
1622 hobject_t begin = info.pgid.pgid.get_hobj_start();
1623 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1624 if (begin < m->begin) {
1625 begin = m->begin;
1626 }
1627 if (end > m->end) {
1628 end = m->end;
1629 }
1630 dout(10) << __func__ << " backoff ack id " << m->id
1631 << " [" << begin << "," << end << ")" << dendl;
1632 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1633}
1634
1635void PrimaryLogPG::do_request(
1636 OpRequestRef& op,
1637 ThreadPool::TPHandle &handle)
1638{
1639 if (op->osd_trace) {
1640 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1641 op->pg_trace.event("do request");
1642 }
1643 // make sure we have a new enough map
1644 auto p = waiting_for_map.find(op->get_source());
1645 if (p != waiting_for_map.end()) {
1646 // preserve ordering
1647 dout(20) << __func__ << " waiting_for_map "
1648 << p->first << " not empty, queueing" << dendl;
1649 p->second.push_back(op);
1650 op->mark_delayed("waiting_for_map not empty");
1651 return;
1652 }
1653 if (!have_same_or_newer_map(op->min_epoch)) {
1654 dout(20) << __func__ << " min " << op->min_epoch
1655 << ", queue on waiting_for_map " << op->get_source() << dendl;
1656 waiting_for_map[op->get_source()].push_back(op);
1657 op->mark_delayed("op must wait for map");
1658 return;
1659 }
1660
1661 if (can_discard_request(op)) {
1662 return;
1663 }
1664
1665 // pg-wide backoffs
1666 const Message *m = op->get_req();
1667 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1668 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1669 if (!session)
1670 return; // drop it.
1671 session->put(); // get_priv takes a ref, and so does the SessionRef
1672
1673 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1674 if (session->check_backoff(cct, info.pgid,
1675 info.pgid.pgid.get_hobj_start(), m)) {
1676 return;
1677 }
1678
1679 bool backoff =
1680 is_down() ||
1681 is_incomplete() ||
1682 (!is_active() && is_peered());
1683 if (g_conf->osd_backoff_on_peering && !backoff) {
1684 if (is_peering()) {
1685 backoff = true;
1686 }
1687 }
1688 if (backoff) {
1689 add_pg_backoff(session);
1690 return;
1691 }
1692 }
1693 // pg backoff acks at pg-level
1694 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1695 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1696 if (ba->begin != ba->end) {
1697 handle_backoff(op);
1698 return;
1699 }
1700 }
1701 }
1702
1703 if (flushes_in_progress > 0) {
1704 dout(20) << flushes_in_progress
1705 << " flushes_in_progress pending "
1706 << "waiting for active on " << op << dendl;
1707 waiting_for_peered.push_back(op);
1708 op->mark_delayed("waiting for peered");
1709 return;
1710 }
1711
1712 if (!is_peered()) {
1713 // Delay unless PGBackend says it's ok
1714 if (pgbackend->can_handle_while_inactive(op)) {
1715 bool handled = pgbackend->handle_message(op);
1716 assert(handled);
1717 return;
1718 } else {
1719 waiting_for_peered.push_back(op);
1720 op->mark_delayed("waiting for peered");
1721 return;
1722 }
1723 }
1724
1725 assert(is_peered() && flushes_in_progress == 0);
1726 if (pgbackend->handle_message(op))
1727 return;
1728
1729 switch (op->get_req()->get_type()) {
1730 case CEPH_MSG_OSD_OP:
1731 case CEPH_MSG_OSD_BACKOFF:
1732 if (!is_active()) {
1733 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1734 waiting_for_active.push_back(op);
1735 op->mark_delayed("waiting for active");
1736 return;
1737 }
1738 switch (op->get_req()->get_type()) {
1739 case CEPH_MSG_OSD_OP:
1740 // verify client features
1741 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1742 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1743 osd->reply_op_error(op, -EOPNOTSUPP);
1744 return;
1745 }
1746 do_op(op);
1747 break;
1748 case CEPH_MSG_OSD_BACKOFF:
1749 // object-level backoff acks handled in osdop context
1750 handle_backoff(op);
1751 break;
1752 }
1753 break;
1754
1755 case MSG_OSD_SUBOP:
1756 do_sub_op(op);
1757 break;
1758
1759 case MSG_OSD_SUBOPREPLY:
1760 do_sub_op_reply(op);
1761 break;
1762
1763 case MSG_OSD_PG_SCAN:
1764 do_scan(op, handle);
1765 break;
1766
1767 case MSG_OSD_PG_BACKFILL:
1768 do_backfill(op);
1769 break;
1770
1771 case MSG_OSD_PG_BACKFILL_REMOVE:
1772 do_backfill_remove(op);
1773 break;
1774
1775 case MSG_OSD_SCRUB_RESERVE:
1776 {
1777 const MOSDScrubReserve *m =
1778 static_cast<const MOSDScrubReserve*>(op->get_req());
1779 switch (m->type) {
1780 case MOSDScrubReserve::REQUEST:
1781 handle_scrub_reserve_request(op);
1782 break;
1783 case MOSDScrubReserve::GRANT:
1784 handle_scrub_reserve_grant(op, m->from);
1785 break;
1786 case MOSDScrubReserve::REJECT:
1787 handle_scrub_reserve_reject(op, m->from);
1788 break;
1789 case MOSDScrubReserve::RELEASE:
1790 handle_scrub_reserve_release(op);
1791 break;
1792 }
1793 }
1794 break;
1795
1796 case MSG_OSD_REP_SCRUB:
1797 replica_scrub(op, handle);
1798 break;
1799
1800 case MSG_OSD_REP_SCRUBMAP:
1801 do_replica_scrub_map(op);
1802 break;
1803
1804 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1805 do_update_log_missing(op);
1806 break;
1807
1808 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1809 do_update_log_missing_reply(op);
1810 break;
1811
1812 default:
1813 assert(0 == "bad message type in do_request");
1814 }
1815}
1816
1817hobject_t PrimaryLogPG::earliest_backfill() const
1818{
1819 hobject_t e = hobject_t::get_max();
1820 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1821 i != backfill_targets.end();
1822 ++i) {
1823 pg_shard_t bt = *i;
1824 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1825 assert(iter != peer_info.end());
1826 if (iter->second.last_backfill < e)
1827 e = iter->second.last_backfill;
1828 }
1829 return e;
1830}
1831
1832/** do_op - do an op
1833 * pg lock will be held (if multithreaded)
1834 * osd_lock NOT held.
1835 */
1836void PrimaryLogPG::do_op(OpRequestRef& op)
1837{
1838 FUNCTRACE();
1839 // NOTE: take a non-const pointer here; we must be careful not to
1840 // change anything that will break other reads on m (operator<<).
1841 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1842 assert(m->get_type() == CEPH_MSG_OSD_OP);
1843 if (m->finish_decode()) {
1844 op->reset_desc(); // for TrackedOp
1845 m->clear_payload();
1846 }
1847
1848 dout(20) << __func__ << ": op " << *m << dendl;
1849
1850 hobject_t head = m->get_hobj();
1851 head.snap = CEPH_NOSNAP;
1852
1853 if (!info.pgid.pgid.contains(
1854 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1855 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1856 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1857 << std::hex << head.get_hash() << std::dec << dendl;
1858 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1859 << " op " << *m;
1860 assert(!cct->_conf->osd_debug_misdirected_ops);
1861 return;
1862 }
1863
1864 bool can_backoff =
1865 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1866 SessionRef session;
1867 if (can_backoff) {
1868 session = static_cast<Session*>(m->get_connection()->get_priv());
1869 if (!session.get()) {
1870 dout(10) << __func__ << " no session" << dendl;
1871 return;
1872 }
1873 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1874
1875 if (session->check_backoff(cct, info.pgid, head, m)) {
1876 return;
1877 }
1878 }
1879
1880 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1881 // not implemented.
1882 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1883 osd->reply_op_error(op, -EINVAL);
1884 return;
1885 }
1886
1887 if (op->rmw_flags == 0) {
1888 int r = osd->osd->init_op_flags(op);
1889 if (r) {
1890 osd->reply_op_error(op, r);
1891 return;
1892 }
1893 }
1894
1895 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1896 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1897 op->may_read() &&
1898 !(op->may_write() || op->may_cache())) {
1899 // balanced reads; any replica will do
1900 if (!(is_primary() || is_replica())) {
1901 osd->handle_misdirected_op(this, op);
1902 return;
1903 }
1904 } else {
1905 // normal case; must be primary
1906 if (!is_primary()) {
1907 osd->handle_misdirected_op(this, op);
1908 return;
1909 }
1910 }
1911
7c673cae
FG
1912 if (!op_has_sufficient_caps(op)) {
1913 osd->reply_op_error(op, -EPERM);
1914 return;
1915 }
1916
31f18b77
FG
1917 if (op->includes_pg_op()) {
1918 return do_pg_op(op);
1919 }
1920
7c673cae
FG
1921 // object name too long?
1922 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1923 dout(4) << "do_op name is longer than "
1924 << cct->_conf->osd_max_object_name_len
1925 << " bytes" << dendl;
1926 osd->reply_op_error(op, -ENAMETOOLONG);
1927 return;
1928 }
1929 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1930 dout(4) << "do_op locator is longer than "
1931 << cct->_conf->osd_max_object_name_len
1932 << " bytes" << dendl;
1933 osd->reply_op_error(op, -ENAMETOOLONG);
1934 return;
1935 }
1936 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1937 dout(4) << "do_op namespace is longer than "
1938 << cct->_conf->osd_max_object_namespace_len
1939 << " bytes" << dendl;
1940 osd->reply_op_error(op, -ENAMETOOLONG);
1941 return;
1942 }
1943
1944 if (int r = osd->store->validate_hobject_key(head)) {
1945 dout(4) << "do_op object " << head << " invalid for backing store: "
1946 << r << dendl;
1947 osd->reply_op_error(op, r);
1948 return;
1949 }
1950
1951 // blacklisted?
1952 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1953 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1954 osd->reply_op_error(op, -EBLACKLISTED);
1955 return;
1956 }
1957
1958 // order this op as a write?
1959 bool write_ordered = op->rwordered();
1960
1961 // discard due to cluster full transition? (we discard any op that
1962 // originates before the cluster or pool is marked full; the client
1963 // will resend after the full flag is removed or if they expect the
1964 // op to succeed despite being full). The except is FULL_FORCE and
1965 // FULL_TRY ops, which there is no reason to discard because they
1966 // bypass all full checks anyway. If this op isn't write or
1967 // read-ordered, we skip.
1968 // FIXME: we exclude mds writes for now.
1969 if (write_ordered && !(m->get_source().is_mds() ||
1970 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1971 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1972 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1973 dout(10) << __func__ << " discarding op sent before full " << m << " "
1974 << *m << dendl;
1975 return;
1976 }
1977 // mds should have stopped writing before this point.
1978 // We can't allow OSD to become non-startable even if mds
1979 // could be writing as part of file removals.
1980 ostringstream ss;
1981 if (write_ordered && osd->check_failsafe_full(ss)) {
1982 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1983 << ss.str()
1984 << dendl;
1985 return;
1986 }
1987 int64_t poolid = get_pgid().pool();
1988 if (op->may_write()) {
1989
1990 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1991 if (!pi) {
1992 return;
1993 }
1994
1995 // invalid?
1996 if (m->get_snapid() != CEPH_NOSNAP) {
1997 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1998 osd->reply_op_error(op, -EINVAL);
1999 return;
2000 }
2001
2002 // too big?
2003 if (cct->_conf->osd_max_write_size &&
2004 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2005 // journal can't hold commit!
2006 derr << "do_op msg data len " << m->get_data_len()
2007 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2008 << " on " << *m << dendl;
2009 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2010 return;
2011 }
2012 }
2013
2014 dout(10) << "do_op " << *m
2015 << (op->may_write() ? " may_write" : "")
2016 << (op->may_read() ? " may_read" : "")
2017 << (op->may_cache() ? " may_cache" : "")
2018 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2019 << " flags " << ceph_osd_flag_string(m->get_flags())
2020 << dendl;
2021
2022 // missing object?
2023 if (is_unreadable_object(head)) {
224ce89b
WB
2024 if (!is_primary()) {
2025 osd->reply_op_error(op, -EAGAIN);
2026 return;
2027 }
7c673cae
FG
2028 if (can_backoff &&
2029 (g_conf->osd_backoff_on_degraded ||
2030 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2031 add_backoff(session, head, head);
2032 maybe_kick_recovery(head);
2033 } else {
2034 wait_for_unreadable_object(head, op);
2035 }
2036 return;
2037 }
2038
2039 // degraded object?
2040 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2041 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2042 add_backoff(session, head, head);
2043 } else {
2044 wait_for_degraded_object(head, op);
2045 }
2046 return;
2047 }
2048
2049 if (write_ordered &&
2050 scrubber.write_blocked_by_scrub(head)) {
2051 dout(20) << __func__ << ": waiting for scrub" << dendl;
2052 waiting_for_scrub.push_back(op);
2053 op->mark_delayed("waiting for scrub");
2054 return;
2055 }
2056
2057 // blocked on snap?
2058 map<hobject_t, snapid_t>::iterator blocked_iter =
2059 objects_blocked_on_degraded_snap.find(head);
2060 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2061 hobject_t to_wait_on(head);
2062 to_wait_on.snap = blocked_iter->second;
2063 wait_for_degraded_object(to_wait_on, op);
2064 return;
2065 }
2066 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2067 objects_blocked_on_snap_promotion.find(head);
2068 if (write_ordered &&
2069 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2070 wait_for_blocked_object(
2071 blocked_snap_promote_iter->second->obs.oi.soid,
2072 op);
2073 return;
2074 }
2075 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2076 block_write_on_full_cache(head, op);
2077 return;
2078 }
2079
2080 // missing snapdir?
2081 hobject_t snapdir = head.get_snapdir();
2082
2083 if (is_unreadable_object(snapdir)) {
2084 wait_for_unreadable_object(snapdir, op);
2085 return;
2086 }
2087
2088 // degraded object?
2089 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2090 wait_for_degraded_object(snapdir, op);
2091 return;
2092 }
2093
2094 // dup/resent?
2095 if (op->may_write() || op->may_cache()) {
2096 // warning: we will get back *a* request for this reqid, but not
2097 // necessarily the most recent. this happens with flush and
2098 // promote ops, but we can't possible have both in our log where
2099 // the original request is still not stable on disk, so for our
2100 // purposes here it doesn't matter which one we get.
2101 eversion_t version;
2102 version_t user_version;
2103 int return_code = 0;
2104 bool got = check_in_progress_op(
2105 m->get_reqid(), &version, &user_version, &return_code);
2106 if (got) {
2107 dout(3) << __func__ << " dup " << m->get_reqid()
2108 << " version " << version << dendl;
2109 if (already_complete(version)) {
2110 osd->reply_op_error(op, return_code, version, user_version);
2111 } else {
2112 dout(10) << " waiting for " << version << " to commit" << dendl;
2113 // always queue ondisk waiters, so that we can requeue if needed
2114 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2115 op->mark_delayed("waiting for ondisk");
2116 }
2117 return;
2118 }
2119 }
2120
2121 ObjectContextRef obc;
2122 bool can_create = op->may_write() || op->may_cache();
2123 hobject_t missing_oid;
2124 const hobject_t& oid = m->get_hobj();
2125
2126 // io blocked on obc?
2127 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2128 maybe_await_blocked_snapset(oid, op)) {
2129 return;
2130 }
2131
2132 int r = find_object_context(
2133 oid, &obc, can_create,
2134 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2135 &missing_oid);
2136
2137 if (r == -EAGAIN) {
2138 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2139 // we have to wait for the object.
2140 if (is_primary()) {
2141 // missing the specific snap we need; requeue and wait.
2142 assert(!op->may_write()); // only happens on a read/cache
2143 wait_for_unreadable_object(missing_oid, op);
2144 return;
2145 }
2146 } else if (r == 0) {
2147 if (is_unreadable_object(obc->obs.oi.soid)) {
2148 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2149 << " is unreadable, waiting" << dendl;
2150 wait_for_unreadable_object(obc->obs.oi.soid, op);
2151 return;
2152 }
2153
2154 // degraded object? (the check above was for head; this could be a clone)
2155 if (write_ordered &&
2156 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2157 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2158 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2159 << " is degraded, waiting" << dendl;
2160 wait_for_degraded_object(obc->obs.oi.soid, op);
2161 return;
2162 }
2163 }
2164
2165 bool in_hit_set = false;
2166 if (hit_set) {
2167 if (obc.get()) {
2168 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2169 in_hit_set = true;
2170 } else {
2171 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2172 in_hit_set = true;
2173 }
2174 if (!op->hitset_inserted) {
2175 hit_set->insert(oid);
2176 op->hitset_inserted = true;
2177 if (hit_set->is_full() ||
2178 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2179 hit_set_persist();
2180 }
2181 }
2182 }
2183
2184 if (agent_state) {
2185 if (agent_choose_mode(false, op))
2186 return;
2187 }
2188
31f18b77
FG
2189 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2190 if (maybe_handle_manifest(op,
2191 write_ordered,
2192 obc))
2193 return;
2194 }
2195
7c673cae
FG
2196 if (maybe_handle_cache(op,
2197 write_ordered,
2198 obc,
2199 r,
2200 missing_oid,
2201 false,
2202 in_hit_set))
2203 return;
2204
2205 if (r && (r != -ENOENT || !obc)) {
2206 // copy the reqids for copy get on ENOENT
2207 if (r == -ENOENT &&
2208 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2209 fill_in_copy_get_noent(op, oid, m->ops[0]);
2210 return;
2211 }
224ce89b 2212 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
7c673cae 2213 if (op->may_write() &&
31f18b77 2214 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
2215 record_write_error(op, oid, nullptr, r);
2216 } else {
2217 osd->reply_op_error(op, r);
2218 }
2219 return;
2220 }
2221
2222 // make sure locator is consistent
2223 object_locator_t oloc(obc->obs.oi.soid);
2224 if (m->get_object_locator() != oloc) {
2225 dout(10) << " provided locator " << m->get_object_locator()
2226 << " != object's " << obc->obs.oi.soid << dendl;
2227 osd->clog->warn() << "bad locator " << m->get_object_locator()
2228 << " on object " << oloc
2229 << " op " << *m;
2230 }
2231
2232 // io blocked on obc?
2233 if (obc->is_blocked() &&
2234 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2235 wait_for_blocked_object(obc->obs.oi.soid, op);
2236 return;
2237 }
2238
2239 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2240
2241 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2242 OSDOp& osd_op = *p;
2243
2244 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2245 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2246 m->get_snapid() != CEPH_SNAPDIR) {
2247 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2248 osd->reply_op_error(op, -EINVAL);
2249 return;
2250 }
2251 }
2252
c07f9fc5 2253 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
7c673cae
FG
2254
2255 if (!obc->obs.exists)
2256 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2257
2258 /* Due to obc caching, we might have a cached non-existent snapset_obc
2259 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2260 * do_op pipeline make decisions based on whether snapset_obc is
2261 * populated.
2262 */
2263 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2264 ctx->snapset_obc = ObjectContextRef();
2265
2266 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2267 dout(20) << __func__ << ": skipping rw locks" << dendl;
2268 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2269 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2270
2271 // verify there is in fact a flush in progress
2272 // FIXME: we could make this a stronger test.
2273 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2274 if (p == flush_ops.end()) {
2275 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2276 reply_ctx(ctx, -EINVAL);
2277 return;
2278 }
2279 } else if (!get_rw_locks(write_ordered, ctx)) {
2280 dout(20) << __func__ << " waiting for rw locks " << dendl;
2281 op->mark_delayed("waiting for rw locks");
2282 close_op_ctx(ctx);
2283 return;
2284 }
2285 dout(20) << __func__ << " obc " << *obc << dendl;
2286
2287 if (r) {
2288 dout(20) << __func__ << " returned an error: " << r << dendl;
2289 close_op_ctx(ctx);
2290 if (op->may_write() &&
31f18b77 2291 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
2292 record_write_error(op, oid, nullptr, r);
2293 } else {
2294 osd->reply_op_error(op, r);
2295 }
2296 return;
2297 }
2298
2299 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2300 ctx->ignore_cache = true;
2301 }
2302
2303 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2304 // This object is lost. Reading from it returns an error.
2305 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2306 << " is lost" << dendl;
2307 reply_ctx(ctx, -ENFILE);
2308 return;
2309 }
2310 if (!op->may_write() &&
2311 !op->may_cache() &&
2312 (!obc->obs.exists ||
2313 ((m->get_snapid() != CEPH_SNAPDIR) &&
2314 obc->obs.oi.is_whiteout()))) {
2315 // copy the reqids for copy get on ENOENT
2316 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2317 fill_in_copy_get_noent(op, oid, m->ops[0]);
2318 close_op_ctx(ctx);
2319 return;
2320 }
2321 reply_ctx(ctx, -ENOENT);
2322 return;
2323 }
2324
2325 op->mark_started();
2326
2327 execute_ctx(ctx);
2328 utime_t prepare_latency = ceph_clock_now();
2329 prepare_latency -= op->get_dequeued_time();
2330 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2331 if (op->may_read() && op->may_write()) {
2332 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2333 } else if (op->may_read()) {
2334 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2335 } else if (op->may_write() || op->may_cache()) {
2336 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2337 }
2338
2339 // force recovery of the oldest missing object if too many logs
2340 maybe_force_recovery();
2341}
31f18b77
FG
2342PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2343 OpRequestRef op,
2344 bool write_ordered,
2345 ObjectContextRef obc)
2346{
2347 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2348 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2349 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2350 return cache_result_t::NOOP;
2351 }
2352
2353 if (obc)
2354 dout(10) << __func__ << " " << obc->obs.oi << " "
2355 << (obc->obs.exists ? "exists" : "DNE")
2356 << dendl;
2357
2358 // if it is write-ordered and blocked, stop now
2359 if (obc.get() && obc->is_blocked() && write_ordered) {
2360 // we're already doing something with this object
2361 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2362 return cache_result_t::NOOP;
2363 }
2364
2365 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2366 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2367 OSDOp& osd_op = *p;
2368 ceph_osd_op& op = osd_op.op;
2369 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2370 return cache_result_t::NOOP;
2371 }
2372 }
2373
2374 switch (obc->obs.oi.manifest.type) {
2375 case object_manifest_t::TYPE_REDIRECT:
2376 if (op->may_write() || write_ordered) {
2377 do_proxy_write(op, obc->obs.oi.soid, obc);
2378 } else {
2379 do_proxy_read(op, obc);
2380 }
2381 return cache_result_t::HANDLED_PROXY;
2382 case object_manifest_t::TYPE_CHUNKED:
2383 default:
2384 assert(0 == "unrecognized manifest type");
2385 }
2386
2387 return cache_result_t::NOOP;
2388}
7c673cae
FG
2389
2390void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2391 MOSDOpReply *orig_reply, int r)
2392{
2393 dout(20) << __func__ << " r=" << r << dendl;
2394 assert(op->may_write());
2395 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2396 ObjectContextRef obc;
31f18b77 2397 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
2398 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2399 get_next_version(), eversion_t(), 0,
2400 reqid, utime_t(), r));
2401
2402 struct OnComplete {
2403 PrimaryLogPG *pg;
2404 OpRequestRef op;
2405 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2406 int r;
2407 OnComplete(
2408 PrimaryLogPG *pg,
2409 OpRequestRef op,
2410 MOSDOpReply *orig_reply,
2411 int r)
2412 : pg(pg), op(op),
2413 orig_reply(orig_reply, false /* take over ref */), r(r)
2414 {}
2415 void operator()() {
2416 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2417 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2418 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2419 MOSDOpReply *reply = orig_reply.detach();
2420 if (reply == nullptr) {
2421 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2422 flags, true);
2423 }
2424 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2425 pg->osd->send_message_osd_client(reply, m->get_connection());
2426 }
2427 };
2428
2429 ObcLockManager lock_manager;
2430 submit_log_entries(
2431 entries,
2432 std::move(lock_manager),
2433 boost::optional<std::function<void(void)> >(
2434 OnComplete(this, op, orig_reply, r)),
2435 op,
2436 r);
2437}
2438
2439PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2440 OpRequestRef op,
2441 bool write_ordered,
2442 ObjectContextRef obc,
2443 int r, hobject_t missing_oid,
2444 bool must_promote,
2445 bool in_hit_set,
2446 ObjectContextRef *promote_obc)
2447{
2448 if (op &&
2449 op->get_req() &&
2450 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2451 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2452 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2453 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2454 return cache_result_t::NOOP;
2455 }
2456 // return quickly if caching is not enabled
2457 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2458 return cache_result_t::NOOP;
2459
2460 must_promote = must_promote || op->need_promote();
2461
2462 if (obc)
2463 dout(25) << __func__ << " " << obc->obs.oi << " "
2464 << (obc->obs.exists ? "exists" : "DNE")
2465 << " missing_oid " << missing_oid
2466 << " must_promote " << (int)must_promote
2467 << " in_hit_set " << (int)in_hit_set
2468 << dendl;
2469 else
2470 dout(25) << __func__ << " (no obc)"
2471 << " missing_oid " << missing_oid
2472 << " must_promote " << (int)must_promote
2473 << " in_hit_set " << (int)in_hit_set
2474 << dendl;
2475
2476 // if it is write-ordered and blocked, stop now
2477 if (obc.get() && obc->is_blocked() && write_ordered) {
2478 // we're already doing something with this object
2479 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2480 return cache_result_t::NOOP;
2481 }
2482
2483 if (r == -ENOENT && missing_oid == hobject_t()) {
2484 // we know this object is logically absent (e.g., an undefined clone)
2485 return cache_result_t::NOOP;
2486 }
2487
2488 if (obc.get() && obc->obs.exists) {
2489 osd->logger->inc(l_osd_op_cache_hit);
2490 return cache_result_t::NOOP;
2491 }
2492
2493 if (missing_oid == hobject_t() && obc.get()) {
2494 missing_oid = obc->obs.oi.soid;
2495 }
2496
2497 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2498 const object_locator_t oloc = m->get_object_locator();
2499
2500 if (op->need_skip_handle_cache()) {
2501 return cache_result_t::NOOP;
2502 }
2503
2504 // older versions do not proxy the feature bits.
2505 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2506 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2507 OpRequestRef promote_op;
2508
2509 switch (pool.info.cache_mode) {
2510 case pg_pool_t::CACHEMODE_WRITEBACK:
2511 if (agent_state &&
2512 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2513 if (!op->may_write() && !op->may_cache() &&
2514 !write_ordered && !must_promote) {
2515 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2516 do_proxy_read(op);
2517 return cache_result_t::HANDLED_PROXY;
2518 }
2519 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2520 block_write_on_full_cache(missing_oid, op);
2521 return cache_result_t::BLOCKED_FULL;
2522 }
2523
2524 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2525 promote_object(obc, missing_oid, oloc, op, promote_obc);
2526 return cache_result_t::BLOCKED_PROMOTE;
2527 }
2528
2529 if (op->may_write() || op->may_cache()) {
2530 if (can_proxy_write) {
2531 do_proxy_write(op, missing_oid);
2532 } else {
2533 // promote if can't proxy the write
2534 promote_object(obc, missing_oid, oloc, op, promote_obc);
2535 return cache_result_t::BLOCKED_PROMOTE;
2536 }
2537
2538 // Promote too?
2539 if (!op->need_skip_promote() &&
2540 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2541 pool.info.min_write_recency_for_promote,
2542 OpRequestRef(),
2543 promote_obc)) {
2544 return cache_result_t::BLOCKED_PROMOTE;
2545 }
2546 return cache_result_t::HANDLED_PROXY;
2547 } else {
2548 do_proxy_read(op);
2549
2550 // Avoid duplicate promotion
2551 if (obc.get() && obc->is_blocked()) {
2552 if (promote_obc)
2553 *promote_obc = obc;
2554 return cache_result_t::BLOCKED_PROMOTE;
2555 }
2556
2557 // Promote too?
2558 if (!op->need_skip_promote()) {
2559 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2560 pool.info.min_read_recency_for_promote,
2561 promote_op, promote_obc);
2562 }
2563
2564 return cache_result_t::HANDLED_PROXY;
2565 }
2566 assert(0 == "unreachable");
2567 return cache_result_t::NOOP;
2568
2569 case pg_pool_t::CACHEMODE_FORWARD:
2570 // FIXME: this mode allows requests to be reordered.
2571 do_cache_redirect(op);
2572 return cache_result_t::HANDLED_REDIRECT;
2573
2574 case pg_pool_t::CACHEMODE_READONLY:
2575 // TODO: clean this case up
2576 if (!obc.get() && r == -ENOENT) {
2577 // we don't have the object and op's a read
2578 promote_object(obc, missing_oid, oloc, op, promote_obc);
2579 return cache_result_t::BLOCKED_PROMOTE;
2580 }
2581 if (!r) { // it must be a write
2582 do_cache_redirect(op);
2583 return cache_result_t::HANDLED_REDIRECT;
2584 }
2585 // crap, there was a failure of some kind
2586 return cache_result_t::NOOP;
2587
2588 case pg_pool_t::CACHEMODE_READFORWARD:
2589 // Do writeback to the cache tier for writes
2590 if (op->may_write() || write_ordered || must_promote) {
2591 if (agent_state &&
2592 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2593 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2594 block_write_on_full_cache(missing_oid, op);
2595 return cache_result_t::BLOCKED_FULL;
2596 }
2597 promote_object(obc, missing_oid, oloc, op, promote_obc);
2598 return cache_result_t::BLOCKED_PROMOTE;
2599 }
2600
2601 // If it is a read, we can read, we need to forward it
2602 do_cache_redirect(op);
2603 return cache_result_t::HANDLED_REDIRECT;
2604
2605 case pg_pool_t::CACHEMODE_PROXY:
2606 if (!must_promote) {
2607 if (op->may_write() || op->may_cache() || write_ordered) {
2608 if (can_proxy_write) {
2609 do_proxy_write(op, missing_oid);
2610 return cache_result_t::HANDLED_PROXY;
2611 }
2612 } else {
2613 do_proxy_read(op);
2614 return cache_result_t::HANDLED_PROXY;
2615 }
2616 }
2617 // ugh, we're forced to promote.
2618 if (agent_state &&
2619 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2620 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2621 block_write_on_full_cache(missing_oid, op);
2622 return cache_result_t::BLOCKED_FULL;
2623 }
2624 promote_object(obc, missing_oid, oloc, op, promote_obc);
2625 return cache_result_t::BLOCKED_PROMOTE;
2626
2627 case pg_pool_t::CACHEMODE_READPROXY:
2628 // Do writeback to the cache tier for writes
2629 if (op->may_write() || write_ordered || must_promote) {
2630 if (agent_state &&
2631 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2632 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2633 block_write_on_full_cache(missing_oid, op);
2634 return cache_result_t::BLOCKED_FULL;
2635 }
2636 promote_object(obc, missing_oid, oloc, op, promote_obc);
2637 return cache_result_t::BLOCKED_PROMOTE;
2638 }
2639
2640 // If it is a read, we can read, we need to proxy it
2641 do_proxy_read(op);
2642 return cache_result_t::HANDLED_PROXY;
2643
2644 default:
2645 assert(0 == "unrecognized cache_mode");
2646 }
2647 return cache_result_t::NOOP;
2648}
2649
2650bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2651 const hobject_t& missing_oid,
2652 const object_locator_t& oloc,
2653 bool in_hit_set,
2654 uint32_t recency,
2655 OpRequestRef promote_op,
2656 ObjectContextRef *promote_obc)
2657{
2658 dout(20) << __func__ << " missing_oid " << missing_oid
2659 << " in_hit_set " << in_hit_set << dendl;
2660
2661 switch (recency) {
2662 case 0:
2663 break;
2664 case 1:
2665 // Check if in the current hit set
2666 if (in_hit_set) {
2667 break;
2668 } else {
2669 // not promoting
2670 return false;
2671 }
2672 break;
2673 default:
2674 {
2675 unsigned count = (int)in_hit_set;
2676 if (count) {
2677 // Check if in other hit sets
2678 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2679 for (map<time_t,HitSetRef>::reverse_iterator itor =
2680 agent_state->hit_set_map.rbegin();
2681 itor != agent_state->hit_set_map.rend();
2682 ++itor) {
2683 if (!itor->second->contains(oid)) {
2684 break;
2685 }
2686 ++count;
2687 if (count >= recency) {
2688 break;
2689 }
2690 }
2691 }
2692 if (count >= recency) {
2693 break;
2694 }
2695 return false; // not promoting
2696 }
2697 break;
2698 }
2699
2700 if (osd->promote_throttle()) {
2701 dout(10) << __func__ << " promote throttled" << dendl;
2702 return false;
2703 }
2704 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2705 return true;
2706}
2707
2708void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2709{
2710 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2711 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2712 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2713 get_osdmap()->get_epoch(), flags, false);
2714 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2715 reply->set_redirect(redir);
2716 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2717 << op << dendl;
2718 m->get_connection()->send_message(reply);
2719 return;
2720}
2721
2722struct C_ProxyRead : public Context {
2723 PrimaryLogPGRef pg;
2724 hobject_t oid;
2725 epoch_t last_peering_reset;
2726 ceph_tid_t tid;
2727 PrimaryLogPG::ProxyReadOpRef prdop;
2728 utime_t start;
2729 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2730 const PrimaryLogPG::ProxyReadOpRef& prd)
2731 : pg(p), oid(o), last_peering_reset(lpr),
2732 tid(0), prdop(prd), start(ceph_clock_now())
2733 {}
2734 void finish(int r) override {
2735 if (prdop->canceled)
2736 return;
2737 pg->lock();
2738 if (prdop->canceled) {
2739 pg->unlock();
2740 return;
2741 }
2742 if (last_peering_reset == pg->get_last_peering_reset()) {
2743 pg->finish_proxy_read(oid, tid, r);
2744 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2745 }
2746 pg->unlock();
2747 }
2748};
2749
31f18b77 2750void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
2751{
2752 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2753 // stash the result in the request's OSDOp vector
2754 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77
FG
2755 object_locator_t oloc;
2756 hobject_t soid;
2757 /* extensible tier */
2758 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2759 switch (obc->obs.oi.manifest.type) {
2760 case object_manifest_t::TYPE_REDIRECT:
2761 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2762 soid = obc->obs.oi.manifest.redirect_target;
2763 break;
2764 case object_manifest_t::TYPE_CHUNKED:
2765 default:
2766 assert(0 == "unrecognized manifest type");
2767 }
2768 } else {
2769 /* proxy */
2770 soid = m->get_hobj();
2771 oloc = object_locator_t(m->get_object_locator());
2772 oloc.pool = pool.info.tier_of;
2773 }
7c673cae
FG
2774 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2775
2776 // pass through some original flags that make sense.
2777 // - leave out redirection and balancing flags since we are
2778 // already proxying through the primary
2779 // - leave off read/write/exec flags that are derived from the op
2780 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2781 CEPH_OSD_FLAG_ORDERSNAP |
2782 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2783 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2784
2785 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2786
2787 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2788
2789 ObjectOperation obj_op;
2790 obj_op.dup(prdop->ops);
2791
2792 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2793 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2794 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2795 ceph_osd_op op = obj_op.ops[i].op;
2796 switch (op.op) {
2797 case CEPH_OSD_OP_READ:
2798 case CEPH_OSD_OP_SYNC_READ:
2799 case CEPH_OSD_OP_SPARSE_READ:
2800 case CEPH_OSD_OP_CHECKSUM:
c07f9fc5 2801 case CEPH_OSD_OP_CMPEXT:
7c673cae
FG
2802 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2803 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2804 }
2805 }
2806 }
2807
2808 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2809 prdop);
2810 ceph_tid_t tid = osd->objecter->read(
2811 soid.oid, oloc, obj_op,
2812 m->get_snapid(), NULL,
2813 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2814 &prdop->user_version,
2815 &prdop->data_offset,
2816 m->get_features());
2817 fin->tid = tid;
2818 prdop->objecter_tid = tid;
2819 proxyread_ops[tid] = prdop;
2820 in_progress_proxy_ops[soid].push_back(op);
2821}
2822
2823void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2824{
2825 dout(10) << __func__ << " " << oid << " tid " << tid
2826 << " " << cpp_strerror(r) << dendl;
2827
2828 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2829 if (p == proxyread_ops.end()) {
2830 dout(10) << __func__ << " no proxyread_op found" << dendl;
2831 return;
2832 }
2833 ProxyReadOpRef prdop = p->second;
2834 if (tid != prdop->objecter_tid) {
2835 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2836 << " tid " << prdop->objecter_tid << dendl;
2837 return;
2838 }
2839 if (oid != prdop->soid) {
2840 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2841 << " soid " << prdop->soid << dendl;
2842 return;
2843 }
2844 proxyread_ops.erase(tid);
2845
2846 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2847 if (q == in_progress_proxy_ops.end()) {
2848 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2849 return;
2850 }
2851 assert(q->second.size());
2852 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2853 q->second.end(),
2854 prdop->op);
2855 assert(it != q->second.end());
2856 OpRequestRef op = *it;
2857 q->second.erase(it);
2858 if (q->second.size() == 0) {
2859 in_progress_proxy_ops.erase(oid);
2860 }
2861
2862 osd->logger->inc(l_osd_tier_proxy_read);
2863
2864 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
c07f9fc5 2865 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
7c673cae
FG
2866 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2867 ctx->user_at_version = prdop->user_version;
2868 ctx->data_off = prdop->data_offset;
2869 ctx->ignore_log_op_stats = true;
2870 complete_read_ctx(r, ctx);
2871}
2872
2873void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2874{
2875 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2876 if (p == in_progress_proxy_ops.end())
2877 return;
2878
2879 list<OpRequestRef>& ls = p->second;
2880 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2881 requeue_ops(ls);
2882 in_progress_proxy_ops.erase(p);
2883}
2884
2885void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2886{
2887 dout(10) << __func__ << " " << prdop->soid << dendl;
2888 prdop->canceled = true;
2889
2890 // cancel objecter op, if we can
2891 if (prdop->objecter_tid) {
2892 osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2893 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2894 prdop->ops[i].outdata.clear();
2895 }
2896 proxyread_ops.erase(prdop->objecter_tid);
2897 prdop->objecter_tid = 0;
2898 }
2899}
2900
2901void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2902{
2903 dout(10) << __func__ << dendl;
2904
2905 // cancel proxy reads
2906 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2907 while (p != proxyread_ops.end()) {
2908 cancel_proxy_read((p++)->second);
2909 }
2910
2911 // cancel proxy writes
2912 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2913 while (q != proxywrite_ops.end()) {
2914 cancel_proxy_write((q++)->second);
2915 }
2916
2917 if (requeue) {
2918 map<hobject_t, list<OpRequestRef>>::iterator p =
2919 in_progress_proxy_ops.begin();
2920 while (p != in_progress_proxy_ops.end()) {
2921 list<OpRequestRef>& ls = p->second;
2922 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2923 << " requests" << dendl;
2924 requeue_ops(ls);
2925 in_progress_proxy_ops.erase(p++);
2926 }
2927 } else {
2928 in_progress_proxy_ops.clear();
2929 }
2930}
2931
2932struct C_ProxyWrite_Commit : public Context {
2933 PrimaryLogPGRef pg;
2934 hobject_t oid;
2935 epoch_t last_peering_reset;
2936 ceph_tid_t tid;
2937 PrimaryLogPG::ProxyWriteOpRef pwop;
2938 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2939 const PrimaryLogPG::ProxyWriteOpRef& pw)
2940 : pg(p), oid(o), last_peering_reset(lpr),
2941 tid(0), pwop(pw)
2942 {}
2943 void finish(int r) override {
2944 if (pwop->canceled)
2945 return;
2946 pg->lock();
2947 if (pwop->canceled) {
2948 pg->unlock();
2949 return;
2950 }
2951 if (last_peering_reset == pg->get_last_peering_reset()) {
2952 pg->finish_proxy_write(oid, tid, r);
2953 }
2954 pg->unlock();
2955 }
2956};
2957
31f18b77 2958void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
7c673cae
FG
2959{
2960 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2961 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77 2962 object_locator_t oloc;
7c673cae 2963 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
31f18b77
FG
2964 hobject_t soid;
2965 /* extensible tier */
2966 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2967 switch (obc->obs.oi.manifest.type) {
2968 case object_manifest_t::TYPE_REDIRECT:
2969 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2970 soid = obc->obs.oi.manifest.redirect_target;
2971 break;
2972 case object_manifest_t::TYPE_CHUNKED:
2973 default:
2974 assert(0 == "unrecognized manifest type");
2975 }
2976 } else {
2977 /* proxy */
2978 soid = m->get_hobj();
2979 oloc = object_locator_t(m->get_object_locator());
2980 oloc.pool = pool.info.tier_of;
2981 }
7c673cae 2982
7c673cae 2983 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
31f18b77
FG
2984 if (!(op->may_write() || op->may_cache())) {
2985 flags |= CEPH_OSD_FLAG_RWORDERED;
2986 }
7c673cae
FG
2987 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2988
2989 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
c07f9fc5 2990 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
7c673cae
FG
2991 pwop->mtime = m->get_mtime();
2992
2993 ObjectOperation obj_op;
2994 obj_op.dup(pwop->ops);
2995
2996 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2997 this, soid, get_last_peering_reset(), pwop);
2998 ceph_tid_t tid = osd->objecter->mutate(
2999 soid.oid, oloc, obj_op, snapc,
3000 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3001 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3002 &pwop->user_version, pwop->reqid);
3003 fin->tid = tid;
3004 pwop->objecter_tid = tid;
3005 proxywrite_ops[tid] = pwop;
3006 in_progress_proxy_ops[soid].push_back(op);
3007}
3008
3009void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3010{
3011 dout(10) << __func__ << " " << oid << " tid " << tid
3012 << " " << cpp_strerror(r) << dendl;
3013
3014 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3015 if (p == proxywrite_ops.end()) {
3016 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3017 return;
3018 }
3019 ProxyWriteOpRef pwop = p->second;
3020 assert(tid == pwop->objecter_tid);
3021 assert(oid == pwop->soid);
3022
3023 proxywrite_ops.erase(tid);
3024
3025 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3026 if (q == in_progress_proxy_ops.end()) {
3027 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3028 delete pwop->ctx;
3029 pwop->ctx = NULL;
3030 return;
3031 }
3032 list<OpRequestRef>& in_progress_op = q->second;
3033 assert(in_progress_op.size());
3034 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3035 in_progress_op.end(),
3036 pwop->op);
3037 assert(it != in_progress_op.end());
3038 in_progress_op.erase(it);
3039 if (in_progress_op.size() == 0) {
3040 in_progress_proxy_ops.erase(oid);
3041 }
3042
3043 osd->logger->inc(l_osd_tier_proxy_write);
3044
3045 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3046 assert(m != NULL);
3047
3048 if (!pwop->sent_reply) {
3049 // send commit.
3050 MOSDOpReply *reply = pwop->ctx->reply;
3051 if (reply)
3052 pwop->ctx->reply = NULL;
3053 else {
3054 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3055 reply->set_reply_versions(eversion_t(), pwop->user_version);
3056 }
3057 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3058 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3059 osd->send_message_osd_client(reply, m->get_connection());
3060 pwop->sent_reply = true;
3061 pwop->ctx->op->mark_commit_sent();
3062 }
3063
3064 delete pwop->ctx;
3065 pwop->ctx = NULL;
3066}
3067
3068void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3069{
3070 dout(10) << __func__ << " " << pwop->soid << dendl;
3071 pwop->canceled = true;
3072
3073 // cancel objecter op, if we can
3074 if (pwop->objecter_tid) {
3075 osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3076 delete pwop->ctx;
3077 pwop->ctx = NULL;
3078 proxywrite_ops.erase(pwop->objecter_tid);
3079 pwop->objecter_tid = 0;
3080 }
3081}
3082
3083class PromoteCallback: public PrimaryLogPG::CopyCallback {
3084 ObjectContextRef obc;
3085 PrimaryLogPG *pg;
3086 utime_t start;
3087public:
3088 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3089 : obc(obc_),
3090 pg(pg_),
3091 start(ceph_clock_now()) {}
3092
3093 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3094 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3095 int r = results.get<0>();
3096 pg->finish_promote(r, results_data, obc);
3097 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3098 }
3099};
3100
3101void PrimaryLogPG::promote_object(ObjectContextRef obc,
3102 const hobject_t& missing_oid,
3103 const object_locator_t& oloc,
3104 OpRequestRef op,
3105 ObjectContextRef *promote_obc)
3106{
3107 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3108 assert(hoid != hobject_t());
3109 if (scrubber.write_blocked_by_scrub(hoid)) {
3110 dout(10) << __func__ << " " << hoid
3111 << " blocked by scrub" << dendl;
3112 if (op) {
3113 waiting_for_scrub.push_back(op);
3114 op->mark_delayed("waiting for scrub");
3115 dout(10) << __func__ << " " << hoid
3116 << " placing op in waiting_for_scrub" << dendl;
3117 } else {
3118 dout(10) << __func__ << " " << hoid
3119 << " no op, dropping on the floor" << dendl;
3120 }
3121 return;
3122 }
3123 if (!obc) { // we need to create an ObjectContext
3124 assert(missing_oid != hobject_t());
3125 obc = get_object_context(missing_oid, true);
3126 }
3127 if (promote_obc)
3128 *promote_obc = obc;
3129
3130 /*
3131 * Before promote complete, if there are proxy-reads for the object,
3132 * for this case we don't use DONTNEED.
3133 */
3134 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3135 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3136 if (q == in_progress_proxy_ops.end()) {
3137 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3138 }
3139
3140 PromoteCallback *cb = new PromoteCallback(obc, this);
3141 object_locator_t my_oloc = oloc;
3142 my_oloc.pool = pool.info.tier_of;
3143
3144 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3145 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3146 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3147 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3148 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3149 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3150 src_fadvise_flags, 0);
3151
3152 assert(obc->is_blocked());
3153
3154 if (op)
3155 wait_for_blocked_object(obc->obs.oi.soid, op);
3156 info.stats.stats.sum.num_promote++;
3157}
3158
3159void PrimaryLogPG::execute_ctx(OpContext *ctx)
3160{
3161 FUNCTRACE();
3162 dout(10) << __func__ << " " << ctx << dendl;
3163 ctx->reset_obs(ctx->obc);
3164 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3165 OpRequestRef op = ctx->op;
3166 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3167 ObjectContextRef obc = ctx->obc;
3168 const hobject_t& soid = obc->obs.oi.soid;
3169
3170 // this method must be idempotent since we may call it several times
3171 // before we finally apply the resulting transaction.
3172 ctx->op_t.reset(new PGTransaction);
3173
3174 if (op->may_write() || op->may_cache()) {
3175 // snap
3176 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3177 pool.info.is_pool_snaps_mode()) {
3178 // use pool's snapc
3179 ctx->snapc = pool.snapc;
3180 } else {
3181 // client specified snapc
3182 ctx->snapc.seq = m->get_snap_seq();
3183 ctx->snapc.snaps = m->get_snaps();
3184 filter_snapc(ctx->snapc.snaps);
3185 }
3186 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3187 ctx->snapc.seq < obc->ssc->snapset.seq) {
3188 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3189 << " < snapset seq " << obc->ssc->snapset.seq
3190 << " on " << obc->obs.oi.soid << dendl;
3191 reply_ctx(ctx, -EOLDSNAPC);
3192 return;
3193 }
3194
3195 // version
3196 ctx->at_version = get_next_version();
3197 ctx->mtime = m->get_mtime();
3198
c07f9fc5 3199 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae
FG
3200 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3201 << " snapc " << ctx->snapc
3202 << " snapset " << obc->ssc->snapset
3203 << dendl;
3204 } else {
c07f9fc5 3205 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae
FG
3206 << " ov " << obc->obs.oi.version
3207 << dendl;
3208 }
3209
3210 if (!ctx->user_at_version)
3211 ctx->user_at_version = obc->obs.oi.user_version;
3212 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3213
3214 if (op->may_read()) {
3215 dout(10) << " taking ondisk_read_lock" << dendl;
3216 obc->ondisk_read_lock();
3217 }
3218
3219 {
3220#ifdef WITH_LTTNG
3221 osd_reqid_t reqid = ctx->op->get_reqid();
3222#endif
3223 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3224 reqid.name._num, reqid.tid, reqid.inc);
3225 }
3226
3227 int result = prepare_transaction(ctx);
3228
3229 {
3230#ifdef WITH_LTTNG
3231 osd_reqid_t reqid = ctx->op->get_reqid();
3232#endif
3233 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3234 reqid.name._num, reqid.tid, reqid.inc);
3235 }
3236
3237 if (op->may_read()) {
3238 dout(10) << " dropping ondisk_read_lock" << dendl;
3239 obc->ondisk_read_unlock();
3240 }
3241
c07f9fc5
FG
3242 bool pending_async_reads = !ctx->pending_async_reads.empty();
3243 if (result == -EINPROGRESS || pending_async_reads) {
7c673cae 3244 // come back later.
c07f9fc5
FG
3245 if (pending_async_reads) {
3246 in_progress_async_reads.push_back(make_pair(op, ctx));
3247 ctx->start_async_reads(this);
3248 }
7c673cae
FG
3249 return;
3250 }
3251
3252 if (result == -EAGAIN) {
3253 // clean up after the ctx
3254 close_op_ctx(ctx);
3255 return;
3256 }
3257
3258 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3259 // prepare the reply
3260 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3261 successful_write);
3262
3263 // Write operations aren't allowed to return a data payload because
3264 // we can't do so reliably. If the client has to resend the request
3265 // and it has already been applied, we will return 0 with no
3266 // payload. Non-deterministic behavior is no good. However, it is
3267 // possible to construct an operation that does a read, does a guard
3268 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3269 // with the write, or return a CMPXATTR and the read value.
3270 if (successful_write) {
3271 // write. normalize the result code.
3272 dout(20) << " zeroing write result code " << result << dendl;
3273 result = 0;
3274 }
3275 ctx->reply->set_result(result);
3276
3277 // read or error?
3278 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3279 // finish side-effects
3280 if (result >= 0)
3281 do_osd_op_effects(ctx, m->get_connection());
3282
c07f9fc5 3283 complete_read_ctx(result, ctx);
7c673cae
FG
3284 return;
3285 }
3286
3287 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3288
3289 assert(op->may_write() || op->may_cache());
3290
3291 // trim log?
3292 calc_trim_to();
3293
3294 // verify that we are doing this in order?
3295 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3296 !pool.info.is_tier() && !pool.info.has_tiers()) {
3297 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3298 ceph_tid_t t = m->get_tid();
3299 client_t n = m->get_source().num();
3300 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3301 if (p == cm.end()) {
3302 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3303 cm[n] = t;
3304 } else {
3305 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3306 if (p->second > t) {
3307 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3308 assert(0 == "out of order op");
3309 }
3310 p->second = t;
3311 }
3312 }
3313
3314 if (ctx->update_log_only) {
3315 if (result >= 0)
3316 do_osd_op_effects(ctx, m->get_connection());
3317
3318 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3319 // save just what we need from ctx
3320 MOSDOpReply *reply = ctx->reply;
3321 ctx->reply = nullptr;
c07f9fc5
FG
3322 reply->claim_op_out_data(*ctx->ops);
3323 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
3324 close_op_ctx(ctx);
3325
3326 if (result == -ENOENT) {
3327 reply->set_enoent_reply_versions(info.last_update,
3328 info.last_user_version);
3329 }
3330 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3331 // append to pg log for dup detection - don't save buffers for now
3332 record_write_error(op, soid, reply, result);
3333 return;
3334 }
3335
3336 // no need to capture PG ref, repop cancel will handle that
3337 // Can capture the ctx by pointer, it's owned by the repop
3338 ctx->register_on_commit(
3339 [m, ctx, this](){
3340 if (ctx->op)
3341 log_op_stats(
3342 ctx);
3343
3344 if (m && !ctx->sent_reply) {
3345 MOSDOpReply *reply = ctx->reply;
3346 if (reply)
3347 ctx->reply = nullptr;
3348 else {
3349 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3350 reply->set_reply_versions(ctx->at_version,
3351 ctx->user_at_version);
3352 }
3353 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3354 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3355 osd->send_message_osd_client(reply, m->get_connection());
3356 ctx->sent_reply = true;
3357 ctx->op->mark_commit_sent();
3358 }
3359 });
3360 ctx->register_on_success(
3361 [ctx, this]() {
3362 do_osd_op_effects(
3363 ctx,
3364 ctx->op ? ctx->op->get_req()->get_connection() :
3365 ConnectionRef());
3366 });
3367 ctx->register_on_finish(
3368 [ctx, this]() {
3369 delete ctx;
3370 });
3371
3372 // issue replica writes
3373 ceph_tid_t rep_tid = osd->get_tid();
3374
3375 RepGather *repop = new_repop(ctx, obc, rep_tid);
3376
3377 issue_repop(repop, ctx);
3378 eval_repop(repop);
3379 repop->put();
3380}
3381
c07f9fc5
FG
3382void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3383 release_object_locks(ctx->lock_manager);
3384
3385 ctx->op_t.reset();
3386
3387 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3388 ctx->on_finish.erase(p++)) {
3389 (*p)();
3390 }
3391 delete ctx;
3392}
3393
7c673cae
FG
3394void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3395{
3396 if (ctx->op)
3397 osd->reply_op_error(ctx->op, r);
3398 close_op_ctx(ctx);
3399}
3400
3401void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3402{
3403 if (ctx->op)
3404 osd->reply_op_error(ctx->op, r, v, uv);
3405 close_op_ctx(ctx);
3406}
3407
3408void PrimaryLogPG::log_op_stats(OpContext *ctx)
3409{
3410 OpRequestRef op = ctx->op;
3411 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3412
3413 utime_t now = ceph_clock_now();
3414 utime_t latency = now;
3415 latency -= ctx->op->get_req()->get_recv_stamp();
3416 utime_t process_latency = now;
3417 process_latency -= ctx->op->get_dequeued_time();
3418
3419 uint64_t inb = ctx->bytes_written;
3420 uint64_t outb = ctx->bytes_read;
3421
3422 osd->logger->inc(l_osd_op);
3423
3424 osd->logger->inc(l_osd_op_outb, outb);
3425 osd->logger->inc(l_osd_op_inb, inb);
3426 osd->logger->tinc(l_osd_op_lat, latency);
3427 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3428
3429 if (op->may_read() && op->may_write()) {
3430 osd->logger->inc(l_osd_op_rw);
3431 osd->logger->inc(l_osd_op_rw_inb, inb);
3432 osd->logger->inc(l_osd_op_rw_outb, outb);
3433 osd->logger->tinc(l_osd_op_rw_lat, latency);
3434 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3435 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3436 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3437 } else if (op->may_read()) {
3438 osd->logger->inc(l_osd_op_r);
3439 osd->logger->inc(l_osd_op_r_outb, outb);
3440 osd->logger->tinc(l_osd_op_r_lat, latency);
3441 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3442 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3443 } else if (op->may_write() || op->may_cache()) {
3444 osd->logger->inc(l_osd_op_w);
3445 osd->logger->inc(l_osd_op_w_inb, inb);
3446 osd->logger->tinc(l_osd_op_w_lat, latency);
3447 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3448 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3449 } else
3450 ceph_abort();
3451
3452 dout(15) << "log_op_stats " << *m
3453 << " inb " << inb
3454 << " outb " << outb
3455 << " lat " << latency << dendl;
3456}
3457
3458void PrimaryLogPG::do_sub_op(OpRequestRef op)
3459{
3460 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3461 assert(have_same_or_newer_map(m->map_epoch));
3462 assert(m->get_type() == MSG_OSD_SUBOP);
3463 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3464
3465 if (!is_peered()) {
3466 waiting_for_peered.push_back(op);
3467 op->mark_delayed("waiting for active");
3468 return;
3469 }
3470
3471 const OSDOp *first = NULL;
3472 if (m->ops.size() >= 1) {
3473 first = &m->ops[0];
3474 }
3475
3476 if (first) {
3477 switch (first->op.op) {
3478 case CEPH_OSD_OP_DELETE:
3479 sub_op_remove(op);
3480 return;
3481 case CEPH_OSD_OP_SCRUB_RESERVE:
3482 handle_scrub_reserve_request(op);
3483 return;
3484 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3485 handle_scrub_reserve_release(op);
3486 return;
3487 case CEPH_OSD_OP_SCRUB_MAP:
3488 sub_op_scrub_map(op);
3489 return;
3490 }
3491 }
3492}
3493
3494void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3495{
3496 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3497 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3498 if (r->ops.size() >= 1) {
3499 const OSDOp& first = r->ops[0];
3500 switch (first.op.op) {
3501 case CEPH_OSD_OP_SCRUB_RESERVE:
3502 {
3503 pg_shard_t from = r->from;
3504 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3505 bool reserved;
3506 ::decode(reserved, p);
3507 if (reserved) {
3508 handle_scrub_reserve_grant(op, from);
3509 } else {
3510 handle_scrub_reserve_reject(op, from);
3511 }
3512 }
3513 return;
3514 }
3515 }
3516}
3517
3518void PrimaryLogPG::do_scan(
3519 OpRequestRef op,
3520 ThreadPool::TPHandle &handle)
3521{
3522 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3523 assert(m->get_type() == MSG_OSD_PG_SCAN);
3524 dout(10) << "do_scan " << *m << dendl;
3525
3526 op->mark_started();
3527
3528 switch (m->op) {
3529 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3530 {
3531 ostringstream ss;
3532 if (osd->check_backfill_full(ss)) {
3533 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3534 queue_peering_event(
3535 CephPeeringEvtRef(
3536 std::make_shared<CephPeeringEvt>(
3537 get_osdmap()->get_epoch(),
3538 get_osdmap()->get_epoch(),
3539 BackfillTooFull())));
3540 return;
3541 }
3542
3543 BackfillInterval bi;
3544 bi.begin = m->begin;
3545 // No need to flush, there won't be any in progress writes occuring
3546 // past m->begin
3547 scan_range(
3548 cct->_conf->osd_backfill_scan_min,
3549 cct->_conf->osd_backfill_scan_max,
3550 &bi,
3551 handle);
3552 MOSDPGScan *reply = new MOSDPGScan(
3553 MOSDPGScan::OP_SCAN_DIGEST,
3554 pg_whoami,
3555 get_osdmap()->get_epoch(), m->query_epoch,
3556 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3557 ::encode(bi.objects, reply->get_data());
3558 osd->send_message_osd_cluster(reply, m->get_connection());
3559 }
3560 break;
3561
3562 case MOSDPGScan::OP_SCAN_DIGEST:
3563 {
3564 pg_shard_t from = m->from;
3565
3566 // Check that from is in backfill_targets vector
3567 assert(is_backfill_targets(from));
3568
3569 BackfillInterval& bi = peer_backfill_info[from];
3570 bi.begin = m->begin;
3571 bi.end = m->end;
3572 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3573
3574 // take care to preserve ordering!
3575 bi.clear_objects();
3576 ::decode_noclear(bi.objects, p);
3577
3578 if (waiting_on_backfill.erase(from)) {
3579 if (waiting_on_backfill.empty()) {
3580 assert(peer_backfill_info.size() == backfill_targets.size());
3581 finish_recovery_op(hobject_t::get_max());
3582 }
3583 } else {
3584 // we canceled backfill for a while due to a too full, and this
3585 // is an extra response from a non-too-full peer
3586 }
3587 }
3588 break;
3589 }
3590}
3591
3592void PrimaryLogPG::do_backfill(OpRequestRef op)
3593{
3594 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3595 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3596 dout(10) << "do_backfill " << *m << dendl;
3597
3598 op->mark_started();
3599
3600 switch (m->op) {
3601 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3602 {
3603 assert(cct->_conf->osd_kill_backfill_at != 1);
3604
3605 MOSDPGBackfill *reply = new MOSDPGBackfill(
3606 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3607 get_osdmap()->get_epoch(),
3608 m->query_epoch,
3609 spg_t(info.pgid.pgid, get_primary().shard));
3610 reply->set_priority(get_recovery_op_priority());
3611 osd->send_message_osd_cluster(reply, m->get_connection());
3612 queue_peering_event(
3613 CephPeeringEvtRef(
3614 std::make_shared<CephPeeringEvt>(
3615 get_osdmap()->get_epoch(),
3616 get_osdmap()->get_epoch(),
3617 RecoveryDone())));
3618 }
3619 // fall-thru
3620
3621 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3622 {
3623 assert(cct->_conf->osd_kill_backfill_at != 2);
3624
3625 info.set_last_backfill(m->last_backfill);
3626 info.stats = m->stats;
3627
3628 ObjectStore::Transaction t;
3629 dirty_info = true;
3630 write_if_dirty(t);
3631 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3632 assert(tr == 0);
3633 }
3634 break;
3635
3636 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3637 {
3638 assert(is_primary());
3639 assert(cct->_conf->osd_kill_backfill_at != 3);
3640 finish_recovery_op(hobject_t::get_max());
3641 }
3642 break;
3643 }
3644}
3645
3646void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3647{
3648 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3649 op->get_req());
3650 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3651 dout(7) << __func__ << " " << m->ls << dendl;
3652
3653 op->mark_started();
3654
3655 ObjectStore::Transaction t;
3656 for (auto& p : m->ls) {
3657 remove_snap_mapped_object(t, p.first);
3658 }
3659 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3660 assert(r == 0);
3661}
3662
224ce89b
WB
3663int PrimaryLogPG::trim_object(
3664 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
7c673cae 3665{
224ce89b 3666 *ctxp = NULL;
7c673cae
FG
3667 // load clone info
3668 bufferlist bl;
3669 ObjectContextRef obc = get_object_context(coid, false, NULL);
224ce89b
WB
3670 if (!obc || !obc->ssc || !obc->ssc->exists) {
3671 osd->clog->error() << __func__ << ": Can not trim " << coid
3672 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3673 return -ENOENT;
7c673cae 3674 }
7c673cae
FG
3675
3676 hobject_t snapoid(
3677 coid.oid, coid.get_key(),
3678 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3679 info.pgid.pool(), coid.get_namespace());
3680 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
224ce89b
WB
3681 if (!snapset_obc) {
3682 osd->clog->error() << __func__ << ": Can not trim " << coid
3683 << " repair needed, no snapset obc for " << snapoid;
3684 return -ENOENT;
3685 }
7c673cae
FG
3686
3687 SnapSet& snapset = obc->ssc->snapset;
3688
3689 bool legacy = snapset.is_legacy() ||
31f18b77 3690 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
3691
3692 object_info_t &coi = obc->obs.oi;
3693 set<snapid_t> old_snaps;
3694 if (legacy) {
3695 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3696 } else {
3697 auto p = snapset.clone_snaps.find(coid.snap);
3698 if (p == snapset.clone_snaps.end()) {
c07f9fc5
FG
3699 osd->clog->error() << "No clone_snaps in snapset " << snapset
3700 << " for object " << coid << "\n";
224ce89b 3701 return -ENOENT;
7c673cae
FG
3702 }
3703 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3704 snapset.clone_snaps[coid.snap].end());
3705 }
3706 if (old_snaps.empty()) {
c07f9fc5 3707 osd->clog->error() << "No object info snaps for object " << coid;
224ce89b 3708 return -ENOENT;
7c673cae
FG
3709 }
3710
3711 dout(10) << coid << " old_snaps " << old_snaps
3712 << " old snapset " << snapset << dendl;
3713 if (snapset.seq == 0) {
c07f9fc5 3714 osd->clog->error() << "No snapset.seq for object " << coid;
224ce89b 3715 return -ENOENT;
7c673cae
FG
3716 }
3717
3718 set<snapid_t> new_snaps;
3719 for (set<snapid_t>::iterator i = old_snaps.begin();
3720 i != old_snaps.end();
3721 ++i) {
3722 if (!pool.info.is_removed_snap(*i))
3723 new_snaps.insert(*i);
3724 }
3725
3726 vector<snapid_t>::iterator p = snapset.clones.end();
3727
3728 if (new_snaps.empty()) {
3729 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3730 if (p == snapset.clones.end()) {
c07f9fc5 3731 osd->clog->error() << "Snap " << coid.snap << " not in clones";
224ce89b 3732 return -ENOENT;
7c673cae
FG
3733 }
3734 }
3735
3736 OpContextUPtr ctx = simple_opc_create(obc);
3737 ctx->snapset_obc = snapset_obc;
3738
3739 if (!ctx->lock_manager.get_snaptrimmer_write(
3740 coid,
3741 obc,
3742 first)) {
3743 close_op_ctx(ctx.release());
3744 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
224ce89b 3745 return -ENOLCK;
7c673cae
FG
3746 }
3747
3748 if (!ctx->lock_manager.get_snaptrimmer_write(
3749 snapoid,
3750 snapset_obc,
3751 first)) {
3752 close_op_ctx(ctx.release());
3753 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
224ce89b 3754 return -ENOLCK;
7c673cae
FG
3755 }
3756
3757 ctx->at_version = get_next_version();
3758
3759 PGTransaction *t = ctx->op_t.get();
3760
3761 if (new_snaps.empty()) {
3762 // remove clone
3763 dout(10) << coid << " snaps " << old_snaps << " -> "
3764 << new_snaps << " ... deleting" << dendl;
3765
3766 // ...from snapset
3767 assert(p != snapset.clones.end());
3768
3769 snapid_t last = coid.snap;
3770 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3771
3772 if (p != snapset.clones.begin()) {
3773 // not the oldest... merge overlap into next older clone
3774 vector<snapid_t>::iterator n = p - 1;
3775 hobject_t prev_coid = coid;
3776 prev_coid.snap = *n;
3777 bool adjust_prev_bytes = is_present_clone(prev_coid);
3778
3779 if (adjust_prev_bytes)
3780 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3781
3782 snapset.clone_overlap[*n].intersection_of(
3783 snapset.clone_overlap[*p]);
3784
3785 if (adjust_prev_bytes)
3786 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3787 }
3788 ctx->delta_stats.num_objects--;
3789 if (coi.is_dirty())
3790 ctx->delta_stats.num_objects_dirty--;
3791 if (coi.is_omap())
3792 ctx->delta_stats.num_objects_omap--;
3793 if (coi.is_whiteout()) {
3794 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3795 ctx->delta_stats.num_whiteouts--;
3796 }
3797 ctx->delta_stats.num_object_clones--;
3798 if (coi.is_cache_pinned())
3799 ctx->delta_stats.num_objects_pinned--;
3800 obc->obs.exists = false;
3801
3802 snapset.clones.erase(p);
3803 snapset.clone_overlap.erase(last);
3804 snapset.clone_size.erase(last);
3805 snapset.clone_snaps.erase(last);
3806
3807 ctx->log.push_back(
3808 pg_log_entry_t(
3809 pg_log_entry_t::DELETE,
3810 coid,
3811 ctx->at_version,
3812 ctx->obs->oi.version,
3813 0,
3814 osd_reqid_t(),
3815 ctx->mtime,
3816 0)
3817 );
3818 t->remove(coid);
3819 t->update_snaps(
3820 coid,
3821 old_snaps,
3822 new_snaps);
31f18b77
FG
3823
3824 coi = object_info_t(coid);
3825
7c673cae
FG
3826 ctx->at_version.version++;
3827 } else {
3828 // save adjusted snaps for this object
3829 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3830 if (legacy) {
3831 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3832 } else {
3833 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3834 new_snaps.rend());
3835 // we still do a 'modify' event on this object just to trigger a
3836 // snapmapper.update ... :(
3837 }
3838
3839 coi.prior_version = coi.version;
3840 coi.version = ctx->at_version;
3841 bl.clear();
3842 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3843 t->setattr(coid, OI_ATTR, bl);
3844
3845 ctx->log.push_back(
3846 pg_log_entry_t(
3847 pg_log_entry_t::MODIFY,
3848 coid,
3849 coi.version,
3850 coi.prior_version,
3851 0,
3852 osd_reqid_t(),
3853 ctx->mtime,
3854 0)
3855 );
3856 ctx->at_version.version++;
3857
3858 t->update_snaps(
3859 coid,
3860 old_snaps,
3861 new_snaps);
3862 }
3863
3864 // save head snapset
3865 dout(10) << coid << " new snapset " << snapset << " on "
3866 << snapset_obc->obs.oi << dendl;
3867 if (snapset.clones.empty() &&
3868 (!snapset.head_exists ||
3869 (snapset_obc->obs.oi.is_whiteout() &&
3870 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3871 !snapset_obc->obs.oi.is_cache_pinned()))) {
3872 // NOTE: this arguably constitutes minor interference with the
3873 // tiering agent if this is a cache tier since a snap trim event
3874 // is effectively evicting a whiteout we might otherwise want to
3875 // keep around.
3876 dout(10) << coid << " removing " << snapoid << dendl;
3877 ctx->log.push_back(
3878 pg_log_entry_t(
3879 pg_log_entry_t::DELETE,
3880 snapoid,
3881 ctx->at_version,
3882 ctx->snapset_obc->obs.oi.version,
3883 0,
3884 osd_reqid_t(),
3885 ctx->mtime,
3886 0)
3887 );
3888 if (snapoid.is_head()) {
3889 derr << "removing snap head" << dendl;
3890 object_info_t& oi = ctx->snapset_obc->obs.oi;
3891 ctx->delta_stats.num_objects--;
3892 if (oi.is_dirty()) {
3893 ctx->delta_stats.num_objects_dirty--;
7c673cae
FG
3894 }
3895 if (oi.is_omap())
3896 ctx->delta_stats.num_objects_omap--;
3897 if (oi.is_whiteout()) {
3898 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3899 ctx->delta_stats.num_whiteouts--;
7c673cae 3900 }
31f18b77 3901 if (oi.is_cache_pinned()) {
7c673cae 3902 ctx->delta_stats.num_objects_pinned--;
31f18b77 3903 }
7c673cae
FG
3904 }
3905 ctx->snapset_obc->obs.exists = false;
31f18b77 3906 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
7c673cae
FG
3907 t->remove(snapoid);
3908 } else {
3909 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3910 snapset.filter(pool.info);
3911 dout(10) << coid << " writing updated snapset on " << snapoid
3912 << ", snapset is " << snapset << dendl;
3913 ctx->log.push_back(
3914 pg_log_entry_t(
3915 pg_log_entry_t::MODIFY,
3916 snapoid,
3917 ctx->at_version,
3918 ctx->snapset_obc->obs.oi.version,
3919 0,
3920 osd_reqid_t(),
3921 ctx->mtime,
3922 0)
3923 );
3924
3925 ctx->snapset_obc->obs.oi.prior_version =
3926 ctx->snapset_obc->obs.oi.version;
3927 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3928
3929 map <string, bufferlist> attrs;
3930 bl.clear();
3931 ::encode(snapset, bl);
3932 attrs[SS_ATTR].claim(bl);
3933
3934 bl.clear();
3935 ::encode(ctx->snapset_obc->obs.oi, bl,
3936 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3937 attrs[OI_ATTR].claim(bl);
3938 t->setattrs(snapoid, attrs);
3939 }
3940
224ce89b
WB
3941 *ctxp = std::move(ctx);
3942 return 0;
7c673cae
FG
3943}
3944
3945void PrimaryLogPG::kick_snap_trim()
3946{
3947 assert(is_active());
3948 assert(is_primary());
3949 if (is_clean() && !snap_trimq.empty()) {
3950 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3951 snap_trimmer_machine.process_event(KickTrim());
3952 }
3953}
3954
3955void PrimaryLogPG::snap_trimmer_scrub_complete()
3956{
3957 if (is_primary() && is_active() && is_clean()) {
3958 assert(!snap_trimq.empty());
3959 snap_trimmer_machine.process_event(ScrubComplete());
3960 }
3961}
3962
3963void PrimaryLogPG::snap_trimmer(epoch_t queued)
3964{
3965 if (deleting || pg_has_reset_since(queued)) {
3966 return;
3967 }
3968
3969 assert(is_primary());
3970
3971 dout(10) << "snap_trimmer posting" << dendl;
3972 snap_trimmer_machine.process_event(DoSnapWork());
3973 dout(10) << "snap_trimmer complete" << dendl;
3974 return;
3975}
3976
3977int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3978{
3979 __u64 v2;
3980
3981 string v2s(xattr.c_str(), xattr.length());
3982 if (v2s.length())
3983 v2 = strtoull(v2s.c_str(), NULL, 10);
3984 else
3985 v2 = 0;
3986
3987 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3988
3989 switch (op) {
3990 case CEPH_OSD_CMPXATTR_OP_EQ:
3991 return (v1 == v2);
3992 case CEPH_OSD_CMPXATTR_OP_NE:
3993 return (v1 != v2);
3994 case CEPH_OSD_CMPXATTR_OP_GT:
3995 return (v1 > v2);
3996 case CEPH_OSD_CMPXATTR_OP_GTE:
3997 return (v1 >= v2);
3998 case CEPH_OSD_CMPXATTR_OP_LT:
3999 return (v1 < v2);
4000 case CEPH_OSD_CMPXATTR_OP_LTE:
4001 return (v1 <= v2);
4002 default:
4003 return -EINVAL;
4004 }
4005}
4006
4007int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4008{
4009 string v2s(xattr.c_str(), xattr.length());
4010
4011 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4012
4013 switch (op) {
4014 case CEPH_OSD_CMPXATTR_OP_EQ:
4015 return (v1s.compare(v2s) == 0);
4016 case CEPH_OSD_CMPXATTR_OP_NE:
4017 return (v1s.compare(v2s) != 0);
4018 case CEPH_OSD_CMPXATTR_OP_GT:
4019 return (v1s.compare(v2s) > 0);
4020 case CEPH_OSD_CMPXATTR_OP_GTE:
4021 return (v1s.compare(v2s) >= 0);
4022 case CEPH_OSD_CMPXATTR_OP_LT:
4023 return (v1s.compare(v2s) < 0);
4024 case CEPH_OSD_CMPXATTR_OP_LTE:
4025 return (v1s.compare(v2s) <= 0);
4026 default:
4027 return -EINVAL;
4028 }
4029}
4030
7c673cae
FG
4031int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4032{
4033 ceph_osd_op& op = osd_op.op;
4034 vector<OSDOp> write_ops(1);
4035 OSDOp& write_op = write_ops[0];
4036 uint64_t write_length = op.writesame.length;
4037 int result = 0;
4038
4039 if (!write_length)
4040 return 0;
4041
4042 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4043 return -EINVAL;
4044
4045 if (op.writesame.data_length != osd_op.indata.length()) {
4046 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4047 return -EINVAL;
4048 }
4049
4050 while (write_length) {
4051 write_op.indata.append(osd_op.indata);
4052 write_length -= op.writesame.data_length;
4053 }
4054
4055 write_op.op.op = CEPH_OSD_OP_WRITE;
4056 write_op.op.extent.offset = op.writesame.offset;
4057 write_op.op.extent.length = op.writesame.length;
4058 result = do_osd_ops(ctx, write_ops);
4059 if (result < 0)
4060 derr << "do_writesame do_osd_ops failed " << result << dendl;
4061
4062 return result;
4063}
4064
4065// ========================================================================
4066// low level osd ops
4067
4068int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4069{
4070 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4071 bufferlist header, vals;
4072 int r = _get_tmap(ctx, &header, &vals);
4073 if (r < 0) {
4074 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4075 r = 0;
4076 return r;
4077 }
4078
4079 vector<OSDOp> ops(3);
4080
4081 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4082 ops[0].op.extent.offset = 0;
4083 ops[0].op.extent.length = 0;
4084
4085 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4086 ops[1].indata.claim(header);
4087
4088 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4089 ops[2].indata.claim(vals);
4090
4091 return do_osd_ops(ctx, ops);
4092}
4093
4094int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4095 bufferlist& bl)
4096{
4097 // decode
4098 bufferlist header;
4099 map<string, bufferlist> m;
4100 if (bl.length()) {
4101 bufferlist::iterator p = bl.begin();
4102 ::decode(header, p);
4103 ::decode(m, p);
4104 assert(p.end());
4105 }
4106
4107 // do the update(s)
4108 while (!bp.end()) {
4109 __u8 op;
4110 string key;
4111 ::decode(op, bp);
4112
4113 switch (op) {
4114 case CEPH_OSD_TMAP_SET: // insert key
4115 {
4116 ::decode(key, bp);
4117 bufferlist data;
4118 ::decode(data, bp);
4119 m[key] = data;
4120 }
4121 break;
4122 case CEPH_OSD_TMAP_RM: // remove key
4123 ::decode(key, bp);
4124 if (!m.count(key)) {
4125 return -ENOENT;
4126 }
4127 m.erase(key);
4128 break;
4129 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4130 ::decode(key, bp);
4131 m.erase(key);
4132 break;
4133 case CEPH_OSD_TMAP_HDR: // update header
4134 {
4135 ::decode(header, bp);
4136 }
4137 break;
4138 default:
4139 return -EINVAL;
4140 }
4141 }
4142
4143 // reencode
4144 bufferlist obl;
4145 ::encode(header, obl);
4146 ::encode(m, obl);
4147
4148 // write it out
4149 vector<OSDOp> nops(1);
4150 OSDOp& newop = nops[0];
4151 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4152 newop.op.extent.offset = 0;
4153 newop.op.extent.length = obl.length();
4154 newop.indata = obl;
4155 do_osd_ops(ctx, nops);
4156 osd_op.outdata.claim(newop.outdata);
4157 return 0;
4158}
4159
4160int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4161{
4162 bufferlist::iterator orig_bp = bp;
4163 int result = 0;
4164 if (bp.end()) {
4165 dout(10) << "tmapup is a no-op" << dendl;
4166 } else {
4167 // read the whole object
4168 vector<OSDOp> nops(1);
4169 OSDOp& newop = nops[0];
4170 newop.op.op = CEPH_OSD_OP_READ;
4171 newop.op.extent.offset = 0;
4172 newop.op.extent.length = 0;
4173 result = do_osd_ops(ctx, nops);
4174
4175 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4176
4177 dout(30) << " starting is \n";
4178 newop.outdata.hexdump(*_dout);
4179 *_dout << dendl;
4180
4181 bufferlist::iterator ip = newop.outdata.begin();
4182 bufferlist obl;
4183
4184 dout(30) << "the update command is: \n";
4185 osd_op.indata.hexdump(*_dout);
4186 *_dout << dendl;
4187
4188 // header
4189 bufferlist header;
4190 __u32 nkeys = 0;
4191 if (newop.outdata.length()) {
4192 ::decode(header, ip);
4193 ::decode(nkeys, ip);
4194 }
4195 dout(10) << "tmapup header " << header.length() << dendl;
4196
4197 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4198 ++bp;
4199 ::decode(header, bp);
4200 dout(10) << "tmapup new header " << header.length() << dendl;
4201 }
4202
4203 ::encode(header, obl);
4204
4205 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4206
4207 // update keys
4208 bufferlist newkeydata;
4209 string nextkey, last_in_key;
4210 bufferlist nextval;
4211 bool have_next = false;
4212 if (!ip.end()) {
4213 have_next = true;
4214 ::decode(nextkey, ip);
4215 ::decode(nextval, ip);
4216 }
4217 while (!bp.end() && !result) {
4218 __u8 op;
4219 string key;
4220 try {
4221 ::decode(op, bp);
4222 ::decode(key, bp);
4223 }
4224 catch (buffer::error& e) {
4225 return -EINVAL;
4226 }
4227 if (key < last_in_key) {
4228 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4229 << "', falling back to an inefficient (unsorted) update" << dendl;
4230 bp = orig_bp;
4231 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4232 }
4233 last_in_key = key;
4234
4235 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4236
4237 // skip existing intervening keys
4238 bool key_exists = false;
4239 while (have_next && !key_exists) {
4240 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4241 if (nextkey > key)
4242 break;
4243 if (nextkey < key) {
4244 // copy untouched.
4245 ::encode(nextkey, newkeydata);
4246 ::encode(nextval, newkeydata);
4247 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4248 } else {
4249 // don't copy; discard old value. and stop.
4250 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4251 key_exists = true;
4252 nkeys--;
4253 }
4254 if (!ip.end()) {
4255 ::decode(nextkey, ip);
4256 ::decode(nextval, ip);
4257 } else {
4258 have_next = false;
4259 }
4260 }
4261
4262 if (op == CEPH_OSD_TMAP_SET) {
4263 bufferlist val;
4264 try {
4265 ::decode(val, bp);
4266 }
4267 catch (buffer::error& e) {
4268 return -EINVAL;
4269 }
4270 ::encode(key, newkeydata);
4271 ::encode(val, newkeydata);
4272 dout(20) << " set " << key << " " << val.length() << dendl;
4273 nkeys++;
4274 } else if (op == CEPH_OSD_TMAP_CREATE) {
4275 if (key_exists) {
4276 return -EEXIST;
4277 }
4278 bufferlist val;
4279 try {
4280 ::decode(val, bp);
4281 }
4282 catch (buffer::error& e) {
4283 return -EINVAL;
4284 }
4285 ::encode(key, newkeydata);
4286 ::encode(val, newkeydata);
4287 dout(20) << " create " << key << " " << val.length() << dendl;
4288 nkeys++;
4289 } else if (op == CEPH_OSD_TMAP_RM) {
4290 // do nothing.
4291 if (!key_exists) {
4292 return -ENOENT;
4293 }
4294 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4295 // do nothing
4296 } else {
4297 dout(10) << " invalid tmap op " << (int)op << dendl;
4298 return -EINVAL;
4299 }
4300 }
4301
4302 // copy remaining
4303 if (have_next) {
4304 ::encode(nextkey, newkeydata);
4305 ::encode(nextval, newkeydata);
4306 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4307 }
4308 if (!ip.end()) {
4309 bufferlist rest;
4310 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4311 dout(20) << " keep trailing " << rest.length()
4312 << " at " << newkeydata.length() << dendl;
4313 newkeydata.claim_append(rest);
4314 }
4315
4316 // encode final key count + key data
4317 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4318 ::encode(nkeys, obl);
4319 obl.claim_append(newkeydata);
4320
4321 if (0) {
4322 dout(30) << " final is \n";
4323 obl.hexdump(*_dout);
4324 *_dout << dendl;
4325
4326 // sanity check
4327 bufferlist::iterator tp = obl.begin();
4328 bufferlist h;
4329 ::decode(h, tp);
4330 map<string,bufferlist> d;
4331 ::decode(d, tp);
4332 assert(tp.end());
4333 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4334 }
4335
4336 // write it out
4337 if (!result) {
4338 dout(20) << "tmapput write " << obl.length() << dendl;
4339 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4340 newop.op.extent.offset = 0;
4341 newop.op.extent.length = obl.length();
4342 newop.indata = obl;
4343 do_osd_ops(ctx, nops);
4344 osd_op.outdata.claim(newop.outdata);
4345 }
4346 }
4347 return result;
4348}
4349
4350static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4351{
4352 if (offset >= max ||
4353 length > max ||
4354 offset + length > max)
4355 return -EFBIG;
4356
4357 return 0;
4358}
4359
4360struct FillInVerifyExtent : public Context {
4361 ceph_le64 *r;
4362 int32_t *rval;
4363 bufferlist *outdatap;
4364 boost::optional<uint32_t> maybe_crc;
4365 uint64_t size;
4366 OSDService *osd;
4367 hobject_t soid;
4368 __le32 flags;
4369 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4370 boost::optional<uint32_t> mc, uint64_t size,
4371 OSDService *osd, hobject_t soid, __le32 flags) :
4372 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4373 size(size), osd(osd), soid(soid), flags(flags) {}
4374 void finish(int len) override {
7c673cae 4375 *r = len;
c07f9fc5
FG
4376 if (len < 0) {
4377 *rval = len;
7c673cae 4378 return;
c07f9fc5
FG
4379 }
4380 *rval = 0;
4381
7c673cae
FG
4382 // whole object? can we verify the checksum?
4383 if (maybe_crc && *r == size) {
4384 uint32_t crc = outdatap->crc32c(-1);
4385 if (maybe_crc != crc) {
4386 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4387 << " != expected 0x" << *maybe_crc
4388 << std::dec << " on " << soid;
4389 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4390 *rval = -EIO;
4391 *r = 0;
4392 }
4393 }
4394 }
4395 }
4396};
4397
4398struct ToSparseReadResult : public Context {
c07f9fc5
FG
4399 int* result;
4400 bufferlist* data_bl;
7c673cae 4401 uint64_t data_offset;
c07f9fc5
FG
4402 ceph_le64* len;
4403 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4404 ceph_le64* len)
4405 : result(result), data_bl(bl), data_offset(offset),len(len) {}
7c673cae 4406 void finish(int r) override {
c07f9fc5
FG
4407 if (r < 0) {
4408 *result = r;
4409 return;
4410 }
4411 *result = 0;
4412 *len = r;
7c673cae
FG
4413 bufferlist outdata;
4414 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4415 ::encode(extents, outdata);
c07f9fc5
FG
4416 ::encode_destructively(*data_bl, outdata);
4417 data_bl->swap(outdata);
7c673cae
FG
4418 }
4419};
4420
4421template<typename V>
4422static string list_keys(const map<string, V>& m) {
4423 string s;
4424 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4425 if (!s.empty()) {
4426 s.push_back(',');
4427 }
4428 s.append(itr->first);
4429 }
4430 return s;
4431}
4432
4433template<typename T>
4434static string list_entries(const T& m) {
4435 string s;
4436 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4437 if (!s.empty()) {
4438 s.push_back(',');
4439 }
4440 s.append(*itr);
4441 }
4442 return s;
4443}
4444
4445void PrimaryLogPG::maybe_create_new_object(
4446 OpContext *ctx,
4447 bool ignore_transaction)
4448{
4449 ObjectState& obs = ctx->new_obs;
4450 if (!obs.exists) {
4451 ctx->delta_stats.num_objects++;
4452 obs.exists = true;
4453 assert(!obs.oi.is_whiteout());
4454 obs.oi.new_object();
4455 if (!ignore_transaction)
4456 ctx->op_t->create(obs.oi.soid);
4457 } else if (obs.oi.is_whiteout()) {
4458 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4459 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4460 --ctx->delta_stats.num_whiteouts;
4461 }
4462}
4463
c07f9fc5
FG
4464struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4465 OSDOp& osd_op;
4466
4467 ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4468 }
4469
4470 int execute() override {
4471 return osd_op.rval;
4472 }
4473};
4474
7c673cae
FG
4475struct C_ChecksumRead : public Context {
4476 PrimaryLogPG *primary_log_pg;
4477 OSDOp &osd_op;
4478 Checksummer::CSumType csum_type;
4479 bufferlist init_value_bl;
4480 ceph_le64 read_length;
4481 bufferlist read_bl;
4482 Context *fill_extent_ctx;
4483
4484 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4485 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4486 boost::optional<uint32_t> maybe_crc, uint64_t size,
4487 OSDService *osd, hobject_t soid, __le32 flags)
4488 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4489 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4490 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4491 &read_bl, maybe_crc, size,
4492 osd, soid, flags)) {
4493 }
c07f9fc5
FG
4494 ~C_ChecksumRead() override {
4495 delete fill_extent_ctx;
4496 }
7c673cae
FG
4497
4498 void finish(int r) override {
4499 fill_extent_ctx->complete(r);
c07f9fc5 4500 fill_extent_ctx = nullptr;
7c673cae
FG
4501
4502 if (osd_op.rval >= 0) {
4503 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4504 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
c07f9fc5 4505 &init_value_bl_it, read_bl);
7c673cae
FG
4506 }
4507 }
4508};
4509
4510int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
c07f9fc5 4511 bufferlist::iterator *bl_it)
7c673cae
FG
4512{
4513 dout(20) << __func__ << dendl;
4514
4515 auto& op = osd_op.op;
4516 if (op.checksum.chunk_size > 0) {
4517 if (op.checksum.length == 0) {
4518 dout(10) << __func__ << ": length required when chunk size provided"
4519 << dendl;
4520 return -EINVAL;
4521 }
4522 if (op.checksum.length % op.checksum.chunk_size != 0) {
4523 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4524 return -EINVAL;
4525 }
4526 }
4527
4528 auto& oi = ctx->new_obs.oi;
4529 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4530 // zeroed offset+length implies checksum whole object
4531 op.checksum.length = oi.size;
4532 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4533 return -EOVERFLOW;
4534 }
4535
4536 Checksummer::CSumType csum_type;
4537 switch (op.checksum.type) {
4538 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4539 csum_type = Checksummer::CSUM_XXHASH32;
4540 break;
4541 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4542 csum_type = Checksummer::CSUM_XXHASH64;
4543 break;
4544 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4545 csum_type = Checksummer::CSUM_CRC32C;
4546 break;
4547 default:
4548 dout(10) << __func__ << ": unknown crc type ("
4549 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4550 return -EINVAL;
4551 }
4552
4553 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4554 if (bl_it->get_remaining() < csum_init_value_size) {
4555 dout(10) << __func__ << ": init value not provided" << dendl;
4556 return -EINVAL;
4557 }
4558
4559 bufferlist init_value_bl;
4560 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4561 csum_init_value_size);
4562 bl_it->advance(csum_init_value_size);
4563
4564 if (pool.info.require_rollback() && op.checksum.length > 0) {
4565 // If there is a data digest and it is possible we are reading
4566 // entire object, pass the digest.
4567 boost::optional<uint32_t> maybe_crc;
4568 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4569 op.checksum.length >= oi.size) {
4570 maybe_crc = oi.data_digest;
4571 }
4572
4573 // async read
4574 auto& soid = oi.soid;
4575 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4576 std::move(init_value_bl), maybe_crc,
4577 oi.size, osd, soid, op.flags);
c07f9fc5 4578
7c673cae
FG
4579 ctx->pending_async_reads.push_back({
4580 {op.checksum.offset, op.checksum.length, op.flags},
4581 {&checksum_ctx->read_bl, checksum_ctx}});
4582
4583 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
c07f9fc5
FG
4584 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4585 new ReadFinisher(osd_op));
4586 return -EINPROGRESS;
7c673cae
FG
4587 }
4588
4589 // sync read
7c673cae
FG
4590 std::vector<OSDOp> read_ops(1);
4591 auto& read_op = read_ops[0];
4592 if (op.checksum.length > 0) {
4593 read_op.op.op = CEPH_OSD_OP_READ;
4594 read_op.op.flags = op.flags;
4595 read_op.op.extent.offset = op.checksum.offset;
4596 read_op.op.extent.length = op.checksum.length;
4597 read_op.op.extent.truncate_size = 0;
4598 read_op.op.extent.truncate_seq = 0;
4599
4600 int r = do_osd_ops(ctx, read_ops);
4601 if (r < 0) {
4602 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4603 return r;
4604 }
4605 }
4606
4607 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4608 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4609 read_op.outdata);
4610}
4611
4612int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4613 Checksummer::CSumType csum_type,
4614 bufferlist::iterator *init_value_bl_it,
4615 const bufferlist &read_bl) {
4616 dout(20) << __func__ << dendl;
4617
4618 auto& op = osd_op.op;
4619
4620 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4621 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4622 << op.checksum.length << dendl;
4623 return -EINVAL;
4624 }
4625
4626 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4627 op.checksum.chunk_size : read_bl.length());
4628 uint32_t csum_count = (csum_chunk_size > 0 ?
4629 read_bl.length() / csum_chunk_size : 0);
4630
4631 bufferlist csum;
4632 bufferptr csum_data;
4633 if (csum_count > 0) {
4634 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4635 csum_data = buffer::create(csum_value_size * csum_count);
4636 csum_data.zero();
4637 csum.append(csum_data);
4638
4639 switch (csum_type) {
4640 case Checksummer::CSUM_XXHASH32:
4641 {
4642 Checksummer::xxhash32::init_value_t init_value;
4643 ::decode(init_value, *init_value_bl_it);
4644 Checksummer::calculate<Checksummer::xxhash32>(
4645 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4646 &csum_data);
4647 }
4648 break;
4649 case Checksummer::CSUM_XXHASH64:
4650 {
4651 Checksummer::xxhash64::init_value_t init_value;
4652 ::decode(init_value, *init_value_bl_it);
4653 Checksummer::calculate<Checksummer::xxhash64>(
4654 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4655 &csum_data);
4656 }
4657 break;
4658 case Checksummer::CSUM_CRC32C:
4659 {
4660 Checksummer::crc32c::init_value_t init_value;
4661 ::decode(init_value, *init_value_bl_it);
4662 Checksummer::calculate<Checksummer::crc32c>(
4663 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4664 &csum_data);
4665 }
4666 break;
4667 default:
4668 break;
4669 }
4670 }
4671
4672 ::encode(csum_count, osd_op.outdata);
4673 osd_op.outdata.claim_append(csum);
4674 return 0;
4675}
4676
c07f9fc5
FG
4677struct C_ExtentCmpRead : public Context {
4678 PrimaryLogPG *primary_log_pg;
4679 OSDOp &osd_op;
4680 ceph_le64 read_length;
4681 bufferlist read_bl;
4682 Context *fill_extent_ctx;
4683
4684 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4685 boost::optional<uint32_t> maybe_crc, uint64_t size,
4686 OSDService *osd, hobject_t soid, __le32 flags)
4687 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4688 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4689 &read_bl, maybe_crc, size,
4690 osd, soid, flags)) {
4691 }
4692 ~C_ExtentCmpRead() override {
4693 delete fill_extent_ctx;
4694 }
4695
4696 void finish(int r) override {
4697 if (r == -ENOENT) {
4698 osd_op.rval = 0;
4699 read_bl.clear();
4700 delete fill_extent_ctx;
4701 } else {
4702 fill_extent_ctx->complete(r);
4703 }
4704 fill_extent_ctx = nullptr;
4705
4706 if (osd_op.rval >= 0) {
4707 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4708 }
4709 }
4710};
4711
4712int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4713{
4714 dout(20) << __func__ << dendl;
4715 ceph_osd_op& op = osd_op.op;
4716
4717 if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4718 dout(20) << __func__ << " object DNE" << dendl;
4719 return finish_extent_cmp(osd_op, {});
4720 } else if (pool.info.require_rollback()) {
4721 // If there is a data digest and it is possible we are reading
4722 // entire object, pass the digest.
4723 auto& oi = ctx->new_obs.oi;
4724 boost::optional<uint32_t> maybe_crc;
4725 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4726 op.checksum.length >= oi.size) {
4727 maybe_crc = oi.data_digest;
4728 }
4729
4730 // async read
4731 auto& soid = oi.soid;
4732 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4733 osd, soid, op.flags);
4734 ctx->pending_async_reads.push_back({
4735 {op.extent.offset, op.extent.length, op.flags},
4736 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4737
4738 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4739
4740 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4741 new ReadFinisher(osd_op));
4742 return -EINPROGRESS;
4743 }
4744
4745 // sync read
4746 vector<OSDOp> read_ops(1);
4747 OSDOp& read_op = read_ops[0];
4748
4749 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4750 read_op.op.extent.offset = op.extent.offset;
4751 read_op.op.extent.length = op.extent.length;
4752 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4753 read_op.op.extent.truncate_size = op.extent.truncate_size;
4754
4755 int result = do_osd_ops(ctx, read_ops);
4756 if (result < 0) {
4757 derr << __func__ << " failed " << result << dendl;
4758 return result;
4759 }
4760 return finish_extent_cmp(osd_op, read_op.outdata);
4761}
4762
4763int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4764{
4765 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4766 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4767 if (osd_op.indata[idx] != read_byte) {
4768 return (-MAX_ERRNO - idx);
4769 }
4770 }
4771
4772 return 0;
4773}
4774
4775int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4776 dout(20) << __func__ << dendl;
4777 auto& op = osd_op.op;
4778 auto& oi = ctx->new_obs.oi;
4779 auto& soid = oi.soid;
4780 __u32 seq = oi.truncate_seq;
4781 uint64_t size = oi.size;
4782 bool trimmed_read = false;
4783
4784 // are we beyond truncate_size?
4785 if ( (seq < op.extent.truncate_seq) &&
4786 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4787 size = op.extent.truncate_size;
4788
4789 if (op.extent.length == 0) //length is zero mean read the whole object
4790 op.extent.length = size;
4791
4792 if (op.extent.offset >= size) {
4793 op.extent.length = 0;
4794 trimmed_read = true;
4795 } else if (op.extent.offset + op.extent.length > size) {
4796 op.extent.length = size - op.extent.offset;
4797 trimmed_read = true;
4798 }
4799
4800 // read into a buffer
4801 int result = 0;
4802 if (trimmed_read && op.extent.length == 0) {
4803 // read size was trimmed to zero and it is expected to do nothing
4804 // a read operation of 0 bytes does *not* do nothing, this is why
4805 // the trimmed_read boolean is needed
4806 } else if (pool.info.require_rollback()) {
4807 boost::optional<uint32_t> maybe_crc;
4808 // If there is a data digest and it is possible we are reading
4809 // entire object, pass the digest. FillInVerifyExtent will
4810 // will check the oi.size again.
4811 if (oi.is_data_digest() && op.extent.offset == 0 &&
4812 op.extent.length >= oi.size)
4813 maybe_crc = oi.data_digest;
4814 ctx->pending_async_reads.push_back(
4815 make_pair(
4816 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4817 make_pair(&osd_op.outdata,
4818 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4819 &osd_op.outdata, maybe_crc, oi.size,
4820 osd, soid, op.flags))));
4821 dout(10) << " async_read noted for " << soid << dendl;
4822
4823 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4824 new ReadFinisher(osd_op));
4825 } else {
4826 int r = pgbackend->objects_read_sync(
4827 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4828 if (r == -EIO) {
4829 r = rep_repair_primary_object(soid, ctx->op);
4830 }
4831 if (r >= 0)
4832 op.extent.length = r;
4833 else {
4834 result = r;
4835 op.extent.length = 0;
4836 }
4837 dout(10) << " read got " << r << " / " << op.extent.length
4838 << " bytes from obj " << soid << dendl;
4839
4840 // whole object? can we verify the checksum?
4841 if (op.extent.length == oi.size && oi.is_data_digest()) {
4842 uint32_t crc = osd_op.outdata.crc32c(-1);
4843 if (oi.data_digest != crc) {
4844 osd->clog->error() << info.pgid << std::hex
4845 << " full-object read crc 0x" << crc
4846 << " != expected 0x" << oi.data_digest
4847 << std::dec << " on " << soid;
4848 // FIXME fall back to replica or something?
4849 result = -EIO;
4850 }
4851 }
4852 }
4853
4854 // XXX the op.extent.length is the requested length for async read
4855 // On error this length is changed to 0 after the error comes back.
4856 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4857 ctx->delta_stats.num_rd++;
4858 return result;
4859}
4860
4861int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4862 dout(20) << __func__ << dendl;
4863 auto& op = osd_op.op;
4864 auto& oi = ctx->new_obs.oi;
4865 auto& soid = oi.soid;
4866
4867 if (op.extent.truncate_seq) {
4868 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4869 return -EINVAL;
4870 }
4871
4872 ++ctx->num_read;
4873 if (pool.info.ec_pool()) {
4874 // translate sparse read to a normal one if not supported
4875 uint64_t offset = op.extent.offset;
4876 uint64_t length = op.extent.length;
4877 if (offset > oi.size) {
4878 length = 0;
4879 } else if (offset + length > oi.size) {
4880 length = oi.size - offset;
4881 }
4882
4883 if (length > 0) {
4884 ctx->pending_async_reads.push_back(
4885 make_pair(
4886 boost::make_tuple(offset, length, op.flags),
4887 make_pair(
4888 &osd_op.outdata,
4889 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4890 &op.extent.length))));
4891 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4892
4893 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4894 new ReadFinisher(osd_op));
4895 } else {
4896 dout(10) << " sparse read ended up empty for " << soid << dendl;
4897 map<uint64_t, uint64_t> extents;
4898 ::encode(extents, osd_op.outdata);
4899 }
4900 } else {
4901 // read into a buffer
4902 map<uint64_t, uint64_t> m;
4903 uint32_t total_read = 0;
4904 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4905 info.pgid.shard),
4906 op.extent.offset, op.extent.length, m);
4907 if (r < 0) {
4908 return r;
4909 }
4910
4911 map<uint64_t, uint64_t>::iterator miter;
4912 bufferlist data_bl;
4913 uint64_t last = op.extent.offset;
4914 for (miter = m.begin(); miter != m.end(); ++miter) {
4915 // verify hole?
4916 if (cct->_conf->osd_verify_sparse_read_holes &&
4917 last < miter->first) {
4918 bufferlist t;
4919 uint64_t len = miter->first - last;
4920 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4921 if (r == -EIO) {
4922 r = rep_repair_primary_object(soid, ctx->op);
4923 }
4924 if (r < 0) {
4925 osd->clog->error() << coll << " " << soid
4926 << " sparse-read failed to read: "
4927 << r;
4928 } else if (!t.is_zero()) {
4929 osd->clog->error() << coll << " " << soid
4930 << " sparse-read found data in hole "
4931 << last << "~" << len;
4932 }
4933 }
4934
4935 bufferlist tmpbl;
4936 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4937 op.flags, &tmpbl);
4938 if (r < 0) {
4939 return r;
4940 }
4941
4942 // this is usually happen when we get extent that exceeds the actual file
4943 // size
4944 if (r < (int)miter->second)
4945 miter->second = r;
4946 total_read += r;
4947 dout(10) << "sparse-read " << miter->first << "@" << miter->second
4948 << dendl;
4949 data_bl.claim_append(tmpbl);
4950 last = miter->first + r;
4951 }
4952
4953 if (r < 0) {
4954 return r;
4955 }
4956
4957 // verify trailing hole?
4958 if (cct->_conf->osd_verify_sparse_read_holes) {
4959 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4960 if (last < end) {
4961 bufferlist t;
4962 uint64_t len = end - last;
4963 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4964 if (r < 0) {
4965 osd->clog->error() << coll << " " << soid
4966 << " sparse-read failed to read: " << r;
4967 } else if (!t.is_zero()) {
4968 osd->clog->error() << coll << " " << soid
4969 << " sparse-read found data in hole "
4970 << last << "~" << len;
4971 }
4972 }
4973 }
4974
4975 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4976 // Maybe at first, there is no much whole objects. With continued use, more
4977 // and more whole object exist. So from this point, for spare-read add
4978 // checksum make sense.
4979 if (total_read == oi.size && oi.is_data_digest()) {
4980 uint32_t crc = data_bl.crc32c(-1);
4981 if (oi.data_digest != crc) {
4982 osd->clog->error() << info.pgid << std::hex
4983 << " full-object read crc 0x" << crc
4984 << " != expected 0x" << oi.data_digest
4985 << std::dec << " on " << soid;
4986 // FIXME fall back to replica or something?
4987 return -EIO;
4988 }
4989 }
4990
4991 op.extent.length = total_read;
4992
4993 ::encode(m, osd_op.outdata); // re-encode since it might be modified
4994 ::encode_destructively(data_bl, osd_op.outdata);
4995
4996 dout(10) << " sparse_read got " << total_read << " bytes from object "
4997 << soid << dendl;
4998 }
4999
5000 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5001 ctx->delta_stats.num_rd++;
5002 return 0;
5003}
5004
7c673cae
FG
5005int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5006{
5007 int result = 0;
5008 SnapSetContext *ssc = ctx->obc->ssc;
5009 ObjectState& obs = ctx->new_obs;
5010 object_info_t& oi = obs.oi;
5011 const hobject_t& soid = oi.soid;
5012
7c673cae
FG
5013 PGTransaction* t = ctx->op_t.get();
5014
5015 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5016
c07f9fc5 5017 ctx->current_osd_subop_num = 0;
7c673cae
FG
5018 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
5019 OSDOp& osd_op = *p;
5020 ceph_osd_op& op = osd_op.op;
5021
c07f9fc5
FG
5022 OpFinisher* op_finisher = nullptr;
5023 {
5024 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5025 if (op_finisher_it != ctx->op_finishers.end()) {
5026 op_finisher = op_finisher_it->second.get();
5027 }
5028 }
5029
7c673cae
FG
5030 // TODO: check endianness (__le32 vs uint32_t, etc.)
5031 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5032 // but the code in this function seems to treat them as native-endian. What should the
5033 // tracepoints do?
5034 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5035
5036 dout(10) << "do_osd_op " << osd_op << dendl;
5037
5038 bufferlist::iterator bp = osd_op.indata.begin();
5039
5040 // user-visible modifcation?
5041 switch (op.op) {
5042 // non user-visible modifications
5043 case CEPH_OSD_OP_WATCH:
5044 case CEPH_OSD_OP_CACHE_EVICT:
5045 case CEPH_OSD_OP_CACHE_FLUSH:
5046 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5047 case CEPH_OSD_OP_UNDIRTY:
5048 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5049 case CEPH_OSD_OP_CACHE_PIN:
5050 case CEPH_OSD_OP_CACHE_UNPIN:
31f18b77 5051 case CEPH_OSD_OP_SET_REDIRECT:
7c673cae
FG
5052 break;
5053 default:
5054 if (op.op & CEPH_OSD_OP_MODE_WR)
5055 ctx->user_modify = true;
5056 }
5057
5058 // munge -1 truncate to 0 truncate
5059 if (ceph_osd_op_uses_extent(op.op) &&
5060 op.extent.truncate_seq == 1 &&
5061 op.extent.truncate_size == (-1ULL)) {
5062 op.extent.truncate_size = 0;
5063 op.extent.truncate_seq = 0;
5064 }
5065
5066 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5067 if (op.op == CEPH_OSD_OP_ZERO &&
5068 obs.exists &&
5069 op.extent.offset < cct->_conf->osd_max_object_size &&
5070 op.extent.length >= 1 &&
5071 op.extent.length <= cct->_conf->osd_max_object_size &&
5072 op.extent.offset + op.extent.length >= oi.size) {
5073 if (op.extent.offset >= oi.size) {
5074 // no-op
5075 goto fail;
5076 }
5077 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5078 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5079 op.op = CEPH_OSD_OP_TRUNCATE;
5080 }
5081
5082 switch (op.op) {
5083
5084 // --- READS ---
5085
5086 case CEPH_OSD_OP_CMPEXT:
5087 ++ctx->num_read;
c07f9fc5
FG
5088 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5089 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5090 op.extent.length, op.extent.truncate_size,
5091 op.extent.truncate_seq);
5092
5093 if (op_finisher == nullptr) {
5094 result = do_extent_cmp(ctx, osd_op);
5095 } else {
5096 result = op_finisher->execute();
5097 }
7c673cae
FG
5098 break;
5099
5100 case CEPH_OSD_OP_SYNC_READ:
5101 if (pool.info.require_rollback()) {
5102 result = -EOPNOTSUPP;
5103 break;
5104 }
5105 // fall through
5106 case CEPH_OSD_OP_READ:
5107 ++ctx->num_read;
c07f9fc5
FG
5108 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5109 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5110 op.extent.length, op.extent.truncate_size,
5111 op.extent.truncate_seq);
5112 if (op_finisher == nullptr) {
5113 if (!ctx->data_off) {
7c673cae
FG
5114 ctx->data_off = op.extent.offset;
5115 }
c07f9fc5
FG
5116 result = do_read(ctx, osd_op);
5117 } else {
5118 result = op_finisher->execute();
7c673cae
FG
5119 }
5120 break;
5121
5122 case CEPH_OSD_OP_CHECKSUM:
5123 ++ctx->num_read;
5124 {
5125 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5126 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5127 op.checksum.offset, op.checksum.length,
5128 op.checksum.chunk_size);
5129
c07f9fc5
FG
5130 if (op_finisher == nullptr) {
5131 result = do_checksum(ctx, osd_op, &bp);
5132 } else {
5133 result = op_finisher->execute();
7c673cae
FG
5134 }
5135 }
5136 break;
5137
5138 /* map extents */
5139 case CEPH_OSD_OP_MAPEXT:
5140 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5141 if (pool.info.require_rollback()) {
5142 result = -EOPNOTSUPP;
5143 break;
5144 }
5145 ++ctx->num_read;
5146 {
5147 // read into a buffer
5148 bufferlist bl;
5149 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5150 info.pgid.shard),
5151 op.extent.offset, op.extent.length, bl);
5152 osd_op.outdata.claim(bl);
5153 if (r < 0)
5154 result = r;
5155 else
5156 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5157 ctx->delta_stats.num_rd++;
5158 dout(10) << " map_extents done on object " << soid << dendl;
5159 }
5160 break;
5161
5162 /* map extents */
5163 case CEPH_OSD_OP_SPARSE_READ:
c07f9fc5
FG
5164 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5165 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5166 op.extent.length, op.extent.truncate_size,
5167 op.extent.truncate_seq);
5168 if (op_finisher == nullptr) {
5169 result = do_sparse_read(ctx, osd_op);
7c673cae 5170 } else {
c07f9fc5 5171 result = op_finisher->execute();
7c673cae 5172 }
7c673cae
FG
5173 break;
5174
5175 case CEPH_OSD_OP_CALL:
5176 {
5177 string cname, mname;
5178 bufferlist indata;
5179 try {
5180 bp.copy(op.cls.class_len, cname);
5181 bp.copy(op.cls.method_len, mname);
5182 bp.copy(op.cls.indata_len, indata);
5183 } catch (buffer::error& e) {
5184 dout(10) << "call unable to decode class + method + indata" << dendl;
5185 dout(30) << "in dump: ";
5186 osd_op.indata.hexdump(*_dout);
5187 *_dout << dendl;
5188 result = -EINVAL;
5189 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5190 break;
5191 }
5192 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5193
5194 ClassHandler::ClassData *cls;
5195 result = osd->class_handler->open_class(cname, &cls);
5196 assert(result == 0); // init_op_flags() already verified this works.
5197
5198 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5199 if (!method) {
5200 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5201 result = -EOPNOTSUPP;
5202 break;
5203 }
5204
5205 int flags = method->get_flags();
5206 if (flags & CLS_METHOD_WR)
5207 ctx->user_modify = true;
5208
5209 bufferlist outdata;
5210 dout(10) << "call method " << cname << "." << mname << dendl;
5211 int prev_rd = ctx->num_read;
5212 int prev_wr = ctx->num_write;
5213 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5214
5215 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5216 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5217 result = -EIO;
5218 break;
5219 }
5220 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5221 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5222 result = -EIO;
5223 break;
5224 }
5225
5226 dout(10) << "method called response length=" << outdata.length() << dendl;
5227 op.extent.length = outdata.length();
5228 osd_op.outdata.claim_append(outdata);
5229 dout(30) << "out dump: ";
5230 osd_op.outdata.hexdump(*_dout);
5231 *_dout << dendl;
5232 }
5233 break;
5234
5235 case CEPH_OSD_OP_STAT:
5236 // note: stat does not require RD
5237 {
5238 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5239
5240 if (obs.exists && !oi.is_whiteout()) {
5241 ::encode(oi.size, osd_op.outdata);
5242 ::encode(oi.mtime, osd_op.outdata);
5243 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5244 } else {
5245 result = -ENOENT;
5246 dout(10) << "stat oi object does not exist" << dendl;
5247 }
5248
5249 ctx->delta_stats.num_rd++;
5250 }
5251 break;
5252
5253 case CEPH_OSD_OP_ISDIRTY:
5254 ++ctx->num_read;
5255 {
5256 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5257 bool is_dirty = obs.oi.is_dirty();
5258 ::encode(is_dirty, osd_op.outdata);
5259 ctx->delta_stats.num_rd++;
5260 result = 0;
5261 }
5262 break;
5263
5264 case CEPH_OSD_OP_UNDIRTY:
5265 ++ctx->num_write;
5266 {
5267 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5268 if (oi.is_dirty()) {
5269 ctx->undirty = true; // see make_writeable()
5270 ctx->modify = true;
5271 ctx->delta_stats.num_wr++;
5272 }
5273 result = 0;
5274 }
5275 break;
5276
5277 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5278 ++ctx->num_write;
5279 {
5280 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5281 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5282 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5283 result = -EINVAL;
5284 break;
5285 }
5286 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5287 result = -EINVAL;
5288 break;
5289 }
5290 if (!obs.exists) {
5291 result = 0;
5292 break;
5293 }
5294 if (oi.is_cache_pinned()) {
5295 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5296 result = -EPERM;
5297 break;
5298 }
5299 if (oi.is_dirty()) {
5300 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5301 if (result == -EINPROGRESS)
5302 result = -EAGAIN;
5303 } else {
5304 result = 0;
5305 }
5306 }
5307 break;
5308
5309 case CEPH_OSD_OP_CACHE_FLUSH:
5310 ++ctx->num_write;
5311 {
5312 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5313 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5314 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5315 result = -EINVAL;
5316 break;
5317 }
5318 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5319 result = -EINVAL;
5320 break;
5321 }
5322 if (!obs.exists) {
5323 result = 0;
5324 break;
5325 }
5326 if (oi.is_cache_pinned()) {
5327 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5328 result = -EPERM;
5329 break;
5330 }
5331 hobject_t missing;
5332 if (oi.is_dirty()) {
5333 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5334 if (result == -EINPROGRESS)
5335 result = -EAGAIN;
5336 } else {
5337 result = 0;
5338 }
5339 // Check special return value which has set missing_return
5340 if (result == -ENOENT) {
5341 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5342 assert(!missing.is_min());
5343 wait_for_unreadable_object(missing, ctx->op);
5344 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5345 result = -EAGAIN;
5346 }
5347 }
5348 break;
5349
5350 case CEPH_OSD_OP_CACHE_EVICT:
5351 ++ctx->num_write;
5352 {
5353 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5354 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5355 result = -EINVAL;
5356 break;
5357 }
5358 if (!obs.exists) {
5359 result = 0;
5360 break;
5361 }
5362 if (oi.is_cache_pinned()) {
5363 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5364 result = -EPERM;
5365 break;
5366 }
5367 if (oi.is_dirty()) {
5368 result = -EBUSY;
5369 break;
5370 }
5371 if (!oi.watchers.empty()) {
5372 result = -EBUSY;
5373 break;
5374 }
5375 if (soid.snap == CEPH_NOSNAP) {
5376 result = _verify_no_head_clones(soid, ssc->snapset);
5377 if (result < 0)
5378 break;
5379 }
5380 result = _delete_oid(ctx, true, false);
5381 if (result >= 0) {
5382 // mark that this is a cache eviction to avoid triggering normal
5383 // make_writeable() clone or snapdir object creation in finish_ctx()
5384 ctx->cache_evict = true;
5385 }
5386 osd->logger->inc(l_osd_tier_evict);
5387 }
5388 break;
5389
5390 case CEPH_OSD_OP_GETXATTR:
5391 ++ctx->num_read;
5392 {
5393 string aname;
5394 bp.copy(op.xattr.name_len, aname);
5395 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5396 string name = "_" + aname;
5397 int r = getattr_maybe_cache(
5398 ctx->obc,
5399 name,
5400 &(osd_op.outdata));
5401 if (r >= 0) {
5402 op.xattr.value_len = osd_op.outdata.length();
5403 result = 0;
5404 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5405 } else
5406 result = r;
5407
5408 ctx->delta_stats.num_rd++;
5409 }
5410 break;
5411
5412 case CEPH_OSD_OP_GETXATTRS:
5413 ++ctx->num_read;
5414 {
5415 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5416 map<string, bufferlist> out;
5417 result = getattrs_maybe_cache(
5418 ctx->obc,
5419 &out,
5420 true);
5421
5422 bufferlist bl;
5423 ::encode(out, bl);
5424 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5425 ctx->delta_stats.num_rd++;
5426 osd_op.outdata.claim_append(bl);
5427 }
5428 break;
5429
5430 case CEPH_OSD_OP_CMPXATTR:
5431 ++ctx->num_read;
5432 {
5433 string aname;
5434 bp.copy(op.xattr.name_len, aname);
5435 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5436 string name = "_" + aname;
5437 name[op.xattr.name_len + 1] = 0;
5438
5439 bufferlist xattr;
5440 result = getattr_maybe_cache(
5441 ctx->obc,
5442 name,
5443 &xattr);
5444 if (result < 0 && result != -EEXIST && result != -ENODATA)
5445 break;
5446
5447 ctx->delta_stats.num_rd++;
5448 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5449
5450 switch (op.xattr.cmp_mode) {
5451 case CEPH_OSD_CMPXATTR_MODE_STRING:
5452 {
5453 string val;
5454 bp.copy(op.xattr.value_len, val);
5455 val[op.xattr.value_len] = 0;
5456 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5457 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5458 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5459 }
5460 break;
5461
5462 case CEPH_OSD_CMPXATTR_MODE_U64:
5463 {
5464 uint64_t u64val;
5465 try {
5466 ::decode(u64val, bp);
5467 }
5468 catch (buffer::error& e) {
5469 result = -EINVAL;
5470 goto fail;
5471 }
5472 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5473 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5474 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5475 }
5476 break;
5477
5478 default:
5479 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5480 result = -EINVAL;
5481 }
5482
5483 if (!result) {
5484 dout(10) << "comparison returned false" << dendl;
5485 result = -ECANCELED;
5486 break;
5487 }
5488 if (result < 0) {
5489 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5490 break;
5491 }
5492
5493 dout(10) << "comparison returned true" << dendl;
5494 }
5495 break;
5496
5497 case CEPH_OSD_OP_ASSERT_VER:
5498 ++ctx->num_read;
5499 {
5500 uint64_t ver = op.assert_ver.ver;
5501 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5502 if (!ver)
5503 result = -EINVAL;
5504 else if (ver < oi.user_version)
5505 result = -ERANGE;
5506 else if (ver > oi.user_version)
5507 result = -EOVERFLOW;
5508 }
5509 break;
5510
5511 case CEPH_OSD_OP_LIST_WATCHERS:
5512 ++ctx->num_read;
5513 {
5514 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5515 obj_list_watch_response_t resp;
5516
5517 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5518 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5519 ++oi_iter) {
5520 dout(20) << "key cookie=" << oi_iter->first.first
5521 << " entity=" << oi_iter->first.second << " "
5522 << oi_iter->second << dendl;
5523 assert(oi_iter->first.first == oi_iter->second.cookie);
5524 assert(oi_iter->first.second.is_client());
5525
5526 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5527 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5528 resp.entries.push_back(wi);
5529 }
5530
5531 resp.encode(osd_op.outdata, ctx->get_features());
5532 result = 0;
5533
5534 ctx->delta_stats.num_rd++;
5535 break;
5536 }
5537
5538 case CEPH_OSD_OP_LIST_SNAPS:
5539 ++ctx->num_read;
5540 {
5541 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5542 obj_list_snap_response_t resp;
5543
5544 if (!ssc) {
5545 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5546 }
5547 assert(ssc);
5548
5549 int clonecount = ssc->snapset.clones.size();
5550 if (ssc->snapset.head_exists)
5551 clonecount++;
5552 resp.clones.reserve(clonecount);
5553 for (auto clone_iter = ssc->snapset.clones.begin();
5554 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5555 clone_info ci;
5556 ci.cloneid = *clone_iter;
5557
5558 hobject_t clone_oid = soid;
5559 clone_oid.snap = *clone_iter;
5560
5561 if (!ssc->snapset.is_legacy()) {
5562 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5563 if (p == ssc->snapset.clone_snaps.end()) {
5564 osd->clog->error() << "osd." << osd->whoami
5565 << ": inconsistent clone_snaps found for oid "
5566 << soid << " clone " << *clone_iter
5567 << " snapset " << ssc->snapset;
5568 result = -EINVAL;
5569 break;
5570 }
5571 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5572 ci.snaps.push_back(*q);
5573 }
5574 } else {
5575 /* No need to take a lock here. We are only inspecting state cached on
5576 * in the ObjectContext, so we aren't performing an actual read unless
5577 * the clone obc is not already loaded (in which case, it cannot have
5578 * an in progress write). We also do not risk exposing uncommitted
5579 * state since we do have a read lock on the head object or snapdir,
5580 * which we would have to write lock in order to make user visible
5581 * modifications to the snapshot state (snap trim related mutations
5582 * are not user visible).
5583 */
5584 if (is_missing_object(clone_oid)) {
5585 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5586 wait_for_unreadable_object(clone_oid, ctx->op);
5587 result = -EAGAIN;
5588 break;
5589 }
5590
5591 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5592 if (!clone_obc) {
5593 if (maybe_handle_cache(
5594 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5595 // promoting the clone
5596 result = -EAGAIN;
5597 } else {
5598 osd->clog->error() << "osd." << osd->whoami
5599 << ": missing clone " << clone_oid
5600 << " for oid "
5601 << soid;
5602 // should not happen
5603 result = -ENOENT;
5604 }
5605 break;
5606 }
5607 for (vector<snapid_t>::reverse_iterator p =
5608 clone_obc->obs.oi.legacy_snaps.rbegin();
5609 p != clone_obc->obs.oi.legacy_snaps.rend();
5610 ++p) {
5611 ci.snaps.push_back(*p);
5612 }
5613 }
5614
5615 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5616
5617 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5618 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5619 if (coi == ssc->snapset.clone_overlap.end()) {
5620 osd->clog->error() << "osd." << osd->whoami
5621 << ": inconsistent clone_overlap found for oid "
5622 << soid << " clone " << *clone_iter;
5623 result = -EINVAL;
5624 break;
5625 }
5626 const interval_set<uint64_t> &o = coi->second;
5627 ci.overlap.reserve(o.num_intervals());
5628 for (interval_set<uint64_t>::const_iterator r = o.begin();
5629 r != o.end(); ++r) {
5630 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5631 r.get_len()));
5632 }
5633
5634 map<snapid_t, uint64_t>::const_iterator si;
5635 si = ssc->snapset.clone_size.find(ci.cloneid);
5636 if (si == ssc->snapset.clone_size.end()) {
5637 osd->clog->error() << "osd." << osd->whoami
5638 << ": inconsistent clone_size found for oid "
5639 << soid << " clone " << *clone_iter;
5640 result = -EINVAL;
5641 break;
5642 }
5643 ci.size = si->second;
5644
5645 resp.clones.push_back(ci);
5646 }
5647 if (result < 0) {
5648 break;
5649 }
5650 if (ssc->snapset.head_exists &&
5651 !ctx->obc->obs.oi.is_whiteout()) {
5652 assert(obs.exists);
5653 clone_info ci;
5654 ci.cloneid = CEPH_NOSNAP;
5655
5656 //Size for HEAD is oi.size
5657 ci.size = oi.size;
5658
5659 resp.clones.push_back(ci);
5660 }
5661 resp.seq = ssc->snapset.seq;
5662
5663 resp.encode(osd_op.outdata);
5664 result = 0;
5665
5666 ctx->delta_stats.num_rd++;
5667 break;
5668 }
5669
5670 case CEPH_OSD_OP_NOTIFY:
5671 ++ctx->num_read;
5672 {
5673 uint32_t timeout;
5674 bufferlist bl;
5675
5676 try {
5677 uint32_t ver; // obsolete
5678 ::decode(ver, bp);
5679 ::decode(timeout, bp);
5680 ::decode(bl, bp);
5681 } catch (const buffer::error &e) {
5682 timeout = 0;
5683 }
5684 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5685 if (!timeout)
5686 timeout = cct->_conf->osd_default_notify_timeout;
5687
5688 notify_info_t n;
5689 n.timeout = timeout;
5690 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5691 n.cookie = op.watch.cookie;
5692 n.bl = bl;
5693 ctx->notifies.push_back(n);
5694
5695 // return our unique notify id to the client
5696 ::encode(n.notify_id, osd_op.outdata);
5697 }
5698 break;
5699
5700 case CEPH_OSD_OP_NOTIFY_ACK:
5701 ++ctx->num_read;
5702 {
5703 try {
5704 uint64_t notify_id = 0;
5705 uint64_t watch_cookie = 0;
5706 ::decode(notify_id, bp);
5707 ::decode(watch_cookie, bp);
5708 bufferlist reply_bl;
5709 if (!bp.end()) {
5710 ::decode(reply_bl, bp);
5711 }
5712 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5713 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5714 ctx->notify_acks.push_back(ack);
5715 } catch (const buffer::error &e) {
5716 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5717 OpContext::NotifyAck ack(
5718 // op.watch.cookie is actually the notify_id for historical reasons
5719 op.watch.cookie
5720 );
5721 ctx->notify_acks.push_back(ack);
5722 }
5723 }
5724 break;
5725
5726 case CEPH_OSD_OP_SETALLOCHINT:
5727 ++ctx->num_write;
5728 {
5729 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5730 maybe_create_new_object(ctx);
5731 oi.expected_object_size = op.alloc_hint.expected_object_size;
5732 oi.expected_write_size = op.alloc_hint.expected_write_size;
5733 oi.alloc_hint_flags = op.alloc_hint.flags;
5734 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5735 op.alloc_hint.expected_write_size,
5736 op.alloc_hint.flags);
5737 ctx->delta_stats.num_wr++;
5738 result = 0;
5739 }
5740 break;
5741
5742
5743 // --- WRITES ---
5744
5745 // -- object data --
5746
5747 case CEPH_OSD_OP_WRITE:
5748 ++ctx->num_write;
5749 { // write
5750 __u32 seq = oi.truncate_seq;
5751 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5752 if (op.extent.length != osd_op.indata.length()) {
5753 result = -EINVAL;
5754 break;
5755 }
5756
5757 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5758 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5759
5760 if (pool.info.requires_aligned_append() &&
5761 (op.extent.offset % pool.info.required_alignment() != 0)) {
5762 result = -EOPNOTSUPP;
5763 break;
5764 }
5765
5766 if (!obs.exists) {
5767 if (pool.info.requires_aligned_append() && op.extent.offset) {
5768 result = -EOPNOTSUPP;
5769 break;
5770 }
5771 } else if (op.extent.offset != oi.size &&
5772 pool.info.requires_aligned_append()) {
5773 result = -EOPNOTSUPP;
5774 break;
5775 }
5776
5777 if (seq && (seq > op.extent.truncate_seq) &&
5778 (op.extent.offset + op.extent.length > oi.size)) {
5779 // old write, arrived after trimtrunc
5780 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5781 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5782 << ", adjusting write length to " << op.extent.length << dendl;
5783 bufferlist t;
5784 t.substr_of(osd_op.indata, 0, op.extent.length);
5785 osd_op.indata.swap(t);
5786 }
5787 if (op.extent.truncate_seq > seq) {
5788 // write arrives before trimtrunc
5789 if (obs.exists && !oi.is_whiteout()) {
5790 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5791 << ", truncating to " << op.extent.truncate_size << dendl;
5792 t->truncate(soid, op.extent.truncate_size);
5793 oi.truncate_seq = op.extent.truncate_seq;
5794 oi.truncate_size = op.extent.truncate_size;
5795 if (op.extent.truncate_size != oi.size) {
5796 ctx->delta_stats.num_bytes -= oi.size;
5797 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5798 oi.size = op.extent.truncate_size;
5799 }
5800 } else {
5801 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5802 << ", but object is new" << dendl;
5803 oi.truncate_seq = op.extent.truncate_seq;
5804 oi.truncate_size = op.extent.truncate_size;
5805 }
5806 }
5807 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5808 if (result < 0)
5809 break;
5810
5811 maybe_create_new_object(ctx);
5812
5813 if (op.extent.length == 0) {
5814 if (op.extent.offset > oi.size) {
5815 t->truncate(
5816 soid, op.extent.offset);
5817 } else {
5818 t->nop(soid);
5819 }
5820 } else {
5821 t->write(
5822 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5823 }
5824
5825 if (op.extent.offset == 0 && op.extent.length >= oi.size)
5826 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5827 else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5828 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5829 else
5830 obs.oi.clear_data_digest();
5831 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5832 op.extent.offset, op.extent.length);
5833
5834 }
5835 break;
5836
5837 case CEPH_OSD_OP_WRITEFULL:
5838 ++ctx->num_write;
5839 { // write full object
5840 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5841
5842 if (op.extent.length != osd_op.indata.length()) {
5843 result = -EINVAL;
5844 break;
5845 }
5846 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5847 if (result < 0)
5848 break;
5849
5850 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5851 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5852
5853 maybe_create_new_object(ctx);
5854 if (pool.info.require_rollback()) {
5855 t->truncate(soid, 0);
5856 } else if (obs.exists && op.extent.length < oi.size) {
5857 t->truncate(soid, op.extent.length);
5858 }
5859 if (op.extent.length) {
5860 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5861 }
5862 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5863
5864 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5865 0, op.extent.length, true);
5866 }
5867 break;
5868
5869 case CEPH_OSD_OP_WRITESAME:
5870 ++ctx->num_write;
5871 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5872 result = do_writesame(ctx, osd_op);
5873 break;
5874
5875 case CEPH_OSD_OP_ROLLBACK :
5876 ++ctx->num_write;
5877 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5878 result = _rollback_to(ctx, op);
5879 break;
5880
5881 case CEPH_OSD_OP_ZERO:
5882 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5883 if (pool.info.requires_aligned_append()) {
5884 result = -EOPNOTSUPP;
5885 break;
5886 }
5887 ++ctx->num_write;
5888 { // zero
5889 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5890 if (result < 0)
5891 break;
5892 assert(op.extent.length);
5893 if (obs.exists && !oi.is_whiteout()) {
5894 t->zero(soid, op.extent.offset, op.extent.length);
5895 interval_set<uint64_t> ch;
5896 ch.insert(op.extent.offset, op.extent.length);
5897 ctx->modified_ranges.union_of(ch);
5898 ctx->delta_stats.num_wr++;
5899 oi.clear_data_digest();
5900 } else {
5901 // no-op
5902 }
5903 }
5904 break;
5905 case CEPH_OSD_OP_CREATE:
5906 ++ctx->num_write;
5907 {
5908 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5909 int flags = le32_to_cpu(op.flags);
5910 if (obs.exists && !oi.is_whiteout() &&
5911 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5912 result = -EEXIST; /* this is an exclusive create */
5913 } else {
5914 if (osd_op.indata.length()) {
5915 bufferlist::iterator p = osd_op.indata.begin();
5916 string category;
5917 try {
5918 ::decode(category, p);
5919 }
5920 catch (buffer::error& e) {
5921 result = -EINVAL;
5922 goto fail;
5923 }
5924 // category is no longer implemented.
5925 }
5926 if (result >= 0) {
5927 maybe_create_new_object(ctx);
5928 t->nop(soid);
5929 }
5930 }
5931 }
5932 break;
5933
5934 case CEPH_OSD_OP_TRIMTRUNC:
5935 op.extent.offset = op.extent.truncate_size;
5936 // falling through
5937
5938 case CEPH_OSD_OP_TRUNCATE:
5939 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5940 if (pool.info.requires_aligned_append()) {
5941 result = -EOPNOTSUPP;
5942 break;
5943 }
5944 ++ctx->num_write;
5945 {
5946 // truncate
5947 if (!obs.exists || oi.is_whiteout()) {
5948 dout(10) << " object dne, truncate is a no-op" << dendl;
5949 break;
5950 }
5951
5952 if (op.extent.offset > cct->_conf->osd_max_object_size) {
5953 result = -EFBIG;
5954 break;
5955 }
5956
5957 if (op.extent.truncate_seq) {
5958 assert(op.extent.offset == op.extent.truncate_size);
5959 if (op.extent.truncate_seq <= oi.truncate_seq) {
5960 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5961 << ", no-op" << dendl;
5962 break; // old
5963 }
5964 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5965 << ", truncating" << dendl;
5966 oi.truncate_seq = op.extent.truncate_seq;
5967 oi.truncate_size = op.extent.truncate_size;
5968 }
5969
5970 maybe_create_new_object(ctx);
5971 t->truncate(soid, op.extent.offset);
5972 if (oi.size > op.extent.offset) {
5973 interval_set<uint64_t> trim;
5974 trim.insert(op.extent.offset, oi.size-op.extent.offset);
5975 ctx->modified_ranges.union_of(trim);
5976 }
5977 if (op.extent.offset != oi.size) {
5978 ctx->delta_stats.num_bytes -= oi.size;
5979 ctx->delta_stats.num_bytes += op.extent.offset;
5980 oi.size = op.extent.offset;
5981 }
5982 ctx->delta_stats.num_wr++;
5983 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5984
5985 oi.clear_data_digest();
5986 }
5987 break;
5988
5989 case CEPH_OSD_OP_DELETE:
5990 ++ctx->num_write;
5991 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5992 {
5993 result = _delete_oid(ctx, false, ctx->ignore_cache);
5994 }
5995 break;
5996
5997 case CEPH_OSD_OP_WATCH:
5998 ++ctx->num_write;
5999 {
6000 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6001 op.watch.cookie, op.watch.op);
6002 if (!obs.exists) {
6003 result = -ENOENT;
6004 break;
6005 }
6006 uint64_t cookie = op.watch.cookie;
6007 entity_name_t entity = ctx->reqid.name;
6008 ObjectContextRef obc = ctx->obc;
6009
6010 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6011 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6012 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6013 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6014 dout(10) << "watch: peer_addr="
6015 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6016
6017 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6018 if (op.watch.timeout != 0) {
6019 timeout = op.watch.timeout;
6020 }
6021
6022 watch_info_t w(cookie, timeout,
6023 ctx->op->get_req()->get_connection()->get_peer_addr());
6024 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6025 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6026 if (oi.watchers.count(make_pair(cookie, entity))) {
6027 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6028 } else {
6029 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6030 oi.watchers[make_pair(cookie, entity)] = w;
6031 t->nop(soid); // make sure update the object_info on disk!
6032 }
6033 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6034 ctx->watch_connects.push_back(make_pair(w, will_ping));
6035 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6036 if (!oi.watchers.count(make_pair(cookie, entity))) {
6037 result = -ENOTCONN;
6038 break;
6039 }
6040 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6041 ctx->watch_connects.push_back(make_pair(w, true));
6042 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6043 /* Note: WATCH with PING doesn't cause may_write() to return true,
6044 * so if there is nothing else in the transaction, this is going
6045 * to run do_osd_op_effects, but not write out a log entry */
6046 if (!oi.watchers.count(make_pair(cookie, entity))) {
6047 result = -ENOTCONN;
6048 break;
6049 }
6050 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6051 obc->watchers.find(make_pair(cookie, entity));
6052 if (p == obc->watchers.end() ||
6053 !p->second->is_connected()) {
6054 // client needs to reconnect
6055 result = -ETIMEDOUT;
6056 break;
6057 }
6058 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6059 p->second->got_ping(ceph_clock_now());
6060 result = 0;
6061 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6062 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6063 oi.watchers.find(make_pair(cookie, entity));
6064 if (oi_iter != oi.watchers.end()) {
6065 dout(10) << " removed watch " << oi_iter->second << " by "
6066 << entity << dendl;
6067 oi.watchers.erase(oi_iter);
6068 t->nop(soid); // update oi on disk
6069 ctx->watch_disconnects.push_back(
6070 watch_disconnect_t(cookie, entity, false));
6071 } else {
6072 dout(10) << " can't remove: no watch by " << entity << dendl;
6073 }
6074 }
6075 }
6076 break;
6077
6078 case CEPH_OSD_OP_CACHE_PIN:
6079 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6080 if ((!pool.info.is_tier() ||
6081 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6082 result = -EINVAL;
6083 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6084 break;
6085 }
6086 ++ctx->num_write;
6087 {
6088 if (!obs.exists || oi.is_whiteout()) {
6089 result = -ENOENT;
6090 break;
6091 }
6092
6093 if (!oi.is_cache_pinned()) {
6094 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6095 ctx->modify = true;
6096 ctx->delta_stats.num_objects_pinned++;
6097 ctx->delta_stats.num_wr++;
6098 }
6099 result = 0;
6100 }
6101 break;
6102
6103 case CEPH_OSD_OP_CACHE_UNPIN:
6104 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6105 if ((!pool.info.is_tier() ||
6106 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6107 result = -EINVAL;
6108 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6109 break;
6110 }
6111 ++ctx->num_write;
6112 {
6113 if (!obs.exists || oi.is_whiteout()) {
6114 result = -ENOENT;
6115 break;
6116 }
6117
6118 if (oi.is_cache_pinned()) {
6119 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6120 ctx->modify = true;
6121 ctx->delta_stats.num_objects_pinned--;
6122 ctx->delta_stats.num_wr++;
6123 }
6124 result = 0;
6125 }
6126 break;
6127
31f18b77
FG
6128 case CEPH_OSD_OP_SET_REDIRECT:
6129 ++ctx->num_write;
6130 {
6131 if (pool.info.is_tier()) {
6132 result = -EINVAL;
6133 break;
6134 }
6135 if (!obs.exists) {
6136 result = -ENOENT;
6137 break;
6138 }
6139 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6140 result = -EOPNOTSUPP;
6141 break;
6142 }
6143
6144 object_t target_name;
6145 object_locator_t target_oloc;
6146 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6147 version_t target_version = op.copy_from.src_version;
6148 try {
6149 ::decode(target_name, bp);
6150 ::decode(target_oloc, bp);
6151 }
6152 catch (buffer::error& e) {
6153 result = -EINVAL;
6154 goto fail;
6155 }
6156 pg_t raw_pg;
6157 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6158 hobject_t target(target_name, target_oloc.key, target_snapid,
6159 raw_pg.ps(), raw_pg.pool(),
6160 target_oloc.nspace);
6161 if (target == soid) {
6162 dout(20) << " set-redirect self is invalid" << dendl;
6163 result = -EINVAL;
6164 break;
6165 }
6166 oi.set_flag(object_info_t::FLAG_MANIFEST);
6167 oi.manifest.redirect_target = target;
6168 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6169 t->truncate(soid, 0);
6170 if (oi.is_omap() && pool.info.supports_omap()) {
6171 t->omap_clear(soid);
6172 obs.oi.clear_omap_digest();
6173 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6174 }
6175 ctx->delta_stats.num_bytes -= oi.size;
6176 oi.size = 0;
6177 oi.new_object();
6178 oi.user_version = target_version;
6179 ctx->user_at_version = target_version;
6180 /* rm_attrs */
6181 map<string,bufferlist> rmattrs;
6182 result = getattrs_maybe_cache(ctx->obc,
6183 &rmattrs,
6184 true);
6185 if (result < 0) {
6186 return result;
6187 }
6188 map<string, bufferlist>::iterator iter;
6189 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6190 const string& name = iter->first;
6191 t->rmattr(soid, name);
6192 }
6193 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6194 }
6195
6196 break;
7c673cae
FG
6197
6198 // -- object attrs --
6199
6200 case CEPH_OSD_OP_SETXATTR:
6201 ++ctx->num_write;
6202 {
6203 if (cct->_conf->osd_max_attr_size > 0 &&
6204 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6205 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6206 result = -EFBIG;
6207 break;
6208 }
6209 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6210 cct->_conf->osd_max_attr_name_len);
6211 if (op.xattr.name_len > max_name_len) {
6212 result = -ENAMETOOLONG;
6213 break;
6214 }
6215 maybe_create_new_object(ctx);
6216 string aname;
6217 bp.copy(op.xattr.name_len, aname);
6218 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6219 string name = "_" + aname;
6220 bufferlist bl;
6221 bp.copy(op.xattr.value_len, bl);
6222 t->setattr(soid, name, bl);
6223 ctx->delta_stats.num_wr++;
6224 }
6225 break;
6226
6227 case CEPH_OSD_OP_RMXATTR:
6228 ++ctx->num_write;
6229 {
6230 string aname;
6231 bp.copy(op.xattr.name_len, aname);
6232 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6233 if (!obs.exists || oi.is_whiteout()) {
6234 result = -ENOENT;
6235 break;
6236 }
6237 string name = "_" + aname;
6238 t->rmattr(soid, name);
6239 ctx->delta_stats.num_wr++;
6240 }
6241 break;
6242
6243
6244 // -- fancy writers --
6245 case CEPH_OSD_OP_APPEND:
6246 {
6247 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6248 // just do it inline; this works because we are happy to execute
6249 // fancy op on replicas as well.
6250 vector<OSDOp> nops(1);
6251 OSDOp& newop = nops[0];
6252 newop.op.op = CEPH_OSD_OP_WRITE;
6253 newop.op.extent.offset = oi.size;
6254 newop.op.extent.length = op.extent.length;
6255 newop.op.extent.truncate_seq = oi.truncate_seq;
6256 newop.indata = osd_op.indata;
6257 result = do_osd_ops(ctx, nops);
6258 osd_op.outdata.claim(newop.outdata);
6259 }
6260 break;
6261
6262 case CEPH_OSD_OP_STARTSYNC:
6263 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6264 t->nop(soid);
6265 break;
6266
6267
6268 // -- trivial map --
6269 case CEPH_OSD_OP_TMAPGET:
6270 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6271 if (pool.info.require_rollback()) {
6272 result = -EOPNOTSUPP;
6273 break;
6274 }
6275 {
6276 vector<OSDOp> nops(1);
6277 OSDOp& newop = nops[0];
6278 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6279 newop.op.extent.offset = 0;
6280 newop.op.extent.length = 0;
6281 do_osd_ops(ctx, nops);
6282 osd_op.outdata.claim(newop.outdata);
6283 }
6284 break;
6285
6286 case CEPH_OSD_OP_TMAPPUT:
6287 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6288 if (pool.info.require_rollback()) {
6289 result = -EOPNOTSUPP;
6290 break;
6291 }
6292 {
6293 //_dout_lock.Lock();
6294 //osd_op.data.hexdump(*_dout);
6295 //_dout_lock.Unlock();
6296
6297 // verify sort order
6298 bool unsorted = false;
6299 if (true) {
6300 bufferlist header;
6301 ::decode(header, bp);
6302 uint32_t n;
6303 ::decode(n, bp);
6304 string last_key;
6305 while (n--) {
6306 string key;
6307 ::decode(key, bp);
6308 dout(10) << "tmapput key " << key << dendl;
6309 bufferlist val;
6310 ::decode(val, bp);
6311 if (key < last_key) {
6312 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6313 unsorted = true;
6314 break;
6315 }
6316 last_key = key;
6317 }
6318 }
6319
6320 // write it
6321 vector<OSDOp> nops(1);
6322 OSDOp& newop = nops[0];
6323 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6324 newop.op.extent.offset = 0;
6325 newop.op.extent.length = osd_op.indata.length();
6326 newop.indata = osd_op.indata;
6327
6328 if (unsorted) {
6329 bp = osd_op.indata.begin();
6330 bufferlist header;
6331 map<string, bufferlist> m;
6332 ::decode(header, bp);
6333 ::decode(m, bp);
6334 assert(bp.end());
6335 bufferlist newbl;
6336 ::encode(header, newbl);
6337 ::encode(m, newbl);
6338 newop.indata = newbl;
6339 }
6340 result = do_osd_ops(ctx, nops);
6341 assert(result == 0);
6342 }
6343 break;
6344
6345 case CEPH_OSD_OP_TMAPUP:
6346 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6347 if (pool.info.require_rollback()) {
6348 result = -EOPNOTSUPP;
6349 break;
6350 }
6351 ++ctx->num_write;
6352 result = do_tmapup(ctx, bp, osd_op);
6353 break;
6354
6355 case CEPH_OSD_OP_TMAP2OMAP:
6356 ++ctx->num_write;
6357 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6358 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6359 break;
6360
6361 // OMAP Read ops
6362 case CEPH_OSD_OP_OMAPGETKEYS:
6363 ++ctx->num_read;
6364 {
6365 string start_after;
6366 uint64_t max_return;
6367 try {
6368 ::decode(start_after, bp);
6369 ::decode(max_return, bp);
6370 }
6371 catch (buffer::error& e) {
6372 result = -EINVAL;
6373 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6374 goto fail;
6375 }
6376 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6377 max_return = cct->_conf->osd_max_omap_entries_per_request;
6378 }
6379 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6380
6381 bufferlist bl;
6382 uint32_t num = 0;
6383 bool truncated = false;
6384 if (oi.is_omap()) {
6385 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6386 coll, ghobject_t(soid)
6387 );
6388 assert(iter);
6389 iter->upper_bound(start_after);
6390 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6391 if (num >= max_return ||
6392 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6393 truncated = true;
6394 break;
6395 }
6396 ::encode(iter->key(), bl);
6397 }
6398 } // else return empty out_set
6399 ::encode(num, osd_op.outdata);
6400 osd_op.outdata.claim_append(bl);
6401 ::encode(truncated, osd_op.outdata);
6402 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6403 ctx->delta_stats.num_rd++;
6404 }
6405 break;
6406
6407 case CEPH_OSD_OP_OMAPGETVALS:
6408 ++ctx->num_read;
6409 {
6410 string start_after;
6411 uint64_t max_return;
6412 string filter_prefix;
6413 try {
6414 ::decode(start_after, bp);
6415 ::decode(max_return, bp);
6416 ::decode(filter_prefix, bp);
6417 }
6418 catch (buffer::error& e) {
6419 result = -EINVAL;
6420 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6421 goto fail;
6422 }
6423 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6424 max_return = cct->_conf->osd_max_omap_entries_per_request;
6425 }
6426 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6427
6428 uint32_t num = 0;
6429 bool truncated = false;
6430 bufferlist bl;
6431 if (oi.is_omap()) {
6432 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6433 coll, ghobject_t(soid)
6434 );
6435 if (!iter) {
6436 result = -ENOENT;
6437 goto fail;
6438 }
6439 iter->upper_bound(start_after);
6440 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6441 for (num = 0;
6442 iter->valid() &&
6443 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6444 ++num, iter->next(false)) {
6445 dout(20) << "Found key " << iter->key() << dendl;
6446 if (num >= max_return ||
6447 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6448 truncated = true;
6449 break;
6450 }
6451 ::encode(iter->key(), bl);
6452 ::encode(iter->value(), bl);
6453 }
6454 } // else return empty out_set
6455 ::encode(num, osd_op.outdata);
6456 osd_op.outdata.claim_append(bl);
6457 ::encode(truncated, osd_op.outdata);
6458 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6459 ctx->delta_stats.num_rd++;
6460 }
6461 break;
6462
6463 case CEPH_OSD_OP_OMAPGETHEADER:
6464 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6465 if (!oi.is_omap()) {
6466 // return empty header
6467 break;
6468 }
6469 ++ctx->num_read;
6470 {
6471 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6472 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6473 ctx->delta_stats.num_rd++;
6474 }
6475 break;
6476
6477 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6478 ++ctx->num_read;
6479 {
6480 set<string> keys_to_get;
6481 try {
6482 ::decode(keys_to_get, bp);
6483 }
6484 catch (buffer::error& e) {
6485 result = -EINVAL;
6486 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6487 goto fail;
6488 }
6489 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6490 map<string, bufferlist> out;
6491 if (oi.is_omap()) {
6492 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6493 } // else return empty omap entries
6494 ::encode(out, osd_op.outdata);
6495 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6496 ctx->delta_stats.num_rd++;
6497 }
6498 break;
6499
6500 case CEPH_OSD_OP_OMAP_CMP:
6501 ++ctx->num_read;
6502 {
6503 if (!obs.exists || oi.is_whiteout()) {
6504 result = -ENOENT;
6505 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6506 break;
6507 }
6508 map<string, pair<bufferlist, int> > assertions;
6509 try {
6510 ::decode(assertions, bp);
6511 }
6512 catch (buffer::error& e) {
6513 result = -EINVAL;
6514 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6515 goto fail;
6516 }
6517 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6518
6519 map<string, bufferlist> out;
6520
6521 if (oi.is_omap()) {
6522 set<string> to_get;
6523 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6524 i != assertions.end();
6525 ++i)
6526 to_get.insert(i->first);
6527 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6528 to_get, &out);
6529 if (r < 0) {
6530 result = r;
6531 break;
6532 }
6533 } // else leave out empty
6534
6535 //Should set num_rd_kb based on encode length of map
6536 ctx->delta_stats.num_rd++;
6537
6538 int r = 0;
6539 bufferlist empty;
6540 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6541 i != assertions.end();
6542 ++i) {
6543 auto out_entry = out.find(i->first);
6544 bufferlist &bl = (out_entry != out.end()) ?
6545 out_entry->second : empty;
6546 switch (i->second.second) {
6547 case CEPH_OSD_CMPXATTR_OP_EQ:
6548 if (!(bl == i->second.first)) {
6549 r = -ECANCELED;
6550 }
6551 break;
6552 case CEPH_OSD_CMPXATTR_OP_LT:
6553 if (!(bl < i->second.first)) {
6554 r = -ECANCELED;
6555 }
6556 break;
6557 case CEPH_OSD_CMPXATTR_OP_GT:
6558 if (!(bl > i->second.first)) {
6559 r = -ECANCELED;
6560 }
6561 break;
6562 default:
6563 r = -EINVAL;
6564 break;
6565 }
6566 if (r < 0)
6567 break;
6568 }
6569 if (r < 0) {
6570 result = r;
6571 }
6572 }
6573 break;
6574
6575 // OMAP Write ops
6576 case CEPH_OSD_OP_OMAPSETVALS:
6577 if (!pool.info.supports_omap()) {
6578 result = -EOPNOTSUPP;
6579 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6580 break;
6581 }
6582 ++ctx->num_write;
6583 {
6584 maybe_create_new_object(ctx);
6585 bufferlist to_set_bl;
6586 try {
6587 decode_str_str_map_to_bl(bp, &to_set_bl);
6588 }
6589 catch (buffer::error& e) {
6590 result = -EINVAL;
6591 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6592 goto fail;
6593 }
6594 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6595 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6596 dout(20) << "setting vals: " << dendl;
6597 map<string,bufferlist> to_set;
6598 bufferlist::iterator pt = to_set_bl.begin();
6599 ::decode(to_set, pt);
6600 for (map<string, bufferlist>::iterator i = to_set.begin();
6601 i != to_set.end();
6602 ++i) {
6603 dout(20) << "\t" << i->first << dendl;
6604 }
6605 }
6606 t->omap_setkeys(soid, to_set_bl);
6607 ctx->delta_stats.num_wr++;
6608 }
6609 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6610 obs.oi.clear_omap_digest();
6611 break;
6612
6613 case CEPH_OSD_OP_OMAPSETHEADER:
6614 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6615 if (!pool.info.supports_omap()) {
6616 result = -EOPNOTSUPP;
6617 break;
6618 }
6619 ++ctx->num_write;
6620 {
6621 maybe_create_new_object(ctx);
6622 t->omap_setheader(soid, osd_op.indata);
6623 ctx->delta_stats.num_wr++;
6624 }
6625 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6626 obs.oi.clear_omap_digest();
6627 break;
6628
6629 case CEPH_OSD_OP_OMAPCLEAR:
6630 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6631 if (!pool.info.supports_omap()) {
6632 result = -EOPNOTSUPP;
6633 break;
6634 }
6635 ++ctx->num_write;
6636 {
6637 if (!obs.exists || oi.is_whiteout()) {
6638 result = -ENOENT;
6639 break;
6640 }
6641 if (oi.is_omap()) {
6642 t->omap_clear(soid);
6643 ctx->delta_stats.num_wr++;
6644 obs.oi.clear_omap_digest();
6645 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6646 }
6647 }
6648 break;
6649
6650 case CEPH_OSD_OP_OMAPRMKEYS:
6651 if (!pool.info.supports_omap()) {
6652 result = -EOPNOTSUPP;
6653 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6654 break;
6655 }
6656 ++ctx->num_write;
6657 {
6658 if (!obs.exists || oi.is_whiteout()) {
6659 result = -ENOENT;
6660 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6661 break;
6662 }
6663 bufferlist to_rm_bl;
6664 try {
6665 decode_str_set_to_bl(bp, &to_rm_bl);
6666 }
6667 catch (buffer::error& e) {
6668 result = -EINVAL;
6669 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6670 goto fail;
6671 }
6672 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6673 t->omap_rmkeys(soid, to_rm_bl);
6674 ctx->delta_stats.num_wr++;
6675 }
6676 obs.oi.clear_omap_digest();
6677 break;
6678
6679 case CEPH_OSD_OP_COPY_GET:
6680 ++ctx->num_read;
c07f9fc5
FG
6681 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6682 soid.snap.val);
6683 if (op_finisher == nullptr) {
6684 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6685 } else {
6686 result = op_finisher->execute();
6687 }
7c673cae
FG
6688 break;
6689
6690 case CEPH_OSD_OP_COPY_FROM:
6691 ++ctx->num_write;
6692 {
6693 object_t src_name;
6694 object_locator_t src_oloc;
6695 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6696 version_t src_version = op.copy_from.src_version;
6697 try {
6698 ::decode(src_name, bp);
6699 ::decode(src_oloc, bp);
6700 }
6701 catch (buffer::error& e) {
6702 result = -EINVAL;
6703 tracepoint(osd,
6704 do_osd_op_pre_copy_from,
6705 soid.oid.name.c_str(),
6706 soid.snap.val,
6707 "???",
6708 0,
6709 "???",
6710 "???",
6711 0,
6712 src_snapid,
6713 src_version);
6714 goto fail;
6715 }
6716 tracepoint(osd,
6717 do_osd_op_pre_copy_from,
6718 soid.oid.name.c_str(),
6719 soid.snap.val,
6720 src_name.name.c_str(),
6721 src_oloc.pool,
6722 src_oloc.key.c_str(),
6723 src_oloc.nspace.c_str(),
6724 src_oloc.hash,
6725 src_snapid,
6726 src_version);
c07f9fc5 6727 if (op_finisher == nullptr) {
7c673cae
FG
6728 // start
6729 pg_t raw_pg;
6730 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6731 hobject_t src(src_name, src_oloc.key, src_snapid,
6732 raw_pg.ps(), raw_pg.pool(),
6733 src_oloc.nspace);
6734 if (src == soid) {
6735 dout(20) << " copy from self is invalid" << dendl;
6736 result = -EINVAL;
6737 break;
6738 }
c07f9fc5
FG
6739 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6740 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6741 new CopyFromFinisher(cb));
7c673cae
FG
6742 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6743 op.copy_from.flags,
6744 false,
6745 op.copy_from.src_fadvise_flags,
6746 op.flags);
6747 result = -EINPROGRESS;
6748 } else {
6749 // finish
c07f9fc5
FG
6750 result = op_finisher->execute();
6751 assert(result == 0);
6752
6753 // COPY_FROM cannot be executed multiple times -- it must restart
6754 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7c673cae
FG
6755 }
6756 }
6757 break;
6758
6759 default:
6760 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6761 dout(1) << "unrecognized osd op " << op.op
6762 << " " << ceph_osd_op_name(op.op)
6763 << dendl;
6764 result = -EOPNOTSUPP;
6765 }
6766
6767 fail:
6768 osd_op.rval = result;
6769 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6770 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6771 result = 0;
6772
6773 if (result < 0)
6774 break;
6775 }
6776 return result;
6777}
6778
6779int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6780{
6781 if (ctx->new_obs.oi.size == 0) {
6782 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6783 return -ENODATA;
6784 }
6785 vector<OSDOp> nops(1);
6786 OSDOp &newop = nops[0];
6787 newop.op.op = CEPH_OSD_OP_TMAPGET;
6788 do_osd_ops(ctx, nops);
6789 try {
6790 bufferlist::iterator i = newop.outdata.begin();
6791 ::decode(*header, i);
6792 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6793 } catch (...) {
6794 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6795 << dendl;
6796 return -EINVAL;
6797 }
6798 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6799 << dendl;
6800 return 0;
6801}
6802
6803int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6804 const SnapSet& ss)
6805{
6806 // verify that all clones have been evicted
6807 dout(20) << __func__ << " verifying clones are absent "
6808 << ss << dendl;
6809 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6810 p != ss.clones.end();
6811 ++p) {
6812 hobject_t clone_oid = soid;
6813 clone_oid.snap = *p;
6814 if (is_missing_object(clone_oid))
6815 return -EBUSY;
6816 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6817 if (clone_obc && clone_obc->obs.exists) {
6818 dout(10) << __func__ << " cannot evict head before clone "
6819 << clone_oid << dendl;
6820 return -EBUSY;
6821 }
6822 if (copy_ops.count(clone_oid)) {
6823 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6824 << clone_oid << dendl;
6825 return -EBUSY;
6826 }
6827 }
6828 return 0;
6829}
6830
6831inline int PrimaryLogPG::_delete_oid(
6832 OpContext *ctx,
6833 bool no_whiteout, // no whiteouts, no matter what.
6834 bool try_no_whiteout) // try not to whiteout
6835{
6836 SnapSet& snapset = ctx->new_snapset;
6837 ObjectState& obs = ctx->new_obs;
6838 object_info_t& oi = obs.oi;
6839 const hobject_t& soid = oi.soid;
6840 PGTransaction* t = ctx->op_t.get();
6841
6842 // cache: cache: set whiteout on delete?
6843 bool whiteout = false;
6844 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6845 && !no_whiteout
6846 && !try_no_whiteout) {
6847 whiteout = true;
6848 }
6849 bool legacy;
31f18b77 6850 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
6851 legacy = false;
6852 // in luminous or later, we can't delete the head if there are
6853 // clones. we trust the caller passing no_whiteout has already
6854 // verified they don't exist.
6855 if (!snapset.clones.empty() ||
6856 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6857 if (no_whiteout) {
6858 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6859 << dendl;
6860 } else {
6861 dout(20) << __func__ << " has or will have clones; will whiteout"
6862 << dendl;
6863 whiteout = true;
6864 }
6865 }
6866 } else {
6867 legacy = false;
6868 }
6869 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6870 << " no_whiteout=" << (int)no_whiteout
6871 << " try_no_whiteout=" << (int)try_no_whiteout
6872 << dendl;
6873 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6874 return -ENOENT;
6875
6876 t->remove(soid);
6877
6878 if (oi.size > 0) {
6879 interval_set<uint64_t> ch;
6880 ch.insert(0, oi.size);
6881 ctx->modified_ranges.union_of(ch);
6882 }
6883
6884 ctx->delta_stats.num_wr++;
6885 if (soid.is_snap()) {
6886 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6887 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6888 } else {
6889 ctx->delta_stats.num_bytes -= oi.size;
6890 }
6891 oi.size = 0;
6892 oi.new_object();
6893
6894 // disconnect all watchers
6895 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6896 oi.watchers.begin();
6897 p != oi.watchers.end();
6898 ++p) {
6899 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6900 ctx->watch_disconnects.push_back(
6901 watch_disconnect_t(p->first.first, p->first.second, true));
6902 }
6903 oi.watchers.clear();
6904
6905 if (whiteout) {
6906 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6907 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6908 ctx->delta_stats.num_whiteouts++;
6909 t->create(soid);
6910 osd->logger->inc(l_osd_tier_whiteout);
6911 return 0;
6912 }
6913
6914 // delete the head
6915 ctx->delta_stats.num_objects--;
6916 if (soid.is_snap())
6917 ctx->delta_stats.num_object_clones--;
6918 if (oi.is_whiteout()) {
6919 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6920 ctx->delta_stats.num_whiteouts--;
6921 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6922 }
6923 if (oi.is_cache_pinned()) {
6924 ctx->delta_stats.num_objects_pinned--;
6925 }
6926 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6927 snapset.head_exists = false;
6928 }
6929 obs.exists = false;
6930 return 0;
6931}
6932
6933int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6934{
6935 SnapSet& snapset = ctx->new_snapset;
6936 ObjectState& obs = ctx->new_obs;
6937 object_info_t& oi = obs.oi;
6938 const hobject_t& soid = oi.soid;
6939 PGTransaction* t = ctx->op_t.get();
6940 snapid_t snapid = (uint64_t)op.snap.snapid;
6941 hobject_t missing_oid;
6942
6943 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6944
6945 ObjectContextRef rollback_to;
6946 int ret = find_object_context(
6947 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6948 soid.get_namespace()),
6949 &rollback_to, false, false, &missing_oid);
6950 if (ret == -EAGAIN) {
6951 /* clone must be missing */
c07f9fc5
FG
6952 assert(is_degraded_or_backfilling_object(missing_oid));
6953 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7c673cae
FG
6954 << missing_oid << " (requested snapid: ) " << snapid << dendl;
6955 block_write_on_degraded_snap(missing_oid, ctx->op);
6956 return ret;
6957 }
6958 {
6959 ObjectContextRef promote_obc;
31f18b77
FG
6960 cache_result_t tier_mode_result;
6961 if (obs.exists && obs.oi.has_manifest()) {
6962 tier_mode_result =
6963 maybe_handle_manifest_detail(
6964 ctx->op,
6965 true,
6966 rollback_to);
6967 } else {
6968 tier_mode_result =
6969 maybe_handle_cache_detail(
6970 ctx->op,
6971 true,
6972 rollback_to,
6973 ret,
6974 missing_oid,
6975 true,
6976 false,
6977 &promote_obc);
6978 }
6979 switch (tier_mode_result) {
7c673cae
FG
6980 case cache_result_t::NOOP:
6981 break;
6982 case cache_result_t::BLOCKED_PROMOTE:
6983 assert(promote_obc);
6984 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6985 return -EAGAIN;
6986 case cache_result_t::BLOCKED_FULL:
6987 block_write_on_full_cache(soid, ctx->op);
6988 return -EAGAIN;
6989 default:
6990 assert(0 == "must promote was set, other values are not valid");
6991 return -EAGAIN;
6992 }
6993 }
6994
6995 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6996 // there's no snapshot here, or there's no object.
6997 // if there's no snapshot, we delete the object; otherwise, do nothing.
6998 dout(20) << "_rollback_to deleting head on " << soid.oid
6999 << " because got ENOENT|whiteout on find_object_context" << dendl;
7000 if (ctx->obc->obs.oi.watchers.size()) {
7001 // Cannot delete an object with watchers
7002 ret = -EBUSY;
7003 } else {
7004 _delete_oid(ctx, false, false);
7005 ret = 0;
7006 }
7007 } else if (ret) {
7008 // ummm....huh? It *can't* return anything else at time of writing.
7009 assert(0 == "unexpected error code in _rollback_to");
7010 } else { //we got our context, let's use it to do the rollback!
7011 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7012 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7013 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7014 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7015 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7016 ret = -EAGAIN;
7017 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7018 // rolling back to the head; we just need to clone it.
7019 ctx->modify = true;
7020 } else {
7021 /* 1) Delete current head
7022 * 2) Clone correct snapshot into head
7023 * 3) Calculate clone_overlaps by following overlaps
7024 * forward from rollback snapshot */
7025 dout(10) << "_rollback_to deleting " << soid.oid
7026 << " and rolling back to old snap" << dendl;
7027
7028 if (obs.exists) {
7029 t->remove(soid);
7030 }
7031 t->clone(soid, rollback_to_sobject);
7032 snapset.head_exists = true;
7033 t->add_obc(rollback_to);
7034
7035 map<snapid_t, interval_set<uint64_t> >::iterator iter =
7036 snapset.clone_overlap.lower_bound(snapid);
7037 interval_set<uint64_t> overlaps = iter->second;
7038 assert(iter != snapset.clone_overlap.end());
7039 for ( ;
7040 iter != snapset.clone_overlap.end();
7041 ++iter)
7042 overlaps.intersection_of(iter->second);
7043
7044 if (obs.oi.size > 0) {
7045 interval_set<uint64_t> modified;
7046 modified.insert(0, obs.oi.size);
7047 overlaps.intersection_of(modified);
7048 modified.subtract(overlaps);
7049 ctx->modified_ranges.union_of(modified);
7050 }
7051
7052 // Adjust the cached objectcontext
7053 maybe_create_new_object(ctx, true);
7054 ctx->delta_stats.num_bytes -= obs.oi.size;
7055 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7056 obs.oi.size = rollback_to->obs.oi.size;
7057 if (rollback_to->obs.oi.is_data_digest())
7058 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7059 else
7060 obs.oi.clear_data_digest();
7061 if (rollback_to->obs.oi.is_omap_digest())
7062 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7063 else
7064 obs.oi.clear_omap_digest();
7065
7066 if (rollback_to->obs.oi.is_omap()) {
7067 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7068 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7069 } else {
7070 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7071 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7072 }
7073
7074 snapset.head_exists = true;
7075 }
7076 }
7077 return ret;
7078}
7079
7080void PrimaryLogPG::_make_clone(
7081 OpContext *ctx,
7082 PGTransaction* t,
7083 ObjectContextRef obc,
7084 const hobject_t& head, const hobject_t& coid,
7085 object_info_t *poi)
7086{
7087 bufferlist bv;
7088 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7089
7090 t->clone(coid, head);
7091 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7092 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7093}
7094
7095void PrimaryLogPG::make_writeable(OpContext *ctx)
7096{
7097 const hobject_t& soid = ctx->obs->oi.soid;
7098 SnapContext& snapc = ctx->snapc;
7099
7100 // clone?
7101 assert(soid.snap == CEPH_NOSNAP);
7102 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7103 << " snapc=" << snapc << dendl;
7104
7105 bool was_dirty = ctx->obc->obs.oi.is_dirty();
7106 if (ctx->new_obs.exists) {
7107 // we will mark the object dirty
7108 if (ctx->undirty && was_dirty) {
7109 dout(20) << " clearing DIRTY flag" << dendl;
7110 assert(ctx->new_obs.oi.is_dirty());
7111 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7112 --ctx->delta_stats.num_objects_dirty;
7113 osd->logger->inc(l_osd_tier_clean);
7114 } else if (!was_dirty && !ctx->undirty) {
7115 dout(20) << " setting DIRTY flag" << dendl;
7116 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7117 ++ctx->delta_stats.num_objects_dirty;
7118 osd->logger->inc(l_osd_tier_dirty);
7119 }
7120 } else {
7121 if (was_dirty) {
7122 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7123 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7124 --ctx->delta_stats.num_objects_dirty;
7125 }
7126 }
7127
7128 if ((ctx->new_obs.exists &&
7129 ctx->new_obs.oi.is_omap()) &&
7130 (!ctx->obc->obs.exists ||
7131 !ctx->obc->obs.oi.is_omap())) {
7132 ++ctx->delta_stats.num_objects_omap;
7133 }
7134 if ((!ctx->new_obs.exists ||
7135 !ctx->new_obs.oi.is_omap()) &&
7136 (ctx->obc->obs.exists &&
7137 ctx->obc->obs.oi.is_omap())) {
7138 --ctx->delta_stats.num_objects_omap;
7139 }
7140
7141 // use newer snapc?
7142 if (ctx->new_snapset.seq > snapc.seq) {
7143 snapc.seq = ctx->new_snapset.seq;
7144 snapc.snaps = ctx->new_snapset.snaps;
7145 filter_snapc(snapc.snaps);
7146 dout(10) << " using newer snapc " << snapc << dendl;
7147 }
7148
7149 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7150 snapc.snaps.size() && // there are snaps
7151 !ctx->cache_evict &&
7152 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
7153 // clone
7154 hobject_t coid = soid;
7155 coid.snap = snapc.seq;
7156
7157 unsigned l;
7158 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7159
7160 vector<snapid_t> snaps(l);
7161 for (unsigned i=0; i<l; i++)
7162 snaps[i] = snapc.snaps[i];
7163
7164 // prepare clone
7165 object_info_t static_snap_oi(coid);
7166 object_info_t *snap_oi;
7167 if (is_primary()) {
7168 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7169 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7170 ctx->clone_obc->obs.oi = static_snap_oi;
7171 ctx->clone_obc->obs.exists = true;
7172 ctx->clone_obc->ssc = ctx->obc->ssc;
7173 ctx->clone_obc->ssc->ref++;
7174 if (pool.info.require_rollback())
7175 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7176 snap_oi = &ctx->clone_obc->obs.oi;
7177 bool got = ctx->lock_manager.get_write_greedy(
7178 coid,
7179 ctx->clone_obc,
7180 ctx->op);
7181 assert(got);
7182 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7183 } else {
7184 snap_oi = &static_snap_oi;
7185 }
7186 snap_oi->version = ctx->at_version;
7187 snap_oi->prior_version = ctx->obs->oi.version;
7188 snap_oi->copy_user_bits(ctx->obs->oi);
7189
7190 bool legacy = ctx->new_snapset.is_legacy() ||
31f18b77 7191 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
7192 if (legacy) {
7193 snap_oi->legacy_snaps = snaps;
7194 }
7195
7196 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7197
7198 ctx->delta_stats.num_objects++;
7199 if (snap_oi->is_dirty()) {
7200 ctx->delta_stats.num_objects_dirty++;
7201 osd->logger->inc(l_osd_tier_dirty);
7202 }
7203 if (snap_oi->is_omap())
7204 ctx->delta_stats.num_objects_omap++;
7205 if (snap_oi->is_cache_pinned())
7206 ctx->delta_stats.num_objects_pinned++;
7207 ctx->delta_stats.num_object_clones++;
7208 ctx->new_snapset.clones.push_back(coid.snap);
7209 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7210 if (!legacy) {
7211 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7212 }
7213
7214 // clone_overlap should contain an entry for each clone
7215 // (an empty interval_set if there is no overlap)
7216 ctx->new_snapset.clone_overlap[coid.snap];
7217 if (ctx->obs->oi.size)
7218 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7219
7220 // log clone
7221 dout(10) << " cloning v " << ctx->obs->oi.version
7222 << " to " << coid << " v " << ctx->at_version
7223 << " snaps=" << snaps
7224 << " snapset=" << ctx->new_snapset << dendl;
7225 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7226 ctx->obs->oi.version,
7227 ctx->obs->oi.user_version,
7228 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7229 ::encode(snaps, ctx->log.back().snaps);
7230
7231 ctx->at_version.version++;
7232 }
7233
7234 // update most recent clone_overlap and usage stats
7235 if (ctx->new_snapset.clones.size() > 0) {
7236 /* we need to check whether the most recent clone exists, if it's been evicted,
7237 * it's not included in the stats */
7238 hobject_t last_clone_oid = soid;
7239 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7240 if (is_present_clone(last_clone_oid)) {
7241 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7242 ctx->modified_ranges.intersection_of(newest_overlap);
7243 // modified_ranges is still in use by the clone
7244 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7245 newest_overlap.subtract(ctx->modified_ranges);
7246 }
7247 }
7248
7249 // update snapset with latest snap context
7250 ctx->new_snapset.seq = snapc.seq;
7251 ctx->new_snapset.snaps = snapc.snaps;
31f18b77 7252 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
7253 // pessimistic assumption that this is a net-new legacy SnapSet
7254 ctx->delta_stats.num_legacy_snapsets++;
7255 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7256 } else if (ctx->new_snapset.is_legacy()) {
7257 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7258 }
7259 dout(20) << "make_writeable " << soid
7260 << " done, snapset=" << ctx->new_snapset << dendl;
7261}
7262
7263
7264void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7265 interval_set<uint64_t>& modified, uint64_t offset,
7266 uint64_t length, bool write_full)
7267{
7268 interval_set<uint64_t> ch;
7269 if (write_full) {
7270 if (oi.size)
7271 ch.insert(0, oi.size);
7272 } else if (length)
7273 ch.insert(offset, length);
7274 modified.union_of(ch);
7275 if (write_full || offset + length > oi.size) {
7276 uint64_t new_size = offset + length;
7277 delta_stats.num_bytes -= oi.size;
7278 delta_stats.num_bytes += new_size;
7279 oi.size = new_size;
7280 }
7281 delta_stats.num_wr++;
7282 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7283}
7284
7285void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7286{
7287 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7288 delta_stats.num_bytes += p.get_len();
7289 }
7290}
7291
7292void PrimaryLogPG::complete_disconnect_watches(
7293 ObjectContextRef obc,
7294 const list<watch_disconnect_t> &to_disconnect)
7295{
7296 for (list<watch_disconnect_t>::const_iterator i =
7297 to_disconnect.begin();
7298 i != to_disconnect.end();
7299 ++i) {
7300 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7301 auto watchers_entry = obc->watchers.find(watcher);
7302 if (watchers_entry != obc->watchers.end()) {
7303 WatchRef watch = watchers_entry->second;
7304 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7305 obc->watchers.erase(watcher);
7306 watch->remove(i->send_disconnect);
7307 } else {
7308 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7309 << watcher << dendl;
7310 }
7311 }
7312}
7313
7314void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7315{
7316 entity_name_t entity = ctx->reqid.name;
7317 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7318
7319 // disconnects first
7320 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7321
7322 assert(conn);
7323
7324 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7325 if (!session.get())
7326 return;
7327 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7328
7329 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7330 i != ctx->watch_connects.end();
7331 ++i) {
7332 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7333 dout(15) << "do_osd_op_effects applying watch connect on session "
7334 << session.get() << " watcher " << watcher << dendl;
7335 WatchRef watch;
7336 if (ctx->obc->watchers.count(watcher)) {
7337 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7338 << dendl;
7339 watch = ctx->obc->watchers[watcher];
7340 } else {
7341 dout(15) << "do_osd_op_effects new watcher " << watcher
7342 << dendl;
7343 watch = Watch::makeWatchRef(
7344 this, osd, ctx->obc, i->first.timeout_seconds,
7345 i->first.cookie, entity, conn->get_peer_addr());
7346 ctx->obc->watchers.insert(
7347 make_pair(
7348 watcher,
7349 watch));
7350 }
7351 watch->connect(conn, i->second);
7352 }
7353
7354 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7355 p != ctx->notifies.end();
7356 ++p) {
7357 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7358 ConnectionRef conn(ctx->op->get_req()->get_connection());
7359 NotifyRef notif(
7360 Notify::makeNotifyRef(
7361 conn,
7362 ctx->reqid.name.num(),
7363 p->bl,
7364 p->timeout,
7365 p->cookie,
7366 p->notify_id,
7367 ctx->obc->obs.oi.user_version,
7368 osd));
7369 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7370 ctx->obc->watchers.begin();
7371 i != ctx->obc->watchers.end();
7372 ++i) {
7373 dout(10) << "starting notify on watch " << i->first << dendl;
7374 i->second->start_notify(notif);
7375 }
7376 notif->init();
7377 }
7378
7379 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7380 p != ctx->notify_acks.end();
7381 ++p) {
7382 if (p->watch_cookie)
7383 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7384 else
7385 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7386 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7387 ctx->obc->watchers.begin();
7388 i != ctx->obc->watchers.end();
7389 ++i) {
7390 if (i->first.second != entity) continue;
7391 if (p->watch_cookie &&
7392 p->watch_cookie.get() != i->first.first) continue;
7393 dout(10) << "acking notify on watch " << i->first << dendl;
7394 i->second->notify_ack(p->notify_id, p->reply_bl);
7395 }
7396 }
7397}
7398
7399hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7400{
7401 ostringstream ss;
7402 ss << "temp_" << info.pgid << "_" << get_role()
7403 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7404 hobject_t hoid = target.make_temp_hobject(ss.str());
7405 dout(20) << __func__ << " " << hoid << dendl;
7406 return hoid;
7407}
7408
7409hobject_t PrimaryLogPG::get_temp_recovery_object(
7410 const hobject_t& target,
7411 eversion_t version)
7412{
7413 ostringstream ss;
7414 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7415 << "_" << version
7416 << "_" << info.history.same_interval_since
7417 << "_" << target.snap;
7418 // pgid + version + interval + snapid is unique, and short
7419 hobject_t hoid = target.make_temp_hobject(ss.str());
7420 dout(20) << __func__ << " " << hoid << dendl;
7421 return hoid;
7422}
7423
7424int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7425{
c07f9fc5
FG
7426 assert(!ctx->ops->empty());
7427
7c673cae
FG
7428 const hobject_t& soid = ctx->obs->oi.soid;
7429
7430 // valid snap context?
7431 if (!ctx->snapc.is_valid()) {
7432 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7433 return -EINVAL;
7434 }
7435
7436 // prepare the actual mutation
c07f9fc5 7437 int result = do_osd_ops(ctx, *ctx->ops);
7c673cae
FG
7438 if (result < 0) {
7439 if (ctx->op->may_write() &&
31f18b77 7440 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
7441 // need to save the error code in the pg log, to detect dup ops,
7442 // but do nothing else
7443 ctx->update_log_only = true;
7444 }
7445 return result;
7446 }
7447
7448 // read-op? write-op noop? done?
7449 if (ctx->op_t->empty() && !ctx->modify) {
7450 unstable_stats.add(ctx->delta_stats);
7451 if (ctx->op->may_write() &&
31f18b77 7452 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
7453 ctx->update_log_only = true;
7454 }
7455 return result;
7456 }
7457
7458 // check for full
7459 if ((ctx->delta_stats.num_bytes > 0 ||
7460 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7461 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7462 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7463 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7464 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7465 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7466 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7467 << dendl;
7468 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7469 // they tried, they failed.
7470 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7471 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7472 } else {
7473 // drop request
7474 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7475 return -EAGAIN;
7476 }
7477 }
7478
7479 // clone, if necessary
7480 if (soid.snap == CEPH_NOSNAP)
7481 make_writeable(ctx);
7482
7483 finish_ctx(ctx,
7484 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7485 pg_log_entry_t::DELETE);
7486
7487 return result;
7488}
7489
7490void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7491{
7492 const hobject_t& soid = ctx->obs->oi.soid;
7493 dout(20) << __func__ << " " << soid << " " << ctx
7494 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7495 << dendl;
7496 utime_t now = ceph_clock_now();
7497
7498 // snapset
7499 bufferlist bss;
7500
7501 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7502 ::encode(ctx->new_snapset, bss);
7503 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7504 !ctx->new_snapset.is_legacy());
7505
7506 if (ctx->new_obs.exists) {
7507 if (!ctx->obs->exists) {
7508 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7509 hobject_t snapoid = soid.get_snapdir();
7510 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7511 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7512 ctx->at_version,
7513 ctx->snapset_obc->obs.oi.version,
7514 0, osd_reqid_t(), ctx->mtime, 0));
7515 ctx->op_t->remove(snapoid);
7516
7517 ctx->at_version.version++;
7518
7519 ctx->snapset_obc->obs.exists = false;
7520 }
7521 }
7522 } else if (!ctx->new_snapset.clones.empty() &&
7523 !ctx->cache_evict &&
7524 !ctx->new_snapset.head_exists &&
7525 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7526 // save snapset on _snap
7527 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7528 info.pgid.pool(), soid.get_namespace());
7529 dout(10) << " final snapset " << ctx->new_snapset
7530 << " in " << snapoid << dendl;
31f18b77 7531 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
7532 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7533 ctx->at_version,
7534 eversion_t(),
7535 0, osd_reqid_t(), ctx->mtime, 0));
7536
7537 if (!ctx->snapset_obc)
7538 ctx->snapset_obc = get_object_context(snapoid, true);
7539 bool got = false;
7540 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7541 got = ctx->lock_manager.get_write_greedy(
7542 snapoid,
7543 ctx->snapset_obc,
7544 ctx->op);
7545 } else {
7546 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7547 got = ctx->lock_manager.get_lock_type(
7548 ObjectContext::RWState::RWEXCL,
7549 snapoid,
7550 ctx->snapset_obc,
7551 ctx->op);
7552 }
7553 assert(got);
7554 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7555 ctx->snapset_obc->obs.exists = true;
7556 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7557 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7558 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7559 ctx->snapset_obc->obs.oi.local_mtime = now;
7560
7561 map<string, bufferlist> attrs;
7562 bufferlist bv(sizeof(ctx->new_obs.oi));
7563 ::encode(ctx->snapset_obc->obs.oi, bv,
7564 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7565 ctx->op_t->create(snapoid);
7566 attrs[OI_ATTR].claim(bv);
7567 attrs[SS_ATTR].claim(bss);
7568 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7569 ctx->at_version.version++;
7570 }
7571 }
7572
7573 // finish and log the op.
7574 if (ctx->user_modify) {
7575 // update the user_version for any modify ops, except for the watch op
7576 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7577 /* In order for new clients and old clients to interoperate properly
7578 * when exchanging versions, we need to lower bound the user_version
7579 * (which our new clients pay proper attention to)
7580 * by the at_version (which is all the old clients can ever see). */
7581 if (ctx->at_version.version > ctx->user_at_version)
7582 ctx->user_at_version = ctx->at_version.version;
7583 ctx->new_obs.oi.user_version = ctx->user_at_version;
7584 }
7585 ctx->bytes_written = ctx->op_t->get_bytes_written();
7586
7587 if (ctx->new_obs.exists) {
7588 // on the head object
7589 ctx->new_obs.oi.version = ctx->at_version;
7590 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7591 ctx->new_obs.oi.last_reqid = ctx->reqid;
7592 if (ctx->mtime != utime_t()) {
7593 ctx->new_obs.oi.mtime = ctx->mtime;
7594 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7595 ctx->new_obs.oi.local_mtime = now;
7596 } else {
7597 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7598 }
7599
7600 map <string, bufferlist> attrs;
7601 bufferlist bv(sizeof(ctx->new_obs.oi));
7602 ::encode(ctx->new_obs.oi, bv,
7603 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7604 attrs[OI_ATTR].claim(bv);
7605
7606 if (soid.snap == CEPH_NOSNAP) {
7607 dout(10) << " final snapset " << ctx->new_snapset
7608 << " in " << soid << dendl;
7609 attrs[SS_ATTR].claim(bss);
7610 } else {
7611 dout(10) << " no snapset (this is a clone)" << dendl;
7612 }
7613 ctx->op_t->setattrs(soid, attrs);
7614 } else {
7615 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7616 }
7617
7618 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
31f18b77 7619 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
7620
7621 // append to log
7622 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7623 ctx->obs->oi.version,
7624 ctx->user_at_version, ctx->reqid,
7625 ctx->mtime, 0));
7626 if (soid.snap < CEPH_NOSNAP) {
7627 switch (log_op_type) {
7628 case pg_log_entry_t::MODIFY:
7629 case pg_log_entry_t::PROMOTE:
7630 case pg_log_entry_t::CLEAN:
7631 if (legacy_snapset) {
7632 dout(20) << __func__ << " encoding legacy_snaps "
7633 << ctx->new_obs.oi.legacy_snaps
7634 << dendl;
7635 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7636 } else {
7637 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7638 << dendl;
7639 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7640 }
7641 break;
7642 default:
7643 break;
7644 }
7645 }
7646
7647 if (!ctx->extra_reqids.empty()) {
7648 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7649 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7650 }
7651
7652 // apply new object state.
7653 ctx->obc->obs = ctx->new_obs;
7654
7655 if (soid.is_head() && !ctx->obc->obs.exists &&
7656 (!maintain_ssc || ctx->cache_evict)) {
7657 ctx->obc->ssc->exists = false;
7658 ctx->obc->ssc->snapset = SnapSet();
7659 } else {
7660 ctx->obc->ssc->exists = true;
7661 ctx->obc->ssc->snapset = ctx->new_snapset;
7662 }
7663}
7664
7665void PrimaryLogPG::apply_stats(
7666 const hobject_t &soid,
7667 const object_stat_sum_t &delta_stats) {
7668
7669 info.stats.stats.add(delta_stats);
7670
7671 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7672 i != backfill_targets.end();
7673 ++i) {
7674 pg_shard_t bt = *i;
7675 pg_info_t& pinfo = peer_info[bt];
7676 if (soid <= pinfo.last_backfill)
7677 pinfo.stats.stats.add(delta_stats);
7678 else if (soid <= last_backfill_started)
7679 pending_backfill_updates[soid].stats.add(delta_stats);
7680 }
7681
7682 if (is_primary() && scrubber.active) {
7683 if (soid < scrubber.start) {
7684 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7685 << "," << scrubber.end << ")" << dendl;
7686 scrub_cstat.add(delta_stats);
7687 } else {
7688 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7689 << "," << scrubber.end << ")" << dendl;
7690 }
7691 }
7692}
7693
7694void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7695{
7696 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7697 assert(ctx->async_reads_complete());
7698
c07f9fc5
FG
7699 for (vector<OSDOp>::iterator p = ctx->ops->begin();
7700 p != ctx->ops->end() && result >= 0; ++p) {
7c673cae
FG
7701 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7702 result = p->rval;
7703 break;
7704 }
7705 ctx->bytes_read += p->outdata.length();
7706 }
c07f9fc5
FG
7707 ctx->reply->claim_op_out_data(*ctx->ops);
7708 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
7709
7710 MOSDOpReply *reply = ctx->reply;
7711 ctx->reply = nullptr;
7712
7713 if (result >= 0) {
7714 if (!ctx->ignore_log_op_stats) {
7715 log_op_stats(ctx);
7716 publish_stats_to_osd();
7717 }
7718
7719 // on read, return the current object version
7720 if (ctx->obs) {
7721 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7722 } else {
7723 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7724 }
7725 } else if (result == -ENOENT) {
7726 // on ENOENT, set a floor for what the next user version will be.
7727 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7728 }
7729
7730 reply->set_result(result);
7731 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7732 osd->send_message_osd_client(reply, m->get_connection());
7733 close_op_ctx(ctx);
7734}
7735
7736// ========================================================================
7737// copyfrom
7738
7739struct C_Copyfrom : public Context {
7740 PrimaryLogPGRef pg;
7741 hobject_t oid;
7742 epoch_t last_peering_reset;
7743 ceph_tid_t tid;
7744 PrimaryLogPG::CopyOpRef cop;
7745 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7746 const PrimaryLogPG::CopyOpRef& c)
7747 : pg(p), oid(o), last_peering_reset(lpr),
7748 tid(0), cop(c)
7749 {}
7750 void finish(int r) override {
7751 if (r == -ECANCELED)
7752 return;
7753 pg->lock();
7754 if (last_peering_reset == pg->get_last_peering_reset()) {
7755 pg->process_copy_chunk(oid, tid, r);
7756 }
7757 pg->unlock();
7758 }
7759};
7760
7761struct C_CopyFrom_AsyncReadCb : public Context {
7762 OSDOp *osd_op;
7763 object_copy_data_t reply_obj;
7764 uint64_t features;
7765 size_t len;
7766 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7767 osd_op(osd_op), features(features), len(0) {}
7768 void finish(int r) override {
c07f9fc5
FG
7769 osd_op->rval = r;
7770 if (r < 0) {
7771 return;
7772 }
7773
7c673cae
FG
7774 assert(len > 0);
7775 assert(len <= reply_obj.data.length());
7776 bufferlist bl;
7777 bl.substr_of(reply_obj.data, 0, len);
7778 reply_obj.data.swap(bl);
7779 ::encode(reply_obj, osd_op->outdata, features);
7780 }
7781};
7782
c07f9fc5
FG
7783int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7784 OSDOp& osd_op, ObjectContextRef &obc)
7c673cae
FG
7785{
7786 object_info_t& oi = obc->obs.oi;
7787 hobject_t& soid = oi.soid;
7788 int result = 0;
7789 object_copy_cursor_t cursor;
7790 uint64_t out_max;
7791 try {
7792 ::decode(cursor, bp);
7793 ::decode(out_max, bp);
7794 }
7795 catch (buffer::error& e) {
7796 result = -EINVAL;
7797 return result;
7798 }
7799
7800 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7801 uint64_t features = op->get_features();
7802
7803 bool async_read_started = false;
7804 object_copy_data_t _reply_obj;
7805 C_CopyFrom_AsyncReadCb *cb = NULL;
7806 if (pool.info.require_rollback()) {
7807 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7808 }
7809 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7810 // size, mtime
7811 reply_obj.size = oi.size;
7812 reply_obj.mtime = oi.mtime;
7813 assert(obc->ssc);
7814 if (soid.snap < CEPH_NOSNAP) {
7815 if (obc->ssc->snapset.is_legacy()) {
7816 reply_obj.snaps = oi.legacy_snaps;
7817 } else {
7818 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7819 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7820 reply_obj.snaps = p->second;
7821 }
7822 } else {
7823 reply_obj.snap_seq = obc->ssc->snapset.seq;
7824 }
7825 if (oi.is_data_digest()) {
7826 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7827 reply_obj.data_digest = oi.data_digest;
7828 }
7829 if (oi.is_omap_digest()) {
7830 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7831 reply_obj.omap_digest = oi.omap_digest;
7832 }
7833 reply_obj.truncate_seq = oi.truncate_seq;
7834 reply_obj.truncate_size = oi.truncate_size;
7835
7836 // attrs
7837 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7838 if (!cursor.attr_complete) {
7839 result = getattrs_maybe_cache(
7840 ctx->obc,
7841 &out_attrs,
7842 true);
7843 if (result < 0) {
7844 if (cb) {
7845 delete cb;
7846 }
7847 return result;
7848 }
7849 cursor.attr_complete = true;
7850 dout(20) << " got attrs" << dendl;
7851 }
7852
7853 int64_t left = out_max - osd_op.outdata.length();
7854
7855 // data
7856 bufferlist& bl = reply_obj.data;
7857 if (left > 0 && !cursor.data_complete) {
7858 if (cursor.data_offset < oi.size) {
7859 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7860 if (cb) {
7861 async_read_started = true;
7862 ctx->pending_async_reads.push_back(
7863 make_pair(
7864 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7865 make_pair(&bl, cb)));
c07f9fc5
FG
7866 cb->len = max_read;
7867
7868 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7869 new ReadFinisher(osd_op));
7870 result = -EINPROGRESS;
7871
7872 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7c673cae
FG
7873 } else {
7874 result = pgbackend->objects_read_sync(
c07f9fc5 7875 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7c673cae
FG
7876 if (result < 0)
7877 return result;
7878 }
c07f9fc5
FG
7879 left -= max_read;
7880 cursor.data_offset += max_read;
7c673cae
FG
7881 }
7882 if (cursor.data_offset == oi.size) {
7883 cursor.data_complete = true;
7884 dout(20) << " got data" << dendl;
7885 }
7886 assert(cursor.data_offset <= oi.size);
7887 }
7888
7889 // omap
7890 uint32_t omap_keys = 0;
7891 if (!pool.info.supports_omap() || !oi.is_omap()) {
7892 cursor.omap_complete = true;
7893 } else {
7894 if (left > 0 && !cursor.omap_complete) {
7895 assert(cursor.data_complete);
7896 if (cursor.omap_offset.empty()) {
7897 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7898 &reply_obj.omap_header);
7899 }
7900 bufferlist omap_data;
7901 ObjectMap::ObjectMapIterator iter =
7902 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7903 assert(iter);
7904 iter->upper_bound(cursor.omap_offset);
7905 for (; iter->valid(); iter->next(false)) {
7906 ++omap_keys;
7907 ::encode(iter->key(), omap_data);
7908 ::encode(iter->value(), omap_data);
7909 left -= iter->key().length() + 4 + iter->value().length() + 4;
7910 if (left <= 0)
7911 break;
7912 }
7913 if (omap_keys) {
7914 ::encode(omap_keys, reply_obj.omap_data);
7915 reply_obj.omap_data.claim_append(omap_data);
7916 }
7917 if (iter->valid()) {
7918 cursor.omap_offset = iter->key();
7919 } else {
7920 cursor.omap_complete = true;
7921 dout(20) << " got omap" << dendl;
7922 }
7923 }
7924 }
7925
7926 if (cursor.is_complete()) {
7927 // include reqids only in the final step. this is a bit fragile
7928 // but it works...
7929 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7930 dout(20) << " got reqids" << dendl;
7931 }
7932
7933 dout(20) << " cursor.is_complete=" << cursor.is_complete()
7934 << " " << out_attrs.size() << " attrs"
7935 << " " << bl.length() << " bytes"
7936 << " " << reply_obj.omap_header.length() << " omap header bytes"
7937 << " " << reply_obj.omap_data.length() << " omap data bytes in "
7938 << omap_keys << " keys"
7939 << " " << reply_obj.reqids.size() << " reqids"
7940 << dendl;
7941 reply_obj.cursor = cursor;
7942 if (!async_read_started) {
7943 ::encode(reply_obj, osd_op.outdata, features);
7944 }
7945 if (cb && !async_read_started) {
7946 delete cb;
7947 }
c07f9fc5
FG
7948
7949 if (result > 0) {
7950 result = 0;
7951 }
7c673cae
FG
7952 return result;
7953}
7954
7955void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7956 OSDOp& osd_op)
7957{
7958 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7959 // be careful not to modify anything else that will upset a racing
7960 // operator<<
7961 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7962 uint64_t features = m->get_features();
7963 object_copy_data_t reply_obj;
7964
7965 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7966 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7967 ::encode(reply_obj, osd_op.outdata, features);
7968 osd_op.rval = -ENOENT;
7969 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7970 reply->claim_op_out_data(m->ops);
7971 reply->set_result(-ENOENT);
7972 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7973 osd->send_message_osd_client(reply, m->get_connection());
7974}
7975
7976void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7977 hobject_t src, object_locator_t oloc,
7978 version_t version, unsigned flags,
7979 bool mirror_snapset,
7980 unsigned src_obj_fadvise_flags,
7981 unsigned dest_obj_fadvise_flags)
7982{
7983 const hobject_t& dest = obc->obs.oi.soid;
7984 dout(10) << __func__ << " " << dest
7985 << " from " << src << " " << oloc << " v" << version
7986 << " flags " << flags
7987 << (mirror_snapset ? " mirror_snapset" : "")
7988 << dendl;
7989
7990 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7991 src.snap == CEPH_SNAPDIR));
7992
7993 // cancel a previous in-progress copy?
7994 if (copy_ops.count(dest)) {
7995 // FIXME: if the src etc match, we could avoid restarting from the
7996 // beginning.
7997 CopyOpRef cop = copy_ops[dest];
7998 cancel_copy(cop, false);
7999 }
8000
8001 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8002 mirror_snapset, src_obj_fadvise_flags,
8003 dest_obj_fadvise_flags));
8004 copy_ops[dest] = cop;
8005 obc->start_block();
8006
8007 _copy_some(obc, cop);
8008}
8009
8010void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8011{
8012 dout(10) << __func__ << " " << obc << " " << cop << dendl;
8013
8014 unsigned flags = 0;
8015 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8016 flags |= CEPH_OSD_FLAG_FLUSH;
8017 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8018 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8019 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8020 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8021 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8022 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8023 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8024 flags |= CEPH_OSD_FLAG_RWORDERED;
8025
8026 C_GatherBuilder gather(cct);
8027
8028 if (cop->cursor.is_initial() && cop->mirror_snapset) {
8029 // list snaps too.
8030 assert(cop->src.snap == CEPH_NOSNAP);
8031 ObjectOperation op;
8032 op.list_snaps(&cop->results.snapset, NULL);
8033 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8034 CEPH_SNAPDIR, NULL,
8035 flags, gather.new_sub(), NULL);
8036 cop->objecter_tid2 = tid;
8037 }
8038
8039 ObjectOperation op;
8040 if (cop->results.user_version) {
8041 op.assert_version(cop->results.user_version);
8042 } else {
8043 // we should learn the version after the first chunk, if we didn't know
8044 // it already!
8045 assert(cop->cursor.is_initial());
8046 }
8047 op.copy_get(&cop->cursor, get_copy_chunk_size(),
8048 &cop->results.object_size, &cop->results.mtime,
8049 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8050 &cop->results.snaps, &cop->results.snap_seq,
8051 &cop->results.flags,
8052 &cop->results.source_data_digest,
8053 &cop->results.source_omap_digest,
8054 &cop->results.reqids,
8055 &cop->results.truncate_seq,
8056 &cop->results.truncate_size,
8057 &cop->rval);
8058 op.set_last_op_flags(cop->src_obj_fadvise_flags);
8059
8060 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8061 get_last_peering_reset(), cop);
8062 gather.set_finisher(new C_OnFinisher(fin,
8063 &osd->objecter_finisher));
8064
8065 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8066 cop->src.snap, NULL,
8067 flags,
8068 gather.new_sub(),
8069 // discover the object version if we don't know it yet
8070 cop->results.user_version ? NULL : &cop->results.user_version);
8071 fin->tid = tid;
8072 cop->objecter_tid = tid;
8073 gather.activate();
8074}
8075
8076void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8077{
8078 dout(10) << __func__ << " " << oid << " tid " << tid
8079 << " " << cpp_strerror(r) << dendl;
8080 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8081 if (p == copy_ops.end()) {
8082 dout(10) << __func__ << " no copy_op found" << dendl;
8083 return;
8084 }
8085 CopyOpRef cop = p->second;
8086 if (tid != cop->objecter_tid) {
8087 dout(10) << __func__ << " tid " << tid << " != cop " << cop
8088 << " tid " << cop->objecter_tid << dendl;
8089 return;
8090 }
8091
8092 if (cop->omap_data.length() || cop->omap_header.length())
8093 cop->results.has_omap = true;
8094
8095 if (r >= 0 && !pool.info.supports_omap() &&
8096 (cop->omap_data.length() || cop->omap_header.length())) {
8097 r = -EOPNOTSUPP;
8098 }
8099 cop->objecter_tid = 0;
8100 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
8101 ObjectContextRef& cobc = cop->obc;
8102
8103 if (r < 0)
8104 goto out;
8105
8106 assert(cop->rval >= 0);
8107
8108 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8109 // verify snap hasn't been deleted
8110 vector<snapid_t>::iterator p = cop->results.snaps.begin();
8111 while (p != cop->results.snaps.end()) {
8112 if (pool.info.is_removed_snap(*p)) {
8113 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8114 << dendl;
8115 for (vector<snapid_t>::iterator q = p + 1;
8116 q != cop->results.snaps.end();
8117 ++q)
8118 *(q - 1) = *q;
8119 cop->results.snaps.resize(cop->results.snaps.size() - 1);
8120 } else {
8121 ++p;
8122 }
8123 }
8124 if (cop->results.snaps.empty()) {
8125 dout(10) << __func__ << " no more snaps for " << oid << dendl;
8126 r = -ENOENT;
8127 goto out;
8128 }
8129 }
8130
8131 assert(cop->rval >= 0);
8132
8133 if (!cop->temp_cursor.data_complete) {
8134 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8135 }
8136 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8137 if (cop->omap_header.length()) {
8138 cop->results.omap_digest =
8139 cop->omap_header.crc32c(cop->results.omap_digest);
8140 }
8141 if (cop->omap_data.length()) {
8142 bufferlist keys;
8143 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8144 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8145 }
8146 }
8147
8148 if (!cop->temp_cursor.attr_complete) {
8149 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8150 p != cop->attrs.end();
8151 ++p) {
8152 cop->results.attrs[string("_") + p->first] = p->second;
8153 }
8154 cop->attrs.clear();
8155 }
8156
8157 if (!cop->cursor.is_complete()) {
8158 // write out what we have so far
8159 if (cop->temp_cursor.is_initial()) {
8160 assert(!cop->results.started_temp_obj);
8161 cop->results.started_temp_obj = true;
8162 cop->results.temp_oid = generate_temp_object(oid);
8163 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8164 }
8165 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8166 OpContextUPtr ctx = simple_opc_create(tempobc);
8167 if (cop->temp_cursor.is_initial()) {
8168 ctx->new_temp_oid = cop->results.temp_oid;
8169 }
8170 _write_copy_chunk(cop, ctx->op_t.get());
8171 simple_opc_submit(std::move(ctx));
8172 dout(10) << __func__ << " fetching more" << dendl;
8173 _copy_some(cobc, cop);
8174 return;
8175 }
8176
8177 // verify digests?
8178 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8179 dout(20) << __func__ << std::hex
8180 << " got digest: rx data 0x" << cop->results.data_digest
8181 << " omap 0x" << cop->results.omap_digest
8182 << ", source: data 0x" << cop->results.source_data_digest
8183 << " omap 0x" << cop->results.source_omap_digest
8184 << std::dec
8185 << " flags " << cop->results.flags
8186 << dendl;
8187 }
8188 if (cop->results.is_data_digest() &&
8189 cop->results.data_digest != cop->results.source_data_digest) {
8190 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8191 << " != source 0x" << cop->results.source_data_digest << std::dec
8192 << dendl;
8193 osd->clog->error() << info.pgid << " copy from " << cop->src
8194 << " to " << cop->obc->obs.oi.soid << std::hex
8195 << " data digest 0x" << cop->results.data_digest
8196 << " != source 0x" << cop->results.source_data_digest
8197 << std::dec;
8198 r = -EIO;
8199 goto out;
8200 }
8201 if (cop->results.is_omap_digest() &&
8202 cop->results.omap_digest != cop->results.source_omap_digest) {
8203 derr << __func__ << std::hex
8204 << " omap digest 0x" << cop->results.omap_digest
8205 << " != source 0x" << cop->results.source_omap_digest
8206 << std::dec << dendl;
8207 osd->clog->error() << info.pgid << " copy from " << cop->src
8208 << " to " << cop->obc->obs.oi.soid << std::hex
8209 << " omap digest 0x" << cop->results.omap_digest
8210 << " != source 0x" << cop->results.source_omap_digest
8211 << std::dec;
8212 r = -EIO;
8213 goto out;
8214 }
8215 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8216 derr << __func__ << " injecting copyfrom failure" << dendl;
8217 r = -EIO;
8218 goto out;
8219 }
8220
8221 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8222 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8223 ObjectState& obs = cop->obc->obs;
8224 if (cop->temp_cursor.is_initial()) {
8225 dout(20) << "fill_in_final_tx: writing "
8226 << "directly to final object" << dendl;
8227 // write directly to final object
8228 cop->results.temp_oid = obs.oi.soid;
8229 _write_copy_chunk(cop, t);
8230 } else {
8231 // finish writing to temp object, then move into place
8232 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8233 _write_copy_chunk(cop, t);
8234 t->rename(obs.oi.soid, cop->results.temp_oid);
8235 }
8236 t->setattrs(obs.oi.soid, cop->results.attrs);
8237 });
8238
8239 dout(20) << __func__ << " success; committing" << dendl;
8240
8241 out:
8242 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8243 CopyCallbackResults results(r, &cop->results);
8244 cop->cb->complete(results);
8245
8246 copy_ops.erase(cobc->obs.oi.soid);
8247 cobc->stop_block();
8248
8249 if (r < 0 && cop->results.started_temp_obj) {
8250 dout(10) << __func__ << " deleting partial temp object "
8251 << cop->results.temp_oid << dendl;
8252 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8253 OpContextUPtr ctx = simple_opc_create(tempobc);
8254 ctx->op_t->remove(cop->results.temp_oid);
8255 ctx->discard_temp_oid = cop->results.temp_oid;
8256 simple_opc_submit(std::move(ctx));
8257 }
8258
8259 // cancel and requeue proxy ops on this object
8260 if (!r) {
8261 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8262 it != proxyread_ops.end();) {
8263 if (it->second->soid == cobc->obs.oi.soid) {
8264 cancel_proxy_read((it++)->second);
8265 } else {
8266 ++it;
8267 }
8268 }
8269 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8270 it != proxywrite_ops.end();) {
8271 if (it->second->soid == cobc->obs.oi.soid) {
8272 cancel_proxy_write((it++)->second);
8273 } else {
8274 ++it;
8275 }
8276 }
8277 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8278 }
8279
8280 kick_object_context_blocked(cobc);
8281}
8282
8283void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8284{
8285 dout(20) << __func__ << " " << cop
8286 << " " << cop->attrs.size() << " attrs"
8287 << " " << cop->data.length() << " bytes"
8288 << " " << cop->omap_header.length() << " omap header bytes"
8289 << " " << cop->omap_data.length() << " omap data bytes"
8290 << dendl;
8291 if (!cop->temp_cursor.attr_complete) {
8292 t->create(cop->results.temp_oid);
8293 }
8294 if (!cop->temp_cursor.data_complete) {
8295 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8296 cop->cursor.data_offset);
8297 if (pool.info.requires_aligned_append() &&
8298 !cop->cursor.data_complete) {
8299 /**
8300 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8301 * to pick it up on the next pass.
8302 */
8303 assert(cop->temp_cursor.data_offset %
8304 pool.info.required_alignment() == 0);
8305 if (cop->data.length() % pool.info.required_alignment() != 0) {
8306 uint64_t to_trim =
8307 cop->data.length() % pool.info.required_alignment();
8308 bufferlist bl;
8309 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8310 cop->data.swap(bl);
8311 cop->cursor.data_offset -= to_trim;
8312 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8313 cop->cursor.data_offset);
8314 }
8315 }
8316 if (cop->data.length()) {
8317 t->write(
8318 cop->results.temp_oid,
8319 cop->temp_cursor.data_offset,
8320 cop->data.length(),
8321 cop->data,
8322 cop->dest_obj_fadvise_flags);
8323 }
8324 cop->data.clear();
8325 }
8326 if (pool.info.supports_omap()) {
8327 if (!cop->temp_cursor.omap_complete) {
8328 if (cop->omap_header.length()) {
8329 t->omap_setheader(
8330 cop->results.temp_oid,
8331 cop->omap_header);
8332 cop->omap_header.clear();
8333 }
8334 if (cop->omap_data.length()) {
8335 map<string,bufferlist> omap;
8336 bufferlist::iterator p = cop->omap_data.begin();
8337 ::decode(omap, p);
8338 t->omap_setkeys(cop->results.temp_oid, omap);
8339 cop->omap_data.clear();
8340 }
8341 }
8342 } else {
8343 assert(cop->omap_header.length() == 0);
8344 assert(cop->omap_data.length() == 0);
8345 }
8346 cop->temp_cursor = cop->cursor;
8347}
8348
c07f9fc5 8349void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
7c673cae 8350{
c07f9fc5 8351 OpContext *ctx = cb->ctx;
7c673cae 8352 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
7c673cae 8353
c07f9fc5 8354 ObjectState& obs = ctx->new_obs;
7c673cae
FG
8355 if (obs.exists) {
8356 dout(20) << __func__ << ": exists, removing" << dendl;
8357 ctx->op_t->remove(obs.oi.soid);
8358 } else {
8359 ctx->delta_stats.num_objects++;
8360 obs.exists = true;
8361 }
8362 if (cb->is_temp_obj_used()) {
8363 ctx->discard_temp_oid = cb->results->temp_oid;
8364 }
8365 cb->results->fill_in_final_tx(ctx->op_t.get());
8366
8367 // CopyFromCallback fills this in for us
8368 obs.oi.user_version = ctx->user_at_version;
8369
8370 obs.oi.set_data_digest(cb->results->data_digest);
8371 obs.oi.set_omap_digest(cb->results->omap_digest);
8372
8373 obs.oi.truncate_seq = cb->results->truncate_seq;
8374 obs.oi.truncate_size = cb->results->truncate_size;
8375
8376 ctx->extra_reqids = cb->results->reqids;
8377
8378 // cache: clear whiteout?
8379 if (obs.oi.is_whiteout()) {
8380 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8381 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8382 --ctx->delta_stats.num_whiteouts;
8383 }
8384
8385 if (cb->results->has_omap) {
8386 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8387 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8388 } else {
8389 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8390 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8391 }
8392
8393 interval_set<uint64_t> ch;
8394 if (obs.oi.size > 0)
8395 ch.insert(0, obs.oi.size);
8396 ctx->modified_ranges.union_of(ch);
8397
8398 if (cb->get_data_size() != obs.oi.size) {
8399 ctx->delta_stats.num_bytes -= obs.oi.size;
8400 obs.oi.size = cb->get_data_size();
8401 ctx->delta_stats.num_bytes += obs.oi.size;
8402 }
8403 ctx->delta_stats.num_wr++;
8404 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8405
8406 osd->logger->inc(l_osd_copyfrom);
8407}
8408
8409void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8410 ObjectContextRef obc)
8411{
8412 const hobject_t& soid = obc->obs.oi.soid;
8413 dout(10) << __func__ << " " << soid << " r=" << r
8414 << " uv" << results->user_version << dendl;
8415
8416 if (r == -ECANCELED) {
8417 return;
8418 }
8419
8420 if (r != -ENOENT && soid.is_snap()) {
8421 if (results->snaps.empty()) {
8422 // we must have read "snap" content from the head object in
8423 // the base pool. use snap_seq to construct what snaps should
8424 // be for this clone (what is was before we evicted the clean
8425 // clone from this pool, and what it will be when we flush and
8426 // the clone eventually happens in the base pool).
8427 SnapSet& snapset = obc->ssc->snapset;
8428 vector<snapid_t>::iterator p = snapset.snaps.begin();
8429 while (p != snapset.snaps.end() && *p > soid.snap)
8430 ++p;
8431 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8432 results->snaps.push_back(*p);
8433 ++p;
8434 }
8435 }
8436
8437 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8438 filter_snapc(results->snaps);
8439
8440 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8441 if (results->snaps.empty()) {
8442 dout(20) << __func__
8443 << " snaps are empty, clone is invalid,"
8444 << " setting r to ENOENT" << dendl;
8445 r = -ENOENT;
8446 }
8447 }
8448
8449 if (r < 0 && results->started_temp_obj) {
8450 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8451 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8452 assert(tempobc);
8453 OpContextUPtr ctx = simple_opc_create(tempobc);
8454 ctx->op_t->remove(results->temp_oid);
8455 simple_opc_submit(std::move(ctx));
8456 results->started_temp_obj = false;
8457 }
8458
8459 if (r == -ENOENT && soid.is_snap()) {
8460 dout(10) << __func__
8461 << ": enoent while trying to promote clone, " << soid
8462 << " must have been trimmed, removing from snapset"
8463 << dendl;
8464 hobject_t head(soid.get_head());
8465 ObjectContextRef obc = get_object_context(head, false);
8466 assert(obc);
8467
8468 OpContextUPtr tctx = simple_opc_create(obc);
8469 tctx->at_version = get_next_version();
8470 filter_snapc(tctx->new_snapset.snaps);
8471 vector<snapid_t> new_clones;
8472 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8473 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8474 i != tctx->new_snapset.clones.end();
8475 ++i) {
8476 if (*i != soid.snap) {
8477 new_clones.push_back(*i);
8478 auto p = tctx->new_snapset.clone_snaps.find(*i);
8479 if (p != tctx->new_snapset.clone_snaps.end()) {
8480 new_clone_snaps[*i] = p->second;
8481 }
8482 }
8483 }
8484 tctx->new_snapset.clones.swap(new_clones);
8485 tctx->new_snapset.clone_overlap.erase(soid.snap);
8486 tctx->new_snapset.clone_size.erase(soid.snap);
8487 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8488
8489 // take RWWRITE lock for duration of our local write. ignore starvation.
8490 if (!tctx->lock_manager.take_write_lock(
8491 head,
8492 obc)) {
8493 assert(0 == "problem!");
8494 }
8495 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8496
8497 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8498
8499 simple_opc_submit(std::move(tctx));
8500 return;
8501 }
8502
8503 bool whiteout = false;
8504 if (r == -ENOENT) {
8505 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8506 dout(10) << __func__ << " whiteout " << soid << dendl;
8507 whiteout = true;
8508 }
8509
8510 if (r < 0 && !whiteout) {
8511 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8512 // pass error to everyone blocked on this object
8513 // FIXME: this is pretty sloppy, but at this point we got
8514 // something unexpected and don't have many other options.
8515 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8516 waiting_for_blocked_object.find(soid);
8517 if (blocked_iter != waiting_for_blocked_object.end()) {
8518 while (!blocked_iter->second.empty()) {
8519 osd->reply_op_error(blocked_iter->second.front(), r);
8520 blocked_iter->second.pop_front();
8521 }
8522 waiting_for_blocked_object.erase(blocked_iter);
8523 }
8524 return;
8525 }
8526
8527 osd->promote_finish(results->object_size);
8528
8529 OpContextUPtr tctx = simple_opc_create(obc);
8530 tctx->at_version = get_next_version();
8531
8532 ++tctx->delta_stats.num_objects;
8533 if (soid.snap < CEPH_NOSNAP)
8534 ++tctx->delta_stats.num_object_clones;
8535 tctx->new_obs.exists = true;
8536
8537 tctx->extra_reqids = results->reqids;
8538
8539 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
31f18b77 8540 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
8541
8542 if (whiteout) {
8543 // create a whiteout
8544 tctx->op_t->create(soid);
8545 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8546 ++tctx->delta_stats.num_whiteouts;
8547 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8548 osd->logger->inc(l_osd_tier_whiteout);
8549 } else {
8550 if (results->has_omap) {
8551 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8552 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8553 ++tctx->delta_stats.num_objects_omap;
8554 }
8555
8556 results->fill_in_final_tx(tctx->op_t.get());
8557 if (results->started_temp_obj) {
8558 tctx->discard_temp_oid = results->temp_oid;
8559 }
8560 tctx->new_obs.oi.size = results->object_size;
8561 tctx->new_obs.oi.user_version = results->user_version;
8562 // Don't care src object whether have data or omap digest
8563 if (results->object_size)
8564 tctx->new_obs.oi.set_data_digest(results->data_digest);
8565 if (results->has_omap)
8566 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8567 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8568 tctx->new_obs.oi.truncate_size = results->truncate_size;
8569
8570 if (soid.snap != CEPH_NOSNAP) {
8571 if (legacy_snapset) {
8572 tctx->new_obs.oi.legacy_snaps = results->snaps;
8573 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8574 } else {
8575 // it's already in the snapset
8576 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8577 }
8578 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8579 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8580 results->object_size);
8581 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8582
8583 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8584 } else {
8585 tctx->delta_stats.num_bytes += results->object_size;
8586 }
8587 }
8588
8589 if (results->mirror_snapset) {
8590 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8591 tctx->new_snapset.from_snap_set(
8592 results->snapset,
31f18b77 8593 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
8594 }
8595 tctx->new_snapset.head_exists = true;
8596 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8597
8598 // take RWWRITE lock for duration of our local write. ignore starvation.
8599 if (!tctx->lock_manager.take_write_lock(
8600 obc->obs.oi.soid,
8601 obc)) {
8602 assert(0 == "problem!");
8603 }
8604 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8605
8606 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8607
8608 simple_opc_submit(std::move(tctx));
8609
8610 osd->logger->inc(l_osd_tier_promote);
8611
8612 if (agent_state &&
8613 agent_state->is_idle())
8614 agent_choose_mode();
8615}
8616
8617void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8618{
8619 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8620 << " from " << cop->src << " " << cop->oloc
8621 << " v" << cop->results.user_version << dendl;
8622
8623 // cancel objecter op, if we can
8624 if (cop->objecter_tid) {
8625 osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8626 cop->objecter_tid = 0;
8627 if (cop->objecter_tid2) {
8628 osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8629 cop->objecter_tid2 = 0;
8630 }
8631 }
8632
8633 copy_ops.erase(cop->obc->obs.oi.soid);
8634 cop->obc->stop_block();
8635
8636 kick_object_context_blocked(cop->obc);
8637 cop->results.should_requeue = requeue;
8638 CopyCallbackResults result(-ECANCELED, &cop->results);
8639 cop->cb->complete(result);
8640
8641 // There may still be an objecter callback referencing this copy op.
8642 // That callback will not need the obc since it's been canceled, and
8643 // we need the obc reference to go away prior to flush.
8644 cop->obc = ObjectContextRef();
8645}
8646
8647void PrimaryLogPG::cancel_copy_ops(bool requeue)
8648{
8649 dout(10) << __func__ << dendl;
8650 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8651 while (p != copy_ops.end()) {
8652 // requeue this op? can I queue up all of them?
8653 cancel_copy((p++)->second, requeue);
8654 }
8655}
8656
8657
8658// ========================================================================
8659// flush
8660//
8661// Flush a dirty object in the cache tier by writing it back to the
8662// base tier. The sequence looks like:
8663//
8664// * send a copy-from operation to the base tier to copy the current
8665// version of the object
8666// * base tier will pull the object via (perhaps multiple) copy-get(s)
8667// * on completion, we check if the object has been modified. if so,
8668// just reply with -EAGAIN.
8669// * try to take a write lock so we can clear the dirty flag. if this
8670// fails, wait and retry
8671// * start a repop that clears the bit.
8672//
8673// If we have to wait, we will retry by coming back through the
8674// start_flush method. We check if a flush is already in progress
8675// and, if so, try to finish it by rechecking the version and trying
8676// to clear the dirty bit.
8677//
8678// In order for the cache-flush (a write op) to not block the copy-get
8679// from reading the object, the client *must* set the SKIPRWLOCKS
8680// flag.
8681//
8682// NOTE: normally writes are strictly ordered for the client, but
8683// flushes are special in that they can be reordered with respect to
8684// other writes. In particular, we can't have a flush request block
8685// an update to the cache pool object!
8686
8687struct C_Flush : public Context {
8688 PrimaryLogPGRef pg;
8689 hobject_t oid;
8690 epoch_t last_peering_reset;
8691 ceph_tid_t tid;
8692 utime_t start;
8693 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8694 : pg(p), oid(o), last_peering_reset(lpr),
8695 tid(0), start(ceph_clock_now())
8696 {}
8697 void finish(int r) override {
8698 if (r == -ECANCELED)
8699 return;
8700 pg->lock();
8701 if (last_peering_reset == pg->get_last_peering_reset()) {
8702 pg->finish_flush(oid, tid, r);
8703 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8704 }
8705 pg->unlock();
8706 }
8707};
8708
8709int PrimaryLogPG::start_flush(
8710 OpRequestRef op, ObjectContextRef obc,
8711 bool blocking, hobject_t *pmissing,
8712 boost::optional<std::function<void()>> &&on_flush)
8713{
8714 const object_info_t& oi = obc->obs.oi;
8715 const hobject_t& soid = oi.soid;
8716 dout(10) << __func__ << " " << soid
8717 << " v" << oi.version
8718 << " uv" << oi.user_version
8719 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8720 << dendl;
8721
8722 // get a filtered snapset, need to remove removed snaps
8723 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8724
8725 // verify there are no (older) check for dirty clones
8726 {
8727 dout(20) << " snapset " << snapset << dendl;
8728 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8729 while (p != snapset.clones.rend() && *p >= soid.snap)
8730 ++p;
8731 if (p != snapset.clones.rend()) {
8732 hobject_t next = soid;
8733 next.snap = *p;
8734 assert(next.snap < soid.snap);
8735 if (pg_log.get_missing().is_missing(next)) {
8736 dout(10) << __func__ << " missing clone is " << next << dendl;
8737 if (pmissing)
8738 *pmissing = next;
8739 return -ENOENT;
8740 }
8741 ObjectContextRef older_obc = get_object_context(next, false);
8742 if (older_obc) {
8743 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8744 << dendl;
8745 if (older_obc->obs.oi.is_dirty()) {
8746 dout(10) << __func__ << " next oldest clone is dirty: "
8747 << older_obc->obs.oi << dendl;
8748 return -EBUSY;
8749 }
8750 } else {
8751 dout(20) << __func__ << " next oldest clone " << next
8752 << " is not present; implicitly clean" << dendl;
8753 }
8754 } else {
8755 dout(20) << __func__ << " no older clones" << dendl;
8756 }
8757 }
8758
8759 if (blocking)
8760 obc->start_block();
8761
8762 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8763 if (p != flush_ops.end()) {
8764 FlushOpRef fop = p->second;
8765 if (fop->op == op) {
8766 // we couldn't take the write lock on a cache-try-flush before;
8767 // now we are trying again for the lock.
8768 return try_flush_mark_clean(fop);
8769 }
8770 if (fop->flushed_version == obc->obs.oi.user_version &&
8771 (fop->blocking || !blocking)) {
8772 // nonblocking can join anything
8773 // blocking can only join a blocking flush
8774 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8775 if (op)
8776 fop->dup_ops.push_back(op);
8777 return -EAGAIN; // clean up this ctx; op will retry later
8778 }
8779
8780 // cancel current flush since it will fail anyway, or because we
8781 // are blocking and the existing flush is nonblocking.
8782 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8783 if (fop->op)
8784 osd->reply_op_error(fop->op, -EBUSY);
8785 while (!fop->dup_ops.empty()) {
8786 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8787 fop->dup_ops.pop_front();
8788 }
8789 cancel_flush(fop, false);
8790 }
8791
8792 /**
8793 * In general, we need to send a delete and a copyfrom.
8794 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8795 * where 4 is marked as clean. To flush 10, we have to:
8796 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8797 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8798 *
8799 * There is a complicating case. Supposed there had been a clone 7
8800 * for snaps [7, 6] which has been trimmed since they no longer exist.
8801 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8802 * the delete, the snap will be promoted to 5, and the head will become
8803 * a snapdir. When the copy-from goes through, we'll end up with
8804 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8805 *
8806 * Another complication is the case where there is an interval change
8807 * after doing the delete and the flush but before marking the object
8808 * clean. We'll happily delete head and then recreate it at the same
8809 * sequence number, which works out ok.
8810 */
8811
8812 SnapContext snapc, dsnapc;
8813 if (snapset.seq != 0) {
8814 if (soid.snap == CEPH_NOSNAP) {
8815 snapc.seq = snapset.seq;
8816 snapc.snaps = snapset.snaps;
8817 } else {
8818 snapid_t min_included_snap;
8819 if (snapset.is_legacy()) {
8820 min_included_snap = oi.legacy_snaps.back();
8821 } else {
8822 auto p = snapset.clone_snaps.find(soid.snap);
8823 assert(p != snapset.clone_snaps.end());
8824 min_included_snap = p->second.back();
8825 }
8826 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8827 }
8828
8829 snapid_t prev_snapc = 0;
8830 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8831 citer != snapset.clones.rend();
8832 ++citer) {
8833 if (*citer < soid.snap) {
8834 prev_snapc = *citer;
8835 break;
8836 }
8837 }
8838
8839 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8840 }
8841
8842 object_locator_t base_oloc(soid);
8843 base_oloc.pool = pool.info.tier_of;
8844
8845 if (dsnapc.seq < snapc.seq) {
8846 ObjectOperation o;
8847 o.remove();
8848 osd->objecter->mutate(
8849 soid.oid,
8850 base_oloc,
8851 o,
8852 dsnapc,
8853 ceph::real_clock::from_ceph_timespec(oi.mtime),
8854 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8855 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8856 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8857 }
8858
8859 FlushOpRef fop(std::make_shared<FlushOp>());
8860 fop->obc = obc;
8861 fop->flushed_version = oi.user_version;
8862 fop->blocking = blocking;
8863 fop->on_flush = std::move(on_flush);
8864 fop->op = op;
8865
8866 ObjectOperation o;
8867 if (oi.is_whiteout()) {
8868 fop->removal = true;
8869 o.remove();
8870 } else {
8871 object_locator_t oloc(soid);
8872 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8873 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8874 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8875 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8876 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8877 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8878
8879 //mean the base tier don't cache data after this
8880 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8881 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8882 }
8883 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8884
8885 ceph_tid_t tid = osd->objecter->mutate(
8886 soid.oid, base_oloc, o, snapc,
8887 ceph::real_clock::from_ceph_timespec(oi.mtime),
8888 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8889 new C_OnFinisher(fin,
8890 &osd->objecter_finisher));
8891 /* we're under the pg lock and fin->finish() is grabbing that */
8892 fin->tid = tid;
8893 fop->objecter_tid = tid;
8894
8895 flush_ops[soid] = fop;
8896 info.stats.stats.sum.num_flush++;
8897 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8898 return -EINPROGRESS;
8899}
8900
8901void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8902{
8903 dout(10) << __func__ << " " << oid << " tid " << tid
8904 << " " << cpp_strerror(r) << dendl;
8905 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8906 if (p == flush_ops.end()) {
8907 dout(10) << __func__ << " no flush_op found" << dendl;
8908 return;
8909 }
8910 FlushOpRef fop = p->second;
8911 if (tid != fop->objecter_tid) {
8912 dout(10) << __func__ << " tid " << tid << " != fop " << fop
8913 << " tid " << fop->objecter_tid << dendl;
8914 return;
8915 }
8916 ObjectContextRef obc = fop->obc;
8917 fop->objecter_tid = 0;
8918
8919 if (r < 0 && !(r == -ENOENT && fop->removal)) {
8920 if (fop->op)
8921 osd->reply_op_error(fop->op, -EBUSY);
8922 if (fop->blocking) {
8923 obc->stop_block();
8924 kick_object_context_blocked(obc);
8925 }
8926
8927 if (!fop->dup_ops.empty()) {
8928 dout(20) << __func__ << " requeueing dups" << dendl;
8929 requeue_ops(fop->dup_ops);
8930 }
8931 if (fop->on_flush) {
8932 (*(fop->on_flush))();
8933 fop->on_flush = boost::none;
8934 }
8935 flush_ops.erase(oid);
8936 return;
8937 }
8938
8939 r = try_flush_mark_clean(fop);
8940 if (r == -EBUSY && fop->op) {
8941 osd->reply_op_error(fop->op, r);
8942 }
8943}
8944
8945int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8946{
8947 ObjectContextRef obc = fop->obc;
8948 const hobject_t& oid = obc->obs.oi.soid;
8949
8950 if (fop->blocking) {
8951 obc->stop_block();
8952 kick_object_context_blocked(obc);
8953 }
8954
8955 if (fop->flushed_version != obc->obs.oi.user_version ||
8956 !obc->obs.exists) {
8957 if (obc->obs.exists)
8958 dout(10) << __func__ << " flushed_version " << fop->flushed_version
8959 << " != current " << obc->obs.oi.user_version
8960 << dendl;
8961 else
8962 dout(10) << __func__ << " object no longer exists" << dendl;
8963
8964 if (!fop->dup_ops.empty()) {
8965 dout(20) << __func__ << " requeueing dups" << dendl;
8966 requeue_ops(fop->dup_ops);
8967 }
8968 if (fop->on_flush) {
8969 (*(fop->on_flush))();
8970 fop->on_flush = boost::none;
8971 }
8972 flush_ops.erase(oid);
8973 if (fop->blocking)
8974 osd->logger->inc(l_osd_tier_flush_fail);
8975 else
8976 osd->logger->inc(l_osd_tier_try_flush_fail);
8977 return -EBUSY;
8978 }
8979
8980 if (!fop->blocking &&
8981 scrubber.write_blocked_by_scrub(oid)) {
8982 if (fop->op) {
8983 dout(10) << __func__ << " blocked by scrub" << dendl;
8984 requeue_op(fop->op);
8985 requeue_ops(fop->dup_ops);
8986 return -EAGAIN; // will retry
8987 } else {
8988 osd->logger->inc(l_osd_tier_try_flush_fail);
8989 cancel_flush(fop, false);
8990 return -ECANCELED;
8991 }
8992 }
8993
8994 // successfully flushed, can we evict this object?
8995 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8996 agent_maybe_evict(obc, true)) {
8997 osd->logger->inc(l_osd_tier_clean);
8998 if (fop->on_flush) {
8999 (*(fop->on_flush))();
9000 fop->on_flush = boost::none;
9001 }
9002 flush_ops.erase(oid);
9003 return 0;
9004 }
9005
9006 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9007 OpContextUPtr ctx = simple_opc_create(fop->obc);
9008
9009 // successfully flushed; can we clear the dirty bit?
9010 // try to take the lock manually, since we don't
9011 // have a ctx yet.
9012 if (ctx->lock_manager.get_lock_type(
9013 ObjectContext::RWState::RWWRITE,
9014 oid,
9015 obc,
9016 fop->op)) {
9017 dout(20) << __func__ << " took write lock" << dendl;
9018 } else if (fop->op) {
9019 dout(10) << __func__ << " waiting on write lock" << dendl;
9020 close_op_ctx(ctx.release());
9021 requeue_op(fop->op);
9022 requeue_ops(fop->dup_ops);
9023 return -EAGAIN; // will retry
9024 } else {
9025 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9026 close_op_ctx(ctx.release());
9027 osd->logger->inc(l_osd_tier_try_flush_fail);
9028 cancel_flush(fop, false);
9029 return -ECANCELED;
9030 }
9031
9032 if (fop->on_flush) {
9033 ctx->register_on_finish(*(fop->on_flush));
9034 fop->on_flush = boost::none;
9035 }
9036
9037 ctx->at_version = get_next_version();
9038
9039 ctx->new_obs = obc->obs;
9040 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9041 --ctx->delta_stats.num_objects_dirty;
9042
9043 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9044
9045 osd->logger->inc(l_osd_tier_clean);
9046
9047 if (!fop->dup_ops.empty() || fop->op) {
9048 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9049 list<OpRequestRef> ls;
9050 if (fop->op)
9051 ls.push_back(fop->op);
9052 ls.splice(ls.end(), fop->dup_ops);
9053 requeue_ops(ls);
9054 }
9055
9056 simple_opc_submit(std::move(ctx));
9057
9058 flush_ops.erase(oid);
9059
9060 if (fop->blocking)
9061 osd->logger->inc(l_osd_tier_flush);
9062 else
9063 osd->logger->inc(l_osd_tier_try_flush);
9064
9065 return -EINPROGRESS;
9066}
9067
9068void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
9069{
9070 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9071 << fop->objecter_tid << dendl;
9072 if (fop->objecter_tid) {
9073 osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
9074 fop->objecter_tid = 0;
9075 }
9076 if (fop->blocking) {
9077 fop->obc->stop_block();
9078 kick_object_context_blocked(fop->obc);
9079 }
9080 if (requeue) {
9081 if (fop->op)
9082 requeue_op(fop->op);
9083 requeue_ops(fop->dup_ops);
9084 }
9085 if (fop->on_flush) {
9086 (*(fop->on_flush))();
9087 fop->on_flush = boost::none;
9088 }
9089 flush_ops.erase(fop->obc->obs.oi.soid);
9090}
9091
9092void PrimaryLogPG::cancel_flush_ops(bool requeue)
9093{
9094 dout(10) << __func__ << dendl;
9095 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9096 while (p != flush_ops.end()) {
9097 cancel_flush((p++)->second, requeue);
9098 }
9099}
9100
9101bool PrimaryLogPG::is_present_clone(hobject_t coid)
9102{
9103 if (!pool.info.allow_incomplete_clones())
9104 return true;
9105 if (is_missing_object(coid))
9106 return true;
9107 ObjectContextRef obc = get_object_context(coid, false);
9108 return obc && obc->obs.exists;
9109}
9110
9111// ========================================================================
9112// rep op gather
9113
9114class C_OSD_RepopApplied : public Context {
9115 PrimaryLogPGRef pg;
9116 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9117public:
9118 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9119 : pg(pg), repop(repop) {}
9120 void finish(int) override {
9121 pg->repop_all_applied(repop.get());
9122 }
9123};
9124
9125
9126void PrimaryLogPG::repop_all_applied(RepGather *repop)
9127{
9128 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9129 << dendl;
9130 assert(!repop->applies_with_commit);
9131 repop->all_applied = true;
9132 if (!repop->rep_aborted) {
9133 eval_repop(repop);
9134 }
9135}
9136
9137class C_OSD_RepopCommit : public Context {
9138 PrimaryLogPGRef pg;
9139 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9140public:
9141 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9142 : pg(pg), repop(repop) {}
9143 void finish(int) override {
9144 pg->repop_all_committed(repop.get());
9145 }
9146};
9147
9148void PrimaryLogPG::repop_all_committed(RepGather *repop)
9149{
9150 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9151 << dendl;
9152 repop->all_committed = true;
9153 if (repop->applies_with_commit) {
9154 assert(!repop->all_applied);
9155 repop->all_applied = true;
9156 }
9157
9158 if (!repop->rep_aborted) {
9159 if (repop->v != eversion_t()) {
9160 last_update_ondisk = repop->v;
9161 last_complete_ondisk = repop->pg_local_last_complete;
9162 }
9163 eval_repop(repop);
9164 }
9165}
9166
9167void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9168{
9169 dout(10) << "op_applied version " << applied_version << dendl;
9170 if (applied_version == eversion_t())
9171 return;
9172 assert(applied_version > last_update_applied);
9173 assert(applied_version <= info.last_update);
9174 last_update_applied = applied_version;
9175 if (is_primary()) {
9176 if (scrubber.active) {
c07f9fc5 9177 if (last_update_applied >= scrubber.subset_last_update) {
31f18b77
FG
9178 if (ops_blocked_by_scrub()) {
9179 requeue_scrub(true);
9180 } else {
9181 requeue_scrub(false);
9182 }
9183
7c673cae
FG
9184 }
9185 } else {
9186 assert(scrubber.start == scrubber.end);
9187 }
9188 } else {
9189 if (scrubber.active_rep_scrub) {
c07f9fc5 9190 if (last_update_applied >= static_cast<const MOSDRepScrub*>(
7c673cae
FG
9191 scrubber.active_rep_scrub->get_req())->scrub_to) {
9192 osd->enqueue_back(
9193 info.pgid,
9194 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9195 scrubber.active_rep_scrub = OpRequestRef();
9196 }
9197 }
9198 }
9199}
9200
9201void PrimaryLogPG::eval_repop(RepGather *repop)
9202{
9203 const MOSDOp *m = NULL;
9204 if (repop->op)
9205 m = static_cast<const MOSDOp *>(repop->op->get_req());
9206
9207 if (m)
9208 dout(10) << "eval_repop " << *repop
9209 << (repop->rep_done ? " DONE" : "")
9210 << dendl;
9211 else
9212 dout(10) << "eval_repop " << *repop << " (no op)"
9213 << (repop->rep_done ? " DONE" : "")
9214 << dendl;
9215
9216 if (repop->rep_done)
9217 return;
9218
9219 // ondisk?
9220 if (repop->all_committed) {
9221 dout(10) << " commit: " << *repop << dendl;
9222 for (auto p = repop->on_committed.begin();
9223 p != repop->on_committed.end();
9224 repop->on_committed.erase(p++)) {
9225 (*p)();
9226 }
9227 // send dup commits, in order
9228 if (waiting_for_ondisk.count(repop->v)) {
9229 assert(waiting_for_ondisk.begin()->first == repop->v);
9230 for (list<pair<OpRequestRef, version_t> >::iterator i =
9231 waiting_for_ondisk[repop->v].begin();
9232 i != waiting_for_ondisk[repop->v].end();
9233 ++i) {
9234 osd->reply_op_error(i->first, repop->r, repop->v,
9235 i->second);
9236 }
9237 waiting_for_ondisk.erase(repop->v);
9238 }
9239 }
9240
9241 // applied?
9242 if (repop->all_applied) {
9243 if (repop->applies_with_commit) {
9244 assert(repop->on_applied.empty());
9245 }
9246 dout(10) << " applied: " << *repop << " " << dendl;
9247 for (auto p = repop->on_applied.begin();
9248 p != repop->on_applied.end();
9249 repop->on_applied.erase(p++)) {
9250 (*p)();
9251 }
9252 }
9253
9254 // done.
9255 if (repop->all_applied && repop->all_committed) {
9256 repop->rep_done = true;
9257
9258 publish_stats_to_osd();
9259 calc_min_last_complete_ondisk();
9260
9261 dout(10) << " removing " << *repop << dendl;
9262 assert(!repop_queue.empty());
9263 dout(20) << " q front is " << *repop_queue.front() << dendl;
9264 if (repop_queue.front() != repop) {
9265 if (!repop->applies_with_commit) {
9266 dout(0) << " removing " << *repop << dendl;
9267 dout(0) << " q front is " << *repop_queue.front() << dendl;
9268 assert(repop_queue.front() == repop);
9269 }
9270 } else {
9271 RepGather *to_remove = nullptr;
9272 while (!repop_queue.empty() &&
9273 (to_remove = repop_queue.front())->rep_done) {
9274 repop_queue.pop_front();
9275 for (auto p = to_remove->on_success.begin();
9276 p != to_remove->on_success.end();
9277 to_remove->on_success.erase(p++)) {
9278 (*p)();
9279 }
9280 remove_repop(to_remove);
9281 }
9282 }
9283 }
9284}
9285
9286void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9287{
9288 FUNCTRACE();
9289 const hobject_t& soid = ctx->obs->oi.soid;
9290 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9291 << " o " << soid
9292 << dendl;
9293
9294 repop->v = ctx->at_version;
9295 if (ctx->at_version > eversion_t()) {
9296 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9297 i != actingbackfill.end();
9298 ++i) {
9299 if (*i == get_primary()) continue;
9300 pg_info_t &pinfo = peer_info[*i];
9301 // keep peer_info up to date
9302 if (pinfo.last_complete == pinfo.last_update)
9303 pinfo.last_complete = ctx->at_version;
9304 pinfo.last_update = ctx->at_version;
9305 }
9306 }
9307
9308 ctx->obc->ondisk_write_lock();
9309
9310 bool unlock_snapset_obc = false;
9311 ctx->op_t->add_obc(ctx->obc);
9312 if (ctx->clone_obc) {
9313 ctx->clone_obc->ondisk_write_lock();
9314 ctx->op_t->add_obc(ctx->clone_obc);
9315 }
9316 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9317 ctx->obc->obs.oi.soid) {
9318 ctx->snapset_obc->ondisk_write_lock();
9319 unlock_snapset_obc = true;
9320 ctx->op_t->add_obc(ctx->snapset_obc);
9321 }
9322
9323 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9324 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9325 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9326 ctx->obc,
9327 ctx->clone_obc,
9328 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9329 if (!(ctx->log.empty())) {
9330 assert(ctx->at_version >= projected_last_update);
9331 projected_last_update = ctx->at_version;
9332 }
9333 for (auto &&entry: ctx->log) {
9334 projected_log.add(entry);
9335 }
9336 pgbackend->submit_transaction(
9337 soid,
9338 ctx->delta_stats,
9339 ctx->at_version,
9340 std::move(ctx->op_t),
9341 pg_trim_to,
9342 min_last_complete_ondisk,
9343 ctx->log,
9344 ctx->updated_hset_history,
9345 onapplied_sync,
9346 on_all_applied,
9347 on_all_commit,
9348 repop->rep_tid,
9349 ctx->reqid,
9350 ctx->op);
9351}
9352
9353PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9354 OpContext *ctx, ObjectContextRef obc,
9355 ceph_tid_t rep_tid)
9356{
9357 if (ctx->op)
9358 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9359 else
9360 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9361
9362 RepGather *repop = new RepGather(
9363 ctx, rep_tid, info.last_complete, false);
9364
9365 repop->start = ceph_clock_now();
9366
9367 repop_queue.push_back(&repop->queue_item);
9368 repop->get();
9369
9370 osd->logger->inc(l_osd_op_wip);
9371
9372 dout(10) << __func__ << ": " << *repop << dendl;
9373 return repop;
9374}
9375
9376boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9377 eversion_t version,
9378 int r,
9379 ObcLockManager &&manager,
9380 OpRequestRef &&op,
9381 boost::optional<std::function<void(void)> > &&on_complete)
9382{
9383 RepGather *repop = new RepGather(
9384 std::move(manager),
9385 std::move(op),
9386 std::move(on_complete),
9387 osd->get_tid(),
9388 info.last_complete,
9389 true,
9390 r);
9391 repop->v = version;
9392
9393 repop->start = ceph_clock_now();
9394
9395 repop_queue.push_back(&repop->queue_item);
9396
9397 osd->logger->inc(l_osd_op_wip);
9398
9399 dout(10) << __func__ << ": " << *repop << dendl;
9400 return boost::intrusive_ptr<RepGather>(repop);
9401}
9402
9403void PrimaryLogPG::remove_repop(RepGather *repop)
9404{
9405 dout(20) << __func__ << " " << *repop << dendl;
9406
9407 for (auto p = repop->on_finish.begin();
9408 p != repop->on_finish.end();
9409 repop->on_finish.erase(p++)) {
9410 (*p)();
9411 }
9412
9413 release_object_locks(
9414 repop->lock_manager);
9415 repop->put();
9416
9417 osd->logger->dec(l_osd_op_wip);
9418}
9419
9420PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9421{
9422 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
7c673cae
FG
9423 ceph_tid_t rep_tid = osd->get_tid();
9424 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
c07f9fc5 9425 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
7c673cae
FG
9426 ctx->op_t.reset(new PGTransaction());
9427 ctx->mtime = ceph_clock_now();
9428 return ctx;
9429}
9430
9431void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9432{
9433 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9434 dout(20) << __func__ << " " << repop << dendl;
9435 issue_repop(repop, ctx.get());
9436 eval_repop(repop);
224ce89b 9437 calc_trim_to();
7c673cae
FG
9438 repop->put();
9439}
9440
9441
9442void PrimaryLogPG::submit_log_entries(
31f18b77 9443 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
9444 ObcLockManager &&manager,
9445 boost::optional<std::function<void(void)> > &&_on_complete,
9446 OpRequestRef op,
9447 int r)
9448{
9449 dout(10) << __func__ << " " << entries << dendl;
9450 assert(is_primary());
9451
9452 eversion_t version;
9453 if (!entries.empty()) {
9454 assert(entries.rbegin()->version >= projected_last_update);
9455 version = projected_last_update = entries.rbegin()->version;
9456 }
9457
9458 boost::intrusive_ptr<RepGather> repop;
9459 boost::optional<std::function<void(void)> > on_complete;
31f18b77 9460 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9461 repop = new_repop(
9462 version,
9463 r,
9464 std::move(manager),
9465 std::move(op),
9466 std::move(_on_complete));
9467 } else {
9468 on_complete = std::move(_on_complete);
9469 }
9470
9471 pgbackend->call_write_ordered(
9472 [this, entries, repop, on_complete]() {
9473 ObjectStore::Transaction t;
9474 eversion_t old_last_update = info.last_update;
9475 merge_new_log_entries(entries, t);
9476
9477
9478 set<pg_shard_t> waiting_on;
9479 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9480 i != actingbackfill.end();
9481 ++i) {
9482 pg_shard_t peer(*i);
9483 if (peer == pg_whoami) continue;
9484 assert(peer_missing.count(peer));
9485 assert(peer_info.count(peer));
31f18b77 9486 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9487 assert(repop);
9488 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9489 entries,
9490 spg_t(info.pgid.pgid, i->shard),
9491 pg_whoami.shard,
9492 get_osdmap()->get_epoch(),
9493 last_peering_reset,
9494 repop->rep_tid);
9495 osd->send_message_osd_cluster(
9496 peer.osd, m, get_osdmap()->get_epoch());
9497 waiting_on.insert(peer);
9498 } else {
9499 MOSDPGLog *m = new MOSDPGLog(
9500 peer.shard, pg_whoami.shard,
9501 info.last_update.epoch,
9502 info);
9503 m->log.log = entries;
9504 m->log.tail = old_last_update;
9505 m->log.head = info.last_update;
9506 osd->send_message_osd_cluster(
9507 peer.osd, m, get_osdmap()->get_epoch());
9508 }
9509 }
31f18b77 9510 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9511 ceph_tid_t rep_tid = repop->rep_tid;
9512 waiting_on.insert(pg_whoami);
9513 log_entry_update_waiting_on.insert(
9514 make_pair(
9515 rep_tid,
9516 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9517 ));
9518 struct OnComplete : public Context {
9519 PrimaryLogPGRef pg;
9520 ceph_tid_t rep_tid;
9521 epoch_t epoch;
9522 OnComplete(
9523 PrimaryLogPGRef pg,
9524 ceph_tid_t rep_tid,
9525 epoch_t epoch)
9526 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9527 void finish(int) override {
9528 pg->lock();
9529 if (!pg->pg_has_reset_since(epoch)) {
9530 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9531 assert(it != pg->log_entry_update_waiting_on.end());
9532 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9533 assert(it2 != it->second.waiting_on.end());
9534 it->second.waiting_on.erase(it2);
9535 if (it->second.waiting_on.empty()) {
9536 pg->repop_all_committed(it->second.repop.get());
9537 pg->log_entry_update_waiting_on.erase(it);
9538 }
9539 }
9540 pg->unlock();
9541 }
9542 };
9543 t.register_on_commit(
9544 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9545 } else {
9546 if (on_complete) {
9547 struct OnComplete : public Context {
9548 PrimaryLogPGRef pg;
9549 std::function<void(void)> on_complete;
9550 epoch_t epoch;
9551 OnComplete(
9552 PrimaryLogPGRef pg,
9553 const std::function<void(void)> &on_complete,
9554 epoch_t epoch)
9555 : pg(pg),
9556 on_complete(std::move(on_complete)),
9557 epoch(epoch) {}
9558 void finish(int) override {
9559 pg->lock();
9560 if (!pg->pg_has_reset_since(epoch))
9561 on_complete();
9562 pg->unlock();
9563 }
9564 };
9565 t.register_on_complete(
9566 new OnComplete{
9567 this, *on_complete, get_osdmap()->get_epoch()
9568 });
9569 }
9570 }
9571 t.register_on_applied(
9572 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9573 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9574 assert(r == 0);
9575 });
9576}
9577
9578void PrimaryLogPG::cancel_log_updates()
9579{
9580 // get rid of all the LogUpdateCtx so their references to repops are
9581 // dropped
9582 log_entry_update_waiting_on.clear();
9583}
9584
9585// -------------------------------------------------------
9586
9587void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9588{
9589 pair<hobject_t, ObjectContextRef> i;
9590 while (object_contexts.get_next(i.first, &i)) {
9591 ObjectContextRef obc(i.second);
9592 get_obc_watchers(obc, pg_watchers);
9593 }
9594}
9595
9596void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9597{
9598 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9599 obc->watchers.begin();
9600 j != obc->watchers.end();
9601 ++j) {
9602 obj_watch_item_t owi;
9603
9604 owi.obj = obc->obs.oi.soid;
9605 owi.wi.addr = j->second->get_peer_addr();
9606 owi.wi.name = j->second->get_entity();
9607 owi.wi.cookie = j->second->get_cookie();
9608 owi.wi.timeout_seconds = j->second->get_timeout();
9609
9610 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9611 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9612
9613 pg_watchers.push_back(owi);
9614 }
9615}
9616
9617void PrimaryLogPG::check_blacklisted_watchers()
9618{
9619 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9620 pair<hobject_t, ObjectContextRef> i;
9621 while (object_contexts.get_next(i.first, &i))
9622 check_blacklisted_obc_watchers(i.second);
9623}
9624
9625void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9626{
9627 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9628 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9629 obc->watchers.begin();
9630 k != obc->watchers.end();
9631 ) {
9632 //Advance iterator now so handle_watch_timeout() can erase element
9633 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9634 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9635 entity_addr_t ea = j->second->get_peer_addr();
9636 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9637 if (get_osdmap()->is_blacklisted(ea)) {
9638 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9639 assert(j->second->get_pg() == this);
9640 j->second->unregister_cb();
9641 handle_watch_timeout(j->second);
9642 }
9643 }
9644}
9645
9646void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9647{
9648 assert(is_active());
9649 assert((recovering.count(obc->obs.oi.soid) ||
9650 !is_missing_object(obc->obs.oi.soid)) ||
9651 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9652 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9653 pg_log_entry_t::LOST_REVERT &&
9654 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9655 obc->obs.oi.version));
9656
9657 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9658 assert(obc->watchers.empty());
9659 // populate unconnected_watchers
9660 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9661 obc->obs.oi.watchers.begin();
9662 p != obc->obs.oi.watchers.end();
9663 ++p) {
9664 utime_t expire = info.stats.last_became_active;
9665 expire += p->second.timeout_seconds;
9666 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9667 WatchRef watch(
9668 Watch::makeWatchRef(
9669 this, osd, obc, p->second.timeout_seconds, p->first.first,
9670 p->first.second, p->second.addr));
9671 watch->disconnect();
9672 obc->watchers.insert(
9673 make_pair(
9674 make_pair(p->first.first, p->first.second),
9675 watch));
9676 }
9677 // Look for watchers from blacklisted clients and drop
9678 check_blacklisted_obc_watchers(obc);
9679}
9680
9681void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9682{
9683 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9684 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9685
9686 if (!is_active()) {
9687 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9688 return;
9689 }
9690 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9691 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9692 watch->get_delayed_cb()
9693 );
9694 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9695 << obc->obs.oi.soid
9696 << dendl;
9697 return;
9698 }
9699
9700 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9701 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9702 << obc->obs.oi.soid
9703 << dendl;
9704 scrubber.add_callback(
9705 watch->get_delayed_cb() // This callback!
9706 );
9707 return;
9708 }
9709
9710 OpContextUPtr ctx = simple_opc_create(obc);
9711 ctx->at_version = get_next_version();
9712
9713 object_info_t& oi = ctx->new_obs.oi;
9714 oi.watchers.erase(make_pair(watch->get_cookie(),
9715 watch->get_entity()));
9716
9717 list<watch_disconnect_t> watch_disconnects = {
9718 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9719 };
9720 ctx->register_on_success(
9721 [this, obc, watch_disconnects]() {
9722 complete_disconnect_watches(obc, watch_disconnects);
9723 });
9724
9725
9726 PGTransaction *t = ctx->op_t.get();
9727 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9728 ctx->at_version,
9729 oi.version,
9730 0,
9731 osd_reqid_t(), ctx->mtime, 0));
9732
9733 oi.prior_version = obc->obs.oi.version;
9734 oi.version = ctx->at_version;
9735 bufferlist bl;
9736 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9737 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9738
9739 // apply new object state.
9740 ctx->obc->obs = ctx->new_obs;
9741
9742 // no ctx->delta_stats
9743 simple_opc_submit(std::move(ctx));
9744}
9745
9746ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9747 SnapSetContext *ssc)
9748{
9749 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9750 assert(obc->destructor_callback == NULL);
9751 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9752 obc->obs.oi = oi;
9753 obc->obs.exists = false;
9754 obc->ssc = ssc;
9755 if (ssc)
9756 register_snapset_context(ssc);
9757 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9758 if (is_active())
9759 populate_obc_watchers(obc);
9760 return obc;
9761}
9762
9763ObjectContextRef PrimaryLogPG::get_object_context(
9764 const hobject_t& soid,
9765 bool can_create,
9766 const map<string, bufferlist> *attrs)
9767{
9768 assert(
9769 attrs || !pg_log.get_missing().is_missing(soid) ||
9770 // or this is a revert... see recover_primary()
9771 (pg_log.get_log().objects.count(soid) &&
9772 pg_log.get_log().objects.find(soid)->second->op ==
9773 pg_log_entry_t::LOST_REVERT));
9774 ObjectContextRef obc = object_contexts.lookup(soid);
9775 osd->logger->inc(l_osd_object_ctx_cache_total);
9776 if (obc) {
9777 osd->logger->inc(l_osd_object_ctx_cache_hit);
9778 dout(10) << __func__ << ": found obc in cache: " << obc
9779 << dendl;
9780 } else {
9781 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9782 // check disk
9783 bufferlist bv;
9784 if (attrs) {
9785 assert(attrs->count(OI_ATTR));
9786 bv = attrs->find(OI_ATTR)->second;
9787 } else {
9788 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9789 if (r < 0) {
9790 if (!can_create) {
9791 dout(10) << __func__ << ": no obc for soid "
9792 << soid << " and !can_create"
9793 << dendl;
9794 return ObjectContextRef(); // -ENOENT!
9795 }
9796
9797 dout(10) << __func__ << ": no obc for soid "
9798 << soid << " but can_create"
9799 << dendl;
9800 // new object.
9801 object_info_t oi(soid);
9802 SnapSetContext *ssc = get_snapset_context(
9803 soid, true, 0, false);
224ce89b 9804 assert(ssc);
7c673cae
FG
9805 obc = create_object_context(oi, ssc);
9806 dout(10) << __func__ << ": " << obc << " " << soid
9807 << " " << obc->rwstate
9808 << " oi: " << obc->obs.oi
9809 << " ssc: " << obc->ssc
9810 << " snapset: " << obc->ssc->snapset << dendl;
9811 return obc;
9812 }
9813 }
9814
9815 object_info_t oi;
9816 try {
9817 bufferlist::iterator bliter = bv.begin();
9818 ::decode(oi, bliter);
9819 } catch (...) {
9820 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9821 return ObjectContextRef(); // -ENOENT!
9822 }
9823
9824 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9825
9826 obc = object_contexts.lookup_or_create(oi.soid);
9827 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9828 obc->obs.oi = oi;
9829 obc->obs.exists = true;
9830
9831 obc->ssc = get_snapset_context(
9832 soid, true,
9833 soid.has_snapset() ? attrs : 0);
9834
9835 if (is_active())
9836 populate_obc_watchers(obc);
9837
9838 if (pool.info.require_rollback()) {
9839 if (attrs) {
9840 obc->attr_cache = *attrs;
9841 } else {
9842 int r = pgbackend->objects_get_attrs(
9843 soid,
9844 &obc->attr_cache);
9845 assert(r == 0);
9846 }
9847 }
9848
9849 dout(10) << __func__ << ": creating obc from disk: " << obc
9850 << dendl;
9851 }
224ce89b
WB
9852
9853 // XXX: Caller doesn't expect this
9854 if (obc->ssc == NULL) {
9855 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9856 return ObjectContextRef(); // -ENOENT!
9857 }
9858
7c673cae
FG
9859 dout(10) << __func__ << ": " << obc << " " << soid
9860 << " " << obc->rwstate
9861 << " oi: " << obc->obs.oi
9862 << " exists: " << (int)obc->obs.exists
9863 << " ssc: " << obc->ssc
9864 << " snapset: " << obc->ssc->snapset << dendl;
9865 return obc;
9866}
9867
9868void PrimaryLogPG::context_registry_on_change()
9869{
9870 pair<hobject_t, ObjectContextRef> i;
9871 while (object_contexts.get_next(i.first, &i)) {
9872 ObjectContextRef obc(i.second);
9873 if (obc) {
9874 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9875 obc->watchers.begin();
9876 j != obc->watchers.end();
9877 obc->watchers.erase(j++)) {
9878 j->second->discard();
9879 }
9880 }
9881 }
9882}
9883
9884
9885/*
9886 * If we return an error, and set *pmissing, then promoting that
9887 * object may help.
9888 *
9889 * If we return -EAGAIN, we will always set *pmissing to the missing
9890 * object to wait for.
9891 *
9892 * If we return an error but do not set *pmissing, then we know the
9893 * object does not exist.
9894 */
9895int PrimaryLogPG::find_object_context(const hobject_t& oid,
9896 ObjectContextRef *pobc,
9897 bool can_create,
9898 bool map_snapid_to_clone,
9899 hobject_t *pmissing)
9900{
9901 FUNCTRACE();
9902 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9903 // want the head?
9904 if (oid.snap == CEPH_NOSNAP) {
9905 ObjectContextRef obc = get_object_context(oid, can_create);
9906 if (!obc) {
9907 if (pmissing)
9908 *pmissing = oid;
9909 return -ENOENT;
9910 }
9911 dout(10) << "find_object_context " << oid
9912 << " @" << oid.snap
9913 << " oi=" << obc->obs.oi
9914 << dendl;
9915 *pobc = obc;
9916
9917 return 0;
9918 }
9919
9920 hobject_t head = oid.get_head();
9921
9922 // want the snapdir?
9923 if (oid.snap == CEPH_SNAPDIR) {
9924 // return head or snapdir, whichever exists.
9925 ObjectContextRef headobc = get_object_context(head, can_create);
9926 ObjectContextRef obc = headobc;
9927 if (!obc || !obc->obs.exists)
9928 obc = get_object_context(oid, can_create);
9929 if (!obc || !obc->obs.exists) {
9930 // if we have neither, we would want to promote the head.
9931 if (pmissing)
9932 *pmissing = head;
9933 if (pobc)
9934 *pobc = headobc; // may be null
9935 return -ENOENT;
9936 }
9937 dout(10) << "find_object_context " << oid
9938 << " @" << oid.snap
9939 << " oi=" << obc->obs.oi
9940 << dendl;
9941 *pobc = obc;
9942
9943 // always populate ssc for SNAPDIR...
9944 if (!obc->ssc)
9945 obc->ssc = get_snapset_context(
9946 oid, true);
9947 return 0;
9948 }
9949
9950 // we want a snap
9951 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9952 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9953 return -ENOENT;
9954 }
9955
9956 SnapSetContext *ssc = get_snapset_context(oid, can_create);
9957 if (!ssc || !(ssc->exists || can_create)) {
9958 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9959 if (pmissing)
9960 *pmissing = head; // start by getting the head
9961 if (ssc)
9962 put_snapset_context(ssc);
9963 return -ENOENT;
9964 }
9965
9966 if (map_snapid_to_clone) {
9967 dout(10) << "find_object_context " << oid << " @" << oid.snap
9968 << " snapset " << ssc->snapset
9969 << " map_snapid_to_clone=true" << dendl;
9970 if (oid.snap > ssc->snapset.seq) {
9971 // already must be readable
9972 ObjectContextRef obc = get_object_context(head, false);
9973 dout(10) << "find_object_context " << oid << " @" << oid.snap
9974 << " snapset " << ssc->snapset
9975 << " maps to head" << dendl;
9976 *pobc = obc;
9977 put_snapset_context(ssc);
9978 return (obc && obc->obs.exists) ? 0 : -ENOENT;
9979 } else {
9980 vector<snapid_t>::const_iterator citer = std::find(
9981 ssc->snapset.clones.begin(),
9982 ssc->snapset.clones.end(),
9983 oid.snap);
9984 if (citer == ssc->snapset.clones.end()) {
9985 dout(10) << "find_object_context " << oid << " @" << oid.snap
9986 << " snapset " << ssc->snapset
9987 << " maps to nothing" << dendl;
9988 put_snapset_context(ssc);
9989 return -ENOENT;
9990 }
9991
9992 dout(10) << "find_object_context " << oid << " @" << oid.snap
9993 << " snapset " << ssc->snapset
9994 << " maps to " << oid << dendl;
9995
9996 if (pg_log.get_missing().is_missing(oid)) {
9997 dout(10) << "find_object_context " << oid << " @" << oid.snap
9998 << " snapset " << ssc->snapset
9999 << " " << oid << " is missing" << dendl;
10000 if (pmissing)
10001 *pmissing = oid;
10002 put_snapset_context(ssc);
10003 return -EAGAIN;
10004 }
10005
10006 ObjectContextRef obc = get_object_context(oid, false);
10007 if (!obc || !obc->obs.exists) {
10008 dout(10) << "find_object_context " << oid << " @" << oid.snap
10009 << " snapset " << ssc->snapset
10010 << " " << oid << " is not present" << dendl;
10011 if (pmissing)
10012 *pmissing = oid;
10013 put_snapset_context(ssc);
10014 return -ENOENT;
10015 }
10016 dout(10) << "find_object_context " << oid << " @" << oid.snap
10017 << " snapset " << ssc->snapset
10018 << " " << oid << " HIT" << dendl;
10019 *pobc = obc;
10020 put_snapset_context(ssc);
10021 return 0;
10022 }
10023 ceph_abort(); //unreachable
10024 }
10025
10026 dout(10) << "find_object_context " << oid << " @" << oid.snap
10027 << " snapset " << ssc->snapset << dendl;
10028
10029 // head?
10030 if (oid.snap > ssc->snapset.seq) {
10031 if (ssc->snapset.head_exists) {
10032 ObjectContextRef obc = get_object_context(head, false);
10033 dout(10) << "find_object_context " << head
10034 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10035 << " -- HIT " << obc->obs
10036 << dendl;
10037 if (!obc->ssc)
10038 obc->ssc = ssc;
10039 else {
10040 assert(ssc == obc->ssc);
10041 put_snapset_context(ssc);
10042 }
10043 *pobc = obc;
10044 return 0;
10045 }
10046 dout(10) << "find_object_context " << head
10047 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10048 << " but head dne -- DNE"
10049 << dendl;
10050 put_snapset_context(ssc);
10051 return -ENOENT;
10052 }
10053
10054 // which clone would it be?
10055 unsigned k = 0;
10056 while (k < ssc->snapset.clones.size() &&
10057 ssc->snapset.clones[k] < oid.snap)
10058 k++;
10059 if (k == ssc->snapset.clones.size()) {
10060 dout(10) << "find_object_context no clones with last >= oid.snap "
10061 << oid.snap << " -- DNE" << dendl;
10062 put_snapset_context(ssc);
10063 return -ENOENT;
10064 }
10065 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10066 info.pgid.pool(), oid.get_namespace());
10067
10068 if (pg_log.get_missing().is_missing(soid)) {
10069 dout(20) << "find_object_context " << soid << " missing, try again later"
10070 << dendl;
10071 if (pmissing)
10072 *pmissing = soid;
10073 put_snapset_context(ssc);
10074 return -EAGAIN;
10075 }
10076
10077 ObjectContextRef obc = get_object_context(soid, false);
10078 if (!obc || !obc->obs.exists) {
7c673cae
FG
10079 if (pmissing)
10080 *pmissing = soid;
10081 put_snapset_context(ssc);
c07f9fc5
FG
10082 if (is_degraded_or_backfilling_object(soid)) {
10083 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10084 return -EAGAIN;
10085 } else {
10086 dout(20) << __func__ << " missing clone " << soid << dendl;
10087 return -ENOENT;
10088 }
7c673cae
FG
10089 }
10090
10091 if (!obc->ssc) {
10092 obc->ssc = ssc;
10093 } else {
10094 assert(obc->ssc == ssc);
10095 put_snapset_context(ssc);
10096 }
10097 ssc = 0;
10098
10099 // clone
10100 dout(20) << "find_object_context " << soid
10101 << " snapset " << obc->ssc->snapset
10102 << " legacy_snaps " << obc->obs.oi.legacy_snaps
10103 << dendl;
10104 snapid_t first, last;
10105 if (obc->ssc->snapset.is_legacy()) {
10106 first = obc->obs.oi.legacy_snaps.back();
10107 last = obc->obs.oi.legacy_snaps.front();
10108 } else {
10109 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10110 assert(p != obc->ssc->snapset.clone_snaps.end());
10111 first = p->second.back();
10112 last = p->second.front();
10113 }
10114 if (first <= oid.snap) {
10115 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10116 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10117 *pobc = obc;
10118 return 0;
10119 } else {
10120 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10121 << "] does not contain " << oid.snap << " -- DNE" << dendl;
10122 return -ENOENT;
10123 }
10124}
10125
10126void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10127{
10128 if (obc->ssc)
10129 put_snapset_context(obc->ssc);
10130}
10131
10132void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10133{
10134 object_info_t& oi = obc->obs.oi;
10135
10136 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10137 object_stat_sum_t stat;
10138
10139 stat.num_bytes += oi.size;
10140
10141 if (oi.soid.snap != CEPH_SNAPDIR)
10142 stat.num_objects++;
10143 if (oi.is_dirty())
10144 stat.num_objects_dirty++;
10145 if (oi.is_whiteout())
10146 stat.num_whiteouts++;
10147 if (oi.is_omap())
10148 stat.num_objects_omap++;
10149 if (oi.is_cache_pinned())
10150 stat.num_objects_pinned++;
10151
10152 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10153 stat.num_object_clones++;
10154
10155 if (!obc->ssc)
10156 obc->ssc = get_snapset_context(oi.soid, false);
10157 assert(obc->ssc);
10158
10159 // subtract off clone overlap
10160 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10161 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10162 for (interval_set<uint64_t>::const_iterator r = o.begin();
10163 r != o.end();
10164 ++r) {
10165 stat.num_bytes -= r.get_len();
10166 }
10167 }
10168 }
10169
10170 // add it in
10171 pgstat->stats.sum.add(stat);
10172}
10173
10174void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10175{
10176 const hobject_t& soid = obc->obs.oi.soid;
10177 if (obc->is_blocked()) {
10178 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10179 return;
10180 }
10181
10182 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10183 if (p != waiting_for_blocked_object.end()) {
10184 list<OpRequestRef>& ls = p->second;
10185 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10186 requeue_ops(ls);
10187 waiting_for_blocked_object.erase(p);
10188 }
10189
10190 map<hobject_t, ObjectContextRef>::iterator i =
10191 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10192 if (i != objects_blocked_on_snap_promotion.end()) {
10193 assert(i->second == obc);
10194 objects_blocked_on_snap_promotion.erase(i);
10195 }
10196
10197 if (obc->requeue_scrub_on_unblock) {
10198 obc->requeue_scrub_on_unblock = false;
10199 requeue_scrub();
10200 }
10201}
10202
10203SnapSetContext *PrimaryLogPG::get_snapset_context(
10204 const hobject_t& oid,
10205 bool can_create,
10206 const map<string, bufferlist> *attrs,
10207 bool oid_existed)
10208{
10209 Mutex::Locker l(snapset_contexts_lock);
10210 SnapSetContext *ssc;
10211 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10212 oid.get_snapdir());
10213 if (p != snapset_contexts.end()) {
10214 if (can_create || p->second->exists) {
10215 ssc = p->second;
10216 } else {
10217 return NULL;
10218 }
10219 } else {
10220 bufferlist bv;
10221 if (!attrs) {
10222 int r = -ENOENT;
10223 if (!(oid.is_head() && !oid_existed))
10224 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10225 if (r < 0) {
10226 // try _snapset
10227 if (!(oid.is_snapdir() && !oid_existed))
10228 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10229 if (r < 0 && !can_create)
10230 return NULL;
10231 }
10232 } else {
10233 assert(attrs->count(SS_ATTR));
10234 bv = attrs->find(SS_ATTR)->second;
10235 }
10236 ssc = new SnapSetContext(oid.get_snapdir());
10237 _register_snapset_context(ssc);
10238 if (bv.length()) {
10239 bufferlist::iterator bvp = bv.begin();
224ce89b
WB
10240 try {
10241 ssc->snapset.decode(bvp);
10242 } catch (buffer::error& e) {
10243 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10244 return NULL;
10245 }
7c673cae
FG
10246 ssc->exists = true;
10247 } else {
10248 ssc->exists = false;
10249 }
10250 }
10251 assert(ssc);
10252 ssc->ref++;
10253 return ssc;
10254}
10255
10256void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10257{
10258 Mutex::Locker l(snapset_contexts_lock);
10259 --ssc->ref;
10260 if (ssc->ref == 0) {
10261 if (ssc->registered)
10262 snapset_contexts.erase(ssc->oid);
10263 delete ssc;
10264 }
10265}
10266
10267/** pull - request object from a peer
10268 */
10269
10270/*
10271 * Return values:
10272 * NONE - didn't pull anything
10273 * YES - pulled what the caller wanted
10274 * OTHER - needed to pull something else first (_head or _snapdir)
10275 */
10276enum { PULL_NONE, PULL_OTHER, PULL_YES };
10277
10278int PrimaryLogPG::recover_missing(
10279 const hobject_t &soid, eversion_t v,
10280 int priority,
10281 PGBackend::RecoveryHandle *h)
10282{
10283 if (missing_loc.is_unfound(soid)) {
10284 dout(7) << "pull " << soid
10285 << " v " << v
10286 << " but it is unfound" << dendl;
10287 return PULL_NONE;
10288 }
10289
c07f9fc5
FG
10290 if (missing_loc.is_deleted(soid)) {
10291 start_recovery_op(soid);
10292 assert(!recovering.count(soid));
10293 recovering.insert(make_pair(soid, ObjectContextRef()));
10294 epoch_t cur_epoch = get_osdmap()->get_epoch();
10295 remove_missing_object(soid, v, new FunctionContext(
10296 [=](int) {
10297 lock();
10298 if (!pg_has_reset_since(cur_epoch)) {
10299 bool object_missing = false;
10300 for (const auto& shard : actingbackfill) {
10301 if (shard == pg_whoami)
10302 continue;
10303 if (peer_missing[shard].is_missing(soid)) {
10304 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10305 object_missing = true;
10306 break;
10307 }
10308 }
10309 if (!object_missing) {
10310 object_stat_sum_t stat_diff;
10311 stat_diff.num_objects_recovered = 1;
10312 on_global_recover(soid, stat_diff, true);
10313 } else {
10314 auto recovery_handle = pgbackend->open_recovery_op();
10315 pgbackend->recover_delete_object(soid, v, recovery_handle);
10316 pgbackend->run_recovery_op(recovery_handle, priority);
10317 }
10318 }
10319 unlock();
10320 }));
10321 return PULL_YES;
10322 }
10323
7c673cae
FG
10324 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10325 ObjectContextRef obc;
10326 ObjectContextRef head_obc;
10327 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10328 // do we have the head and/or snapdir?
10329 hobject_t head = soid.get_head();
10330 if (pg_log.get_missing().is_missing(head)) {
10331 if (recovering.count(head)) {
10332 dout(10) << " missing but already recovering head " << head << dendl;
10333 return PULL_NONE;
10334 } else {
10335 int r = recover_missing(
10336 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10337 h);
10338 if (r != PULL_NONE)
10339 return PULL_OTHER;
10340 return PULL_NONE;
10341 }
10342 }
10343 head = soid.get_snapdir();
10344 if (pg_log.get_missing().is_missing(head)) {
10345 if (recovering.count(head)) {
10346 dout(10) << " missing but already recovering snapdir " << head << dendl;
10347 return PULL_NONE;
10348 } else {
10349 int r = recover_missing(
10350 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10351 h);
10352 if (r != PULL_NONE)
10353 return PULL_OTHER;
10354 return PULL_NONE;
10355 }
10356 }
10357
10358 // we must have one or the other
10359 head_obc = get_object_context(
10360 soid.get_head(),
10361 false,
10362 0);
10363 if (!head_obc)
10364 head_obc = get_object_context(
10365 soid.get_snapdir(),
10366 false,
10367 0);
10368 assert(head_obc);
10369 }
10370 start_recovery_op(soid);
10371 assert(!recovering.count(soid));
10372 recovering.insert(make_pair(soid, obc));
224ce89b 10373 int r = pgbackend->recover_object(
7c673cae
FG
10374 soid,
10375 v,
10376 head_obc,
10377 obc,
10378 h);
224ce89b
WB
10379 // This is only a pull which shouldn't return an error
10380 assert(r >= 0);
7c673cae
FG
10381 return PULL_YES;
10382}
10383
10384void PrimaryLogPG::send_remove_op(
10385 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10386{
10387 ceph_tid_t tid = osd->get_tid();
10388 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10389
10390 dout(10) << "send_remove_op " << oid << " from osd." << peer
10391 << " tid " << tid << dendl;
10392
10393 MOSDSubOp *subop = new MOSDSubOp(
10394 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10395 oid, CEPH_OSD_FLAG_ACK,
10396 get_osdmap()->get_epoch(), tid, v);
10397 subop->ops = vector<OSDOp>(1);
10398 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10399
10400 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10401}
10402
c07f9fc5
FG
10403void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10404 eversion_t v, Context *on_complete)
10405{
10406 dout(20) << __func__ << " " << soid << " " << v << dendl;
10407 assert(on_complete != nullptr);
10408 // delete locally
10409 ObjectStore::Transaction t;
10410 remove_snap_mapped_object(t, soid);
10411
10412 ObjectRecoveryInfo recovery_info;
10413 recovery_info.soid = soid;
10414 recovery_info.version = v;
10415
10416 epoch_t cur_epoch = get_osdmap()->get_epoch();
10417 t.register_on_complete(new FunctionContext(
10418 [=](int) {
10419 lock();
10420 if (!pg_has_reset_since(cur_epoch)) {
10421 ObjectStore::Transaction t2;
10422 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10423 t2.register_on_complete(on_complete);
10424 int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10425 assert(r == 0);
10426 unlock();
10427 } else {
10428 unlock();
10429 on_complete->complete(-EAGAIN);
10430 }
10431 }));
10432 int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10433 assert(r == 0);
10434}
7c673cae
FG
10435
10436void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10437{
10438 dout(10) << "finish_degraded_object " << oid << dendl;
7c673cae
FG
10439 if (callbacks_for_degraded_object.count(oid)) {
10440 list<Context*> contexts;
10441 contexts.swap(callbacks_for_degraded_object[oid]);
10442 callbacks_for_degraded_object.erase(oid);
10443 for (list<Context*>::iterator i = contexts.begin();
10444 i != contexts.end();
10445 ++i) {
10446 (*i)->complete(0);
10447 }
10448 }
10449 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10450 oid.get_head());
10451 if (i != objects_blocked_on_degraded_snap.end() &&
10452 i->second == oid.snap)
10453 objects_blocked_on_degraded_snap.erase(i);
10454}
10455
10456void PrimaryLogPG::_committed_pushed_object(
10457 epoch_t epoch, eversion_t last_complete)
10458{
10459 lock();
10460 if (!pg_has_reset_since(epoch)) {
10461 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10462 last_complete_ondisk = last_complete;
10463
10464 if (last_complete_ondisk == info.last_update) {
10465 if (!is_primary()) {
10466 // Either we are a replica or backfill target.
10467 // we are fully up to date. tell the primary!
10468 osd->send_message_osd_cluster(
10469 get_primary().osd,
10470 new MOSDPGTrim(
10471 get_osdmap()->get_epoch(),
10472 spg_t(info.pgid.pgid, get_primary().shard),
10473 last_complete_ondisk),
10474 get_osdmap()->get_epoch());
10475 } else {
10476 calc_min_last_complete_ondisk();
10477 }
10478 }
10479
10480 } else {
10481 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10482 }
10483
10484 unlock();
10485}
10486
10487void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10488{
10489 lock();
c07f9fc5
FG
10490 dout(20) << __func__ << dendl;
10491 if (obc) {
10492 dout(20) << "obc = " << *obc << dendl;
10493 }
7c673cae
FG
10494 assert(active_pushes >= 1);
10495 --active_pushes;
10496
10497 // requeue an active chunky scrub waiting on recovery ops
10498 if (!deleting && active_pushes == 0
10499 && scrubber.is_chunky_scrub_active()) {
31f18b77
FG
10500 if (ops_blocked_by_scrub()) {
10501 requeue_scrub(true);
10502 } else {
10503 requeue_scrub(false);
10504 }
7c673cae 10505 }
7c673cae
FG
10506 unlock();
10507}
10508
10509void PrimaryLogPG::_applied_recovered_object_replica()
10510{
10511 lock();
c07f9fc5 10512 dout(20) << __func__ << dendl;
7c673cae
FG
10513 assert(active_pushes >= 1);
10514 --active_pushes;
10515
10516 // requeue an active chunky scrub waiting on recovery ops
10517 if (!deleting && active_pushes == 0 &&
10518 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10519 scrubber.active_rep_scrub->get_req())->chunky) {
10520 osd->enqueue_back(
10521 info.pgid,
10522 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10523 scrubber.active_rep_scrub = OpRequestRef();
10524 }
7c673cae
FG
10525 unlock();
10526}
10527
10528void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10529{
10530 dout(10) << "got missing " << oid << " v " << v << dendl;
10531 pg_log.recover_got(oid, v, info);
10532 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10533 dout(10) << "last_complete now " << info.last_complete
10534 << " log.complete_to " << pg_log.get_log().complete_to->version
10535 << dendl;
10536 } else {
10537 dout(10) << "last_complete now " << info.last_complete
10538 << " log.complete_to at end" << dendl;
10539 //below is not true in the repair case.
10540 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10541 assert(info.last_complete == info.last_update);
10542 }
10543}
10544
224ce89b
WB
10545void PrimaryLogPG::primary_failed(const hobject_t &soid)
10546{
10547 list<pg_shard_t> fl = { pg_whoami };
10548 failed_push(fl, soid);
10549}
10550
7c673cae
FG
10551void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10552{
10553 dout(20) << __func__ << ": " << soid << dendl;
10554 assert(recovering.count(soid));
10555 auto obc = recovering[soid];
10556 if (obc) {
10557 list<OpRequestRef> blocked_ops;
10558 obc->drop_recovery_read(&blocked_ops);
10559 requeue_ops(blocked_ops);
10560 }
10561 recovering.erase(soid);
10562 for (auto&& i : from)
10563 missing_loc.remove_location(soid, i);
10564 dout(0) << __func__ << " " << soid << " from shard " << from
10565 << ", reps on " << missing_loc.get_locations(soid)
10566 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10567 finish_recovery_op(soid); // close out this attempt,
10568}
10569
10570void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10571{
10572 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10573 assert(m->get_type() == MSG_OSD_SUBOP);
10574 dout(7) << "sub_op_remove " << m->poid << dendl;
10575
10576 op->mark_started();
10577
10578 ObjectStore::Transaction t;
10579 remove_snap_mapped_object(t, m->poid);
10580 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10581 assert(r == 0);
10582}
10583
10584eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10585{
10586 eversion_t v;
10587 pg_missing_item pmi;
10588 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10589 assert(is_missing);
10590 v = pmi.have;
10591 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10592
10593 assert(!actingbackfill.empty());
10594 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10595 i != actingbackfill.end();
10596 ++i) {
10597 if (*i == get_primary()) continue;
10598 pg_shard_t peer = *i;
10599 if (!peer_missing[peer].is_missing(oid)) {
7c673cae
FG
10600 continue;
10601 }
10602 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10603 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10604 if (h > v)
10605 v = h;
10606 }
10607
10608 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10609 return v;
10610}
10611
10612void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10613{
10614 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10615 op->get_req());
10616 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10617 ObjectStore::Transaction t;
10618 append_log_entries_update_missing(m->entries, t);
10619
10620 Context *complete = new FunctionContext(
10621 [=](int) {
10622 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10623 op->get_req());
10624 lock();
10625 if (!pg_has_reset_since(msg->get_epoch())) {
10626 MOSDPGUpdateLogMissingReply *reply =
10627 new MOSDPGUpdateLogMissingReply(
10628 spg_t(info.pgid.pgid, primary_shard().shard),
10629 pg_whoami.shard,
10630 msg->get_epoch(),
10631 msg->min_epoch,
10632 msg->get_tid());
10633 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10634 msg->get_connection()->send_message(reply);
10635 }
10636 unlock();
10637 });
10638
31f18b77 10639 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
10640 t.register_on_commit(complete);
10641 } else {
10642 /* Hack to work around the fact that ReplicatedBackend sends
10643 * ack+commit if commit happens first
10644 *
10645 * This behavior is no longer necessary, but we preserve it so old
10646 * primaries can keep their repops in order */
10647 if (pool.info.ec_pool()) {
10648 t.register_on_complete(complete);
10649 } else {
10650 t.register_on_commit(complete);
10651 }
10652 }
10653 t.register_on_applied(
10654 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10655 int tr = osd->store->queue_transaction(
10656 osr.get(),
10657 std::move(t),
10658 nullptr);
10659 assert(tr == 0);
10660}
10661
10662void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10663{
10664 const MOSDPGUpdateLogMissingReply *m =
10665 static_cast<const MOSDPGUpdateLogMissingReply*>(
10666 op->get_req());
10667 dout(20) << __func__ << " got reply from "
10668 << m->get_from() << dendl;
10669
10670 auto it = log_entry_update_waiting_on.find(m->get_tid());
10671 if (it != log_entry_update_waiting_on.end()) {
10672 if (it->second.waiting_on.count(m->get_from())) {
10673 it->second.waiting_on.erase(m->get_from());
10674 } else {
10675 osd->clog->error()
10676 << info.pgid << " got reply "
10677 << *m << " from shard we are not waiting for "
10678 << m->get_from();
10679 }
10680
10681 if (it->second.waiting_on.empty()) {
10682 repop_all_committed(it->second.repop.get());
10683 log_entry_update_waiting_on.erase(it);
10684 }
10685 } else {
10686 osd->clog->error()
10687 << info.pgid << " got reply "
10688 << *m << " on unknown tid " << m->get_tid();
10689 }
10690}
10691
10692/* Mark all unfound objects as lost.
10693 */
10694void PrimaryLogPG::mark_all_unfound_lost(
10695 int what,
10696 ConnectionRef con,
10697 ceph_tid_t tid)
10698{
10699 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
224ce89b 10700 list<hobject_t> oids;
7c673cae
FG
10701
10702 dout(30) << __func__ << ": log before:\n";
10703 pg_log.get_log().print(*_dout);
10704 *_dout << dendl;
10705
31f18b77 10706 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
7c673cae
FG
10707
10708 utime_t mtime = ceph_clock_now();
10709 map<hobject_t, pg_missing_item>::const_iterator m =
10710 missing_loc.get_needs_recovery().begin();
10711 map<hobject_t, pg_missing_item>::const_iterator mend =
10712 missing_loc.get_needs_recovery().end();
10713
10714 ObcLockManager manager;
10715 eversion_t v = get_next_version();
10716 v.epoch = get_osdmap()->get_epoch();
10717 uint64_t num_unfound = missing_loc.num_unfound();
10718 while (m != mend) {
10719 const hobject_t &oid(m->first);
10720 if (!missing_loc.is_unfound(oid)) {
10721 // We only care about unfound objects
10722 ++m;
10723 continue;
10724 }
10725
10726 ObjectContextRef obc;
10727 eversion_t prev;
10728
10729 switch (what) {
10730 case pg_log_entry_t::LOST_MARK:
10731 assert(0 == "actually, not implemented yet!");
10732 break;
10733
10734 case pg_log_entry_t::LOST_REVERT:
10735 prev = pick_newest_available(oid);
10736 if (prev > eversion_t()) {
10737 // log it
10738 pg_log_entry_t e(
10739 pg_log_entry_t::LOST_REVERT, oid, v,
10740 m->second.need, 0, osd_reqid_t(), mtime, 0);
10741 e.reverting_to = prev;
10742 e.mark_unrollbackable();
10743 log_entries.push_back(e);
10744 dout(10) << e << dendl;
10745
10746 // we are now missing the new version; recovery code will sort it out.
10747 ++v.version;
10748 ++m;
10749 break;
10750 }
10751
10752 case pg_log_entry_t::LOST_DELETE:
10753 {
10754 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10755 0, osd_reqid_t(), mtime, 0);
31f18b77 10756 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
10757 if (pool.info.require_rollback()) {
10758 e.mod_desc.try_rmobject(v.version);
10759 } else {
10760 e.mark_unrollbackable();
10761 }
10762 } // otherwise, just do what we used to do
10763 dout(10) << e << dendl;
10764 log_entries.push_back(e);
224ce89b 10765 oids.push_back(oid);
7c673cae
FG
10766
10767 ++v.version;
10768 ++m;
10769 }
10770 break;
10771
10772 default:
10773 ceph_abort();
10774 }
10775 }
10776
10777 info.stats.stats_invalid = true;
10778
10779 submit_log_entries(
10780 log_entries,
10781 std::move(manager),
10782 boost::optional<std::function<void(void)> >(
224ce89b 10783 [this, oids, con, num_unfound, tid]() {
c07f9fc5
FG
10784 if (perform_deletes_during_peering()) {
10785 for (auto oid : oids) {
10786 // clear old locations - merge_new_log_entries will have
10787 // handled rebuilding missing_loc for each of these
10788 // objects if we have the RECOVERY_DELETES flag
224ce89b 10789 missing_loc.recovered(oid);
c07f9fc5
FG
10790 }
10791 }
10792
7c673cae
FG
10793 for (auto& p : waiting_for_unreadable_object) {
10794 release_backoffs(p.first);
10795 }
10796 requeue_object_waiters(waiting_for_unreadable_object);
10797 queue_recovery();
10798
10799 stringstream ss;
10800 ss << "pg has " << num_unfound
10801 << " objects unfound and apparently lost marking";
10802 string rs = ss.str();
10803 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10804 osd->clog->info() << rs;
10805 if (con) {
10806 MCommandReply *reply = new MCommandReply(0, rs);
10807 reply->set_tid(tid);
10808 con->send_message(reply);
10809 }
10810 }),
10811 OpRequestRef());
10812}
10813
10814void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10815{
10816 assert(repop_queue.empty());
10817}
10818
10819/*
10820 * pg status change notification
10821 */
10822
10823void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10824{
10825 list<OpRequestRef> rq;
10826
10827 // apply all repops
10828 while (!repop_queue.empty()) {
10829 RepGather *repop = repop_queue.front();
10830 repop_queue.pop_front();
10831 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10832 repop->rep_aborted = true;
10833 repop->on_applied.clear();
10834 repop->on_committed.clear();
10835 repop->on_success.clear();
10836
10837 if (requeue) {
10838 if (repop->op) {
10839 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10840 rq.push_back(repop->op);
10841 repop->op = OpRequestRef();
10842 }
10843
10844 // also requeue any dups, interleaved into position
10845 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10846 waiting_for_ondisk.find(repop->v);
10847 if (p != waiting_for_ondisk.end()) {
10848 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10849 for (list<pair<OpRequestRef, version_t> >::iterator i =
10850 p->second.begin();
10851 i != p->second.end();
10852 ++i) {
10853 rq.push_back(i->first);
10854 }
10855 waiting_for_ondisk.erase(p);
10856 }
10857 }
10858
10859 remove_repop(repop);
10860 }
10861
10862 assert(repop_queue.empty());
10863
10864 if (requeue) {
10865 requeue_ops(rq);
10866 if (!waiting_for_ondisk.empty()) {
10867 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10868 waiting_for_ondisk.begin();
10869 i != waiting_for_ondisk.end();
10870 ++i) {
10871 for (list<pair<OpRequestRef, version_t> >::iterator j =
10872 i->second.begin();
10873 j != i->second.end();
10874 ++j) {
10875 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10876 << i->first << dendl;
10877 }
10878 }
10879 assert(waiting_for_ondisk.empty());
10880 }
10881 }
10882
10883 waiting_for_ondisk.clear();
10884}
10885
10886void PrimaryLogPG::on_flushed()
10887{
10888 assert(flushes_in_progress > 0);
10889 flushes_in_progress--;
10890 if (flushes_in_progress == 0) {
10891 requeue_ops(waiting_for_peered);
10892 }
10893 if (!is_peered() || !is_primary()) {
10894 pair<hobject_t, ObjectContextRef> i;
10895 while (object_contexts.get_next(i.first, &i)) {
10896 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10897 }
10898 assert(object_contexts.empty());
10899 }
10900 pgbackend->on_flushed();
10901}
10902
10903void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10904{
10905 dout(10) << "on_removal" << dendl;
10906
10907 // adjust info to backfill
10908 info.set_last_backfill(hobject_t());
10909 pg_log.reset_backfill();
10910 dirty_info = true;
10911
10912
10913 // clear log
10914 PGLogEntryHandler rollbacker{this, t};
10915 pg_log.roll_forward(&rollbacker);
10916
10917 write_if_dirty(*t);
10918
10919 if (!deleting)
10920 on_shutdown();
10921}
10922
c07f9fc5
FG
10923void PrimaryLogPG::clear_async_reads()
10924{
10925 dout(10) << __func__ << dendl;
10926 for(auto& i : in_progress_async_reads) {
10927 dout(10) << "clear ctx: "
10928 << "OpRequestRef " << i.first
10929 << " OpContext " << i.second
10930 << dendl;
10931 close_op_ctx(i.second);
10932 }
10933}
10934
7c673cae
FG
10935void PrimaryLogPG::on_shutdown()
10936{
10937 dout(10) << "on_shutdown" << dendl;
10938
10939 // remove from queues
10940 osd->pg_stat_queue_dequeue(this);
10941 osd->peering_wq.dequeue(this);
10942
10943 // handles queue races
10944 deleting = true;
10945
224ce89b
WB
10946 if (recovery_queued) {
10947 recovery_queued = false;
10948 osd->clear_queued_recovery(this);
10949 }
10950
7c673cae
FG
10951 clear_scrub_reserved();
10952 scrub_clear_state();
10953
10954 unreg_next_scrub();
10955 cancel_copy_ops(false);
10956 cancel_flush_ops(false);
10957 cancel_proxy_ops(false);
10958 apply_and_flush_repops(false);
10959 cancel_log_updates();
31f18b77
FG
10960 // we must remove PGRefs, so do this this prior to release_backoffs() callers
10961 clear_backoffs();
10962 // clean up snap trim references
10963 snap_trimmer_machine.process_event(Reset());
7c673cae
FG
10964
10965 pgbackend->on_change();
10966
10967 context_registry_on_change();
10968 object_contexts.clear();
10969
c07f9fc5
FG
10970 clear_async_reads();
10971
7c673cae
FG
10972 osd->remote_reserver.cancel_reservation(info.pgid);
10973 osd->local_reserver.cancel_reservation(info.pgid);
10974
10975 clear_primary_state();
10976 cancel_recovery();
10977}
10978
10979void PrimaryLogPG::on_activate()
10980{
10981 // all clean?
10982 if (needs_recovery()) {
10983 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10984 queue_peering_event(
10985 CephPeeringEvtRef(
10986 std::make_shared<CephPeeringEvt>(
10987 get_osdmap()->get_epoch(),
10988 get_osdmap()->get_epoch(),
10989 DoRecovery())));
10990 } else if (needs_backfill()) {
10991 dout(10) << "activate queueing backfill" << dendl;
10992 queue_peering_event(
10993 CephPeeringEvtRef(
10994 std::make_shared<CephPeeringEvt>(
10995 get_osdmap()->get_epoch(),
10996 get_osdmap()->get_epoch(),
10997 RequestBackfill())));
10998 } else {
10999 dout(10) << "activate all replicas clean, no recovery" << dendl;
224ce89b 11000 eio_errors_to_process = false;
7c673cae
FG
11001 queue_peering_event(
11002 CephPeeringEvtRef(
11003 std::make_shared<CephPeeringEvt>(
11004 get_osdmap()->get_epoch(),
11005 get_osdmap()->get_epoch(),
11006 AllReplicasRecovered())));
11007 }
11008
11009 publish_stats_to_osd();
11010
11011 if (!backfill_targets.empty()) {
11012 last_backfill_started = earliest_backfill();
11013 new_backfill = true;
11014 assert(!last_backfill_started.is_max());
11015 dout(5) << "on activate: bft=" << backfill_targets
11016 << " from " << last_backfill_started << dendl;
11017 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11018 i != backfill_targets.end();
11019 ++i) {
11020 dout(5) << "target shard " << *i
11021 << " from " << peer_info[*i].last_backfill
11022 << dendl;
11023 }
11024 }
11025
11026 hit_set_setup();
11027 agent_setup();
11028}
11029
11030void PrimaryLogPG::_on_new_interval()
11031{
c07f9fc5
FG
11032 dout(20) << __func__ << "checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11033 if (!pg_log.get_missing().may_include_deletes &&
11034 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11035 pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11036 }
11037 assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
7c673cae
FG
11038}
11039
11040void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11041{
11042 dout(10) << "on_change" << dendl;
11043
11044 if (hit_set && hit_set->insert_count() == 0) {
11045 dout(20) << " discarding empty hit_set" << dendl;
11046 hit_set_clear();
11047 }
11048
11049 if (recovery_queued) {
11050 recovery_queued = false;
11051 osd->clear_queued_recovery(this);
11052 }
11053
11054 // requeue everything in the reverse order they should be
11055 // reexamined.
11056 requeue_ops(waiting_for_peered);
11057 requeue_ops(waiting_for_active);
11058
11059 clear_scrub_reserved();
11060
11061 cancel_copy_ops(is_primary());
11062 cancel_flush_ops(is_primary());
11063 cancel_proxy_ops(is_primary());
11064
11065 // requeue object waiters
11066 for (auto& p : waiting_for_unreadable_object) {
11067 release_backoffs(p.first);
11068 }
11069 if (is_primary()) {
11070 requeue_object_waiters(waiting_for_unreadable_object);
11071 } else {
11072 waiting_for_unreadable_object.clear();
11073 }
11074 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11075 p != waiting_for_degraded_object.end();
11076 waiting_for_degraded_object.erase(p++)) {
11077 release_backoffs(p->first);
11078 if (is_primary())
11079 requeue_ops(p->second);
11080 else
11081 p->second.clear();
11082 finish_degraded_object(p->first);
11083 }
11084
11085 // requeues waiting_for_scrub
11086 scrub_clear_state();
11087
11088 for (auto p = waiting_for_blocked_object.begin();
11089 p != waiting_for_blocked_object.end();
11090 waiting_for_blocked_object.erase(p++)) {
11091 if (is_primary())
11092 requeue_ops(p->second);
11093 else
11094 p->second.clear();
11095 }
11096 for (auto i = callbacks_for_degraded_object.begin();
11097 i != callbacks_for_degraded_object.end();
11098 ) {
11099 finish_degraded_object((i++)->first);
11100 }
11101 assert(callbacks_for_degraded_object.empty());
11102
11103 if (is_primary()) {
11104 requeue_ops(waiting_for_cache_not_full);
7c673cae
FG
11105 } else {
11106 waiting_for_cache_not_full.clear();
7c673cae
FG
11107 }
11108 objects_blocked_on_cache_full.clear();
11109
11110 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11111 in_progress_async_reads.begin();
11112 i != in_progress_async_reads.end();
11113 in_progress_async_reads.erase(i++)) {
11114 close_op_ctx(i->second);
11115 if (is_primary())
11116 requeue_op(i->first);
11117 }
11118
11119 // this will requeue ops we were working on but didn't finish, and
11120 // any dups
11121 apply_and_flush_repops(is_primary());
11122 cancel_log_updates();
11123
11124 // do this *after* apply_and_flush_repops so that we catch any newly
11125 // registered watches.
11126 context_registry_on_change();
11127
11128 pgbackend->on_change_cleanup(t);
11129 scrubber.cleanup_store(t);
11130 pgbackend->on_change();
11131
11132 // clear snap_trimmer state
11133 snap_trimmer_machine.process_event(Reset());
11134
11135 debug_op_order.clear();
11136 unstable_stats.clear();
11137
11138 // we don't want to cache object_contexts through the interval change
11139 // NOTE: we actually assert that all currently live references are dead
11140 // by the time the flush for the next interval completes.
11141 object_contexts.clear();
11142
11143 // should have been cleared above by finishing all of the degraded objects
11144 assert(objects_blocked_on_degraded_snap.empty());
11145}
11146
11147void PrimaryLogPG::on_role_change()
11148{
11149 dout(10) << "on_role_change" << dendl;
11150 if (get_role() != 0 && hit_set) {
11151 dout(10) << " clearing hit set" << dendl;
11152 hit_set_clear();
11153 }
11154}
11155
11156void PrimaryLogPG::on_pool_change()
11157{
11158 dout(10) << __func__ << dendl;
11159 // requeue cache full waiters just in case the cache_mode is
11160 // changing away from writeback mode. note that if we are not
11161 // active the normal requeuing machinery is sufficient (and properly
11162 // ordered).
11163 if (is_active() &&
11164 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11165 !waiting_for_cache_not_full.empty()) {
11166 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11167 << dendl;
11168 requeue_ops(waiting_for_cache_not_full);
11169 objects_blocked_on_cache_full.clear();
11170 }
11171 hit_set_setup();
11172 agent_setup();
11173}
11174
11175// clear state. called on recovery completion AND cancellation.
11176void PrimaryLogPG::_clear_recovery_state()
11177{
11178 missing_loc.clear();
11179#ifdef DEBUG_RECOVERY_OIDS
11180 recovering_oids.clear();
11181#endif
11182 last_backfill_started = hobject_t();
11183 set<hobject_t>::iterator i = backfills_in_flight.begin();
11184 while (i != backfills_in_flight.end()) {
11185 assert(recovering.count(*i));
11186 backfills_in_flight.erase(i++);
11187 }
11188
11189 list<OpRequestRef> blocked_ops;
11190 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11191 i != recovering.end();
11192 recovering.erase(i++)) {
11193 if (i->second) {
11194 i->second->drop_recovery_read(&blocked_ops);
11195 requeue_ops(blocked_ops);
11196 }
11197 }
11198 assert(backfills_in_flight.empty());
11199 pending_backfill_updates.clear();
11200 assert(recovering.empty());
11201 pgbackend->clear_recovery_state();
11202}
11203
11204void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11205{
11206 dout(20) << __func__ << ": " << soid << dendl;
11207 assert(recovering.count(soid));
11208 ObjectContextRef obc = recovering[soid];
11209 if (obc) {
11210 list<OpRequestRef> blocked_ops;
11211 obc->drop_recovery_read(&blocked_ops);
11212 requeue_ops(blocked_ops);
11213 }
11214 recovering.erase(soid);
11215 finish_recovery_op(soid);
11216 release_backoffs(soid);
11217 if (waiting_for_degraded_object.count(soid)) {
11218 dout(20) << " kicking degraded waiters on " << soid << dendl;
11219 requeue_ops(waiting_for_degraded_object[soid]);
11220 waiting_for_degraded_object.erase(soid);
11221 }
11222 if (waiting_for_unreadable_object.count(soid)) {
11223 dout(20) << " kicking unreadable waiters on " << soid << dendl;
11224 requeue_ops(waiting_for_unreadable_object[soid]);
11225 waiting_for_unreadable_object.erase(soid);
11226 }
11227 if (is_missing_object(soid))
11228 pg_log.set_last_requested(0); // get recover_primary to start over
11229 finish_degraded_object(soid);
11230}
11231
11232void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11233{
11234 /*
11235 * check that any peers we are planning to (or currently) pulling
11236 * objects from are dealt with.
11237 */
11238 missing_loc.check_recovery_sources(osdmap);
11239 pgbackend->check_recovery_sources(osdmap);
11240
11241 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11242 i != peer_log_requested.end();
11243 ) {
11244 if (!osdmap->is_up(i->osd)) {
11245 dout(10) << "peer_log_requested removing " << *i << dendl;
11246 peer_log_requested.erase(i++);
11247 } else {
11248 ++i;
11249 }
11250 }
11251
11252 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11253 i != peer_missing_requested.end();
11254 ) {
11255 if (!osdmap->is_up(i->osd)) {
11256 dout(10) << "peer_missing_requested removing " << *i << dendl;
11257 peer_missing_requested.erase(i++);
11258 } else {
11259 ++i;
11260 }
11261 }
11262}
11263
11264void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11265{
11266 set<pg_shard_t> now_down;
11267 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11268 p != missing_loc_sources.end();
11269 ) {
11270 if (osdmap->is_up(p->osd)) {
11271 ++p;
11272 continue;
11273 }
11274 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11275 now_down.insert(*p);
11276 missing_loc_sources.erase(p++);
11277 }
11278
11279 if (now_down.empty()) {
11280 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11281 } else {
11282 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11283 << missing_loc_sources << dendl;
11284
11285 // filter missing_loc
11286 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11287 while (p != missing_loc.end()) {
11288 set<pg_shard_t>::iterator q = p->second.begin();
11289 while (q != p->second.end())
11290 if (now_down.count(*q)) {
11291 p->second.erase(q++);
11292 } else {
11293 ++q;
11294 }
11295 if (p->second.empty())
11296 missing_loc.erase(p++);
11297 else
11298 ++p;
11299 }
11300 }
11301}
11302
11303
11304bool PrimaryLogPG::start_recovery_ops(
11305 uint64_t max,
11306 ThreadPool::TPHandle &handle,
11307 uint64_t *ops_started)
11308{
11309 uint64_t& started = *ops_started;
11310 started = 0;
11311 bool work_in_progress = false;
11312 assert(is_primary());
11313
11314 if (!state_test(PG_STATE_RECOVERING) &&
11315 !state_test(PG_STATE_BACKFILL)) {
11316 /* TODO: I think this case is broken and will make do_recovery()
11317 * unhappy since we're returning false */
11318 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11319 return false;
11320 }
11321
c07f9fc5 11322 const auto &missing = pg_log.get_missing();
7c673cae
FG
11323
11324 unsigned int num_missing = missing.num_missing();
11325 uint64_t num_unfound = get_num_unfound();
11326
11327 if (num_missing == 0) {
11328 info.last_complete = info.last_update;
11329 }
11330
11331 if (num_missing == num_unfound) {
11332 // All of the missing objects we have are unfound.
11333 // Recover the replicas.
11334 started = recover_replicas(max, handle);
11335 }
11336 if (!started) {
11337 // We still have missing objects that we should grab from replicas.
11338 started += recover_primary(max, handle);
11339 }
11340 if (!started && num_unfound != get_num_unfound()) {
11341 // second chance to recovery replicas
11342 started = recover_replicas(max, handle);
11343 }
11344
11345 if (started)
11346 work_in_progress = true;
11347
11348 bool deferred_backfill = false;
11349 if (recovering.empty() &&
11350 state_test(PG_STATE_BACKFILL) &&
11351 !backfill_targets.empty() && started < max &&
11352 missing.num_missing() == 0 &&
11353 waiting_on_backfill.empty()) {
11354 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11355 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11356 deferred_backfill = true;
11357 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11358 !is_degraded()) {
11359 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11360 deferred_backfill = true;
11361 } else if (!backfill_reserved) {
11362 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11363 if (!backfill_reserving) {
11364 dout(10) << "queueing RequestBackfill" << dendl;
11365 backfill_reserving = true;
11366 queue_peering_event(
11367 CephPeeringEvtRef(
11368 std::make_shared<CephPeeringEvt>(
11369 get_osdmap()->get_epoch(),
11370 get_osdmap()->get_epoch(),
11371 RequestBackfill())));
11372 }
11373 deferred_backfill = true;
11374 } else {
11375 started += recover_backfill(max - started, handle, &work_in_progress);
11376 }
11377 }
11378
11379 dout(10) << " started " << started << dendl;
11380 osd->logger->inc(l_osd_rop, started);
11381
11382 if (!recovering.empty() ||
11383 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11384 return work_in_progress;
11385
11386 assert(recovering.empty());
11387 assert(recovery_ops_active == 0);
11388
11389 dout(10) << __func__ << " needs_recovery: "
11390 << missing_loc.get_needs_recovery()
11391 << dendl;
11392 dout(10) << __func__ << " missing_loc: "
11393 << missing_loc.get_missing_locs()
11394 << dendl;
11395 int unfound = get_num_unfound();
11396 if (unfound) {
11397 dout(10) << " still have " << unfound << " unfound" << dendl;
11398 return work_in_progress;
11399 }
11400
11401 if (missing.num_missing() > 0) {
11402 // this shouldn't happen!
c07f9fc5
FG
11403 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11404 << missing.num_missing() << ": " << missing.get_items();
7c673cae
FG
11405 return work_in_progress;
11406 }
11407
11408 if (needs_recovery()) {
11409 // this shouldn't happen!
11410 // We already checked num_missing() so we must have missing replicas
c07f9fc5
FG
11411 osd->clog->error() << info.pgid
11412 << " Unexpected Error: recovery ending with missing replicas";
7c673cae
FG
11413 return work_in_progress;
11414 }
11415
11416 if (state_test(PG_STATE_RECOVERING)) {
11417 state_clear(PG_STATE_RECOVERING);
c07f9fc5 11418 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
11419 if (needs_backfill()) {
11420 dout(10) << "recovery done, queuing backfill" << dendl;
11421 queue_peering_event(
11422 CephPeeringEvtRef(
11423 std::make_shared<CephPeeringEvt>(
11424 get_osdmap()->get_epoch(),
11425 get_osdmap()->get_epoch(),
11426 RequestBackfill())));
11427 } else {
11428 dout(10) << "recovery done, no backfill" << dendl;
224ce89b 11429 eio_errors_to_process = false;
c07f9fc5 11430 state_clear(PG_STATE_FORCED_BACKFILL);
7c673cae
FG
11431 queue_peering_event(
11432 CephPeeringEvtRef(
11433 std::make_shared<CephPeeringEvt>(
11434 get_osdmap()->get_epoch(),
11435 get_osdmap()->get_epoch(),
11436 AllReplicasRecovered())));
11437 }
11438 } else { // backfilling
11439 state_clear(PG_STATE_BACKFILL);
c07f9fc5
FG
11440 state_clear(PG_STATE_FORCED_BACKFILL);
11441 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae 11442 dout(10) << "recovery done, backfill done" << dendl;
224ce89b 11443 eio_errors_to_process = false;
7c673cae
FG
11444 queue_peering_event(
11445 CephPeeringEvtRef(
11446 std::make_shared<CephPeeringEvt>(
11447 get_osdmap()->get_epoch(),
11448 get_osdmap()->get_epoch(),
11449 Backfilled())));
11450 }
11451
11452 return false;
11453}
11454
11455/**
11456 * do one recovery op.
11457 * return true if done, false if nothing left to do.
11458 */
11459uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11460{
11461 assert(is_primary());
11462
c07f9fc5 11463 const auto &missing = pg_log.get_missing();
7c673cae
FG
11464
11465 dout(10) << "recover_primary recovering " << recovering.size()
11466 << " in pg" << dendl;
11467 dout(10) << "recover_primary " << missing << dendl;
11468 dout(25) << "recover_primary " << missing.get_items() << dendl;
11469
11470 // look at log!
11471 pg_log_entry_t *latest = 0;
11472 unsigned started = 0;
11473 int skipped = 0;
11474
11475 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11476 map<version_t, hobject_t>::const_iterator p =
11477 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11478 while (p != missing.get_rmissing().end()) {
11479 handle.reset_tp_timeout();
11480 hobject_t soid;
11481 version_t v = p->first;
11482
11483 if (pg_log.get_log().objects.count(p->second)) {
11484 latest = pg_log.get_log().objects.find(p->second)->second;
c07f9fc5 11485 assert(latest->is_update() || latest->is_delete());
7c673cae
FG
11486 soid = latest->soid;
11487 } else {
11488 latest = 0;
11489 soid = p->second;
11490 }
11491 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11492 ++p;
11493
224ce89b 11494 hobject_t head = soid.get_head();
7c673cae
FG
11495
11496 eversion_t need = item.need;
11497
11498 dout(10) << "recover_primary "
11499 << soid << " " << item.need
11500 << (missing.is_missing(soid) ? " (missing)":"")
11501 << (missing.is_missing(head) ? " (missing head)":"")
11502 << (recovering.count(soid) ? " (recovering)":"")
11503 << (recovering.count(head) ? " (recovering head)":"")
11504 << dendl;
11505
11506 if (latest) {
11507 switch (latest->op) {
11508 case pg_log_entry_t::CLONE:
11509 /*
11510 * Handling for this special case removed for now, until we
11511 * can correctly construct an accurate SnapSet from the old
11512 * one.
11513 */
11514 break;
11515
11516 case pg_log_entry_t::LOST_REVERT:
11517 {
11518 if (item.have == latest->reverting_to) {
11519 ObjectContextRef obc = get_object_context(soid, true);
11520
11521 if (obc->obs.oi.version == latest->version) {
11522 // I'm already reverting
11523 dout(10) << " already reverting " << soid << dendl;
11524 } else {
11525 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11526 obc->ondisk_write_lock();
11527 obc->obs.oi.version = latest->version;
11528
11529 ObjectStore::Transaction t;
11530 bufferlist b2;
11531 obc->obs.oi.encode(
11532 b2,
11533 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11534 assert(!pool.info.require_rollback());
11535 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11536
11537 recover_got(soid, latest->version);
11538 missing_loc.add_location(soid, pg_whoami);
11539
11540 ++active_pushes;
11541
11542 osd->store->queue_transaction(osr.get(), std::move(t),
11543 new C_OSD_AppliedRecoveredObject(this, obc),
11544 new C_OSD_CommittedPushedObject(
11545 this,
11546 get_osdmap()->get_epoch(),
11547 info.last_complete),
11548 new C_OSD_OndiskWriteUnlock(obc));
11549 continue;
11550 }
11551 } else {
11552 /*
11553 * Pull the old version of the object. Update missing_loc here to have the location
11554 * of the version we want.
11555 *
11556 * This doesn't use the usual missing_loc paths, but that's okay:
11557 * - if we have it locally, we hit the case above, and go from there.
11558 * - if we don't, we always pass through this case during recovery and set up the location
11559 * properly.
11560 * - this way we don't need to mangle the missing code to be general about needing an old
11561 * version...
11562 */
11563 eversion_t alternate_need = latest->reverting_to;
11564 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11565
11566 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11567 p != peer_missing.end();
11568 ++p)
11569 if (p->second.is_missing(soid, need) &&
11570 p->second.get_items().at(soid).have == alternate_need) {
11571 missing_loc.add_location(soid, p->first);
11572 }
11573 dout(10) << " will pull " << alternate_need << " or " << need
11574 << " from one of " << missing_loc.get_locations(soid)
11575 << dendl;
11576 }
11577 }
11578 break;
11579 }
11580 }
11581
11582 if (!recovering.count(soid)) {
11583 if (recovering.count(head)) {
11584 ++skipped;
11585 } else {
11586 int r = recover_missing(
11587 soid, need, get_recovery_op_priority(), h);
11588 switch (r) {
11589 case PULL_YES:
11590 ++started;
11591 break;
11592 case PULL_OTHER:
11593 ++started;
11594 case PULL_NONE:
11595 ++skipped;
11596 break;
11597 default:
11598 ceph_abort();
11599 }
11600 if (started >= max)
11601 break;
11602 }
11603 }
11604
11605 // only advance last_requested if we haven't skipped anything
11606 if (!skipped)
11607 pg_log.set_last_requested(v);
11608 }
11609
11610 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11611 return started;
11612}
11613
224ce89b
WB
11614bool PrimaryLogPG::primary_error(
11615 const hobject_t& soid, eversion_t v)
11616{
11617 pg_log.missing_add(soid, v, eversion_t());
11618 pg_log.set_last_requested(0);
11619 missing_loc.remove_location(soid, pg_whoami);
11620 bool uhoh = true;
11621 assert(!actingbackfill.empty());
11622 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11623 i != actingbackfill.end();
11624 ++i) {
11625 if (*i == get_primary()) continue;
11626 pg_shard_t peer = *i;
11627 if (!peer_missing[peer].is_missing(soid, v)) {
11628 missing_loc.add_location(soid, peer);
11629 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11630 << ", there should be a copy on shard " << peer << dendl;
11631 uhoh = false;
11632 }
11633 }
11634 if (uhoh)
11635 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11636 else
11637 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11638 << ", will try copies on " << missing_loc.get_locations(soid);
11639 return uhoh;
11640}
11641
c07f9fc5
FG
11642int PrimaryLogPG::prep_object_replica_deletes(
11643 const hobject_t& soid, eversion_t v,
11644 PGBackend::RecoveryHandle *h)
11645{
11646 assert(is_primary());
11647 dout(10) << __func__ << ": on " << soid << dendl;
11648
11649 start_recovery_op(soid);
11650 assert(!recovering.count(soid));
11651 recovering.insert(make_pair(soid, ObjectContextRef()));
11652
11653 pgbackend->recover_delete_object(soid, v, h);
11654 return 1;
11655}
11656
7c673cae
FG
11657int PrimaryLogPG::prep_object_replica_pushes(
11658 const hobject_t& soid, eversion_t v,
11659 PGBackend::RecoveryHandle *h)
11660{
11661 assert(is_primary());
11662 dout(10) << __func__ << ": on " << soid << dendl;
11663
11664 // NOTE: we know we will get a valid oloc off of disk here.
11665 ObjectContextRef obc = get_object_context(soid, false);
11666 if (!obc) {
224ce89b 11667 primary_error(soid, v);
7c673cae
FG
11668 return 0;
11669 }
11670
11671 if (!obc->get_recovery_read()) {
11672 dout(20) << "recovery delayed on " << soid
11673 << "; could not get rw_manager lock" << dendl;
11674 return 0;
11675 } else {
11676 dout(20) << "recovery got recovery read lock on " << soid
11677 << dendl;
11678 }
11679
11680 start_recovery_op(soid);
11681 assert(!recovering.count(soid));
11682 recovering.insert(make_pair(soid, obc));
11683
11684 /* We need this in case there is an in progress write on the object. In fact,
11685 * the only possible write is an update to the xattr due to a lost_revert --
11686 * a client write would be blocked since the object is degraded.
11687 * In almost all cases, therefore, this lock should be uncontended.
11688 */
11689 obc->ondisk_read_lock();
224ce89b 11690 int r = pgbackend->recover_object(
7c673cae
FG
11691 soid,
11692 v,
11693 ObjectContextRef(),
11694 obc, // has snapset context
11695 h);
11696 obc->ondisk_read_unlock();
224ce89b
WB
11697 if (r < 0) {
11698 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11699 primary_failed(soid);
11700 primary_error(soid, v);
11701 return 0;
11702 }
7c673cae
FG
11703 return 1;
11704}
11705
11706uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11707{
11708 dout(10) << __func__ << "(" << max << ")" << dendl;
11709 uint64_t started = 0;
11710
11711 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11712
11713 // this is FAR from an optimal recovery order. pretty lame, really.
11714 assert(!actingbackfill.empty());
11715 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11716 i != actingbackfill.end();
11717 ++i) {
11718 if (*i == get_primary()) continue;
11719 pg_shard_t peer = *i;
11720 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11721 assert(pm != peer_missing.end());
11722 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11723 assert(pi != peer_info.end());
11724 size_t m_sz = pm->second.num_missing();
11725
11726 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11727 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11728
11729 // oldest first!
11730 const pg_missing_t &m(pm->second);
11731 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11732 p != m.get_rmissing().end() && started < max;
11733 ++p) {
11734 handle.reset_tp_timeout();
11735 const hobject_t soid(p->second);
11736
224ce89b
WB
11737 if (missing_loc.is_unfound(soid)) {
11738 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11739 continue;
11740 }
11741
7c673cae
FG
11742 if (soid > pi->second.last_backfill) {
11743 if (!recovering.count(soid)) {
224ce89b 11744 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
7c673cae
FG
11745 derr << __func__ << ": object added to missing set for backfill, but "
11746 << "is not in recovering, error!" << dendl;
11747 ceph_abort();
11748 }
11749 continue;
11750 }
11751
11752 if (recovering.count(soid)) {
11753 dout(10) << __func__ << ": already recovering " << soid << dendl;
11754 continue;
11755 }
11756
c07f9fc5
FG
11757 if (missing_loc.is_deleted(soid)) {
11758 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11759 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11760 started += prep_object_replica_deletes(soid, r->second.need, h);
11761 continue;
11762 }
11763
7c673cae
FG
11764 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11765 dout(10) << __func__ << ": " << soid.get_head()
11766 << " still missing on primary" << dendl;
11767 continue;
11768 }
11769
11770 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11771 dout(10) << __func__ << ": " << soid.get_snapdir()
11772 << " still missing on primary" << dendl;
11773 continue;
11774 }
11775
11776 if (pg_log.get_missing().is_missing(soid)) {
11777 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11778 continue;
11779 }
11780
11781 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11782 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11783 started += prep_object_replica_pushes(soid, r->second.need,
11784 h);
11785 }
11786 }
11787
11788 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11789 return started;
11790}
11791
11792hobject_t PrimaryLogPG::earliest_peer_backfill() const
11793{
11794 hobject_t e = hobject_t::get_max();
11795 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11796 i != backfill_targets.end();
11797 ++i) {
11798 pg_shard_t peer = *i;
11799 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11800 peer_backfill_info.find(peer);
11801 assert(iter != peer_backfill_info.end());
11802 if (iter->second.begin < e)
11803 e = iter->second.begin;
11804 }
11805 return e;
11806}
11807
11808bool PrimaryLogPG::all_peer_done() const
11809{
11810 // Primary hasn't got any more objects
11811 assert(backfill_info.empty());
11812
11813 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11814 i != backfill_targets.end();
11815 ++i) {
11816 pg_shard_t bt = *i;
11817 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11818 peer_backfill_info.find(bt);
11819 assert(piter != peer_backfill_info.end());
11820 const BackfillInterval& pbi = piter->second;
11821 // See if peer has more to process
11822 if (!pbi.extends_to_end() || !pbi.empty())
11823 return false;
11824 }
11825 return true;
11826}
11827
11828/**
11829 * recover_backfill
11830 *
11831 * Invariants:
11832 *
11833 * backfilled: fully pushed to replica or present in replica's missing set (both
11834 * our copy and theirs).
11835 *
11836 * All objects on a backfill_target in
11837 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11838 * objects have been actually deleted and all logically-valid objects are replicated.
11839 * There may be PG objects in this interval yet to be backfilled.
11840 *
11841 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11842 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11843 *
11844 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11845 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11846 * interval remain on the backfill target.
11847 *
11848 * For a backfill target, all objects <= peer_info[target].last_backfill
11849 * have been backfilled to target
11850 *
11851 * There *MAY* be missing/outdated objects between last_backfill_started and
11852 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11853 * io created objects since the last scan. For this reason, we call
11854 * update_range() again before continuing backfill.
11855 */
11856uint64_t PrimaryLogPG::recover_backfill(
11857 uint64_t max,
11858 ThreadPool::TPHandle &handle, bool *work_started)
11859{
11860 dout(10) << "recover_backfill (" << max << ")"
11861 << " bft=" << backfill_targets
11862 << " last_backfill_started " << last_backfill_started
11863 << (new_backfill ? " new_backfill":"")
11864 << dendl;
11865 assert(!backfill_targets.empty());
11866
11867 // Initialize from prior backfill state
11868 if (new_backfill) {
11869 // on_activate() was called prior to getting here
11870 assert(last_backfill_started == earliest_backfill());
11871 new_backfill = false;
11872
11873 // initialize BackfillIntervals
11874 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11875 i != backfill_targets.end();
11876 ++i) {
11877 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11878 }
11879 backfill_info.reset(last_backfill_started);
11880
11881 backfills_in_flight.clear();
11882 pending_backfill_updates.clear();
11883 }
11884
11885 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11886 i != backfill_targets.end();
11887 ++i) {
11888 dout(10) << "peer osd." << *i
11889 << " info " << peer_info[*i]
11890 << " interval " << peer_backfill_info[*i].begin
11891 << "-" << peer_backfill_info[*i].end
11892 << " " << peer_backfill_info[*i].objects.size() << " objects"
11893 << dendl;
11894 }
11895
11896 // update our local interval to cope with recent changes
11897 backfill_info.begin = last_backfill_started;
11898 update_range(&backfill_info, handle);
11899
11900 unsigned ops = 0;
7c673cae
FG
11901 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11902 set<hobject_t> add_to_stat;
11903
11904 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11905 i != backfill_targets.end();
11906 ++i) {
11907 peer_backfill_info[*i].trim_to(
11908 std::max(peer_info[*i].last_backfill, last_backfill_started));
11909 }
11910 backfill_info.trim_to(last_backfill_started);
11911
224ce89b 11912 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
7c673cae
FG
11913 while (ops < max) {
11914 if (backfill_info.begin <= earliest_peer_backfill() &&
11915 !backfill_info.extends_to_end() && backfill_info.empty()) {
11916 hobject_t next = backfill_info.end;
11917 backfill_info.reset(next);
11918 backfill_info.end = hobject_t::get_max();
11919 update_range(&backfill_info, handle);
11920 backfill_info.trim();
11921 }
11922
11923 dout(20) << " my backfill interval " << backfill_info << dendl;
11924
11925 bool sent_scan = false;
11926 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11927 i != backfill_targets.end();
11928 ++i) {
11929 pg_shard_t bt = *i;
11930 BackfillInterval& pbi = peer_backfill_info[bt];
11931
11932 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11933 if (pbi.begin <= backfill_info.begin &&
11934 !pbi.extends_to_end() && pbi.empty()) {
11935 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11936 epoch_t e = get_osdmap()->get_epoch();
11937 MOSDPGScan *m = new MOSDPGScan(
11938 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11939 spg_t(info.pgid.pgid, bt.shard),
11940 pbi.end, hobject_t());
11941 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11942 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11943 waiting_on_backfill.insert(bt);
11944 sent_scan = true;
11945 }
11946 }
11947
11948 // Count simultaneous scans as a single op and let those complete
11949 if (sent_scan) {
11950 ops++;
11951 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11952 break;
11953 }
11954
11955 if (backfill_info.empty() && all_peer_done()) {
11956 dout(10) << " reached end for both local and all peers" << dendl;
11957 break;
11958 }
11959
11960 // Get object within set of peers to operate on and
11961 // the set of targets for which that object applies.
11962 hobject_t check = earliest_peer_backfill();
11963
11964 if (check < backfill_info.begin) {
11965
11966 set<pg_shard_t> check_targets;
11967 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11968 i != backfill_targets.end();
11969 ++i) {
11970 pg_shard_t bt = *i;
11971 BackfillInterval& pbi = peer_backfill_info[bt];
11972 if (pbi.begin == check)
11973 check_targets.insert(bt);
11974 }
11975 assert(!check_targets.empty());
11976
11977 dout(20) << " BACKFILL removing " << check
11978 << " from peers " << check_targets << dendl;
11979 for (set<pg_shard_t>::iterator i = check_targets.begin();
11980 i != check_targets.end();
11981 ++i) {
11982 pg_shard_t bt = *i;
11983 BackfillInterval& pbi = peer_backfill_info[bt];
11984 assert(pbi.begin == check);
11985
11986 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11987 pbi.pop_front();
11988 }
11989
11990 /* This requires a bit of explanation. We compare head against
11991 * last_backfill to determine whether to send an operation
11992 * to the replica. A single write operation can touch up to three
11993 * objects: head, the snapdir, and a new clone which sorts closer to
11994 * head than any existing clone. If last_backfill points at a clone,
11995 * the transaction won't be sent and all 3 must lie on the right side
11996 * of the line (i.e., we'll backfill them later). If last_backfill
11997 * points at snapdir, it sorts greater than head, so we send the
11998 * transaction which is correct because all three must lie to the left
11999 * of the line.
12000 *
12001 * If it points at head, we have a bit of an issue. If head actually
12002 * exists, no problem, because any transaction which touches snapdir
12003 * must end up creating it (and deleting head), so sending the
12004 * operation won't pose a problem -- we'll end up having to scan it,
12005 * but it'll end up being the right version so we won't bother to
12006 * rebackfill it. However, if head doesn't exist, any write on head
12007 * will remove snapdir. For a replicated pool, this isn't a problem,
12008 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12009 * It only poses a problem for EC pools, because we never just delete
12010 * an object, we rename it into a rollback object. That operation
12011 * will end up crashing the osd with ENOENT. Tolerating the failure
12012 * wouldn't work either, even if snapdir exists, we'd be creating a
12013 * rollback object past the last_backfill line which wouldn't get
12014 * cleaned up (no rollback objects past the last_backfill line is an
12015 * existing important invariant). Thus, let's avoid the whole issue
12016 * by just not updating last_backfill_started here if head doesn't
12017 * exist and snapdir does. We aren't using up a recovery count here,
12018 * so we're going to recover snapdir immediately anyway. We'll only
12019 * fail "backward" if we fail to get the rw lock and that just means
12020 * we'll re-process this section of the hash space again.
12021 *
12022 * I'm choosing this hack here because the really "correct" answer is
12023 * going to be to unify snapdir and head into a single object (a
12024 * snapdir is really just a confusing way to talk about head existing
12025 * as a whiteout), but doing that is going to be a somewhat larger
12026 * undertaking.
12027 *
12028 * @see http://tracker.ceph.com/issues/17668
12029 */
12030 if (!(check.is_head() &&
12031 backfill_info.begin.is_snapdir() &&
12032 check == backfill_info.begin.get_head()))
12033 last_backfill_started = check;
12034
12035 // Don't increment ops here because deletions
12036 // are cheap and not replied to unlike real recovery_ops,
12037 // and we can't increment ops without requeueing ourself
12038 // for recovery.
12039 } else {
12040 eversion_t& obj_v = backfill_info.objects.begin()->second;
12041
12042 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12043 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12044 i != backfill_targets.end();
12045 ++i) {
12046 pg_shard_t bt = *i;
12047 BackfillInterval& pbi = peer_backfill_info[bt];
12048 // Find all check peers that have the wrong version
12049 if (check == backfill_info.begin && check == pbi.begin) {
12050 if (pbi.objects.begin()->second != obj_v) {
12051 need_ver_targs.push_back(bt);
12052 } else {
12053 keep_ver_targs.push_back(bt);
12054 }
12055 } else {
12056 pg_info_t& pinfo = peer_info[bt];
12057
12058 // Only include peers that we've caught up to their backfill line
12059 // otherwise, they only appear to be missing this object
12060 // because their pbi.begin > backfill_info.begin.
12061 if (backfill_info.begin > pinfo.last_backfill)
12062 missing_targs.push_back(bt);
12063 else
12064 skip_targs.push_back(bt);
12065 }
12066 }
12067
12068 if (!keep_ver_targs.empty()) {
12069 // These peers have version obj_v
12070 dout(20) << " BACKFILL keeping " << check
12071 << " with ver " << obj_v
12072 << " on peers " << keep_ver_targs << dendl;
12073 //assert(!waiting_for_degraded_object.count(check));
12074 }
12075 if (!need_ver_targs.empty() || !missing_targs.empty()) {
12076 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12077 assert(obc);
12078 if (obc->get_recovery_read()) {
12079 if (!need_ver_targs.empty()) {
12080 dout(20) << " BACKFILL replacing " << check
12081 << " with ver " << obj_v
12082 << " to peers " << need_ver_targs << dendl;
12083 }
12084 if (!missing_targs.empty()) {
12085 dout(20) << " BACKFILL pushing " << backfill_info.begin
12086 << " with ver " << obj_v
12087 << " to peers " << missing_targs << dendl;
12088 }
12089 vector<pg_shard_t> all_push = need_ver_targs;
12090 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12091
224ce89b
WB
12092 handle.reset_tp_timeout();
12093 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12094 if (r < 0) {
12095 *work_started = true;
12096 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12097 break;
12098 }
7c673cae
FG
12099 ops++;
12100 } else {
12101 *work_started = true;
12102 dout(20) << "backfill blocking on " << backfill_info.begin
12103 << "; could not get rw_manager lock" << dendl;
12104 break;
12105 }
12106 }
12107 dout(20) << "need_ver_targs=" << need_ver_targs
12108 << " keep_ver_targs=" << keep_ver_targs << dendl;
12109 dout(20) << "backfill_targets=" << backfill_targets
12110 << " missing_targs=" << missing_targs
12111 << " skip_targs=" << skip_targs << dendl;
12112
12113 last_backfill_started = backfill_info.begin;
12114 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12115 backfill_info.pop_front();
12116 vector<pg_shard_t> check_targets = need_ver_targs;
12117 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12118 for (vector<pg_shard_t>::iterator i = check_targets.begin();
12119 i != check_targets.end();
12120 ++i) {
12121 pg_shard_t bt = *i;
12122 BackfillInterval& pbi = peer_backfill_info[bt];
12123 pbi.pop_front();
12124 }
12125 }
12126 }
12127
12128 hobject_t backfill_pos =
12129 std::min(backfill_info.begin, earliest_peer_backfill());
12130
12131 for (set<hobject_t>::iterator i = add_to_stat.begin();
12132 i != add_to_stat.end();
12133 ++i) {
12134 ObjectContextRef obc = get_object_context(*i, false);
12135 assert(obc);
12136 pg_stat_t stat;
12137 add_object_context_to_pg_stat(obc, &stat);
12138 pending_backfill_updates[*i] = stat;
12139 }
12140 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12141 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12142 for (unsigned i = 0; i < to_remove.size(); ++i) {
12143 handle.reset_tp_timeout();
12144 const hobject_t& oid = to_remove[i].get<0>();
12145 eversion_t v = to_remove[i].get<1>();
12146 pg_shard_t peer = to_remove[i].get<2>();
12147 MOSDPGBackfillRemove *m;
12148 auto it = reqs.find(peer);
12149 if (it != reqs.end()) {
12150 m = it->second;
12151 } else {
12152 m = reqs[peer] = new MOSDPGBackfillRemove(
12153 spg_t(info.pgid.pgid, peer.shard),
12154 get_osdmap()->get_epoch());
12155 }
12156 m->ls.push_back(make_pair(oid, v));
12157
12158 if (oid <= last_backfill_started)
12159 pending_backfill_updates[oid]; // add empty stat!
12160 }
12161 for (auto p : reqs) {
12162 osd->send_message_osd_cluster(p.first.osd, p.second,
12163 get_osdmap()->get_epoch());
12164 }
12165 } else {
12166 // for jewel targets
12167 for (unsigned i = 0; i < to_remove.size(); ++i) {
12168 handle.reset_tp_timeout();
12169
12170 // ordered before any subsequent updates
12171 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12172 to_remove[i].get<2>());
12173
12174 if (to_remove[i].get<0>() <= last_backfill_started)
12175 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12176 }
12177 }
12178
7c673cae
FG
12179 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12180
12181 dout(5) << "backfill_pos is " << backfill_pos << dendl;
12182 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12183 i != backfills_in_flight.end();
12184 ++i) {
12185 dout(20) << *i << " is still in flight" << dendl;
12186 }
12187
12188 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12189 backfill_pos : *(backfills_in_flight.begin());
12190 hobject_t new_last_backfill = earliest_backfill();
12191 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12192 for (map<hobject_t, pg_stat_t>::iterator i =
12193 pending_backfill_updates.begin();
12194 i != pending_backfill_updates.end() &&
12195 i->first < next_backfill_to_complete;
12196 pending_backfill_updates.erase(i++)) {
12197 dout(20) << " pending_backfill_update " << i->first << dendl;
12198 assert(i->first > new_last_backfill);
12199 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12200 j != backfill_targets.end();
12201 ++j) {
12202 pg_shard_t bt = *j;
12203 pg_info_t& pinfo = peer_info[bt];
12204 //Add stats to all peers that were missing object
12205 if (i->first > pinfo.last_backfill)
12206 pinfo.stats.add(i->second);
12207 }
12208 new_last_backfill = i->first;
12209 }
12210 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12211
12212 assert(!pending_backfill_updates.empty() ||
12213 new_last_backfill == last_backfill_started);
12214 if (pending_backfill_updates.empty() &&
12215 backfill_pos.is_max()) {
12216 assert(backfills_in_flight.empty());
12217 new_last_backfill = backfill_pos;
12218 last_backfill_started = backfill_pos;
12219 }
12220 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12221
12222 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12223 // all the backfill targets. Otherwise, we will move last_backfill up on
12224 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12225 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12226 i != backfill_targets.end();
12227 ++i) {
12228 pg_shard_t bt = *i;
12229 pg_info_t& pinfo = peer_info[bt];
12230
12231 if (new_last_backfill > pinfo.last_backfill) {
12232 pinfo.set_last_backfill(new_last_backfill);
12233 epoch_t e = get_osdmap()->get_epoch();
12234 MOSDPGBackfill *m = NULL;
12235 if (pinfo.last_backfill.is_max()) {
12236 m = new MOSDPGBackfill(
12237 MOSDPGBackfill::OP_BACKFILL_FINISH,
12238 e,
12239 last_peering_reset,
12240 spg_t(info.pgid.pgid, bt.shard));
12241 // Use default priority here, must match sub_op priority
12242 /* pinfo.stats might be wrong if we did log-based recovery on the
12243 * backfilled portion in addition to continuing backfill.
12244 */
12245 pinfo.stats = info.stats;
12246 start_recovery_op(hobject_t::get_max());
12247 } else {
12248 m = new MOSDPGBackfill(
12249 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12250 e,
12251 last_peering_reset,
12252 spg_t(info.pgid.pgid, bt.shard));
12253 // Use default priority here, must match sub_op priority
12254 }
12255 m->last_backfill = pinfo.last_backfill;
12256 m->stats = pinfo.stats;
12257 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12258 dout(10) << " peer " << bt
12259 << " num_objects now " << pinfo.stats.stats.sum.num_objects
12260 << " / " << info.stats.stats.sum.num_objects << dendl;
12261 }
12262 }
12263
12264 if (ops)
12265 *work_started = true;
12266 return ops;
12267}
12268
224ce89b 12269int PrimaryLogPG::prep_backfill_object_push(
7c673cae
FG
12270 hobject_t oid, eversion_t v,
12271 ObjectContextRef obc,
12272 vector<pg_shard_t> peers,
12273 PGBackend::RecoveryHandle *h)
12274{
224ce89b 12275 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
7c673cae
FG
12276 assert(!peers.empty());
12277
12278 backfills_in_flight.insert(oid);
12279 for (unsigned int i = 0 ; i < peers.size(); ++i) {
12280 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12281 assert(bpm != peer_missing.end());
c07f9fc5 12282 bpm->second.add(oid, eversion_t(), eversion_t(), false);
7c673cae
FG
12283 }
12284
12285 assert(!recovering.count(oid));
12286
12287 start_recovery_op(oid);
12288 recovering.insert(make_pair(oid, obc));
12289
12290 // We need to take the read_lock here in order to flush in-progress writes
12291 obc->ondisk_read_lock();
224ce89b 12292 int r = pgbackend->recover_object(
7c673cae
FG
12293 oid,
12294 v,
12295 ObjectContextRef(),
12296 obc,
12297 h);
12298 obc->ondisk_read_unlock();
224ce89b
WB
12299 if (r < 0) {
12300 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12301 primary_failed(oid);
12302 primary_error(oid, v);
12303 backfills_in_flight.erase(oid);
12304 missing_loc.add_missing(oid, v, eversion_t());
12305 }
12306 return r;
7c673cae
FG
12307}
12308
12309void PrimaryLogPG::update_range(
12310 BackfillInterval *bi,
12311 ThreadPool::TPHandle &handle)
12312{
12313 int local_min = cct->_conf->osd_backfill_scan_min;
12314 int local_max = cct->_conf->osd_backfill_scan_max;
12315
12316 if (bi->version < info.log_tail) {
12317 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12318 << dendl;
12319 if (last_update_applied >= info.log_tail) {
12320 bi->version = last_update_applied;
12321 } else {
12322 osr->flush();
12323 bi->version = info.last_update;
12324 }
12325 scan_range(local_min, local_max, bi, handle);
12326 }
12327
12328 if (bi->version >= projected_last_update) {
12329 dout(10) << __func__<< ": bi is current " << dendl;
12330 assert(bi->version == projected_last_update);
12331 } else if (bi->version >= info.log_tail) {
12332 if (pg_log.get_log().empty() && projected_log.empty()) {
12333 /* Because we don't move log_tail on split, the log might be
12334 * empty even if log_tail != last_update. However, the only
12335 * way to get here with an empty log is if log_tail is actually
12336 * eversion_t(), because otherwise the entry which changed
12337 * last_update since the last scan would have to be present.
12338 */
12339 assert(bi->version == eversion_t());
12340 return;
12341 }
12342
12343 dout(10) << __func__<< ": bi is old, (" << bi->version
12344 << ") can be updated with log to projected_last_update "
12345 << projected_last_update << dendl;
12346
12347 auto func = [&](const pg_log_entry_t &e) {
12348 dout(10) << __func__ << ": updating from version " << e.version
12349 << dendl;
12350 const hobject_t &soid = e.soid;
12351 if (soid >= bi->begin &&
12352 soid < bi->end) {
12353 if (e.is_update()) {
12354 dout(10) << __func__ << ": " << e.soid << " updated to version "
12355 << e.version << dendl;
12356 bi->objects.erase(e.soid);
12357 bi->objects.insert(
12358 make_pair(
12359 e.soid,
12360 e.version));
12361 } else if (e.is_delete()) {
12362 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12363 bi->objects.erase(e.soid);
12364 }
12365 }
12366 };
12367 dout(10) << "scanning pg log first" << dendl;
12368 pg_log.get_log().scan_log_after(bi->version, func);
12369 dout(10) << "scanning projected log" << dendl;
12370 projected_log.scan_log_after(bi->version, func);
12371 bi->version = projected_last_update;
12372 } else {
12373 assert(0 == "scan_range should have raised bi->version past log_tail");
12374 }
12375}
12376
12377void PrimaryLogPG::scan_range(
12378 int min, int max, BackfillInterval *bi,
12379 ThreadPool::TPHandle &handle)
12380{
12381 assert(is_locked());
12382 dout(10) << "scan_range from " << bi->begin << dendl;
12383 bi->clear_objects();
12384
12385 vector<hobject_t> ls;
12386 ls.reserve(max);
12387 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12388 assert(r >= 0);
12389 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12390 dout(20) << ls << dendl;
12391
12392 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12393 handle.reset_tp_timeout();
12394 ObjectContextRef obc;
12395 if (is_primary())
12396 obc = object_contexts.lookup(*p);
12397 if (obc) {
12398 bi->objects[*p] = obc->obs.oi.version;
12399 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12400 } else {
12401 bufferlist bl;
12402 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12403
12404 /* If the object does not exist here, it must have been removed
12405 * between the collection_list_partial and here. This can happen
12406 * for the first item in the range, which is usually last_backfill.
12407 */
12408 if (r == -ENOENT)
12409 continue;
12410
12411 assert(r >= 0);
12412 object_info_t oi(bl);
12413 bi->objects[*p] = oi.version;
12414 dout(20) << " " << *p << " " << oi.version << dendl;
12415 }
12416 }
12417}
12418
12419
12420/** check_local
12421 *
12422 * verifies that stray objects have been deleted
12423 */
12424void PrimaryLogPG::check_local()
12425{
12426 dout(10) << __func__ << dendl;
12427
12428 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12429
12430 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12431 return;
12432
12433 // just scan the log.
12434 set<hobject_t> did;
12435 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12436 p != pg_log.get_log().log.rend();
12437 ++p) {
12438 if (did.count(p->soid))
12439 continue;
12440 did.insert(p->soid);
12441
c07f9fc5 12442 if (p->is_delete() && !is_missing_object(p->soid)) {
7c673cae
FG
12443 dout(10) << " checking " << p->soid
12444 << " at " << p->version << dendl;
12445 struct stat st;
12446 int r = osd->store->stat(
12447 ch,
12448 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12449 &st);
12450 if (r != -ENOENT) {
12451 derr << __func__ << " " << p->soid << " exists, but should have been "
12452 << "deleted" << dendl;
12453 assert(0 == "erroneously present object");
12454 }
12455 } else {
12456 // ignore old(+missing) objects
12457 }
12458 }
12459}
12460
12461
12462
12463// ===========================
12464// hit sets
12465
12466hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12467{
12468 ostringstream ss;
12469 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12470 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12471 info.pgid.ps(), info.pgid.pool(),
12472 cct->_conf->osd_hit_set_namespace);
12473 dout(20) << __func__ << " " << hoid << dendl;
12474 return hoid;
12475}
12476
12477hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12478 utime_t end,
12479 bool using_gmt)
12480{
12481 ostringstream ss;
12482 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12483 if (using_gmt) {
12484 start.gmtime(ss) << "_";
12485 end.gmtime(ss);
12486 } else {
12487 start.localtime(ss) << "_";
12488 end.localtime(ss);
12489 }
12490 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12491 info.pgid.ps(), info.pgid.pool(),
12492 cct->_conf->osd_hit_set_namespace);
12493 dout(20) << __func__ << " " << hoid << dendl;
12494 return hoid;
12495}
12496
12497void PrimaryLogPG::hit_set_clear()
12498{
12499 dout(20) << __func__ << dendl;
12500 hit_set.reset();
12501 hit_set_start_stamp = utime_t();
12502}
12503
12504void PrimaryLogPG::hit_set_setup()
12505{
12506 if (!is_active() ||
12507 !is_primary()) {
12508 hit_set_clear();
12509 return;
12510 }
12511
12512 if (is_active() && is_primary() &&
12513 (!pool.info.hit_set_count ||
12514 !pool.info.hit_set_period ||
12515 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12516 hit_set_clear();
12517
12518 // only primary is allowed to remove all the hit set objects
12519 hit_set_remove_all();
12520 return;
12521 }
12522
12523 // FIXME: discard any previous data for now
12524 hit_set_create();
12525
12526 // include any writes we know about from the pg log. this doesn't
12527 // capture reads, but it is better than nothing!
12528 hit_set_apply_log();
12529}
12530
12531void PrimaryLogPG::hit_set_remove_all()
12532{
12533 // If any archives are degraded we skip this
12534 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12535 p != info.hit_set.history.end();
12536 ++p) {
12537 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12538
12539 // Once we hit a degraded object just skip
12540 if (is_degraded_or_backfilling_object(aoid))
12541 return;
12542 if (scrubber.write_blocked_by_scrub(aoid))
12543 return;
12544 }
12545
12546 if (!info.hit_set.history.empty()) {
12547 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12548 assert(p != info.hit_set.history.rend());
12549 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12550 assert(!is_degraded_or_backfilling_object(oid));
12551 ObjectContextRef obc = get_object_context(oid, false);
12552 assert(obc);
12553
12554 OpContextUPtr ctx = simple_opc_create(obc);
12555 ctx->at_version = get_next_version();
12556 ctx->updated_hset_history = info.hit_set;
12557 utime_t now = ceph_clock_now();
12558 ctx->mtime = now;
12559 hit_set_trim(ctx, 0);
12560 simple_opc_submit(std::move(ctx));
12561 }
12562
12563 info.hit_set = pg_hit_set_history_t();
12564 if (agent_state) {
12565 agent_state->discard_hit_sets();
12566 }
12567}
12568
12569void PrimaryLogPG::hit_set_create()
12570{
12571 utime_t now = ceph_clock_now();
12572 // make a copy of the params to modify
12573 HitSet::Params params(pool.info.hit_set_params);
12574
12575 dout(20) << __func__ << " " << params << dendl;
12576 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12577 BloomHitSet::Params *p =
12578 static_cast<BloomHitSet::Params*>(params.impl.get());
12579
12580 // convert false positive rate so it holds up across the full period
12581 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12582 if (p->get_fpp() <= 0.0)
12583 p->set_fpp(.01); // fpp cannot be zero!
12584
12585 // if we don't have specified size, estimate target size based on the
12586 // previous bin!
12587 if (p->target_size == 0 && hit_set) {
12588 utime_t dur = now - hit_set_start_stamp;
12589 unsigned unique = hit_set->approx_unique_insert_count();
12590 dout(20) << __func__ << " previous set had approx " << unique
12591 << " unique items over " << dur << " seconds" << dendl;
12592 p->target_size = (double)unique * (double)pool.info.hit_set_period
12593 / (double)dur;
12594 }
12595 if (p->target_size <
12596 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12597 p->target_size = cct->_conf->osd_hit_set_min_size;
12598
12599 if (p->target_size
12600 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12601 p->target_size = cct->_conf->osd_hit_set_max_size;
12602
12603 p->seed = now.sec();
12604
12605 dout(10) << __func__ << " target_size " << p->target_size
12606 << " fpp " << p->get_fpp() << dendl;
12607 }
12608 hit_set.reset(new HitSet(params));
12609 hit_set_start_stamp = now;
12610}
12611
12612/**
12613 * apply log entries to set
12614 *
12615 * this would only happen after peering, to at least capture writes
12616 * during an interval that was potentially lost.
12617 */
12618bool PrimaryLogPG::hit_set_apply_log()
12619{
12620 if (!hit_set)
12621 return false;
12622
12623 eversion_t to = info.last_update;
12624 eversion_t from = info.hit_set.current_last_update;
12625 if (to <= from) {
12626 dout(20) << __func__ << " no update" << dendl;
12627 return false;
12628 }
12629
12630 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12631 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12632 while (p != pg_log.get_log().log.rend() && p->version > to)
12633 ++p;
12634 while (p != pg_log.get_log().log.rend() && p->version > from) {
12635 hit_set->insert(p->soid);
12636 ++p;
12637 }
12638
12639 return true;
12640}
12641
12642void PrimaryLogPG::hit_set_persist()
12643{
12644 dout(10) << __func__ << dendl;
12645 bufferlist bl;
12646 unsigned max = pool.info.hit_set_count;
12647
12648 utime_t now = ceph_clock_now();
12649 hobject_t oid;
12650
12651 // If any archives are degraded we skip this persist request
12652 // account for the additional entry being added below
12653 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12654 p != info.hit_set.history.end();
12655 ++p) {
12656 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12657
12658 // Once we hit a degraded object just skip further trim
12659 if (is_degraded_or_backfilling_object(aoid))
12660 return;
12661 if (scrubber.write_blocked_by_scrub(aoid))
12662 return;
12663 }
12664
12665 // If backfill is in progress and we could possibly overlap with the
12666 // hit_set_* objects, back off. Since these all have
12667 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12668 // look just at that. This is necessary because our transactions
12669 // may include a modify of the new hit_set *and* a delete of the
12670 // old one, and this may span the backfill boundary.
12671 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12672 p != backfill_targets.end();
12673 ++p) {
12674 assert(peer_info.count(*p));
12675 const pg_info_t& pi = peer_info[*p];
12676 if (pi.last_backfill == hobject_t() ||
12677 pi.last_backfill.get_hash() == info.pgid.ps()) {
12678 dout(10) << __func__ << " backfill target osd." << *p
12679 << " last_backfill has not progressed past pgid ps"
12680 << dendl;
12681 return;
12682 }
12683 }
12684
12685
12686 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12687 new_hset.begin = hit_set_start_stamp;
12688 new_hset.end = now;
12689 oid = get_hit_set_archive_object(
12690 new_hset.begin,
12691 new_hset.end,
12692 new_hset.using_gmt);
12693
12694 // If the current object is degraded we skip this persist request
12695 if (scrubber.write_blocked_by_scrub(oid))
12696 return;
12697
12698 hit_set->seal();
12699 ::encode(*hit_set, bl);
12700 dout(20) << __func__ << " archive " << oid << dendl;
12701
12702 if (agent_state) {
12703 agent_state->add_hit_set(new_hset.begin, hit_set);
12704 uint32_t size = agent_state->hit_set_map.size();
12705 if (size >= pool.info.hit_set_count) {
12706 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12707 }
12708 hit_set_in_memory_trim(size);
12709 }
12710
12711 ObjectContextRef obc = get_object_context(oid, true);
12712 OpContextUPtr ctx = simple_opc_create(obc);
12713
12714 ctx->at_version = get_next_version();
12715 ctx->updated_hset_history = info.hit_set;
12716 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12717
12718 updated_hit_set_hist.current_last_update = info.last_update;
12719 new_hset.version = ctx->at_version;
12720
12721 updated_hit_set_hist.history.push_back(new_hset);
12722 hit_set_create();
12723
12724 // fabricate an object_info_t and SnapSet
12725 obc->obs.oi.version = ctx->at_version;
12726 obc->obs.oi.mtime = now;
12727 obc->obs.oi.size = bl.length();
12728 obc->obs.exists = true;
12729 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12730
12731 ctx->new_obs = obc->obs;
12732
12733 obc->ssc->snapset.head_exists = true;
12734 ctx->new_snapset = obc->ssc->snapset;
12735
12736 ctx->delta_stats.num_objects++;
12737 ctx->delta_stats.num_objects_hit_set_archive++;
12738 ctx->delta_stats.num_bytes += bl.length();
12739 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12740
12741 bufferlist bss;
12742 ::encode(ctx->new_snapset, bss);
12743 bufferlist boi(sizeof(ctx->new_obs.oi));
12744 ::encode(ctx->new_obs.oi, boi,
12745 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12746
12747 ctx->op_t->create(oid);
12748 if (bl.length()) {
12749 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12750 }
12751 map <string, bufferlist> attrs;
12752 attrs[OI_ATTR].claim(boi);
12753 attrs[SS_ATTR].claim(bss);
12754 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12755 ctx->log.push_back(
12756 pg_log_entry_t(
12757 pg_log_entry_t::MODIFY,
12758 oid,
12759 ctx->at_version,
12760 eversion_t(),
12761 0,
12762 osd_reqid_t(),
12763 ctx->mtime,
12764 0)
12765 );
12766
12767 hit_set_trim(ctx, max);
12768
12769 simple_opc_submit(std::move(ctx));
12770}
12771
12772void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12773{
12774 assert(ctx->updated_hset_history);
12775 pg_hit_set_history_t &updated_hit_set_hist =
12776 *(ctx->updated_hset_history);
12777 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12778 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12779 assert(p != updated_hit_set_hist.history.end());
12780 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12781
12782 assert(!is_degraded_or_backfilling_object(oid));
12783
12784 dout(20) << __func__ << " removing " << oid << dendl;
12785 ++ctx->at_version.version;
12786 ctx->log.push_back(
12787 pg_log_entry_t(pg_log_entry_t::DELETE,
12788 oid,
12789 ctx->at_version,
12790 p->version,
12791 0,
12792 osd_reqid_t(),
12793 ctx->mtime,
12794 0));
12795
12796 ctx->op_t->remove(oid);
12797 updated_hit_set_hist.history.pop_front();
12798
12799 ObjectContextRef obc = get_object_context(oid, false);
12800 assert(obc);
12801 --ctx->delta_stats.num_objects;
12802 --ctx->delta_stats.num_objects_hit_set_archive;
12803 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12804 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12805 }
12806}
12807
12808void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12809{
12810 while (agent_state->hit_set_map.size() > max_in_memory) {
12811 agent_state->remove_oldest_hit_set();
12812 }
12813}
12814
12815
12816// =======================================
12817// cache agent
12818
12819void PrimaryLogPG::agent_setup()
12820{
12821 assert(is_locked());
12822 if (!is_active() ||
12823 !is_primary() ||
12824 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12825 pool.info.tier_of < 0 ||
12826 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12827 agent_clear();
12828 return;
12829 }
12830 if (!agent_state) {
12831 agent_state.reset(new TierAgentState);
12832
12833 // choose random starting position
12834 agent_state->position = hobject_t();
12835 agent_state->position.pool = info.pgid.pool();
12836 agent_state->position.set_hash(pool.info.get_random_pg_position(
12837 info.pgid.pgid,
12838 rand()));
12839 agent_state->start = agent_state->position;
12840
12841 dout(10) << __func__ << " allocated new state, position "
12842 << agent_state->position << dendl;
12843 } else {
12844 dout(10) << __func__ << " keeping existing state" << dendl;
12845 }
12846
12847 if (info.stats.stats_invalid) {
12848 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12849 }
12850
12851 agent_choose_mode();
12852}
12853
12854void PrimaryLogPG::agent_clear()
12855{
12856 agent_stop();
12857 agent_state.reset(NULL);
12858}
12859
12860// Return false if no objects operated on since start of object hash space
12861bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12862{
12863 lock();
12864 if (!agent_state) {
12865 dout(10) << __func__ << " no agent state, stopping" << dendl;
12866 unlock();
12867 return true;
12868 }
12869
12870 assert(!deleting);
12871
12872 if (agent_state->is_idle()) {
12873 dout(10) << __func__ << " idle, stopping" << dendl;
12874 unlock();
12875 return true;
12876 }
12877
12878 osd->logger->inc(l_osd_agent_wake);
12879
12880 dout(10) << __func__
12881 << " max " << start_max
12882 << ", flush " << agent_state->get_flush_mode_name()
12883 << ", evict " << agent_state->get_evict_mode_name()
12884 << ", pos " << agent_state->position
12885 << dendl;
12886 assert(is_primary());
12887 assert(is_active());
12888
12889 agent_load_hit_sets();
12890
12891 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12892 assert(base_pool);
12893
12894 int ls_min = 1;
12895 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12896
12897 // list some objects. this conveniently lists clones (oldest to
12898 // newest) before heads... the same order we want to flush in.
12899 //
12900 // NOTE: do not flush the Sequencer. we will assume that the
12901 // listing we get back is imprecise.
12902 vector<hobject_t> ls;
12903 hobject_t next;
12904 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12905 &ls, &next);
12906 assert(r >= 0);
12907 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12908 int started = 0;
12909 for (vector<hobject_t>::iterator p = ls.begin();
12910 p != ls.end();
12911 ++p) {
12912 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12913 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12914 osd->logger->inc(l_osd_agent_skip);
12915 continue;
12916 }
12917 if (is_degraded_or_backfilling_object(*p)) {
12918 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12919 osd->logger->inc(l_osd_agent_skip);
12920 continue;
12921 }
12922 if (is_missing_object(p->get_head())) {
12923 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12924 osd->logger->inc(l_osd_agent_skip);
12925 continue;
12926 }
12927 ObjectContextRef obc = get_object_context(*p, false, NULL);
12928 if (!obc) {
12929 // we didn't flush; we may miss something here.
12930 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12931 osd->logger->inc(l_osd_agent_skip);
12932 continue;
12933 }
12934 if (!obc->obs.exists) {
12935 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12936 osd->logger->inc(l_osd_agent_skip);
12937 continue;
12938 }
12939 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12940 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12941 osd->logger->inc(l_osd_agent_skip);
12942 continue;
12943 }
12944 if (obc->is_blocked()) {
12945 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12946 osd->logger->inc(l_osd_agent_skip);
12947 continue;
12948 }
12949 if (obc->is_request_pending()) {
12950 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12951 osd->logger->inc(l_osd_agent_skip);
12952 continue;
12953 }
12954
12955 // be careful flushing omap to an EC pool.
12956 if (!base_pool->supports_omap() &&
12957 obc->obs.oi.is_omap()) {
12958 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12959 osd->logger->inc(l_osd_agent_skip);
12960 continue;
12961 }
12962
12963 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12964 agent_maybe_evict(obc, false))
12965 ++started;
12966 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12967 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12968 ++started;
12969 --agent_flush_quota;
12970 }
12971 if (started >= start_max) {
12972 // If finishing early, set "next" to the next object
12973 if (++p != ls.end())
12974 next = *p;
12975 break;
12976 }
12977 }
12978
12979 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12980 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12981 agent_state->hist_age = 0;
12982 agent_state->temp_hist.decay();
12983 }
12984
12985 // Total objects operated on so far
12986 int total_started = agent_state->started + started;
12987 bool need_delay = false;
12988
12989 dout(20) << __func__ << " start pos " << agent_state->position
12990 << " next start pos " << next
12991 << " started " << total_started << dendl;
12992
12993 // See if we've made a full pass over the object hash space
12994 // This might check at most ls_max objects a second time to notice that
12995 // we've checked every objects at least once.
12996 if (agent_state->position < agent_state->start &&
12997 next >= agent_state->start) {
12998 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
12999 if (total_started == 0)
13000 need_delay = true;
13001 else
13002 total_started = 0;
13003 agent_state->start = next;
13004 }
13005 agent_state->started = total_started;
13006
13007 // See if we are starting from beginning
13008 if (next.is_max())
13009 agent_state->position = hobject_t();
13010 else
13011 agent_state->position = next;
13012
13013 // Discard old in memory HitSets
13014 hit_set_in_memory_trim(pool.info.hit_set_count);
13015
13016 if (need_delay) {
13017 assert(agent_state->delaying == false);
13018 agent_delay();
13019 unlock();
13020 return false;
13021 }
13022 agent_choose_mode();
13023 unlock();
13024 return true;
13025}
13026
13027void PrimaryLogPG::agent_load_hit_sets()
13028{
13029 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13030 return;
13031 }
13032
13033 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13034 dout(10) << __func__ << dendl;
13035 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13036 p != info.hit_set.history.end(); ++p) {
13037 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13038 dout(10) << __func__ << " loading " << p->begin << "-"
13039 << p->end << dendl;
13040 if (!pool.info.is_replicated()) {
13041 // FIXME: EC not supported here yet
13042 derr << __func__ << " on non-replicated pool" << dendl;
13043 break;
13044 }
13045
13046 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13047 if (is_unreadable_object(oid)) {
13048 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13049 break;
13050 }
13051
13052 ObjectContextRef obc = get_object_context(oid, false);
13053 if (!obc) {
13054 derr << __func__ << ": could not load hitset " << oid << dendl;
13055 break;
13056 }
13057
13058 bufferlist bl;
13059 {
13060 obc->ondisk_read_lock();
13061 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13062 assert(r >= 0);
13063 obc->ondisk_read_unlock();
13064 }
13065 HitSetRef hs(new HitSet);
13066 bufferlist::iterator pbl = bl.begin();
13067 ::decode(*hs, pbl);
13068 agent_state->add_hit_set(p->begin.sec(), hs);
13069 }
13070 }
13071 }
13072}
13073
13074bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13075{
13076 if (!obc->obs.oi.is_dirty()) {
13077 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13078 osd->logger->inc(l_osd_agent_skip);
13079 return false;
13080 }
13081 if (obc->obs.oi.is_cache_pinned()) {
13082 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13083 osd->logger->inc(l_osd_agent_skip);
13084 return false;
13085 }
13086
13087 utime_t now = ceph_clock_now();
13088 utime_t ob_local_mtime;
13089 if (obc->obs.oi.local_mtime != utime_t()) {
13090 ob_local_mtime = obc->obs.oi.local_mtime;
13091 } else {
13092 ob_local_mtime = obc->obs.oi.mtime;
13093 }
13094 bool evict_mode_full =
13095 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13096 if (!evict_mode_full &&
13097 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
13098 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13099 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13100 osd->logger->inc(l_osd_agent_skip);
13101 return false;
13102 }
13103
13104 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13105 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13106 osd->logger->inc(l_osd_agent_skip);
13107 return false;
13108 }
13109
13110 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13111
13112 // FIXME: flush anything dirty, regardless of what distribution of
13113 // ages we expect.
13114
13115 hobject_t oid = obc->obs.oi.soid;
13116 osd->agent_start_op(oid);
13117 // no need to capture a pg ref, can't outlive fop or ctx
13118 std::function<void()> on_flush = [this, oid]() {
13119 osd->agent_finish_op(oid);
13120 };
13121
13122 int result = start_flush(
13123 OpRequestRef(), obc, false, NULL,
13124 on_flush);
13125 if (result != -EINPROGRESS) {
13126 on_flush();
13127 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13128 << " with " << result << dendl;
13129 osd->logger->inc(l_osd_agent_skip);
13130 return false;
13131 }
13132
13133 osd->logger->inc(l_osd_agent_flush);
13134 return true;
13135}
13136
13137bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13138{
13139 const hobject_t& soid = obc->obs.oi.soid;
13140 if (!after_flush && obc->obs.oi.is_dirty()) {
13141 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13142 return false;
13143 }
13144 if (!obc->obs.oi.watchers.empty()) {
13145 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13146 return false;
13147 }
13148 if (obc->is_blocked()) {
13149 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13150 return false;
13151 }
13152 if (obc->obs.oi.is_cache_pinned()) {
13153 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13154 return false;
13155 }
13156
13157 if (soid.snap == CEPH_NOSNAP) {
13158 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13159 if (result < 0) {
13160 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13161 return false;
13162 }
13163 }
13164
13165 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13166 // is this object old than cache_min_evict_age?
13167 utime_t now = ceph_clock_now();
13168 utime_t ob_local_mtime;
13169 if (obc->obs.oi.local_mtime != utime_t()) {
13170 ob_local_mtime = obc->obs.oi.local_mtime;
13171 } else {
13172 ob_local_mtime = obc->obs.oi.mtime;
13173 }
13174 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13175 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13176 osd->logger->inc(l_osd_agent_skip);
13177 return false;
13178 }
13179 // is this object old and/or cold enough?
13180 int temp = 0;
13181 uint64_t temp_upper = 0, temp_lower = 0;
13182 if (hit_set)
13183 agent_estimate_temp(soid, &temp);
13184 agent_state->temp_hist.add(temp);
13185 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13186
13187 dout(20) << __func__
13188 << " temp " << temp
13189 << " pos " << temp_lower << "-" << temp_upper
13190 << ", evict_effort " << agent_state->evict_effort
13191 << dendl;
13192 dout(30) << "agent_state:\n";
13193 Formatter *f = Formatter::create("");
13194 f->open_object_section("agent_state");
13195 agent_state->dump(f);
13196 f->close_section();
13197 f->flush(*_dout);
13198 delete f;
13199 *_dout << dendl;
13200
13201 if (1000000 - temp_upper >= agent_state->evict_effort)
13202 return false;
13203 }
13204
13205 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13206 OpContextUPtr ctx = simple_opc_create(obc);
13207
13208 if (!ctx->lock_manager.get_lock_type(
13209 ObjectContext::RWState::RWWRITE,
13210 obc->obs.oi.soid,
13211 obc,
13212 OpRequestRef())) {
13213 close_op_ctx(ctx.release());
13214 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13215 return false;
13216 }
13217
13218 osd->agent_start_evict_op();
13219 ctx->register_on_finish(
13220 [this]() {
13221 osd->agent_finish_evict_op();
13222 });
13223
13224 ctx->at_version = get_next_version();
13225 assert(ctx->new_obs.exists);
13226 int r = _delete_oid(ctx.get(), true, false);
13227 if (obc->obs.oi.is_omap())
13228 ctx->delta_stats.num_objects_omap--;
13229 ctx->delta_stats.num_evict++;
13230 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13231 if (obc->obs.oi.is_dirty())
13232 --ctx->delta_stats.num_objects_dirty;
13233 assert(r == 0);
13234 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13235 simple_opc_submit(std::move(ctx));
13236 osd->logger->inc(l_osd_tier_evict);
13237 osd->logger->inc(l_osd_agent_evict);
13238 return true;
13239}
13240
13241void PrimaryLogPG::agent_stop()
13242{
13243 dout(20) << __func__ << dendl;
13244 if (agent_state && !agent_state->is_idle()) {
13245 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13246 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13247 osd->agent_disable_pg(this, agent_state->evict_effort);
13248 }
13249}
13250
13251void PrimaryLogPG::agent_delay()
13252{
13253 dout(20) << __func__ << dendl;
13254 if (agent_state && !agent_state->is_idle()) {
13255 assert(agent_state->delaying == false);
13256 agent_state->delaying = true;
13257 osd->agent_disable_pg(this, agent_state->evict_effort);
13258 }
13259}
13260
13261void PrimaryLogPG::agent_choose_mode_restart()
13262{
13263 dout(20) << __func__ << dendl;
13264 lock();
13265 if (agent_state && agent_state->delaying) {
13266 agent_state->delaying = false;
13267 agent_choose_mode(true);
13268 }
13269 unlock();
13270}
13271
13272bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13273{
13274 bool requeued = false;
13275 // Let delay play out
13276 if (agent_state->delaying) {
13277 dout(20) << __func__ << this << " delaying, ignored" << dendl;
13278 return requeued;
13279 }
13280
13281 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13282 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13283 unsigned evict_effort = 0;
13284
13285 if (info.stats.stats_invalid) {
13286 // idle; stats can't be trusted until we scrub.
13287 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13288 goto skip_calc;
13289 }
13290
13291 {
13292 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13293 assert(divisor > 0);
13294
13295 // adjust (effective) user objects down based on the number
13296 // of HitSet objects, which should not count toward our total since
13297 // they cannot be flushed.
13298 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13299
13300 // also exclude omap objects if ec backing pool
13301 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13302 assert(base_pool);
13303 if (!base_pool->supports_omap())
13304 unflushable += info.stats.stats.sum.num_objects_omap;
13305
13306 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13307 if (num_user_objects > unflushable)
13308 num_user_objects -= unflushable;
13309 else
13310 num_user_objects = 0;
13311
13312 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13313 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13314 num_user_bytes -= unflushable_bytes;
13315 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13316 num_user_bytes += num_overhead_bytes;
13317
13318 // also reduce the num_dirty by num_objects_omap
13319 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13320 if (!base_pool->supports_omap()) {
13321 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13322 num_dirty -= info.stats.stats.sum.num_objects_omap;
13323 else
13324 num_dirty = 0;
13325 }
13326
13327 dout(10) << __func__
13328 << " flush_mode: "
13329 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13330 << " evict_mode: "
13331 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13332 << " num_objects: " << info.stats.stats.sum.num_objects
13333 << " num_bytes: " << info.stats.stats.sum.num_bytes
13334 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13335 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13336 << " num_dirty: " << num_dirty
13337 << " num_user_objects: " << num_user_objects
13338 << " num_user_bytes: " << num_user_bytes
13339 << " num_overhead_bytes: " << num_overhead_bytes
13340 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13341 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13342 << dendl;
13343
13344 // get dirty, full ratios
13345 uint64_t dirty_micro = 0;
13346 uint64_t full_micro = 0;
13347 if (pool.info.target_max_bytes && num_user_objects > 0) {
13348 uint64_t avg_size = num_user_bytes / num_user_objects;
13349 dirty_micro =
13350 num_dirty * avg_size * 1000000 /
13351 MAX(pool.info.target_max_bytes / divisor, 1);
13352 full_micro =
13353 num_user_objects * avg_size * 1000000 /
13354 MAX(pool.info.target_max_bytes / divisor, 1);
13355 }
13356 if (pool.info.target_max_objects > 0) {
13357 uint64_t dirty_objects_micro =
13358 num_dirty * 1000000 /
13359 MAX(pool.info.target_max_objects / divisor, 1);
13360 if (dirty_objects_micro > dirty_micro)
13361 dirty_micro = dirty_objects_micro;
13362 uint64_t full_objects_micro =
13363 num_user_objects * 1000000 /
13364 MAX(pool.info.target_max_objects / divisor, 1);
13365 if (full_objects_micro > full_micro)
13366 full_micro = full_objects_micro;
13367 }
13368 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13369 << " full " << ((float)full_micro / 1000000.0)
13370 << dendl;
13371
13372 // flush mode
13373 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13374 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13375 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13376 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13377 flush_target += flush_slop;
13378 flush_high_target += flush_slop;
13379 } else {
13380 flush_target -= MIN(flush_target, flush_slop);
13381 flush_high_target -= MIN(flush_high_target, flush_slop);
13382 }
13383
13384 if (dirty_micro > flush_high_target) {
13385 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13386 } else if (dirty_micro > flush_target) {
13387 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13388 }
13389
13390 // evict mode
13391 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13392 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13393 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13394 evict_target += evict_slop;
13395 else
13396 evict_target -= MIN(evict_target, evict_slop);
13397
13398 if (full_micro > 1000000) {
13399 // evict anything clean
13400 evict_mode = TierAgentState::EVICT_MODE_FULL;
13401 evict_effort = 1000000;
13402 } else if (full_micro > evict_target) {
13403 // set effort in [0..1] range based on where we are between
13404 evict_mode = TierAgentState::EVICT_MODE_SOME;
13405 uint64_t over = full_micro - evict_target;
13406 uint64_t span = 1000000 - evict_target;
13407 evict_effort = MAX(over * 1000000 / span,
13408 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13409
13410 // quantize effort to avoid too much reordering in the agent_queue.
13411 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13412 assert(inc > 0);
13413 uint64_t was = evict_effort;
13414 evict_effort -= evict_effort % inc;
13415 if (evict_effort < inc)
13416 evict_effort = inc;
13417 assert(evict_effort >= inc && evict_effort <= 1000000);
13418 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13419 }
13420 }
13421
13422 skip_calc:
13423 bool old_idle = agent_state->is_idle();
13424 if (flush_mode != agent_state->flush_mode) {
13425 dout(5) << __func__ << " flush_mode "
13426 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13427 << " -> "
13428 << TierAgentState::get_flush_mode_name(flush_mode)
13429 << dendl;
13430 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13431 osd->agent_inc_high_count();
13432 info.stats.stats.sum.num_flush_mode_high = 1;
13433 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13434 info.stats.stats.sum.num_flush_mode_low = 1;
13435 }
13436 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13437 osd->agent_dec_high_count();
13438 info.stats.stats.sum.num_flush_mode_high = 0;
13439 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13440 info.stats.stats.sum.num_flush_mode_low = 0;
13441 }
13442 agent_state->flush_mode = flush_mode;
13443 }
13444 if (evict_mode != agent_state->evict_mode) {
13445 dout(5) << __func__ << " evict_mode "
13446 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13447 << " -> "
13448 << TierAgentState::get_evict_mode_name(evict_mode)
13449 << dendl;
13450 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13451 is_active()) {
13452 if (op)
13453 requeue_op(op);
13454 requeue_ops(waiting_for_active);
13455 requeue_ops(waiting_for_scrub);
13456 requeue_ops(waiting_for_cache_not_full);
13457 objects_blocked_on_cache_full.clear();
13458 requeued = true;
13459 }
13460 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13461 info.stats.stats.sum.num_evict_mode_some = 1;
13462 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13463 info.stats.stats.sum.num_evict_mode_full = 1;
13464 }
13465 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13466 info.stats.stats.sum.num_evict_mode_some = 0;
13467 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13468 info.stats.stats.sum.num_evict_mode_full = 0;
13469 }
13470 agent_state->evict_mode = evict_mode;
13471 }
13472 uint64_t old_effort = agent_state->evict_effort;
13473 if (evict_effort != agent_state->evict_effort) {
13474 dout(5) << __func__ << " evict_effort "
13475 << ((float)agent_state->evict_effort / 1000000.0)
13476 << " -> "
13477 << ((float)evict_effort / 1000000.0)
13478 << dendl;
13479 agent_state->evict_effort = evict_effort;
13480 }
13481
13482 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13483 // (including flush). This is probably fine (they should be
13484 // correlated) but it is not precisely correct.
13485 if (agent_state->is_idle()) {
13486 if (!restart && !old_idle) {
13487 osd->agent_disable_pg(this, old_effort);
13488 }
13489 } else {
13490 if (restart || old_idle) {
13491 osd->agent_enable_pg(this, agent_state->evict_effort);
13492 } else if (old_effort != agent_state->evict_effort) {
13493 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13494 }
13495 }
13496 return requeued;
13497}
13498
13499void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13500{
13501 assert(hit_set);
13502 assert(temp);
13503 *temp = 0;
13504 if (hit_set->contains(oid))
13505 *temp = 1000000;
13506 unsigned i = 0;
13507 int last_n = pool.info.hit_set_search_last_n;
13508 for (map<time_t,HitSetRef>::reverse_iterator p =
13509 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13510 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13511 if (p->second->contains(oid)) {
13512 *temp += pool.info.get_grade(i);
13513 --last_n;
13514 }
13515 }
13516}
13517
13518// Dup op detection
13519
13520bool PrimaryLogPG::already_complete(eversion_t v)
13521{
13522 dout(20) << __func__ << ": " << v << dendl;
13523 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13524 !i.end();
13525 ++i) {
13526 dout(20) << __func__ << ": " << **i << dendl;
13527 // skip copy from temp object ops
13528 if ((*i)->v == eversion_t()) {
13529 dout(20) << __func__ << ": " << **i
13530 << " version is empty" << dendl;
13531 continue;
13532 }
13533 if ((*i)->v > v) {
13534 dout(20) << __func__ << ": " << **i
13535 << " (*i)->v past v" << dendl;
13536 break;
13537 }
13538 if (!(*i)->all_committed) {
13539 dout(20) << __func__ << ": " << **i
13540 << " not committed, returning false"
13541 << dendl;
13542 return false;
13543 }
13544 }
13545 dout(20) << __func__ << ": returning true" << dendl;
13546 return true;
13547}
13548
13549bool PrimaryLogPG::already_ack(eversion_t v)
13550{
13551 dout(20) << __func__ << ": " << v << dendl;
13552 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13553 !i.end();
13554 ++i) {
13555 // skip copy from temp object ops
13556 if ((*i)->v == eversion_t()) {
13557 dout(20) << __func__ << ": " << **i
13558 << " version is empty" << dendl;
13559 continue;
13560 }
13561 if ((*i)->v > v) {
13562 dout(20) << __func__ << ": " << **i
13563 << " (*i)->v past v" << dendl;
13564 break;
13565 }
13566 if (!(*i)->all_applied) {
13567 dout(20) << __func__ << ": " << **i
13568 << " not applied, returning false"
13569 << dendl;
13570 return false;
13571 }
13572 }
13573 dout(20) << __func__ << ": returning true" << dendl;
13574 return true;
13575}
13576
13577
13578// ==========================================================================================
13579// SCRUB
13580
13581
13582bool PrimaryLogPG::_range_available_for_scrub(
13583 const hobject_t &begin, const hobject_t &end)
13584{
13585 pair<hobject_t, ObjectContextRef> next;
13586 next.second = object_contexts.lookup(begin);
13587 next.first = begin;
13588 bool more = true;
13589 while (more && next.first < end) {
13590 if (next.second && next.second->is_blocked()) {
13591 next.second->requeue_scrub_on_unblock = true;
13592 dout(10) << __func__ << ": scrub delayed, "
13593 << next.first << " is blocked"
13594 << dendl;
13595 return false;
13596 }
13597 more = object_contexts.get_next(next.first, &next);
13598 }
13599 return true;
13600}
13601
13602static bool doing_clones(const boost::optional<SnapSet> &snapset,
13603 const vector<snapid_t>::reverse_iterator &curclone) {
13604 return snapset && curclone != snapset.get().clones.rend();
13605}
13606
13607void PrimaryLogPG::log_missing(unsigned missing,
13608 const boost::optional<hobject_t> &head,
13609 LogChannelRef clog,
13610 const spg_t &pgid,
13611 const char *func,
13612 const char *mode,
13613 bool allow_incomplete_clones)
13614{
13615 assert(head);
13616 if (allow_incomplete_clones) {
13617 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13618 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13619 } else {
13620 clog->info() << mode << " " << pgid << " " << head.get()
13621 << " " << missing << " missing clone(s)";
13622 }
13623}
13624
13625unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13626 const boost::optional<SnapSet> &snapset,
13627 LogChannelRef clog,
13628 const spg_t &pgid,
13629 const char *mode,
13630 bool allow_incomplete_clones,
13631 boost::optional<snapid_t> target,
13632 vector<snapid_t>::reverse_iterator *curclone,
13633 inconsistent_snapset_wrapper &e)
13634{
13635 assert(head);
13636 assert(snapset);
13637 unsigned missing = 0;
13638
13639 // NOTE: clones are in descending order, thus **curclone > target test here
13640 hobject_t next_clone(head.get());
13641 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13642 ++missing;
13643 // it is okay to be missing one or more clones in a cache tier.
13644 // skip higher-numbered clones in the list.
13645 if (!allow_incomplete_clones) {
13646 next_clone.snap = **curclone;
13647 clog->error() << mode << " " << pgid << " " << head.get()
c07f9fc5
FG
13648 << " expected clone " << next_clone << " " << missing
13649 << " missing";
7c673cae
FG
13650 ++scrubber.shallow_errors;
13651 e.set_clone_missing(next_clone.snap);
13652 }
13653 // Clones are descending
13654 ++(*curclone);
13655 }
13656 return missing;
13657}
13658
13659/*
13660 * Validate consistency of the object info and snap sets.
13661 *
13662 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13663 * the comparison of the objects is against multiple snapset.clones. There are
13664 * multiple clone lists and in between lists we expect head or snapdir.
13665 *
13666 * Example
13667 *
13668 * objects expected
13669 * ======= =======
13670 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13671 * obj2 head head/snapdir, head ok
13672 * [SnapSet clones 6 4 2 1]
13673 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13674 * obj2 snap 6 obj2 snap 6, match
13675 * obj2 snap 4 obj2 snap 4, match
13676 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13677 * [Snapset clones 3 1]
13678 * obj3 snap 3 obj3 snap 3 match
13679 * obj3 snap 1 obj3 snap 1 match
13680 * obj4 snapdir head/snapdir, snapdir ok
13681 * [Snapset clones 4]
13682 * EOL obj4 snap 4, (expected)
13683 */
13684void PrimaryLogPG::scrub_snapshot_metadata(
13685 ScrubMap &scrubmap,
13686 const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13687{
13688 dout(10) << __func__ << dendl;
13689
13690 coll_t c(info.pgid);
13691 bool repair = state_test(PG_STATE_REPAIR);
13692 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13693 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13694 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13695
13696 /// snapsets to repair
13697 map<hobject_t,SnapSet> snapset_to_repair;
13698
13699 // traverse in reverse order.
13700 boost::optional<hobject_t> head;
13701 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13702 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13703 unsigned missing = 0;
13704 inconsistent_snapset_wrapper soid_error, head_error;
13705
13706 bufferlist last_data;
13707
13708 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13709 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13710 const hobject_t& soid = p->first;
13711 soid_error = inconsistent_snapset_wrapper{soid};
13712 object_stat_sum_t stat;
13713 boost::optional<object_info_t> oi;
13714
13715 if (!soid.is_snapdir())
13716 stat.num_objects++;
13717
13718 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13719 stat.num_objects_hit_set_archive++;
13720
13721 if (soid.is_snap()) {
13722 // it's a clone
13723 stat.num_object_clones++;
13724 }
13725
13726 // basic checks.
13727 if (p->second.attrs.count(OI_ATTR) == 0) {
13728 oi = boost::none;
13729 osd->clog->error() << mode << " " << info.pgid << " " << soid
13730 << " no '" << OI_ATTR << "' attr";
13731 ++scrubber.shallow_errors;
13732 soid_error.set_oi_attr_missing();
13733 } else {
13734 bufferlist bv;
13735 bv.push_back(p->second.attrs[OI_ATTR]);
13736 try {
13737 oi = object_info_t(); // Initialize optional<> before decode into it
13738 oi.get().decode(bv);
13739 } catch (buffer::error& e) {
13740 oi = boost::none;
13741 osd->clog->error() << mode << " " << info.pgid << " " << soid
13742 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13743 ++scrubber.shallow_errors;
13744 soid_error.set_oi_attr_corrupted();
13745 soid_error.set_oi_attr_missing(); // Not available too
13746 }
13747 }
13748
13749 if (oi) {
13750 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13751 osd->clog->error() << mode << " " << info.pgid << " " << soid
13752 << " on disk size (" << p->second.size
13753 << ") does not match object info size ("
13754 << oi->size << ") adjusted for ondisk to ("
13755 << pgbackend->be_get_ondisk_size(oi->size)
13756 << ")";
13757 soid_error.set_size_mismatch();
13758 ++scrubber.shallow_errors;
13759 }
13760
13761 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13762
13763 // A clone num_bytes will be added later when we have snapset
13764 if (!soid.is_snap()) {
13765 stat.num_bytes += oi->size;
13766 }
13767 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13768 stat.num_bytes_hit_set_archive += oi->size;
13769
13770 if (!soid.is_snapdir()) {
13771 if (oi->is_dirty())
13772 ++stat.num_objects_dirty;
13773 if (oi->is_whiteout())
13774 ++stat.num_whiteouts;
13775 if (oi->is_omap())
13776 ++stat.num_objects_omap;
13777 if (oi->is_cache_pinned())
13778 ++stat.num_objects_pinned;
13779 }
13780 } else {
13781 // pessimistic assumption that this object might contain a
13782 // legacy SnapSet
13783 stat.num_legacy_snapsets++;
13784 }
13785
13786 // Check for any problems while processing clones
13787 if (doing_clones(snapset, curclone)) {
13788 boost::optional<snapid_t> target;
13789 // Expecting an object with snap for current head
13790 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13791
13792 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13793 << soid << " while processing " << head.get() << dendl;
13794
13795 target = all_clones;
13796 } else {
13797 assert(soid.is_snap());
13798 target = soid.snap;
13799 }
13800
13801 // Log any clones we were expecting to be there up to target
13802 // This will set missing, but will be a no-op if snap.soid == *curclone.
13803 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13804 pool.info.allow_incomplete_clones(), target, &curclone,
13805 head_error);
13806 }
13807 bool expected;
13808 // Check doing_clones() again in case we ran process_clones_to()
13809 if (doing_clones(snapset, curclone)) {
13810 // A head/snapdir would have processed all clones above
13811 // or all greater than *curclone.
13812 assert(soid.is_snap() && *curclone <= soid.snap);
13813
13814 // After processing above clone snap should match the expected curclone
13815 expected = (*curclone == soid.snap);
13816 } else {
13817 // If we aren't doing clones any longer, then expecting head/snapdir
13818 expected = soid.has_snapset();
13819 }
13820 if (!expected) {
13821 // If we couldn't read the head's snapset, just ignore clones
13822 if (head && !snapset) {
13823 osd->clog->error() << mode << " " << info.pgid << " " << soid
13824 << " clone ignored due to missing snapset";
13825 } else {
13826 osd->clog->error() << mode << " " << info.pgid << " " << soid
13827 << " is an unexpected clone";
13828 }
13829 ++scrubber.shallow_errors;
13830 soid_error.set_headless();
13831 scrubber.store->add_snap_error(pool.id, soid_error);
13832 if (head && soid.get_head() == head->get_head())
13833 head_error.set_clone(soid.snap);
13834 continue;
13835 }
13836
13837 // new snapset?
13838 if (soid.has_snapset()) {
13839
13840 if (missing) {
13841 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13842 pool.info.allow_incomplete_clones());
13843 }
13844
13845 // Save previous head error information
13846 if (head && head_error.errors)
13847 scrubber.store->add_snap_error(pool.id, head_error);
13848 // Set this as a new head object
13849 head = soid;
13850 missing = 0;
13851 head_error = soid_error;
13852
13853 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13854
13855 if (p->second.attrs.count(SS_ATTR) == 0) {
13856 osd->clog->error() << mode << " " << info.pgid << " " << soid
13857 << " no '" << SS_ATTR << "' attr";
13858 ++scrubber.shallow_errors;
13859 snapset = boost::none;
13860 head_error.set_ss_attr_missing();
13861 } else {
13862 bufferlist bl;
13863 bl.push_back(p->second.attrs[SS_ATTR]);
13864 bufferlist::iterator blp = bl.begin();
13865 try {
13866 snapset = SnapSet(); // Initialize optional<> before decoding into it
13867 ::decode(snapset.get(), blp);
13868 } catch (buffer::error& e) {
13869 snapset = boost::none;
13870 osd->clog->error() << mode << " " << info.pgid << " " << soid
13871 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13872 ++scrubber.shallow_errors;
13873 head_error.set_ss_attr_corrupted();
13874 }
13875 }
13876
13877 if (snapset) {
13878 // what will be next?
13879 curclone = snapset->clones.rbegin();
13880
13881 if (!snapset->clones.empty()) {
13882 dout(20) << " snapset " << snapset.get() << dendl;
13883 if (snapset->seq == 0) {
13884 osd->clog->error() << mode << " " << info.pgid << " " << soid
13885 << " snaps.seq not set";
13886 ++scrubber.shallow_errors;
13887 head_error.set_snapset_mismatch();
13888 }
13889 }
13890
13891 if (soid.is_head() && !snapset->head_exists) {
13892 osd->clog->error() << mode << " " << info.pgid << " " << soid
13893 << " snapset.head_exists=false, but head exists";
13894 ++scrubber.shallow_errors;
13895 head_error.set_head_mismatch();
b5b8bbf5
FG
13896 // Fix head_exists locally so is_legacy() returns correctly
13897 snapset->head_exists = true;
7c673cae
FG
13898 }
13899 if (soid.is_snapdir() && snapset->head_exists) {
13900 osd->clog->error() << mode << " " << info.pgid << " " << soid
13901 << " snapset.head_exists=true, but snapdir exists";
13902 ++scrubber.shallow_errors;
13903 head_error.set_head_mismatch();
b5b8bbf5
FG
13904 // For symmetry fix this too, but probably doesn't matter
13905 snapset->head_exists = false;
7c673cae
FG
13906 }
13907
31f18b77 13908 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
13909 if (soid.is_snapdir()) {
13910 dout(10) << " will move snapset to head from " << soid << dendl;
13911 snapset_to_repair[soid.get_head()] = *snapset;
13912 } else if (snapset->is_legacy()) {
13913 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13914 << dendl;
13915 snapset_to_repair[soid.get_head()] = *snapset;
13916 }
13917 } else {
13918 stat.num_legacy_snapsets++;
13919 }
13920 } else {
13921 // pessimistic assumption that this object might contain a
13922 // legacy SnapSet
13923 stat.num_legacy_snapsets++;
13924 }
13925 } else {
13926 assert(soid.is_snap());
13927 assert(head);
13928 assert(snapset);
13929 assert(soid.snap == *curclone);
13930
13931 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13932
13933 if (snapset->clone_size.count(soid.snap) == 0) {
13934 osd->clog->error() << mode << " " << info.pgid << " " << soid
13935 << " is missing in clone_size";
13936 ++scrubber.shallow_errors;
13937 soid_error.set_size_mismatch();
13938 } else {
13939 if (oi && oi->size != snapset->clone_size[soid.snap]) {
13940 osd->clog->error() << mode << " " << info.pgid << " " << soid
13941 << " size " << oi->size << " != clone_size "
13942 << snapset->clone_size[*curclone];
13943 ++scrubber.shallow_errors;
13944 soid_error.set_size_mismatch();
13945 }
13946
13947 if (snapset->clone_overlap.count(soid.snap) == 0) {
13948 osd->clog->error() << mode << " " << info.pgid << " " << soid
13949 << " is missing in clone_overlap";
13950 ++scrubber.shallow_errors;
13951 soid_error.set_size_mismatch();
13952 } else {
13953 // This checking is based on get_clone_bytes(). The first 2 asserts
13954 // can't happen because we know we have a clone_size and
13955 // a clone_overlap. Now we check that the interval_set won't
13956 // cause the last assert.
13957 uint64_t size = snapset->clone_size.find(soid.snap)->second;
13958 const interval_set<uint64_t> &overlap =
13959 snapset->clone_overlap.find(soid.snap)->second;
13960 bool bad_interval_set = false;
13961 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13962 i != overlap.end(); ++i) {
13963 if (size < i.get_len()) {
13964 bad_interval_set = true;
13965 break;
13966 }
13967 size -= i.get_len();
13968 }
13969
13970 if (bad_interval_set) {
13971 osd->clog->error() << mode << " " << info.pgid << " " << soid
13972 << " bad interval_set in clone_overlap";
13973 ++scrubber.shallow_errors;
13974 soid_error.set_size_mismatch();
13975 } else {
13976 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13977 }
13978 }
13979 }
13980
13981 // migrate legacy_snaps to snapset?
13982 auto p = snapset_to_repair.find(soid.get_head());
13983 if (p != snapset_to_repair.end()) {
13984 if (!oi || oi->legacy_snaps.empty()) {
13985 osd->clog->error() << mode << " " << info.pgid << " " << soid
13986 << " has no oi or legacy_snaps; cannot convert "
13987 << *snapset;
13988 ++scrubber.shallow_errors;
13989 } else {
13990 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
13991 << " to snapset " << p->second << dendl;
13992 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13993 }
13994 }
13995
13996 // what's next?
13997 ++curclone;
13998 if (soid_error.errors)
13999 scrubber.store->add_snap_error(pool.id, soid_error);
14000 }
14001
14002 scrub_cstat.add(stat);
14003 }
14004
14005 if (doing_clones(snapset, curclone)) {
14006 dout(10) << __func__ << " " << mode << " " << info.pgid
14007 << " No more objects while processing " << head.get() << dendl;
14008
14009 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14010 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14011 head_error);
14012 }
14013 // There could be missing found by the test above or even
14014 // before dropping out of the loop for the last head.
14015 if (missing) {
14016 log_missing(missing, head, osd->clog, info.pgid, __func__,
14017 mode, pool.info.allow_incomplete_clones());
14018 }
14019 if (head && head_error.errors)
14020 scrubber.store->add_snap_error(pool.id, head_error);
14021
14022 for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14023 missing_digest.begin();
14024 p != missing_digest.end();
14025 ++p) {
14026 if (p->first.is_snapdir())
14027 continue;
14028 dout(10) << __func__ << " recording digests for " << p->first << dendl;
14029 ObjectContextRef obc = get_object_context(p->first, false);
14030 if (!obc) {
14031 osd->clog->error() << info.pgid << " " << mode
c07f9fc5 14032 << " cannot get object context for object "
7c673cae
FG
14033 << p->first;
14034 continue;
14035 } else if (obc->obs.oi.soid != p->first) {
14036 osd->clog->error() << info.pgid << " " << mode
14037 << " object " << p->first
14038 << " has a valid oi attr with a mismatched name, "
14039 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14040 continue;
14041 }
14042 OpContextUPtr ctx = simple_opc_create(obc);
14043 ctx->at_version = get_next_version();
14044 ctx->mtime = utime_t(); // do not update mtime
14045 ctx->new_obs.oi.set_data_digest(p->second.first);
14046 ctx->new_obs.oi.set_omap_digest(p->second.second);
14047 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14048
14049 ctx->register_on_success(
14050 [this]() {
14051 dout(20) << "updating scrub digest" << dendl;
14052 if (--scrubber.num_digest_updates_pending == 0) {
14053 requeue_scrub();
14054 }
14055 });
14056
14057 simple_opc_submit(std::move(ctx));
14058 ++scrubber.num_digest_updates_pending;
14059 }
14060 for (auto& p : snapset_to_repair) {
14061 // cache pools may not have the clones, which means we won't know
14062 // what snaps they have. fake out the clone_snaps entries anyway (with
14063 // blank snap lists).
14064 p.second.head_exists = true;
14065 if (pool.info.allow_incomplete_clones()) {
14066 for (auto s : p.second.clones) {
14067 if (p.second.clone_snaps.count(s) == 0) {
14068 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14069 << s << dendl;
14070 p.second.clone_snaps[s];
14071 }
14072 }
14073 }
14074 if (p.second.clones.size() != p.second.clone_snaps.size() ||
14075 p.second.is_legacy()) {
14076 // this happens if we encounter other errors above, like a missing
14077 // or extra clone.
14078 dout(10) << __func__ << " not writing snapset to " << p.first
14079 << " snapset " << p.second << " clones " << p.second.clones
14080 << "; didn't convert fully" << dendl;
14081 scrub_cstat.sum.num_legacy_snapsets++;
14082 continue;
14083 }
14084 dout(10) << __func__ << " writing snapset to " << p.first
14085 << " " << p.second << dendl;
14086 ObjectContextRef obc = get_object_context(p.first, true);
14087 if (!obc) {
14088 osd->clog->error() << info.pgid << " " << mode
c07f9fc5 14089 << " cannot get object context for object "
7c673cae
FG
14090 << p.first;
14091 continue;
14092 } else if (obc->obs.oi.soid != p.first) {
14093 osd->clog->error() << info.pgid << " " << mode
14094 << " object " << p.first
14095 << " has a valid oi attr with a mismatched name, "
14096 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14097 continue;
14098 }
14099 ObjectContextRef snapset_obc;
14100 if (!obc->obs.exists) {
14101 snapset_obc = get_object_context(p.first.get_snapdir(), false);
14102 if (!snapset_obc) {
14103 osd->clog->error() << info.pgid << " " << mode
14104 << " cannot get object context for "
14105 << p.first.get_snapdir();
14106 continue;
14107 }
14108 }
14109 OpContextUPtr ctx = simple_opc_create(obc);
14110 PGTransaction *t = ctx->op_t.get();
14111 ctx->snapset_obc = snapset_obc;
14112 ctx->at_version = get_next_version();
14113 ctx->mtime = utime_t(); // do not update mtime
14114 ctx->new_snapset = p.second;
14115 if (!ctx->new_obs.exists) {
14116 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
14117 ctx->new_obs.exists = true;
14118 ctx->new_snapset.head_exists = true;
14119 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14120 ++ctx->delta_stats.num_whiteouts;
14121 ++ctx->delta_stats.num_objects;
14122 t->create(p.first);
14123 if (p.first < scrubber.start) {
14124 dout(20) << __func__ << " kludging around update outside of scrub range"
14125 << dendl;
14126 } else {
14127 scrub_cstat.add(ctx->delta_stats);
14128 }
14129 }
14130 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
14131 assert(!ctx->new_snapset.is_legacy());
14132 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14133 ctx->register_on_success(
14134 [this]() {
14135 dout(20) << "updating snapset" << dendl;
14136 if (--scrubber.num_digest_updates_pending == 0) {
14137 requeue_scrub();
14138 }
14139 });
14140
14141 simple_opc_submit(std::move(ctx));
14142 ++scrubber.num_digest_updates_pending;
14143 }
14144
14145 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14146}
14147
14148void PrimaryLogPG::_scrub_clear_state()
14149{
14150 scrub_cstat = object_stat_collection_t();
14151}
14152
14153void PrimaryLogPG::_scrub_finish()
14154{
14155 bool repair = state_test(PG_STATE_REPAIR);
14156 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14157 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14158
14159 if (info.stats.stats_invalid) {
14160 info.stats.stats = scrub_cstat;
14161 info.stats.stats_invalid = false;
14162
14163 if (agent_state)
14164 agent_choose_mode();
14165 }
14166
14167 dout(10) << mode << " got "
14168 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14169 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14170 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14171 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14172 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14173 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14174 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14175 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14176 << dendl;
14177
14178 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14179 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14180 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14181 !info.stats.dirty_stats_invalid) ||
14182 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14183 !info.stats.omap_stats_invalid) ||
14184 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14185 !info.stats.pin_stats_invalid) ||
14186 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14187 !info.stats.hitset_stats_invalid) ||
14188 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14189 !info.stats.hitset_bytes_stats_invalid) ||
14190 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14191 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14192 osd->clog->error() << info.pgid << " " << mode
14193 << " stat mismatch, got "
14194 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14195 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14196 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14197 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14198 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14199 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14200 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14201 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14202 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14203 ++scrubber.shallow_errors;
14204
14205 if (repair) {
14206 ++scrubber.fixed;
14207 info.stats.stats = scrub_cstat;
14208 info.stats.dirty_stats_invalid = false;
14209 info.stats.omap_stats_invalid = false;
14210 info.stats.hitset_stats_invalid = false;
14211 info.stats.hitset_bytes_stats_invalid = false;
14212 publish_stats_to_osd();
14213 share_pg_info();
14214 }
14215 } else if (scrub_cstat.sum.num_legacy_snapsets !=
14216 info.stats.stats.sum.num_legacy_snapsets) {
14217 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14218 << " from " << info.stats.stats.sum.num_legacy_snapsets
14219 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14220 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14221 publish_stats_to_osd();
14222 share_pg_info();
14223 }
224ce89b
WB
14224 // Clear object context cache to get repair information
14225 if (repair)
14226 object_contexts.clear();
7c673cae
FG
14227}
14228
14229bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14230{
14231 return osd->check_osdmap_full(missing_on);
14232}
14233
224ce89b
WB
14234int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14235{
14236 // Only supports replicated pools
14237 assert(!pool.info.require_rollback());
14238 assert(is_primary());
14239
14240 dout(10) << __func__ << " " << soid
14241 << " peers osd.{" << actingbackfill << "}" << dendl;
14242
14243 if (!is_clean()) {
14244 block_for_clean(soid, op);
14245 return -EAGAIN;
14246 }
14247
14248 assert(!pg_log.get_missing().is_missing(soid));
14249 bufferlist bv;
14250 object_info_t oi;
14251 eversion_t v;
14252 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14253 if (r < 0) {
14254 // Leave v and try to repair without a version, getting attr failed
14255 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14256 << soid << " error=" << r << dendl;
14257 } else try {
14258 bufferlist::iterator bliter = bv.begin();
14259 ::decode(oi, bliter);
14260 v = oi.version;
14261 } catch (...) {
14262 // Leave v as default constructed. This will fail when sent to older OSDs, but
14263 // not much worse than failing here.
14264 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14265 }
14266
14267 missing_loc.add_missing(soid, v, eversion_t());
14268 if (primary_error(soid, v)) {
14269 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14270 // XXX: If we knew that there is no down osd which could include this
14271 // object, it would be nice if we could return EIO here.
14272 // If a "never fail" flag was available, that could be used
14273 // for rbd to NOT return EIO until object marked lost.
14274
14275 // Drop through to save this op in case an osd comes up with the object.
14276 }
14277
14278 // Restart the op after object becomes readable again
14279 waiting_for_unreadable_object[soid].push_back(op);
14280 op->mark_delayed("waiting for missing object");
14281
14282 if (!eio_errors_to_process) {
14283 eio_errors_to_process = true;
14284 assert(is_clean());
14285 queue_peering_event(
14286 CephPeeringEvtRef(
14287 std::make_shared<CephPeeringEvt>(
14288 get_osdmap()->get_epoch(),
14289 get_osdmap()->get_epoch(),
14290 DoRecovery())));
14291 } else {
14292 // A prior error must have already cleared clean state and queued recovery
14293 // or a map change has triggered re-peering.
14294 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14295 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14296 }
14297
14298 return -EAGAIN;
14299}
14300
7c673cae
FG
14301/*---SnapTrimmer Logging---*/
14302#undef dout_prefix
14303#define dout_prefix *_dout << pg->gen_prefix()
14304
14305void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14306{
14307 ldout(pg->cct, 20) << "enter " << state_name << dendl;
14308}
14309
14310void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14311{
14312 ldout(pg->cct, 20) << "exit " << state_name << dendl;
14313}
14314
14315/*---SnapTrimmer states---*/
14316#undef dout_prefix
14317#define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14318 << "SnapTrimmer state<" << get_state_name() << ">: ")
14319
14320/* NotTrimming */
14321PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14322 : my_base(ctx),
14323 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14324{
14325 context< SnapTrimmer >().log_enter(state_name);
14326}
14327
14328void PrimaryLogPG::NotTrimming::exit()
14329{
14330 context< SnapTrimmer >().log_exit(state_name, enter_time);
14331}
14332
14333boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14334{
14335 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14336 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14337
14338 if (!(pg->is_primary() && pg->is_active())) {
14339 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14340 return discard_event();
14341 }
14342 if (!pg->is_clean() ||
14343 pg->snap_trimq.empty()) {
14344 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14345 return discard_event();
14346 }
14347 if (pg->scrubber.active) {
14348 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
7c673cae
FG
14349 return transit< WaitScrub >();
14350 } else {
14351 return transit< Trimming >();
14352 }
14353}
14354
14355boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14356{
14357 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14358 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14359
14360 pending = nullptr;
14361 if (!context< SnapTrimmer >().can_trim()) {
14362 post_event(KickTrim());
14363 return transit< NotTrimming >();
14364 }
14365
14366 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14367 ldout(pg->cct, 10) << "NotTrimming: trimming "
14368 << pg->snap_trimq.range_start()
14369 << dendl;
14370 return transit< AwaitAsyncWork >();
14371}
14372
14373/* AwaitAsyncWork */
14374PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14375 : my_base(ctx),
14376 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14377{
14378 auto *pg = context< SnapTrimmer >().pg;
14379 context< SnapTrimmer >().log_enter(state_name);
14380 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14381 pg->state_set(PG_STATE_SNAPTRIM);
224ce89b 14382 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
14383 pg->publish_stats_to_osd();
14384}
14385
14386boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14387{
14388 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14389 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14390 auto &in_flight = context<Trimming>().in_flight;
14391 assert(in_flight.empty());
14392
14393 assert(pg->is_primary() && pg->is_active());
14394 if (!context< SnapTrimmer >().can_trim()) {
14395 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14396 post_event(KickTrim());
14397 return transit< NotTrimming >();
14398 }
14399
14400 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14401
14402 vector<hobject_t> to_trim;
14403 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14404 to_trim.reserve(max);
14405 int r = pg->snap_mapper.get_next_objects_to_trim(
14406 snap_to_trim,
14407 max,
14408 &to_trim);
14409 if (r != 0 && r != -ENOENT) {
14410 lderr(pg->cct) << "get_next_objects_to_trim returned "
14411 << cpp_strerror(r) << dendl;
14412 assert(0 == "get_next_objects_to_trim returned an invalid code");
14413 } else if (r == -ENOENT) {
14414 // Done!
14415 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14416
14417 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14418 << " to purged_snaps"
14419 << dendl;
14420 pg->info.purged_snaps.insert(snap_to_trim);
14421 pg->snap_trimq.erase(snap_to_trim);
14422 ldout(pg->cct, 10) << "purged_snaps now "
14423 << pg->info.purged_snaps << ", snap_trimq now "
14424 << pg->snap_trimq << dendl;
14425
14426 ObjectStore::Transaction t;
14427 pg->dirty_big_info = true;
14428 pg->write_if_dirty(t);
14429 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14430 assert(tr == 0);
14431
14432 pg->share_pg_info();
14433 post_event(KickTrim());
14434 return transit< NotTrimming >();
14435 }
14436 assert(!to_trim.empty());
14437
14438 for (auto &&object: to_trim) {
14439 // Get next
14440 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
224ce89b
WB
14441 OpContextUPtr ctx;
14442 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14443 if (error) {
14444 if (error == -ENOLCK) {
14445 ldout(pg->cct, 10) << "could not get write lock on obj "
14446 << object << dendl;
14447 } else {
14448 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14449 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14450 }
14451 if (!in_flight.empty()) {
14452 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14453 return transit< WaitRepops >();
14454 }
14455 if (error == -ENOLCK) {
7c673cae
FG
14456 ldout(pg->cct, 10) << "waiting for it to clear"
14457 << dendl;
14458 return transit< WaitRWLock >();
7c673cae 14459 } else {
224ce89b 14460 return transit< NotTrimming >();
7c673cae
FG
14461 }
14462 }
14463
14464 in_flight.insert(object);
14465 ctx->register_on_success(
14466 [pg, object, &in_flight]() {
14467 assert(in_flight.find(object) != in_flight.end());
14468 in_flight.erase(object);
224ce89b
WB
14469 if (in_flight.empty()) {
14470 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14471 pg->snap_trimmer_machine.process_event(Reset());
14472 } else {
14473 pg->snap_trimmer_machine.process_event(RepopsComplete());
14474 }
14475 }
7c673cae
FG
14476 });
14477
14478 pg->simple_opc_submit(std::move(ctx));
14479 }
14480
14481 return transit< WaitRepops >();
14482}
14483
14484void PrimaryLogPG::setattr_maybe_cache(
14485 ObjectContextRef obc,
14486 OpContext *op,
14487 PGTransaction *t,
14488 const string &key,
14489 bufferlist &val)
14490{
14491 t->setattr(obc->obs.oi.soid, key, val);
14492}
14493
14494void PrimaryLogPG::setattrs_maybe_cache(
14495 ObjectContextRef obc,
14496 OpContext *op,
14497 PGTransaction *t,
14498 map<string, bufferlist> &attrs)
14499{
14500 t->setattrs(obc->obs.oi.soid, attrs);
14501}
14502
14503void PrimaryLogPG::rmattr_maybe_cache(
14504 ObjectContextRef obc,
14505 OpContext *op,
14506 PGTransaction *t,
14507 const string &key)
14508{
14509 t->rmattr(obc->obs.oi.soid, key);
14510}
14511
14512int PrimaryLogPG::getattr_maybe_cache(
14513 ObjectContextRef obc,
14514 const string &key,
14515 bufferlist *val)
14516{
14517 if (pool.info.require_rollback()) {
14518 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14519 if (i != obc->attr_cache.end()) {
14520 if (val)
14521 *val = i->second;
14522 return 0;
14523 } else {
14524 return -ENODATA;
14525 }
14526 }
14527 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14528}
14529
14530int PrimaryLogPG::getattrs_maybe_cache(
14531 ObjectContextRef obc,
14532 map<string, bufferlist> *out,
14533 bool user_only)
14534{
14535 int r = 0;
14536 if (pool.info.require_rollback()) {
14537 if (out)
14538 *out = obc->attr_cache;
14539 } else {
14540 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14541 }
14542 if (out && user_only) {
14543 map<string, bufferlist> tmp;
14544 for (map<string, bufferlist>::iterator i = out->begin();
14545 i != out->end();
14546 ++i) {
14547 if (i->first.size() > 1 && i->first[0] == '_')
14548 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14549 }
14550 tmp.swap(*out);
14551 }
14552 return r;
14553}
14554
14555bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14556 return osd->check_failsafe_full(ss);
14557}
14558
14559void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14560void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14561
14562#ifdef PG_DEBUG_REFS
14563uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14564void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14565#endif
14566
14567void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14568void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }