]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.cc
update sources to 12.2.7
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#include "boost/tuple/tuple.hpp"
19#include "boost/intrusive_ptr.hpp"
20#include "PG.h"
21#include "PrimaryLogPG.h"
22#include "OSD.h"
23#include "OpRequest.h"
24#include "ScrubStore.h"
25#include "Session.h"
26#include "objclass/objclass.h"
27
28#include "common/errno.h"
29#include "common/scrub_types.h"
30#include "common/perf_counters.h"
31
32#include "messages/MOSDOp.h"
33#include "messages/MOSDBackoff.h"
34#include "messages/MOSDSubOp.h"
35#include "messages/MOSDSubOpReply.h"
36#include "messages/MOSDPGTrim.h"
37#include "messages/MOSDPGScan.h"
38#include "messages/MOSDRepScrub.h"
39#include "messages/MOSDPGBackfill.h"
40#include "messages/MOSDPGBackfillRemove.h"
41#include "messages/MOSDPGUpdateLogMissing.h"
42#include "messages/MOSDPGUpdateLogMissingReply.h"
43#include "messages/MCommandReply.h"
44#include "messages/MOSDScrubReserve.h"
45#include "mds/inode_backtrace.h" // Ugh
46#include "common/EventTrace.h"
47
48#include "common/config.h"
49#include "include/compat.h"
50#include "mon/MonClient.h"
51#include "osdc/Objecter.h"
52#include "json_spirit/json_spirit_value.h"
53#include "json_spirit/json_spirit_reader.h"
54#include "include/assert.h" // json_spirit clobbers it
55#include "include/rados/rados_types.hpp"
56
57#ifdef WITH_LTTNG
58#include "tracing/osd.h"
59#else
60#define tracepoint(...)
61#endif
62
63#define dout_context cct
64#define dout_subsys ceph_subsys_osd
65#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66#undef dout_prefix
67#define dout_prefix _prefix(_dout, this)
68template <typename T>
69static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71}
72
73
74#include <sstream>
75#include <utility>
76
77#include <errno.h>
78
79MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81PGLSFilter::PGLSFilter() : cct(nullptr)
82{
83}
84
85PGLSFilter::~PGLSFilter()
86{
87}
88
89struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104};
105
106/**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128};
129
130template <typename T>
131class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146};
147
148GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152}
153
154class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169};
170
171
172Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174}
175
176class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185};
186
187class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201};
202
203struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
7c673cae
FG
210 opcontext->finish_read(pg);
211 }
212 ~OnReadComplete() override {}
213};
214
215class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
216 PrimaryLogPGRef pg;
217 ObjectContextRef obc;
218 public:
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
220 pg(p), obc(o) {}
221 void finish(int r) override {
222 pg->_applied_recovered_object(obc);
223 }
224};
225
226class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238};
239
240class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 void finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 }
248};
249
250// OpContext
251void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
252{
253 inflightreads = 1;
254 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
255 pair<bufferlist*, Context*> > > in;
256 in.swap(pending_async_reads);
257 pg->pgbackend->objects_read_async(
258 obc->obs.oi.soid,
259 in,
260 new OnReadComplete(pg, this), pg->get_pool().fast_read);
261}
262void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
263{
264 assert(inflightreads > 0);
265 --inflightreads;
266 if (async_reads_complete()) {
267 assert(pg->in_progress_async_reads.size());
268 assert(pg->in_progress_async_reads.front().second == this);
269 pg->in_progress_async_reads.pop_front();
c07f9fc5
FG
270
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg->execute_ctx(this);
7c673cae
FG
274 }
275}
276
c07f9fc5 277class CopyFromCallback : public PrimaryLogPG::CopyCallback {
7c673cae 278public:
c07f9fc5 279 PrimaryLogPG::CopyResults *results = nullptr;
7c673cae 280 PrimaryLogPG::OpContext *ctx;
c07f9fc5
FG
281 OSDOp &osd_op;
282
283 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
284 : ctx(ctx), osd_op(osd_op) {
285 }
7c673cae
FG
286 ~CopyFromCallback() override {}
287
288 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
289 results = results_.get<1>();
290 int r = results_.get<0>();
7c673cae
FG
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
c07f9fc5 297 } else {
7c673cae
FG
298 if (r != -ECANCELED) { // on cancel just toss it out; client resends
299 if (ctx->op)
300 ctx->pg->osd->reply_op_error(ctx->op, r);
301 } else if (results->should_requeue) {
302 if (ctx->op)
303 ctx->pg->requeue_op(ctx->op);
304 }
305 ctx->pg->close_op_ctx(ctx);
306 }
307 }
308
309 bool is_temp_obj_used() {
310 return results->started_temp_obj;
311 }
312 uint64_t get_data_size() {
313 return results->object_size;
314 }
c07f9fc5
FG
315};
316
317struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
318 CopyFromCallback *copy_from_callback;
319
320 CopyFromFinisher(CopyFromCallback *copy_from_callback)
321 : copy_from_callback(copy_from_callback) {
322 }
323
324 int execute() override {
325 // instance will be destructed after this method completes
326 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
327 return 0;
7c673cae
FG
328 }
329};
330
331// ======================
332// PGBackend::Listener
333
334void PrimaryLogPG::on_local_recover(
335 const hobject_t &hoid,
336 const ObjectRecoveryInfo &_recovery_info,
337 ObjectContextRef obc,
c07f9fc5 338 bool is_delete,
7c673cae
FG
339 ObjectStore::Transaction *t
340 )
341{
342 dout(10) << __func__ << ": " << hoid << dendl;
343
344 ObjectRecoveryInfo recovery_info(_recovery_info);
345 clear_object_snap_mapping(t, hoid);
c07f9fc5 346 if (!is_delete && recovery_info.soid.is_snap()) {
7c673cae
FG
347 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
348 set<snapid_t> snaps;
349 dout(20) << " snapset " << recovery_info.ss
350 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
351 if (recovery_info.ss.is_legacy() ||
352 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
353 assert(recovery_info.oi.legacy_snaps.size());
354 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
355 recovery_info.oi.legacy_snaps.end());
356 } else {
357 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
358 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
359 snaps.insert(p->second.begin(), p->second.end());
360 }
361 dout(20) << " snaps " << snaps << dendl;
362 snap_mapper.add_oid(
363 recovery_info.soid,
364 snaps,
365 &_t);
366 }
c07f9fc5 367 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
7c673cae
FG
368 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
369 assert(is_primary());
370 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
371 if (latest->op == pg_log_entry_t::LOST_REVERT &&
372 latest->reverting_to == recovery_info.version) {
373 dout(10) << " got old revert version " << recovery_info.version
374 << " for " << *latest << dendl;
375 recovery_info.version = latest->version;
376 // update the attr to the revert event version
377 recovery_info.oi.prior_version = recovery_info.oi.version;
378 recovery_info.oi.version = latest->version;
379 bufferlist bl;
380 ::encode(recovery_info.oi, bl,
381 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
382 assert(!pool.info.require_rollback());
383 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
384 if (obc)
385 obc->attr_cache[OI_ATTR] = bl;
386 }
387 }
388
389 // keep track of active pushes for scrub
390 ++active_pushes;
391
392 if (recovery_info.version > pg_log.get_can_rollback_to()) {
393 /* This can only happen during a repair, and even then, it would
394 * be one heck of a race. If we are repairing the object, the
395 * write in question must be fully committed, so it's not valid
396 * to roll it back anyway (and we'll be rolled forward shortly
397 * anyway) */
398 PGLogEntryHandler h{this, t};
399 pg_log.roll_forward_to(recovery_info.version, &h);
400 }
401 recover_got(recovery_info.soid, recovery_info.version);
402
403 if (is_primary()) {
c07f9fc5
FG
404 if (!is_delete) {
405 obc->obs.exists = true;
406 obc->ondisk_write_lock();
7c673cae 407
c07f9fc5
FG
408 bool got = obc->get_recovery_read();
409 assert(got);
7c673cae 410
c07f9fc5
FG
411 assert(recovering.count(obc->obs.oi.soid));
412 recovering[obc->obs.oi.soid] = obc;
413 obc->obs.oi = recovery_info.oi; // may have been updated above
414 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
415 }
7c673cae
FG
416
417 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
7c673cae
FG
418
419 publish_stats_to_osd();
420 assert(missing_loc.needs_recovery(hoid));
c07f9fc5
FG
421 if (!is_delete)
422 missing_loc.add_location(hoid, pg_whoami);
7c673cae
FG
423 release_backoffs(hoid);
424 if (!is_unreadable_object(hoid)) {
425 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
426 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
427 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
428 requeue_ops(unreadable_object_entry->second);
429 waiting_for_unreadable_object.erase(unreadable_object_entry);
430 }
431 }
7c673cae
FG
432 } else {
433 t->register_on_applied(
434 new C_OSD_AppliedRecoveredObjectReplica(this));
435
436 }
437
438 t->register_on_commit(
439 new C_OSD_CommittedPushedObject(
440 this,
441 get_osdmap()->get_epoch(),
442 info.last_complete));
443
444 // update pg
445 dirty_info = true;
446 write_if_dirty(*t);
447}
448
449void PrimaryLogPG::on_global_recover(
450 const hobject_t &soid,
c07f9fc5
FG
451 const object_stat_sum_t &stat_diff,
452 bool is_delete)
7c673cae
FG
453{
454 info.stats.stats.sum.add(stat_diff);
455 missing_loc.recovered(soid);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
459 assert(i != recovering.end());
460
c07f9fc5
FG
461 if (!is_delete) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 assert(i->second);
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
7c673cae
FG
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488}
489
490void PrimaryLogPG::on_peer_recover(
491 pg_shard_t peer,
492 const hobject_t &soid,
493 const ObjectRecoveryInfo &recovery_info)
494{
495 publish_stats_to_osd();
496 // done!
497 peer_missing[peer].got(soid, recovery_info.version);
498}
499
500void PrimaryLogPG::begin_peer_recover(
501 pg_shard_t peer,
502 const hobject_t soid)
503{
504 peer_missing[peer].revise_have(soid, eversion_t());
505}
506
507void PrimaryLogPG::schedule_recovery_work(
508 GenContext<ThreadPool::TPHandle&> *c)
509{
510 osd->recovery_gen_wq.queue(c);
511}
512
513void PrimaryLogPG::send_message_osd_cluster(
514 int peer, Message *m, epoch_t from_epoch)
515{
516 osd->send_message_osd_cluster(peer, m, from_epoch);
517}
518
519void PrimaryLogPG::send_message_osd_cluster(
520 Message *m, Connection *con)
521{
522 osd->send_message_osd_cluster(m, con);
523}
524
525void PrimaryLogPG::send_message_osd_cluster(
526 Message *m, const ConnectionRef& con)
527{
528 osd->send_message_osd_cluster(m, con);
529}
530
224ce89b
WB
531void PrimaryLogPG::on_primary_error(
532 const hobject_t &oid,
533 eversion_t v)
534{
535 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
536 primary_failed(oid);
537 primary_error(oid, v);
b32b8144
FG
538 backfill_add_missing(oid, v);
539}
540
541void PrimaryLogPG::backfill_add_missing(
542 const hobject_t &oid,
543 eversion_t v)
544{
545 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
224ce89b
WB
546 backfills_in_flight.erase(oid);
547 missing_loc.add_missing(oid, v, eversion_t());
548}
549
7c673cae
FG
550ConnectionRef PrimaryLogPG::get_con_osd_cluster(
551 int peer, epoch_t from_epoch)
552{
553 return osd->get_con_osd_cluster(peer, from_epoch);
554}
555
556PerfCounters *PrimaryLogPG::get_logger()
557{
558 return osd->logger;
559}
560
561
562// ====================
563// missing objects
564
565bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
566{
567 return pg_log.get_missing().get_items().count(soid);
568}
569
570void PrimaryLogPG::maybe_kick_recovery(
571 const hobject_t &soid)
572{
573 eversion_t v;
574 if (!missing_loc.needs_recovery(soid, &v))
575 return;
576
577 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
578 if (p != recovering.end()) {
579 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
580 } else if (missing_loc.is_unfound(soid)) {
581 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
582 } else {
583 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
584 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
585 if (is_missing_object(soid)) {
586 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
c07f9fc5
FG
587 } else if (missing_loc.is_deleted(soid)) {
588 prep_object_replica_deletes(soid, v, h);
7c673cae
FG
589 } else {
590 prep_object_replica_pushes(soid, v, h);
591 }
592 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
593 }
594}
595
596void PrimaryLogPG::wait_for_unreadable_object(
597 const hobject_t& soid, OpRequestRef op)
598{
599 assert(is_unreadable_object(soid));
600 maybe_kick_recovery(soid);
601 waiting_for_unreadable_object[soid].push_back(op);
602 op->mark_delayed("waiting for missing object");
603}
604
7c673cae
FG
605bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
606{
607 /* The conditions below may clear (on_local_recover, before we queue
608 * the transaction) before we actually requeue the degraded waiters
609 * in on_global_recover after the transaction completes.
610 */
611 if (waiting_for_degraded_object.count(soid))
612 return true;
613 if (pg_log.get_missing().get_items().count(soid))
614 return true;
615 assert(!actingbackfill.empty());
616 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
617 i != actingbackfill.end();
618 ++i) {
619 if (*i == get_primary()) continue;
620 pg_shard_t peer = *i;
621 auto peer_missing_entry = peer_missing.find(peer);
622 if (peer_missing_entry != peer_missing.end() &&
623 peer_missing_entry->second.get_items().count(soid))
624 return true;
625
626 // Object is degraded if after last_backfill AND
627 // we are backfilling it
628 if (is_backfill_targets(peer) &&
629 peer_info[peer].last_backfill <= soid &&
630 last_backfill_started >= soid &&
631 backfills_in_flight.count(soid))
632 return true;
633 }
634 return false;
635}
636
637void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
638{
639 assert(is_degraded_or_backfilling_object(soid));
640
641 maybe_kick_recovery(soid);
642 waiting_for_degraded_object[soid].push_back(op);
643 op->mark_delayed("waiting for degraded object");
644}
645
646void PrimaryLogPG::block_write_on_full_cache(
647 const hobject_t& _oid, OpRequestRef op)
648{
649 const hobject_t oid = _oid.get_head();
650 dout(20) << __func__ << ": blocking object " << oid
651 << " on full cache" << dendl;
652 objects_blocked_on_cache_full.insert(oid);
653 waiting_for_cache_not_full.push_back(op);
654 op->mark_delayed("waiting for cache not full");
655}
656
224ce89b
WB
657void PrimaryLogPG::block_for_clean(
658 const hobject_t& oid, OpRequestRef op)
659{
660 dout(20) << __func__ << ": blocking object " << oid
661 << " on primary repair" << dendl;
662 waiting_for_clean_to_primary_repair.push_back(op);
663 op->mark_delayed("waiting for clean to repair");
664}
665
7c673cae
FG
666void PrimaryLogPG::block_write_on_snap_rollback(
667 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
668{
669 dout(20) << __func__ << ": blocking object " << oid.get_head()
670 << " on snap promotion " << obc->obs.oi.soid << dendl;
671 // otherwise, we'd have blocked in do_op
672 assert(oid.is_head());
673 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
674 objects_blocked_on_snap_promotion[oid] = obc;
675 wait_for_blocked_object(obc->obs.oi.soid, op);
676}
677
678void PrimaryLogPG::block_write_on_degraded_snap(
679 const hobject_t& snap, OpRequestRef op)
680{
681 dout(20) << __func__ << ": blocking object " << snap.get_head()
682 << " on degraded snap " << snap << dendl;
683 // otherwise, we'd have blocked in do_op
684 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
685 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
686 wait_for_degraded_object(snap, op);
687}
688
689bool PrimaryLogPG::maybe_await_blocked_snapset(
690 const hobject_t &hoid,
691 OpRequestRef op)
692{
693 ObjectContextRef obc;
694 obc = object_contexts.lookup(hoid.get_head());
695 if (obc) {
696 if (obc->is_blocked()) {
697 wait_for_blocked_object(obc->obs.oi.soid, op);
698 return true;
699 } else {
700 return false;
701 }
702 }
703 obc = object_contexts.lookup(hoid.get_snapdir());
704 if (obc) {
705 if (obc->is_blocked()) {
706 wait_for_blocked_object(obc->obs.oi.soid, op);
707 return true;
708 } else {
709 return false;
710 }
711 }
712 return false;
713}
714
715void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
716{
717 dout(10) << __func__ << " " << soid << " " << op << dendl;
718 waiting_for_blocked_object[soid].push_back(op);
719 op->mark_delayed("waiting for blocked object");
720}
721
722void PrimaryLogPG::maybe_force_recovery()
723{
b32b8144 724 // no force if not in degraded/recovery/backfill states
7c673cae
FG
725 if (!is_degraded() &&
726 !state_test(PG_STATE_RECOVERING |
727 PG_STATE_RECOVERY_WAIT |
3efd9988 728 PG_STATE_BACKFILLING |
7c673cae
FG
729 PG_STATE_BACKFILL_WAIT |
730 PG_STATE_BACKFILL_TOOFULL))
731 return;
732
733 if (pg_log.get_log().approx_size() <
734 cct->_conf->osd_max_pg_log_entries *
735 cct->_conf->osd_force_recovery_pg_log_entries_factor)
736 return;
737
738 // find the oldest missing object
739 version_t min_version = 0;
740 hobject_t soid;
741 if (!pg_log.get_missing().get_items().empty()) {
742 min_version = pg_log.get_missing().get_rmissing().begin()->first;
743 soid = pg_log.get_missing().get_rmissing().begin()->second;
744 }
745 assert(!actingbackfill.empty());
746 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
747 it != actingbackfill.end();
748 ++it) {
749 if (*it == get_primary()) continue;
750 pg_shard_t peer = *it;
751 if (peer_missing.count(peer) &&
752 !peer_missing[peer].get_items().empty() &&
753 min_version > peer_missing[peer].get_rmissing().begin()->first) {
754 min_version = peer_missing[peer].get_rmissing().begin()->first;
755 soid = peer_missing[peer].get_rmissing().begin()->second;
756 }
757 }
758
759 // recover it
760 if (soid != hobject_t())
761 maybe_kick_recovery(soid);
762}
763
764class PGLSPlainFilter : public PGLSFilter {
765 string val;
766public:
767 int init(bufferlist::iterator &params) override
768 {
769 try {
770 ::decode(xattr, params);
771 ::decode(val, params);
772 } catch (buffer::error &e) {
773 return -EINVAL;
774 }
775
776 return 0;
777 }
778 ~PGLSPlainFilter() override {}
779 bool filter(const hobject_t &obj, bufferlist& xattr_data,
780 bufferlist& outdata) override;
781};
782
783class PGLSParentFilter : public PGLSFilter {
784 inodeno_t parent_ino;
785public:
786 CephContext* cct;
787 PGLSParentFilter(CephContext* cct) : cct(cct) {
788 xattr = "_parent";
789 }
790 int init(bufferlist::iterator &params) override
791 {
792 try {
793 ::decode(parent_ino, params);
794 } catch (buffer::error &e) {
795 return -EINVAL;
796 }
797 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
798
799 return 0;
800 }
801 ~PGLSParentFilter() override {}
802 bool filter(const hobject_t &obj, bufferlist& xattr_data,
803 bufferlist& outdata) override;
804};
805
806bool PGLSParentFilter::filter(const hobject_t &obj,
807 bufferlist& xattr_data, bufferlist& outdata)
808{
809 bufferlist::iterator iter = xattr_data.begin();
810 inode_backtrace_t bt;
811
812 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
813
814 ::decode(bt, iter);
815
816 vector<inode_backpointer_t>::iterator vi;
817 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
818 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
819 if (vi->dirino == parent_ino) {
820 ::encode(*vi, outdata);
821 return true;
822 }
823 }
824
825 return false;
826}
827
828bool PGLSPlainFilter::filter(const hobject_t &obj,
829 bufferlist& xattr_data, bufferlist& outdata)
830{
831 if (val.size() != xattr_data.length())
832 return false;
833
834 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
835 return false;
836
837 return true;
838}
839
840bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
841{
842 bufferlist bl;
843
844 // If filter has expressed an interest in an xattr, load it.
845 if (!filter->get_xattr().empty()) {
846 int ret = pgbackend->objects_get_attr(
847 sobj,
848 filter->get_xattr(),
849 &bl);
850 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
851 if (ret < 0) {
852 if (ret != -ENODATA || filter->reject_empty_xattr()) {
853 return false;
854 }
855 }
856 }
857
858 return filter->filter(sobj, bl, outdata);
859}
860
861int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
862{
863 string type;
864 PGLSFilter *filter;
865
866 try {
867 ::decode(type, iter);
868 }
869 catch (buffer::error& e) {
870 return -EINVAL;
871 }
872
873 if (type.compare("parent") == 0) {
874 filter = new PGLSParentFilter(cct);
875 } else if (type.compare("plain") == 0) {
876 filter = new PGLSPlainFilter();
877 } else {
878 std::size_t dot = type.find(".");
879 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
880 return -EINVAL;
881 }
882
883 const std::string class_name = type.substr(0, dot);
884 const std::string filter_name = type.substr(dot + 1);
885 ClassHandler::ClassData *cls = NULL;
886 int r = osd->class_handler->open_class(class_name, &cls);
887 if (r != 0) {
888 derr << "Error opening class '" << class_name << "': "
889 << cpp_strerror(r) << dendl;
890 if (r != -EPERM) // propogate permission error
891 r = -EINVAL;
892 return r;
893 } else {
894 assert(cls);
895 }
896
897 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
898 if (class_filter == NULL) {
899 derr << "Error finding filter '" << filter_name << "' in class "
900 << class_name << dendl;
901 return -EINVAL;
902 }
903 filter = class_filter->fn();
904 if (!filter) {
905 // Object classes are obliged to return us something, but let's
906 // give an error rather than asserting out.
907 derr << "Buggy class " << class_name << " failed to construct "
908 "filter " << filter_name << dendl;
909 return -EINVAL;
910 }
911 }
912
913 assert(filter);
914 int r = filter->init(iter);
915 if (r < 0) {
916 derr << "Error initializing filter " << type << ": "
917 << cpp_strerror(r) << dendl;
918 delete filter;
919 return -EINVAL;
920 } else {
921 // Successfully constructed and initialized, return it.
922 *pfilter = filter;
923 return 0;
924 }
925}
926
927
928// ==========================================================
929
930int PrimaryLogPG::do_command(
931 cmdmap_t cmdmap,
932 ostream& ss,
933 bufferlist& idata,
934 bufferlist& odata,
935 ConnectionRef con,
936 ceph_tid_t tid)
937{
c07f9fc5 938 const auto &missing = pg_log.get_missing();
7c673cae
FG
939 string prefix;
940 string format;
941
942 cmd_getval(cct, cmdmap, "format", format);
943 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
944
945 string command;
946 cmd_getval(cct, cmdmap, "cmd", command);
947 if (command == "query") {
948 f->open_object_section("pg");
949 f->dump_string("state", pg_state_string(get_state()));
950 f->dump_stream("snap_trimq") << snap_trimq;
b32b8144 951 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
7c673cae
FG
952 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
953 f->open_array_section("up");
954 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
955 f->dump_unsigned("osd", *p);
956 f->close_section();
957 f->open_array_section("acting");
958 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
959 f->dump_unsigned("osd", *p);
960 f->close_section();
961 if (!backfill_targets.empty()) {
962 f->open_array_section("backfill_targets");
963 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
964 p != backfill_targets.end();
965 ++p)
966 f->dump_stream("shard") << *p;
967 f->close_section();
968 }
969 if (!actingbackfill.empty()) {
970 f->open_array_section("actingbackfill");
971 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
972 p != actingbackfill.end();
973 ++p)
974 f->dump_stream("shard") << *p;
975 f->close_section();
976 }
977 f->open_object_section("info");
978 _update_calc_stats();
979 info.dump(f.get());
980 f->close_section();
981
982 f->open_array_section("peer_info");
983 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
984 p != peer_info.end();
985 ++p) {
986 f->open_object_section("info");
987 f->dump_stream("peer") << p->first;
988 p->second.dump(f.get());
989 f->close_section();
990 }
991 f->close_section();
992
993 f->open_array_section("recovery_state");
994 handle_query_state(f.get());
995 f->close_section();
996
997 f->open_object_section("agent_state");
998 if (agent_state)
999 agent_state->dump(f.get());
1000 f->close_section();
1001
1002 f->close_section();
1003 f->flush(odata);
1004 return 0;
1005 }
1006 else if (command == "mark_unfound_lost") {
1007 string mulcmd;
1008 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1009 int mode = -1;
1010 if (mulcmd == "revert") {
1011 if (pool.info.ec_pool()) {
1012 ss << "mode must be 'delete' for ec pool";
1013 return -EINVAL;
1014 }
1015 mode = pg_log_entry_t::LOST_REVERT;
1016 } else if (mulcmd == "delete") {
1017 mode = pg_log_entry_t::LOST_DELETE;
1018 } else {
1019 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1020 return -EINVAL;
1021 }
1022 assert(mode == pg_log_entry_t::LOST_REVERT ||
1023 mode == pg_log_entry_t::LOST_DELETE);
1024
1025 if (!is_primary()) {
1026 ss << "not primary";
1027 return -EROFS;
1028 }
1029
1030 uint64_t unfound = missing_loc.num_unfound();
1031 if (!unfound) {
1032 ss << "pg has no unfound objects";
1033 return 0; // make command idempotent
1034 }
1035
1036 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1037 ss << "pg has " << unfound
1038 << " unfound objects but we haven't probed all sources, not marking lost";
1039 return -EINVAL;
1040 }
1041
1042 mark_all_unfound_lost(mode, con, tid);
1043 return -EAGAIN;
1044 }
1045 else if (command == "list_missing") {
1046 hobject_t offset;
1047 string offset_json;
1048 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1049 json_spirit::Value v;
1050 try {
1051 if (!json_spirit::read(offset_json, v))
1052 throw std::runtime_error("bad json");
1053 offset.decode(v);
1054 } catch (std::runtime_error& e) {
1055 ss << "error parsing offset: " << e.what();
1056 return -EINVAL;
1057 }
1058 }
1059 f->open_object_section("missing");
1060 {
1061 f->open_object_section("offset");
1062 offset.dump(f.get());
1063 f->close_section();
1064 }
1065 f->dump_int("num_missing", missing.num_missing());
1066 f->dump_int("num_unfound", get_num_unfound());
1067 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1068 missing_loc.get_needs_recovery();
1069 map<hobject_t, pg_missing_item>::const_iterator p =
1070 needs_recovery_map.upper_bound(offset);
1071 {
1072 f->open_array_section("objects");
1073 int32_t num = 0;
1074 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1075 if (missing_loc.is_unfound(p->first)) {
1076 f->open_object_section("object");
1077 {
1078 f->open_object_section("oid");
1079 p->first.dump(f.get());
1080 f->close_section();
1081 }
1082 p->second.dump(f.get()); // have, need keys
1083 {
1084 f->open_array_section("locations");
1085 for (set<pg_shard_t>::iterator r =
1086 missing_loc.get_locations(p->first).begin();
1087 r != missing_loc.get_locations(p->first).end();
1088 ++r)
1089 f->dump_stream("shard") << *r;
1090 f->close_section();
1091 }
1092 f->close_section();
1093 num++;
1094 }
1095 }
1096 f->close_section();
1097 }
1098 f->dump_bool("more", p != needs_recovery_map.end());
1099 f->close_section();
1100 f->flush(odata);
1101 return 0;
1102 }
1103
1104 ss << "unknown pg command " << prefix;
1105 return -EINVAL;
1106}
1107
1108// ==========================================================
1109
1110void PrimaryLogPG::do_pg_op(OpRequestRef op)
1111{
1112 // NOTE: this is non-const because we modify the OSDOp.outdata in
1113 // place
1114 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1115 assert(m->get_type() == CEPH_MSG_OSD_OP);
1116 dout(10) << "do_pg_op " << *m << dendl;
1117
1118 op->mark_started();
1119
1120 int result = 0;
1121 string cname, mname;
1122 PGLSFilter *filter = NULL;
1123 bufferlist filter_out;
1124
1125 snapid_t snapid = m->get_snapid();
1126
1127 vector<OSDOp> ops = m->ops;
1128
1129 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1130 OSDOp& osd_op = *p;
1131 bufferlist::iterator bp = p->indata.begin();
1132 switch (p->op.op) {
1133 case CEPH_OSD_OP_PGNLS_FILTER:
1134 try {
1135 ::decode(cname, bp);
1136 ::decode(mname, bp);
1137 }
1138 catch (const buffer::error& e) {
1139 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1140 result = -EINVAL;
1141 break;
1142 }
1143 if (filter) {
1144 delete filter;
1145 filter = NULL;
1146 }
1147 result = get_pgls_filter(bp, &filter);
1148 if (result < 0)
1149 break;
1150
1151 assert(filter);
1152
1153 // fall through
1154
1155 case CEPH_OSD_OP_PGNLS:
1156 if (snapid != CEPH_NOSNAP) {
1157 result = -EINVAL;
1158 break;
1159 }
1160 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1161 dout(10) << " pgnls pg=" << m->get_pg()
1162 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1163 << " != " << info.pgid << dendl;
1164 result = 0; // hmm?
1165 } else {
1166 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1167
1168 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1169 // read into a buffer
1170 vector<hobject_t> sentries;
1171 pg_nls_response_t response;
1172 try {
1173 ::decode(response.handle, bp);
1174 }
1175 catch (const buffer::error& e) {
1176 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1177 result = -EINVAL;
1178 break;
1179 }
1180
1181 hobject_t next;
1182 hobject_t lower_bound = response.handle;
1183 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1184 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1185 dout(10) << " pgnls lower_bound " << lower_bound
1186 << " pg_end " << pg_end << dendl;
1187 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1188 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1189 // this should only happen with a buggy client.
1190 dout(10) << "outside of PG bounds " << pg_start << " .. "
1191 << pg_end << dendl;
1192 result = -EINVAL;
1193 break;
1194 }
1195
1196 hobject_t current = lower_bound;
1197 osr->flush();
1198 int r = pgbackend->objects_list_partial(
1199 current,
1200 list_size,
1201 list_size,
1202 &sentries,
1203 &next);
1204 if (r != 0) {
1205 result = -EINVAL;
1206 break;
1207 }
1208
1209 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1210 pg_log.get_missing().get_items().lower_bound(current);
1211 vector<hobject_t>::iterator ls_iter = sentries.begin();
1212 hobject_t _max = hobject_t::get_max();
1213 while (1) {
1214 const hobject_t &mcand =
1215 missing_iter == pg_log.get_missing().get_items().end() ?
1216 _max :
1217 missing_iter->first;
1218 const hobject_t &lcand =
1219 ls_iter == sentries.end() ?
1220 _max :
1221 *ls_iter;
1222
1223 hobject_t candidate;
1224 if (mcand == lcand) {
1225 candidate = mcand;
1226 if (!mcand.is_max()) {
1227 ++ls_iter;
1228 ++missing_iter;
1229 }
1230 } else if (mcand < lcand) {
1231 candidate = mcand;
1232 assert(!mcand.is_max());
1233 ++missing_iter;
1234 } else {
1235 candidate = lcand;
1236 assert(!lcand.is_max());
1237 ++ls_iter;
1238 }
1239
1240 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1241 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1242
1243 if (candidate >= next) {
1244 break;
1245 }
1246
1247 if (response.entries.size() == list_size) {
1248 next = candidate;
1249 break;
1250 }
1251
1252 // skip snapdir objects
1253 if (candidate.snap == CEPH_SNAPDIR)
1254 continue;
1255
1256 if (candidate.snap != CEPH_NOSNAP)
1257 continue;
1258
1259 // skip internal namespace
1260 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1261 continue;
1262
c07f9fc5
FG
1263 if (missing_loc.is_deleted(candidate))
1264 continue;
1265
7c673cae
FG
1266 // skip wrong namespace
1267 if (m->get_hobj().nspace != librados::all_nspaces &&
1268 candidate.get_namespace() != m->get_hobj().nspace)
1269 continue;
1270
1271 if (filter && !pgls_filter(filter, candidate, filter_out))
1272 continue;
1273
1274 dout(20) << "pgnls item 0x" << std::hex
1275 << candidate.get_hash()
1276 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1277 << std::dec << " "
1278 << candidate.oid.name << dendl;
1279
1280 librados::ListObjectImpl item;
1281 item.nspace = candidate.get_namespace();
1282 item.oid = candidate.oid.name;
1283 item.locator = candidate.get_key();
1284 response.entries.push_back(item);
1285 }
1286
1287 if (next.is_max() &&
1288 missing_iter == pg_log.get_missing().get_items().end() &&
1289 ls_iter == sentries.end()) {
1290 result = 1;
1291
1292 // Set response.handle to the start of the next PG according
1293 // to the object sort order.
1294 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1295 } else {
1296 response.handle = next;
1297 }
1298 dout(10) << "pgnls handle=" << response.handle << dendl;
1299 ::encode(response, osd_op.outdata);
1300 if (filter)
1301 ::encode(filter_out, osd_op.outdata);
1302 dout(10) << " pgnls result=" << result << " outdata.length()="
1303 << osd_op.outdata.length() << dendl;
1304 }
1305 break;
1306
1307 case CEPH_OSD_OP_PGLS_FILTER:
1308 try {
1309 ::decode(cname, bp);
1310 ::decode(mname, bp);
1311 }
1312 catch (const buffer::error& e) {
1313 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1314 result = -EINVAL;
1315 break;
1316 }
1317 if (filter) {
1318 delete filter;
1319 filter = NULL;
1320 }
1321 result = get_pgls_filter(bp, &filter);
1322 if (result < 0)
1323 break;
1324
1325 assert(filter);
1326
1327 // fall through
1328
1329 case CEPH_OSD_OP_PGLS:
1330 if (snapid != CEPH_NOSNAP) {
1331 result = -EINVAL;
1332 break;
1333 }
1334 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1335 dout(10) << " pgls pg=" << m->get_pg()
1336 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1337 << " != " << info.pgid << dendl;
1338 result = 0; // hmm?
1339 } else {
1340 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1341
1342 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1343 // read into a buffer
1344 vector<hobject_t> sentries;
1345 pg_ls_response_t response;
1346 try {
1347 ::decode(response.handle, bp);
1348 }
1349 catch (const buffer::error& e) {
1350 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1351 result = -EINVAL;
1352 break;
1353 }
1354
1355 hobject_t next;
1356 hobject_t current = response.handle;
1357 osr->flush();
1358 int r = pgbackend->objects_list_partial(
1359 current,
1360 list_size,
1361 list_size,
1362 &sentries,
1363 &next);
1364 if (r != 0) {
1365 result = -EINVAL;
1366 break;
1367 }
1368
1369 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1370
1371 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1372 pg_log.get_missing().get_items().lower_bound(current);
1373 vector<hobject_t>::iterator ls_iter = sentries.begin();
1374 hobject_t _max = hobject_t::get_max();
1375 while (1) {
1376 const hobject_t &mcand =
1377 missing_iter == pg_log.get_missing().get_items().end() ?
1378 _max :
1379 missing_iter->first;
1380 const hobject_t &lcand =
1381 ls_iter == sentries.end() ?
1382 _max :
1383 *ls_iter;
1384
1385 hobject_t candidate;
1386 if (mcand == lcand) {
1387 candidate = mcand;
1388 if (!mcand.is_max()) {
1389 ++ls_iter;
1390 ++missing_iter;
1391 }
1392 } else if (mcand < lcand) {
1393 candidate = mcand;
1394 assert(!mcand.is_max());
1395 ++missing_iter;
1396 } else {
1397 candidate = lcand;
1398 assert(!lcand.is_max());
1399 ++ls_iter;
1400 }
1401
1402 if (candidate >= next) {
1403 break;
1404 }
1405
1406 if (response.entries.size() == list_size) {
1407 next = candidate;
1408 break;
1409 }
1410
1411 // skip snapdir objects
1412 if (candidate.snap == CEPH_SNAPDIR)
1413 continue;
1414
1415 if (candidate.snap != CEPH_NOSNAP)
1416 continue;
1417
1418 // skip wrong namespace
1419 if (candidate.get_namespace() != m->get_hobj().nspace)
1420 continue;
1421
c07f9fc5
FG
1422 if (missing_loc.is_deleted(candidate))
1423 continue;
1424
7c673cae
FG
1425 if (filter && !pgls_filter(filter, candidate, filter_out))
1426 continue;
1427
1428 response.entries.push_back(make_pair(candidate.oid,
1429 candidate.get_key()));
1430 }
1431 if (next.is_max() &&
1432 missing_iter == pg_log.get_missing().get_items().end() &&
1433 ls_iter == sentries.end()) {
1434 result = 1;
1435 }
1436 response.handle = next;
1437 ::encode(response, osd_op.outdata);
1438 if (filter)
1439 ::encode(filter_out, osd_op.outdata);
1440 dout(10) << " pgls result=" << result << " outdata.length()="
1441 << osd_op.outdata.length() << dendl;
1442 }
1443 break;
1444
1445 case CEPH_OSD_OP_PG_HITSET_LS:
1446 {
1447 list< pair<utime_t,utime_t> > ls;
1448 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1449 p != info.hit_set.history.end();
1450 ++p)
1451 ls.push_back(make_pair(p->begin, p->end));
1452 if (hit_set)
1453 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1454 ::encode(ls, osd_op.outdata);
1455 }
1456 break;
1457
1458 case CEPH_OSD_OP_PG_HITSET_GET:
1459 {
1460 utime_t stamp(osd_op.op.hit_set_get.stamp);
1461 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1462 // read the current in-memory HitSet, not the version we've
1463 // checkpointed.
1464 if (!hit_set) {
1465 result= -ENOENT;
1466 break;
1467 }
1468 ::encode(*hit_set, osd_op.outdata);
1469 result = osd_op.outdata.length();
1470 } else {
1471 // read an archived HitSet.
1472 hobject_t oid;
1473 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1474 p != info.hit_set.history.end();
1475 ++p) {
1476 if (stamp >= p->begin && stamp <= p->end) {
1477 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1478 break;
1479 }
1480 }
1481 if (oid == hobject_t()) {
1482 result = -ENOENT;
1483 break;
1484 }
1485 if (!pool.info.is_replicated()) {
1486 // FIXME: EC not supported yet
1487 result = -EOPNOTSUPP;
1488 break;
1489 }
1490 if (is_unreadable_object(oid)) {
1491 wait_for_unreadable_object(oid, op);
1492 delete filter;
1493 return;
1494 }
1495 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1496 }
1497 }
1498 break;
1499
1500 case CEPH_OSD_OP_SCRUBLS:
1501 result = do_scrub_ls(m, &osd_op);
1502 break;
1503
1504 default:
1505 result = -EINVAL;
1506 break;
1507 }
1508
1509 if (result < 0)
1510 break;
1511 }
1512
1513 // reply
1514 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1515 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1516 false);
1517 reply->claim_op_out_data(ops);
1518 reply->set_result(result);
1519 reply->set_reply_versions(info.last_update, info.last_user_version);
1520 osd->send_message_osd_client(reply, m->get_connection());
1521 delete filter;
1522}
1523
1524int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1525{
1526 if (m->get_pg() != info.pgid.pgid) {
1527 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1528 return -EINVAL; // hmm?
1529 }
1530 auto bp = osd_op->indata.begin();
1531 scrub_ls_arg_t arg;
1532 try {
1533 arg.decode(bp);
1534 } catch (buffer::error&) {
1535 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1536 return -EINVAL;
1537 }
1538 int r = 0;
1539 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1540 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1541 r = -EAGAIN;
1542 } else if (!scrubber.store) {
1543 r = -ENOENT;
1544 } else if (arg.get_snapsets) {
1545 result.vals = scrubber.store->get_snap_errors(osd->store,
1546 get_pgid().pool(),
1547 arg.start_after,
1548 arg.max_return);
1549 } else {
1550 result.vals = scrubber.store->get_object_errors(osd->store,
1551 get_pgid().pool(),
1552 arg.start_after,
1553 arg.max_return);
1554 }
1555 ::encode(result, osd_op->outdata);
1556 return r;
1557}
1558
1559void PrimaryLogPG::calc_trim_to()
1560{
1561 size_t target = cct->_conf->osd_min_pg_log_entries;
1562 if (is_degraded() ||
1563 state_test(PG_STATE_RECOVERING |
1564 PG_STATE_RECOVERY_WAIT |
3efd9988 1565 PG_STATE_BACKFILLING |
7c673cae
FG
1566 PG_STATE_BACKFILL_WAIT |
1567 PG_STATE_BACKFILL_TOOFULL)) {
1568 target = cct->_conf->osd_max_pg_log_entries;
1569 }
1570
1571 eversion_t limit = MIN(
1572 min_last_complete_ondisk,
1573 pg_log.get_can_rollback_to());
1574 if (limit != eversion_t() &&
1575 limit != pg_trim_to &&
1576 pg_log.get_log().approx_size() > target) {
94b18763
FG
1577 size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
1578 cct->_conf->osd_pg_log_trim_max);
1579 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1580 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
7c673cae
FG
1581 return;
1582 }
1583 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1584 eversion_t new_trim_to;
1585 for (size_t i = 0; i < num_to_trim; ++i) {
1586 new_trim_to = it->version;
1587 ++it;
1588 if (new_trim_to > limit) {
1589 new_trim_to = limit;
1590 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1591 break;
1592 }
1593 }
1594 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1595 pg_trim_to = new_trim_to;
1596 assert(pg_trim_to <= pg_log.get_head());
1597 assert(pg_trim_to <= min_last_complete_ondisk);
1598 }
1599}
1600
1601PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1602 const PGPool &_pool, spg_t p) :
1603 PG(o, curmap, _pool, p),
1604 pgbackend(
1605 PGBackend::build_pg_backend(
1606 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1607 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1608 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1609 new_backfill(false),
1610 temp_seq(0),
1611 snap_trimmer_machine(this)
1612{
1613 missing_loc.set_backend_predicates(
1614 pgbackend->get_is_readable_predicate(),
1615 pgbackend->get_is_recoverable_predicate());
1616 snap_trimmer_machine.initiate();
1617}
1618
1619void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1620{
1621 src_oloc = oloc;
1622 if (oloc.key.empty())
1623 src_oloc.key = oid.name;
1624}
1625
1626void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1627{
1628 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1629 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1630 if (!session)
1631 return; // drop it.
1632 session->put(); // get_priv takes a ref, and so does the SessionRef
1633 hobject_t begin = info.pgid.pgid.get_hobj_start();
1634 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1635 if (begin < m->begin) {
1636 begin = m->begin;
1637 }
1638 if (end > m->end) {
1639 end = m->end;
1640 }
1641 dout(10) << __func__ << " backoff ack id " << m->id
1642 << " [" << begin << "," << end << ")" << dendl;
1643 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1644}
1645
1646void PrimaryLogPG::do_request(
1647 OpRequestRef& op,
1648 ThreadPool::TPHandle &handle)
1649{
1650 if (op->osd_trace) {
1651 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1652 op->pg_trace.event("do request");
1653 }
1654 // make sure we have a new enough map
1655 auto p = waiting_for_map.find(op->get_source());
1656 if (p != waiting_for_map.end()) {
1657 // preserve ordering
1658 dout(20) << __func__ << " waiting_for_map "
1659 << p->first << " not empty, queueing" << dendl;
1660 p->second.push_back(op);
1661 op->mark_delayed("waiting_for_map not empty");
1662 return;
1663 }
1664 if (!have_same_or_newer_map(op->min_epoch)) {
1665 dout(20) << __func__ << " min " << op->min_epoch
1666 << ", queue on waiting_for_map " << op->get_source() << dendl;
1667 waiting_for_map[op->get_source()].push_back(op);
1668 op->mark_delayed("op must wait for map");
181888fb 1669 osd->request_osdmap_update(op->min_epoch);
7c673cae
FG
1670 return;
1671 }
1672
1673 if (can_discard_request(op)) {
1674 return;
1675 }
1676
1677 // pg-wide backoffs
1678 const Message *m = op->get_req();
1679 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1680 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1681 if (!session)
1682 return; // drop it.
1683 session->put(); // get_priv takes a ref, and so does the SessionRef
1684
1685 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1686 if (session->check_backoff(cct, info.pgid,
1687 info.pgid.pgid.get_hobj_start(), m)) {
1688 return;
1689 }
1690
1691 bool backoff =
1692 is_down() ||
1693 is_incomplete() ||
1694 (!is_active() && is_peered());
1695 if (g_conf->osd_backoff_on_peering && !backoff) {
1696 if (is_peering()) {
1697 backoff = true;
1698 }
1699 }
1700 if (backoff) {
1701 add_pg_backoff(session);
1702 return;
1703 }
1704 }
1705 // pg backoff acks at pg-level
1706 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1707 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1708 if (ba->begin != ba->end) {
1709 handle_backoff(op);
1710 return;
1711 }
1712 }
1713 }
1714
7c673cae
FG
1715 if (!is_peered()) {
1716 // Delay unless PGBackend says it's ok
1717 if (pgbackend->can_handle_while_inactive(op)) {
1718 bool handled = pgbackend->handle_message(op);
1719 assert(handled);
1720 return;
1721 } else {
1722 waiting_for_peered.push_back(op);
1723 op->mark_delayed("waiting for peered");
1724 return;
1725 }
1726 }
1727
b32b8144
FG
1728 if (flushes_in_progress > 0) {
1729 dout(20) << flushes_in_progress
1730 << " flushes_in_progress pending "
1731 << "waiting for flush on " << op << dendl;
1732 waiting_for_flush.push_back(op);
1733 op->mark_delayed("waiting for flush");
1734 return;
1735 }
1736
7c673cae
FG
1737 assert(is_peered() && flushes_in_progress == 0);
1738 if (pgbackend->handle_message(op))
1739 return;
1740
1741 switch (op->get_req()->get_type()) {
1742 case CEPH_MSG_OSD_OP:
1743 case CEPH_MSG_OSD_BACKOFF:
1744 if (!is_active()) {
1745 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1746 waiting_for_active.push_back(op);
1747 op->mark_delayed("waiting for active");
1748 return;
1749 }
1750 switch (op->get_req()->get_type()) {
1751 case CEPH_MSG_OSD_OP:
1752 // verify client features
1753 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1754 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1755 osd->reply_op_error(op, -EOPNOTSUPP);
1756 return;
1757 }
1758 do_op(op);
1759 break;
1760 case CEPH_MSG_OSD_BACKOFF:
1761 // object-level backoff acks handled in osdop context
1762 handle_backoff(op);
1763 break;
1764 }
1765 break;
1766
1767 case MSG_OSD_SUBOP:
1768 do_sub_op(op);
1769 break;
1770
1771 case MSG_OSD_SUBOPREPLY:
1772 do_sub_op_reply(op);
1773 break;
1774
1775 case MSG_OSD_PG_SCAN:
1776 do_scan(op, handle);
1777 break;
1778
1779 case MSG_OSD_PG_BACKFILL:
1780 do_backfill(op);
1781 break;
1782
1783 case MSG_OSD_PG_BACKFILL_REMOVE:
1784 do_backfill_remove(op);
1785 break;
1786
1787 case MSG_OSD_SCRUB_RESERVE:
1788 {
1789 const MOSDScrubReserve *m =
1790 static_cast<const MOSDScrubReserve*>(op->get_req());
1791 switch (m->type) {
1792 case MOSDScrubReserve::REQUEST:
1793 handle_scrub_reserve_request(op);
1794 break;
1795 case MOSDScrubReserve::GRANT:
1796 handle_scrub_reserve_grant(op, m->from);
1797 break;
1798 case MOSDScrubReserve::REJECT:
1799 handle_scrub_reserve_reject(op, m->from);
1800 break;
1801 case MOSDScrubReserve::RELEASE:
1802 handle_scrub_reserve_release(op);
1803 break;
1804 }
1805 }
1806 break;
1807
1808 case MSG_OSD_REP_SCRUB:
1809 replica_scrub(op, handle);
1810 break;
1811
1812 case MSG_OSD_REP_SCRUBMAP:
1813 do_replica_scrub_map(op);
1814 break;
1815
1816 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1817 do_update_log_missing(op);
1818 break;
1819
1820 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1821 do_update_log_missing_reply(op);
1822 break;
1823
1824 default:
1825 assert(0 == "bad message type in do_request");
1826 }
1827}
1828
1829hobject_t PrimaryLogPG::earliest_backfill() const
1830{
1831 hobject_t e = hobject_t::get_max();
1832 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1833 i != backfill_targets.end();
1834 ++i) {
1835 pg_shard_t bt = *i;
1836 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1837 assert(iter != peer_info.end());
1838 if (iter->second.last_backfill < e)
1839 e = iter->second.last_backfill;
1840 }
1841 return e;
1842}
1843
1844/** do_op - do an op
1845 * pg lock will be held (if multithreaded)
1846 * osd_lock NOT held.
1847 */
1848void PrimaryLogPG::do_op(OpRequestRef& op)
1849{
1850 FUNCTRACE();
1851 // NOTE: take a non-const pointer here; we must be careful not to
1852 // change anything that will break other reads on m (operator<<).
1853 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1854 assert(m->get_type() == CEPH_MSG_OSD_OP);
1855 if (m->finish_decode()) {
1856 op->reset_desc(); // for TrackedOp
1857 m->clear_payload();
1858 }
1859
1860 dout(20) << __func__ << ": op " << *m << dendl;
1861
1862 hobject_t head = m->get_hobj();
1863 head.snap = CEPH_NOSNAP;
1864
1865 if (!info.pgid.pgid.contains(
1866 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1867 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1868 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1869 << std::hex << head.get_hash() << std::dec << dendl;
1870 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1871 << " op " << *m;
1872 assert(!cct->_conf->osd_debug_misdirected_ops);
1873 return;
1874 }
1875
1876 bool can_backoff =
1877 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1878 SessionRef session;
1879 if (can_backoff) {
1880 session = static_cast<Session*>(m->get_connection()->get_priv());
1881 if (!session.get()) {
1882 dout(10) << __func__ << " no session" << dendl;
1883 return;
1884 }
1885 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1886
1887 if (session->check_backoff(cct, info.pgid, head, m)) {
1888 return;
1889 }
1890 }
1891
1892 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1893 // not implemented.
1894 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1895 osd->reply_op_error(op, -EINVAL);
1896 return;
1897 }
1898
1899 if (op->rmw_flags == 0) {
1900 int r = osd->osd->init_op_flags(op);
1901 if (r) {
1902 osd->reply_op_error(op, r);
1903 return;
1904 }
1905 }
1906
1907 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1908 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1909 op->may_read() &&
1910 !(op->may_write() || op->may_cache())) {
1911 // balanced reads; any replica will do
1912 if (!(is_primary() || is_replica())) {
1913 osd->handle_misdirected_op(this, op);
1914 return;
1915 }
1916 } else {
1917 // normal case; must be primary
1918 if (!is_primary()) {
1919 osd->handle_misdirected_op(this, op);
1920 return;
1921 }
1922 }
1923
7c673cae
FG
1924 if (!op_has_sufficient_caps(op)) {
1925 osd->reply_op_error(op, -EPERM);
1926 return;
1927 }
1928
31f18b77
FG
1929 if (op->includes_pg_op()) {
1930 return do_pg_op(op);
1931 }
1932
7c673cae
FG
1933 // object name too long?
1934 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1935 dout(4) << "do_op name is longer than "
1936 << cct->_conf->osd_max_object_name_len
1937 << " bytes" << dendl;
1938 osd->reply_op_error(op, -ENAMETOOLONG);
1939 return;
1940 }
1941 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1942 dout(4) << "do_op locator is longer than "
1943 << cct->_conf->osd_max_object_name_len
1944 << " bytes" << dendl;
1945 osd->reply_op_error(op, -ENAMETOOLONG);
1946 return;
1947 }
1948 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1949 dout(4) << "do_op namespace is longer than "
1950 << cct->_conf->osd_max_object_namespace_len
1951 << " bytes" << dendl;
1952 osd->reply_op_error(op, -ENAMETOOLONG);
1953 return;
1954 }
1955
1956 if (int r = osd->store->validate_hobject_key(head)) {
1957 dout(4) << "do_op object " << head << " invalid for backing store: "
1958 << r << dendl;
1959 osd->reply_op_error(op, r);
1960 return;
1961 }
1962
1963 // blacklisted?
1964 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1965 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1966 osd->reply_op_error(op, -EBLACKLISTED);
1967 return;
1968 }
1969
1970 // order this op as a write?
1971 bool write_ordered = op->rwordered();
1972
1973 // discard due to cluster full transition? (we discard any op that
1974 // originates before the cluster or pool is marked full; the client
1975 // will resend after the full flag is removed or if they expect the
1976 // op to succeed despite being full). The except is FULL_FORCE and
1977 // FULL_TRY ops, which there is no reason to discard because they
1978 // bypass all full checks anyway. If this op isn't write or
1979 // read-ordered, we skip.
1980 // FIXME: we exclude mds writes for now.
1981 if (write_ordered && !(m->get_source().is_mds() ||
1982 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1983 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1984 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1985 dout(10) << __func__ << " discarding op sent before full " << m << " "
1986 << *m << dendl;
1987 return;
1988 }
1989 // mds should have stopped writing before this point.
1990 // We can't allow OSD to become non-startable even if mds
1991 // could be writing as part of file removals.
1992 ostringstream ss;
1993 if (write_ordered && osd->check_failsafe_full(ss)) {
1994 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1995 << ss.str()
1996 << dendl;
1997 return;
1998 }
1999 int64_t poolid = get_pgid().pool();
2000 if (op->may_write()) {
2001
2002 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2003 if (!pi) {
2004 return;
2005 }
2006
2007 // invalid?
2008 if (m->get_snapid() != CEPH_NOSNAP) {
2009 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2010 osd->reply_op_error(op, -EINVAL);
2011 return;
2012 }
2013
2014 // too big?
2015 if (cct->_conf->osd_max_write_size &&
2016 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2017 // journal can't hold commit!
2018 derr << "do_op msg data len " << m->get_data_len()
2019 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2020 << " on " << *m << dendl;
2021 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2022 return;
2023 }
2024 }
2025
2026 dout(10) << "do_op " << *m
2027 << (op->may_write() ? " may_write" : "")
2028 << (op->may_read() ? " may_read" : "")
2029 << (op->may_cache() ? " may_cache" : "")
2030 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2031 << " flags " << ceph_osd_flag_string(m->get_flags())
2032 << dendl;
2033
2034 // missing object?
2035 if (is_unreadable_object(head)) {
224ce89b
WB
2036 if (!is_primary()) {
2037 osd->reply_op_error(op, -EAGAIN);
2038 return;
2039 }
7c673cae
FG
2040 if (can_backoff &&
2041 (g_conf->osd_backoff_on_degraded ||
2042 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2043 add_backoff(session, head, head);
2044 maybe_kick_recovery(head);
2045 } else {
2046 wait_for_unreadable_object(head, op);
2047 }
2048 return;
2049 }
2050
2051 // degraded object?
2052 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2053 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2054 add_backoff(session, head, head);
3efd9988 2055 maybe_kick_recovery(head);
7c673cae
FG
2056 } else {
2057 wait_for_degraded_object(head, op);
2058 }
2059 return;
2060 }
2061
28e407b8
AA
2062 if (write_ordered && scrubber.is_chunky_scrub_active() &&
2063 write_blocked_by_scrub(head)) {
7c673cae
FG
2064 dout(20) << __func__ << ": waiting for scrub" << dendl;
2065 waiting_for_scrub.push_back(op);
2066 op->mark_delayed("waiting for scrub");
2067 return;
2068 }
2069
2070 // blocked on snap?
2071 map<hobject_t, snapid_t>::iterator blocked_iter =
2072 objects_blocked_on_degraded_snap.find(head);
2073 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2074 hobject_t to_wait_on(head);
2075 to_wait_on.snap = blocked_iter->second;
2076 wait_for_degraded_object(to_wait_on, op);
2077 return;
2078 }
2079 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2080 objects_blocked_on_snap_promotion.find(head);
2081 if (write_ordered &&
2082 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2083 wait_for_blocked_object(
2084 blocked_snap_promote_iter->second->obs.oi.soid,
2085 op);
2086 return;
2087 }
2088 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2089 block_write_on_full_cache(head, op);
2090 return;
2091 }
2092
2093 // missing snapdir?
2094 hobject_t snapdir = head.get_snapdir();
2095
2096 if (is_unreadable_object(snapdir)) {
2097 wait_for_unreadable_object(snapdir, op);
2098 return;
2099 }
2100
2101 // degraded object?
2102 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2103 wait_for_degraded_object(snapdir, op);
2104 return;
2105 }
2106
2107 // dup/resent?
2108 if (op->may_write() || op->may_cache()) {
2109 // warning: we will get back *a* request for this reqid, but not
2110 // necessarily the most recent. this happens with flush and
2111 // promote ops, but we can't possible have both in our log where
2112 // the original request is still not stable on disk, so for our
2113 // purposes here it doesn't matter which one we get.
2114 eversion_t version;
2115 version_t user_version;
2116 int return_code = 0;
2117 bool got = check_in_progress_op(
2118 m->get_reqid(), &version, &user_version, &return_code);
2119 if (got) {
2120 dout(3) << __func__ << " dup " << m->get_reqid()
2121 << " version " << version << dendl;
2122 if (already_complete(version)) {
2123 osd->reply_op_error(op, return_code, version, user_version);
2124 } else {
2125 dout(10) << " waiting for " << version << " to commit" << dendl;
2126 // always queue ondisk waiters, so that we can requeue if needed
2127 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2128 op->mark_delayed("waiting for ondisk");
2129 }
2130 return;
2131 }
2132 }
2133
2134 ObjectContextRef obc;
2135 bool can_create = op->may_write() || op->may_cache();
2136 hobject_t missing_oid;
2137 const hobject_t& oid = m->get_hobj();
2138
2139 // io blocked on obc?
2140 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2141 maybe_await_blocked_snapset(oid, op)) {
2142 return;
2143 }
2144
2145 int r = find_object_context(
2146 oid, &obc, can_create,
2147 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2148 &missing_oid);
2149
2150 if (r == -EAGAIN) {
2151 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2152 // we have to wait for the object.
2153 if (is_primary()) {
2154 // missing the specific snap we need; requeue and wait.
2155 assert(!op->may_write()); // only happens on a read/cache
2156 wait_for_unreadable_object(missing_oid, op);
2157 return;
2158 }
2159 } else if (r == 0) {
2160 if (is_unreadable_object(obc->obs.oi.soid)) {
2161 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2162 << " is unreadable, waiting" << dendl;
2163 wait_for_unreadable_object(obc->obs.oi.soid, op);
2164 return;
2165 }
2166
2167 // degraded object? (the check above was for head; this could be a clone)
2168 if (write_ordered &&
2169 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2170 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2171 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2172 << " is degraded, waiting" << dendl;
2173 wait_for_degraded_object(obc->obs.oi.soid, op);
2174 return;
2175 }
2176 }
2177
2178 bool in_hit_set = false;
2179 if (hit_set) {
2180 if (obc.get()) {
2181 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2182 in_hit_set = true;
2183 } else {
2184 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2185 in_hit_set = true;
2186 }
2187 if (!op->hitset_inserted) {
2188 hit_set->insert(oid);
2189 op->hitset_inserted = true;
2190 if (hit_set->is_full() ||
2191 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2192 hit_set_persist();
2193 }
2194 }
2195 }
2196
2197 if (agent_state) {
2198 if (agent_choose_mode(false, op))
2199 return;
2200 }
2201
31f18b77
FG
2202 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2203 if (maybe_handle_manifest(op,
2204 write_ordered,
2205 obc))
2206 return;
2207 }
2208
7c673cae
FG
2209 if (maybe_handle_cache(op,
2210 write_ordered,
2211 obc,
2212 r,
2213 missing_oid,
2214 false,
2215 in_hit_set))
2216 return;
2217
2218 if (r && (r != -ENOENT || !obc)) {
2219 // copy the reqids for copy get on ENOENT
2220 if (r == -ENOENT &&
2221 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2222 fill_in_copy_get_noent(op, oid, m->ops[0]);
2223 return;
2224 }
224ce89b 2225 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
7c673cae 2226 if (op->may_write() &&
31f18b77 2227 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
2228 record_write_error(op, oid, nullptr, r);
2229 } else {
2230 osd->reply_op_error(op, r);
2231 }
2232 return;
2233 }
2234
2235 // make sure locator is consistent
2236 object_locator_t oloc(obc->obs.oi.soid);
2237 if (m->get_object_locator() != oloc) {
2238 dout(10) << " provided locator " << m->get_object_locator()
2239 << " != object's " << obc->obs.oi.soid << dendl;
2240 osd->clog->warn() << "bad locator " << m->get_object_locator()
2241 << " on object " << oloc
2242 << " op " << *m;
2243 }
2244
2245 // io blocked on obc?
2246 if (obc->is_blocked() &&
2247 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2248 wait_for_blocked_object(obc->obs.oi.soid, op);
2249 return;
2250 }
2251
2252 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2253
2254 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2255 OSDOp& osd_op = *p;
2256
2257 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2258 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2259 m->get_snapid() != CEPH_SNAPDIR) {
2260 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2261 osd->reply_op_error(op, -EINVAL);
2262 return;
2263 }
2264 }
2265
c07f9fc5 2266 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
7c673cae
FG
2267
2268 if (!obc->obs.exists)
2269 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2270
2271 /* Due to obc caching, we might have a cached non-existent snapset_obc
2272 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2273 * do_op pipeline make decisions based on whether snapset_obc is
2274 * populated.
2275 */
2276 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2277 ctx->snapset_obc = ObjectContextRef();
2278
2279 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2280 dout(20) << __func__ << ": skipping rw locks" << dendl;
2281 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2282 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2283
2284 // verify there is in fact a flush in progress
2285 // FIXME: we could make this a stronger test.
2286 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2287 if (p == flush_ops.end()) {
2288 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2289 reply_ctx(ctx, -EINVAL);
2290 return;
2291 }
2292 } else if (!get_rw_locks(write_ordered, ctx)) {
2293 dout(20) << __func__ << " waiting for rw locks " << dendl;
2294 op->mark_delayed("waiting for rw locks");
2295 close_op_ctx(ctx);
2296 return;
2297 }
2298 dout(20) << __func__ << " obc " << *obc << dendl;
2299
2300 if (r) {
2301 dout(20) << __func__ << " returned an error: " << r << dendl;
2302 close_op_ctx(ctx);
2303 if (op->may_write() &&
31f18b77 2304 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
2305 record_write_error(op, oid, nullptr, r);
2306 } else {
2307 osd->reply_op_error(op, r);
2308 }
2309 return;
2310 }
2311
2312 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2313 ctx->ignore_cache = true;
2314 }
2315
2316 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2317 // This object is lost. Reading from it returns an error.
2318 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2319 << " is lost" << dendl;
2320 reply_ctx(ctx, -ENFILE);
2321 return;
2322 }
2323 if (!op->may_write() &&
2324 !op->may_cache() &&
2325 (!obc->obs.exists ||
2326 ((m->get_snapid() != CEPH_SNAPDIR) &&
2327 obc->obs.oi.is_whiteout()))) {
2328 // copy the reqids for copy get on ENOENT
2329 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2330 fill_in_copy_get_noent(op, oid, m->ops[0]);
2331 close_op_ctx(ctx);
2332 return;
2333 }
2334 reply_ctx(ctx, -ENOENT);
2335 return;
2336 }
2337
2338 op->mark_started();
2339
2340 execute_ctx(ctx);
2341 utime_t prepare_latency = ceph_clock_now();
2342 prepare_latency -= op->get_dequeued_time();
2343 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2344 if (op->may_read() && op->may_write()) {
2345 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2346 } else if (op->may_read()) {
2347 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2348 } else if (op->may_write() || op->may_cache()) {
2349 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2350 }
2351
2352 // force recovery of the oldest missing object if too many logs
2353 maybe_force_recovery();
2354}
b32b8144 2355
31f18b77
FG
2356PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2357 OpRequestRef op,
2358 bool write_ordered,
2359 ObjectContextRef obc)
2360{
2361 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2362 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2363 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2364 return cache_result_t::NOOP;
2365 }
2366
2367 if (obc)
2368 dout(10) << __func__ << " " << obc->obs.oi << " "
2369 << (obc->obs.exists ? "exists" : "DNE")
2370 << dendl;
2371
2372 // if it is write-ordered and blocked, stop now
2373 if (obc.get() && obc->is_blocked() && write_ordered) {
2374 // we're already doing something with this object
2375 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2376 return cache_result_t::NOOP;
2377 }
2378
2379 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2380 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2381 OSDOp& osd_op = *p;
2382 ceph_osd_op& op = osd_op.op;
2383 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2384 return cache_result_t::NOOP;
2385 }
2386 }
2387
2388 switch (obc->obs.oi.manifest.type) {
2389 case object_manifest_t::TYPE_REDIRECT:
2390 if (op->may_write() || write_ordered) {
2391 do_proxy_write(op, obc->obs.oi.soid, obc);
2392 } else {
2393 do_proxy_read(op, obc);
2394 }
2395 return cache_result_t::HANDLED_PROXY;
2396 case object_manifest_t::TYPE_CHUNKED:
2397 default:
2398 assert(0 == "unrecognized manifest type");
2399 }
2400
2401 return cache_result_t::NOOP;
2402}
7c673cae
FG
2403
2404void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2405 MOSDOpReply *orig_reply, int r)
2406{
2407 dout(20) << __func__ << " r=" << r << dendl;
2408 assert(op->may_write());
2409 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
31f18b77 2410 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
2411 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2412 get_next_version(), eversion_t(), 0,
2413 reqid, utime_t(), r));
2414
2415 struct OnComplete {
2416 PrimaryLogPG *pg;
2417 OpRequestRef op;
2418 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2419 int r;
2420 OnComplete(
2421 PrimaryLogPG *pg,
2422 OpRequestRef op,
2423 MOSDOpReply *orig_reply,
2424 int r)
2425 : pg(pg), op(op),
2426 orig_reply(orig_reply, false /* take over ref */), r(r)
2427 {}
2428 void operator()() {
2429 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2430 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2431 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2432 MOSDOpReply *reply = orig_reply.detach();
2433 if (reply == nullptr) {
2434 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2435 flags, true);
2436 }
2437 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2438 pg->osd->send_message_osd_client(reply, m->get_connection());
2439 }
2440 };
2441
2442 ObcLockManager lock_manager;
2443 submit_log_entries(
2444 entries,
2445 std::move(lock_manager),
2446 boost::optional<std::function<void(void)> >(
2447 OnComplete(this, op, orig_reply, r)),
2448 op,
2449 r);
2450}
2451
2452PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2453 OpRequestRef op,
2454 bool write_ordered,
2455 ObjectContextRef obc,
2456 int r, hobject_t missing_oid,
2457 bool must_promote,
2458 bool in_hit_set,
2459 ObjectContextRef *promote_obc)
2460{
b32b8144
FG
2461 // return quickly if caching is not enabled
2462 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2463 return cache_result_t::NOOP;
2464
7c673cae
FG
2465 if (op &&
2466 op->get_req() &&
2467 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2468 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2469 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2470 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2471 return cache_result_t::NOOP;
2472 }
7c673cae
FG
2473
2474 must_promote = must_promote || op->need_promote();
2475
2476 if (obc)
2477 dout(25) << __func__ << " " << obc->obs.oi << " "
2478 << (obc->obs.exists ? "exists" : "DNE")
2479 << " missing_oid " << missing_oid
2480 << " must_promote " << (int)must_promote
2481 << " in_hit_set " << (int)in_hit_set
2482 << dendl;
2483 else
2484 dout(25) << __func__ << " (no obc)"
2485 << " missing_oid " << missing_oid
2486 << " must_promote " << (int)must_promote
2487 << " in_hit_set " << (int)in_hit_set
2488 << dendl;
2489
2490 // if it is write-ordered and blocked, stop now
2491 if (obc.get() && obc->is_blocked() && write_ordered) {
2492 // we're already doing something with this object
2493 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2494 return cache_result_t::NOOP;
2495 }
2496
2497 if (r == -ENOENT && missing_oid == hobject_t()) {
2498 // we know this object is logically absent (e.g., an undefined clone)
2499 return cache_result_t::NOOP;
2500 }
2501
2502 if (obc.get() && obc->obs.exists) {
2503 osd->logger->inc(l_osd_op_cache_hit);
2504 return cache_result_t::NOOP;
2505 }
b32b8144
FG
2506 if (!is_primary()) {
2507 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2508 osd->reply_op_error(op, -EAGAIN);
2509 return cache_result_t::REPLIED_WITH_EAGAIN;
2510 }
7c673cae
FG
2511
2512 if (missing_oid == hobject_t() && obc.get()) {
2513 missing_oid = obc->obs.oi.soid;
2514 }
2515
2516 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2517 const object_locator_t oloc = m->get_object_locator();
2518
2519 if (op->need_skip_handle_cache()) {
2520 return cache_result_t::NOOP;
2521 }
2522
2523 // older versions do not proxy the feature bits.
2524 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2525 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2526 OpRequestRef promote_op;
2527
2528 switch (pool.info.cache_mode) {
2529 case pg_pool_t::CACHEMODE_WRITEBACK:
2530 if (agent_state &&
2531 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2532 if (!op->may_write() && !op->may_cache() &&
2533 !write_ordered && !must_promote) {
2534 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2535 do_proxy_read(op);
2536 return cache_result_t::HANDLED_PROXY;
2537 }
2538 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2539 block_write_on_full_cache(missing_oid, op);
2540 return cache_result_t::BLOCKED_FULL;
2541 }
2542
2543 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2544 promote_object(obc, missing_oid, oloc, op, promote_obc);
2545 return cache_result_t::BLOCKED_PROMOTE;
2546 }
2547
2548 if (op->may_write() || op->may_cache()) {
2549 if (can_proxy_write) {
2550 do_proxy_write(op, missing_oid);
2551 } else {
2552 // promote if can't proxy the write
2553 promote_object(obc, missing_oid, oloc, op, promote_obc);
2554 return cache_result_t::BLOCKED_PROMOTE;
2555 }
2556
2557 // Promote too?
2558 if (!op->need_skip_promote() &&
2559 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2560 pool.info.min_write_recency_for_promote,
2561 OpRequestRef(),
2562 promote_obc)) {
2563 return cache_result_t::BLOCKED_PROMOTE;
2564 }
2565 return cache_result_t::HANDLED_PROXY;
2566 } else {
2567 do_proxy_read(op);
2568
2569 // Avoid duplicate promotion
2570 if (obc.get() && obc->is_blocked()) {
2571 if (promote_obc)
2572 *promote_obc = obc;
2573 return cache_result_t::BLOCKED_PROMOTE;
2574 }
2575
2576 // Promote too?
2577 if (!op->need_skip_promote()) {
2578 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2579 pool.info.min_read_recency_for_promote,
2580 promote_op, promote_obc);
2581 }
2582
2583 return cache_result_t::HANDLED_PROXY;
2584 }
2585 assert(0 == "unreachable");
2586 return cache_result_t::NOOP;
2587
2588 case pg_pool_t::CACHEMODE_FORWARD:
2589 // FIXME: this mode allows requests to be reordered.
2590 do_cache_redirect(op);
2591 return cache_result_t::HANDLED_REDIRECT;
2592
2593 case pg_pool_t::CACHEMODE_READONLY:
2594 // TODO: clean this case up
2595 if (!obc.get() && r == -ENOENT) {
2596 // we don't have the object and op's a read
2597 promote_object(obc, missing_oid, oloc, op, promote_obc);
2598 return cache_result_t::BLOCKED_PROMOTE;
2599 }
2600 if (!r) { // it must be a write
2601 do_cache_redirect(op);
2602 return cache_result_t::HANDLED_REDIRECT;
2603 }
2604 // crap, there was a failure of some kind
2605 return cache_result_t::NOOP;
2606
2607 case pg_pool_t::CACHEMODE_READFORWARD:
2608 // Do writeback to the cache tier for writes
2609 if (op->may_write() || write_ordered || must_promote) {
2610 if (agent_state &&
2611 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2612 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2613 block_write_on_full_cache(missing_oid, op);
2614 return cache_result_t::BLOCKED_FULL;
2615 }
2616 promote_object(obc, missing_oid, oloc, op, promote_obc);
2617 return cache_result_t::BLOCKED_PROMOTE;
2618 }
2619
2620 // If it is a read, we can read, we need to forward it
2621 do_cache_redirect(op);
2622 return cache_result_t::HANDLED_REDIRECT;
2623
2624 case pg_pool_t::CACHEMODE_PROXY:
2625 if (!must_promote) {
2626 if (op->may_write() || op->may_cache() || write_ordered) {
2627 if (can_proxy_write) {
2628 do_proxy_write(op, missing_oid);
2629 return cache_result_t::HANDLED_PROXY;
2630 }
2631 } else {
2632 do_proxy_read(op);
2633 return cache_result_t::HANDLED_PROXY;
2634 }
2635 }
2636 // ugh, we're forced to promote.
2637 if (agent_state &&
2638 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2639 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2640 block_write_on_full_cache(missing_oid, op);
2641 return cache_result_t::BLOCKED_FULL;
2642 }
2643 promote_object(obc, missing_oid, oloc, op, promote_obc);
2644 return cache_result_t::BLOCKED_PROMOTE;
2645
2646 case pg_pool_t::CACHEMODE_READPROXY:
2647 // Do writeback to the cache tier for writes
2648 if (op->may_write() || write_ordered || must_promote) {
2649 if (agent_state &&
2650 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2651 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2652 block_write_on_full_cache(missing_oid, op);
2653 return cache_result_t::BLOCKED_FULL;
2654 }
2655 promote_object(obc, missing_oid, oloc, op, promote_obc);
2656 return cache_result_t::BLOCKED_PROMOTE;
2657 }
2658
2659 // If it is a read, we can read, we need to proxy it
2660 do_proxy_read(op);
2661 return cache_result_t::HANDLED_PROXY;
2662
2663 default:
2664 assert(0 == "unrecognized cache_mode");
2665 }
2666 return cache_result_t::NOOP;
2667}
2668
2669bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2670 const hobject_t& missing_oid,
2671 const object_locator_t& oloc,
2672 bool in_hit_set,
2673 uint32_t recency,
2674 OpRequestRef promote_op,
2675 ObjectContextRef *promote_obc)
2676{
2677 dout(20) << __func__ << " missing_oid " << missing_oid
2678 << " in_hit_set " << in_hit_set << dendl;
2679
2680 switch (recency) {
2681 case 0:
2682 break;
2683 case 1:
2684 // Check if in the current hit set
2685 if (in_hit_set) {
2686 break;
2687 } else {
2688 // not promoting
2689 return false;
2690 }
2691 break;
2692 default:
2693 {
2694 unsigned count = (int)in_hit_set;
2695 if (count) {
2696 // Check if in other hit sets
2697 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2698 for (map<time_t,HitSetRef>::reverse_iterator itor =
2699 agent_state->hit_set_map.rbegin();
2700 itor != agent_state->hit_set_map.rend();
2701 ++itor) {
2702 if (!itor->second->contains(oid)) {
2703 break;
2704 }
2705 ++count;
2706 if (count >= recency) {
2707 break;
2708 }
2709 }
2710 }
2711 if (count >= recency) {
2712 break;
2713 }
2714 return false; // not promoting
2715 }
2716 break;
2717 }
2718
2719 if (osd->promote_throttle()) {
2720 dout(10) << __func__ << " promote throttled" << dendl;
2721 return false;
2722 }
2723 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2724 return true;
2725}
2726
2727void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2728{
2729 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2730 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2731 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2732 get_osdmap()->get_epoch(), flags, false);
2733 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2734 reply->set_redirect(redir);
2735 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2736 << op << dendl;
2737 m->get_connection()->send_message(reply);
2738 return;
2739}
2740
2741struct C_ProxyRead : public Context {
2742 PrimaryLogPGRef pg;
2743 hobject_t oid;
2744 epoch_t last_peering_reset;
2745 ceph_tid_t tid;
2746 PrimaryLogPG::ProxyReadOpRef prdop;
2747 utime_t start;
2748 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2749 const PrimaryLogPG::ProxyReadOpRef& prd)
2750 : pg(p), oid(o), last_peering_reset(lpr),
2751 tid(0), prdop(prd), start(ceph_clock_now())
2752 {}
2753 void finish(int r) override {
2754 if (prdop->canceled)
2755 return;
2756 pg->lock();
2757 if (prdop->canceled) {
2758 pg->unlock();
2759 return;
2760 }
2761 if (last_peering_reset == pg->get_last_peering_reset()) {
2762 pg->finish_proxy_read(oid, tid, r);
2763 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2764 }
2765 pg->unlock();
2766 }
2767};
2768
31f18b77 2769void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
2770{
2771 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2772 // stash the result in the request's OSDOp vector
2773 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77
FG
2774 object_locator_t oloc;
2775 hobject_t soid;
2776 /* extensible tier */
2777 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2778 switch (obc->obs.oi.manifest.type) {
2779 case object_manifest_t::TYPE_REDIRECT:
2780 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2781 soid = obc->obs.oi.manifest.redirect_target;
2782 break;
2783 case object_manifest_t::TYPE_CHUNKED:
2784 default:
2785 assert(0 == "unrecognized manifest type");
2786 }
2787 } else {
2788 /* proxy */
2789 soid = m->get_hobj();
2790 oloc = object_locator_t(m->get_object_locator());
2791 oloc.pool = pool.info.tier_of;
2792 }
7c673cae
FG
2793 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2794
2795 // pass through some original flags that make sense.
2796 // - leave out redirection and balancing flags since we are
2797 // already proxying through the primary
2798 // - leave off read/write/exec flags that are derived from the op
2799 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2800 CEPH_OSD_FLAG_ORDERSNAP |
2801 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2802 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2803
2804 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2805
2806 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2807
2808 ObjectOperation obj_op;
2809 obj_op.dup(prdop->ops);
2810
2811 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2812 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2813 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2814 ceph_osd_op op = obj_op.ops[i].op;
2815 switch (op.op) {
2816 case CEPH_OSD_OP_READ:
2817 case CEPH_OSD_OP_SYNC_READ:
2818 case CEPH_OSD_OP_SPARSE_READ:
2819 case CEPH_OSD_OP_CHECKSUM:
c07f9fc5 2820 case CEPH_OSD_OP_CMPEXT:
7c673cae
FG
2821 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2822 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2823 }
2824 }
2825 }
2826
2827 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2828 prdop);
2829 ceph_tid_t tid = osd->objecter->read(
2830 soid.oid, oloc, obj_op,
2831 m->get_snapid(), NULL,
2832 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2833 &prdop->user_version,
2834 &prdop->data_offset,
2835 m->get_features());
2836 fin->tid = tid;
2837 prdop->objecter_tid = tid;
2838 proxyread_ops[tid] = prdop;
2839 in_progress_proxy_ops[soid].push_back(op);
2840}
2841
2842void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2843{
2844 dout(10) << __func__ << " " << oid << " tid " << tid
2845 << " " << cpp_strerror(r) << dendl;
2846
2847 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2848 if (p == proxyread_ops.end()) {
2849 dout(10) << __func__ << " no proxyread_op found" << dendl;
2850 return;
2851 }
2852 ProxyReadOpRef prdop = p->second;
2853 if (tid != prdop->objecter_tid) {
2854 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2855 << " tid " << prdop->objecter_tid << dendl;
2856 return;
2857 }
2858 if (oid != prdop->soid) {
2859 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2860 << " soid " << prdop->soid << dendl;
2861 return;
2862 }
2863 proxyread_ops.erase(tid);
2864
2865 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2866 if (q == in_progress_proxy_ops.end()) {
2867 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2868 return;
2869 }
2870 assert(q->second.size());
2871 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2872 q->second.end(),
2873 prdop->op);
2874 assert(it != q->second.end());
2875 OpRequestRef op = *it;
2876 q->second.erase(it);
2877 if (q->second.size() == 0) {
2878 in_progress_proxy_ops.erase(oid);
2879 }
2880
2881 osd->logger->inc(l_osd_tier_proxy_read);
2882
2883 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
c07f9fc5 2884 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
7c673cae
FG
2885 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2886 ctx->user_at_version = prdop->user_version;
2887 ctx->data_off = prdop->data_offset;
2888 ctx->ignore_log_op_stats = true;
2889 complete_read_ctx(r, ctx);
2890}
2891
2892void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2893{
2894 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2895 if (p == in_progress_proxy_ops.end())
2896 return;
2897
2898 list<OpRequestRef>& ls = p->second;
2899 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2900 requeue_ops(ls);
2901 in_progress_proxy_ops.erase(p);
2902}
2903
94b18763
FG
2904void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
2905 vector<ceph_tid_t> *tids)
7c673cae
FG
2906{
2907 dout(10) << __func__ << " " << prdop->soid << dendl;
2908 prdop->canceled = true;
2909
2910 // cancel objecter op, if we can
2911 if (prdop->objecter_tid) {
94b18763 2912 tids->push_back(prdop->objecter_tid);
7c673cae
FG
2913 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2914 prdop->ops[i].outdata.clear();
2915 }
2916 proxyread_ops.erase(prdop->objecter_tid);
2917 prdop->objecter_tid = 0;
2918 }
2919}
2920
94b18763 2921void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
2922{
2923 dout(10) << __func__ << dendl;
2924
2925 // cancel proxy reads
2926 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2927 while (p != proxyread_ops.end()) {
94b18763 2928 cancel_proxy_read((p++)->second, tids);
7c673cae
FG
2929 }
2930
2931 // cancel proxy writes
2932 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2933 while (q != proxywrite_ops.end()) {
94b18763 2934 cancel_proxy_write((q++)->second, tids);
7c673cae
FG
2935 }
2936
2937 if (requeue) {
2938 map<hobject_t, list<OpRequestRef>>::iterator p =
2939 in_progress_proxy_ops.begin();
2940 while (p != in_progress_proxy_ops.end()) {
2941 list<OpRequestRef>& ls = p->second;
2942 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2943 << " requests" << dendl;
2944 requeue_ops(ls);
2945 in_progress_proxy_ops.erase(p++);
2946 }
2947 } else {
2948 in_progress_proxy_ops.clear();
2949 }
2950}
2951
2952struct C_ProxyWrite_Commit : public Context {
2953 PrimaryLogPGRef pg;
2954 hobject_t oid;
2955 epoch_t last_peering_reset;
2956 ceph_tid_t tid;
2957 PrimaryLogPG::ProxyWriteOpRef pwop;
2958 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2959 const PrimaryLogPG::ProxyWriteOpRef& pw)
2960 : pg(p), oid(o), last_peering_reset(lpr),
2961 tid(0), pwop(pw)
2962 {}
2963 void finish(int r) override {
2964 if (pwop->canceled)
2965 return;
2966 pg->lock();
2967 if (pwop->canceled) {
2968 pg->unlock();
2969 return;
2970 }
2971 if (last_peering_reset == pg->get_last_peering_reset()) {
2972 pg->finish_proxy_write(oid, tid, r);
2973 }
2974 pg->unlock();
2975 }
2976};
2977
31f18b77 2978void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
7c673cae
FG
2979{
2980 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2981 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77 2982 object_locator_t oloc;
7c673cae 2983 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
31f18b77
FG
2984 hobject_t soid;
2985 /* extensible tier */
2986 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2987 switch (obc->obs.oi.manifest.type) {
2988 case object_manifest_t::TYPE_REDIRECT:
2989 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2990 soid = obc->obs.oi.manifest.redirect_target;
2991 break;
2992 case object_manifest_t::TYPE_CHUNKED:
2993 default:
2994 assert(0 == "unrecognized manifest type");
2995 }
2996 } else {
2997 /* proxy */
2998 soid = m->get_hobj();
2999 oloc = object_locator_t(m->get_object_locator());
3000 oloc.pool = pool.info.tier_of;
3001 }
7c673cae 3002
7c673cae 3003 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
31f18b77
FG
3004 if (!(op->may_write() || op->may_cache())) {
3005 flags |= CEPH_OSD_FLAG_RWORDERED;
3006 }
7c673cae
FG
3007 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3008
3009 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
c07f9fc5 3010 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
7c673cae
FG
3011 pwop->mtime = m->get_mtime();
3012
3013 ObjectOperation obj_op;
3014 obj_op.dup(pwop->ops);
3015
3016 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3017 this, soid, get_last_peering_reset(), pwop);
3018 ceph_tid_t tid = osd->objecter->mutate(
3019 soid.oid, oloc, obj_op, snapc,
3020 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3021 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3022 &pwop->user_version, pwop->reqid);
3023 fin->tid = tid;
3024 pwop->objecter_tid = tid;
3025 proxywrite_ops[tid] = pwop;
3026 in_progress_proxy_ops[soid].push_back(op);
3027}
3028
3029void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3030{
3031 dout(10) << __func__ << " " << oid << " tid " << tid
3032 << " " << cpp_strerror(r) << dendl;
3033
3034 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3035 if (p == proxywrite_ops.end()) {
3036 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3037 return;
3038 }
3039 ProxyWriteOpRef pwop = p->second;
3040 assert(tid == pwop->objecter_tid);
3041 assert(oid == pwop->soid);
3042
3043 proxywrite_ops.erase(tid);
3044
3045 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3046 if (q == in_progress_proxy_ops.end()) {
3047 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3048 delete pwop->ctx;
3049 pwop->ctx = NULL;
3050 return;
3051 }
3052 list<OpRequestRef>& in_progress_op = q->second;
3053 assert(in_progress_op.size());
3054 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3055 in_progress_op.end(),
3056 pwop->op);
3057 assert(it != in_progress_op.end());
3058 in_progress_op.erase(it);
3059 if (in_progress_op.size() == 0) {
3060 in_progress_proxy_ops.erase(oid);
3061 }
3062
3063 osd->logger->inc(l_osd_tier_proxy_write);
3064
3065 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3066 assert(m != NULL);
3067
3068 if (!pwop->sent_reply) {
3069 // send commit.
3070 MOSDOpReply *reply = pwop->ctx->reply;
3071 if (reply)
3072 pwop->ctx->reply = NULL;
3073 else {
3074 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3075 reply->set_reply_versions(eversion_t(), pwop->user_version);
3076 }
3077 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3078 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3079 osd->send_message_osd_client(reply, m->get_connection());
3080 pwop->sent_reply = true;
3081 pwop->ctx->op->mark_commit_sent();
3082 }
3083
3084 delete pwop->ctx;
3085 pwop->ctx = NULL;
3086}
3087
94b18763
FG
3088void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3089 vector<ceph_tid_t> *tids)
7c673cae
FG
3090{
3091 dout(10) << __func__ << " " << pwop->soid << dendl;
3092 pwop->canceled = true;
3093
3094 // cancel objecter op, if we can
3095 if (pwop->objecter_tid) {
94b18763 3096 tids->push_back(pwop->objecter_tid);
7c673cae
FG
3097 delete pwop->ctx;
3098 pwop->ctx = NULL;
3099 proxywrite_ops.erase(pwop->objecter_tid);
3100 pwop->objecter_tid = 0;
3101 }
3102}
3103
3104class PromoteCallback: public PrimaryLogPG::CopyCallback {
3105 ObjectContextRef obc;
3106 PrimaryLogPG *pg;
3107 utime_t start;
3108public:
3109 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3110 : obc(obc_),
3111 pg(pg_),
3112 start(ceph_clock_now()) {}
3113
3114 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3115 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3116 int r = results.get<0>();
3117 pg->finish_promote(r, results_data, obc);
3118 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3119 }
3120};
3121
3122void PrimaryLogPG::promote_object(ObjectContextRef obc,
3123 const hobject_t& missing_oid,
3124 const object_locator_t& oloc,
3125 OpRequestRef op,
3126 ObjectContextRef *promote_obc)
3127{
3128 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3129 assert(hoid != hobject_t());
28e407b8 3130 if (write_blocked_by_scrub(hoid)) {
7c673cae
FG
3131 dout(10) << __func__ << " " << hoid
3132 << " blocked by scrub" << dendl;
3133 if (op) {
3134 waiting_for_scrub.push_back(op);
3135 op->mark_delayed("waiting for scrub");
3136 dout(10) << __func__ << " " << hoid
3137 << " placing op in waiting_for_scrub" << dendl;
3138 } else {
3139 dout(10) << __func__ << " " << hoid
3140 << " no op, dropping on the floor" << dendl;
3141 }
3142 return;
3143 }
3144 if (!obc) { // we need to create an ObjectContext
3145 assert(missing_oid != hobject_t());
3146 obc = get_object_context(missing_oid, true);
3147 }
3148 if (promote_obc)
3149 *promote_obc = obc;
3150
3151 /*
3152 * Before promote complete, if there are proxy-reads for the object,
3153 * for this case we don't use DONTNEED.
3154 */
3155 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3156 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3157 if (q == in_progress_proxy_ops.end()) {
3158 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3159 }
3160
3161 PromoteCallback *cb = new PromoteCallback(obc, this);
3162 object_locator_t my_oloc = oloc;
3163 my_oloc.pool = pool.info.tier_of;
3164
3165 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3166 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3167 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3168 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3169 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3170 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3171 src_fadvise_flags, 0);
3172
3173 assert(obc->is_blocked());
3174
3175 if (op)
3176 wait_for_blocked_object(obc->obs.oi.soid, op);
3177 info.stats.stats.sum.num_promote++;
3178}
3179
3180void PrimaryLogPG::execute_ctx(OpContext *ctx)
3181{
3182 FUNCTRACE();
3183 dout(10) << __func__ << " " << ctx << dendl;
3184 ctx->reset_obs(ctx->obc);
3185 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3186 OpRequestRef op = ctx->op;
3187 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3188 ObjectContextRef obc = ctx->obc;
3189 const hobject_t& soid = obc->obs.oi.soid;
3190
3191 // this method must be idempotent since we may call it several times
3192 // before we finally apply the resulting transaction.
3193 ctx->op_t.reset(new PGTransaction);
3194
3195 if (op->may_write() || op->may_cache()) {
3196 // snap
3197 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3198 pool.info.is_pool_snaps_mode()) {
3199 // use pool's snapc
3200 ctx->snapc = pool.snapc;
3201 } else {
3202 // client specified snapc
3203 ctx->snapc.seq = m->get_snap_seq();
3204 ctx->snapc.snaps = m->get_snaps();
3205 filter_snapc(ctx->snapc.snaps);
3206 }
3207 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3208 ctx->snapc.seq < obc->ssc->snapset.seq) {
3209 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3210 << " < snapset seq " << obc->ssc->snapset.seq
3211 << " on " << obc->obs.oi.soid << dendl;
3212 reply_ctx(ctx, -EOLDSNAPC);
3213 return;
3214 }
3215
3216 // version
3217 ctx->at_version = get_next_version();
3218 ctx->mtime = m->get_mtime();
3219
c07f9fc5 3220 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae
FG
3221 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3222 << " snapc " << ctx->snapc
3223 << " snapset " << obc->ssc->snapset
3224 << dendl;
3225 } else {
c07f9fc5 3226 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae
FG
3227 << " ov " << obc->obs.oi.version
3228 << dendl;
3229 }
3230
3231 if (!ctx->user_at_version)
3232 ctx->user_at_version = obc->obs.oi.user_version;
3233 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3234
3235 if (op->may_read()) {
3236 dout(10) << " taking ondisk_read_lock" << dendl;
3237 obc->ondisk_read_lock();
3238 }
3239
3240 {
3241#ifdef WITH_LTTNG
3242 osd_reqid_t reqid = ctx->op->get_reqid();
3243#endif
3244 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3245 reqid.name._num, reqid.tid, reqid.inc);
3246 }
3247
3248 int result = prepare_transaction(ctx);
3249
3250 {
3251#ifdef WITH_LTTNG
3252 osd_reqid_t reqid = ctx->op->get_reqid();
3253#endif
3254 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3255 reqid.name._num, reqid.tid, reqid.inc);
3256 }
3257
3258 if (op->may_read()) {
3259 dout(10) << " dropping ondisk_read_lock" << dendl;
3260 obc->ondisk_read_unlock();
3261 }
3262
c07f9fc5
FG
3263 bool pending_async_reads = !ctx->pending_async_reads.empty();
3264 if (result == -EINPROGRESS || pending_async_reads) {
7c673cae 3265 // come back later.
c07f9fc5
FG
3266 if (pending_async_reads) {
3267 in_progress_async_reads.push_back(make_pair(op, ctx));
3268 ctx->start_async_reads(this);
3269 }
7c673cae
FG
3270 return;
3271 }
3272
3273 if (result == -EAGAIN) {
3274 // clean up after the ctx
3275 close_op_ctx(ctx);
3276 return;
3277 }
3278
3279 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3280 // prepare the reply
3281 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3282 successful_write);
3283
3284 // Write operations aren't allowed to return a data payload because
3285 // we can't do so reliably. If the client has to resend the request
3286 // and it has already been applied, we will return 0 with no
3287 // payload. Non-deterministic behavior is no good. However, it is
3288 // possible to construct an operation that does a read, does a guard
3289 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3290 // with the write, or return a CMPXATTR and the read value.
3291 if (successful_write) {
3292 // write. normalize the result code.
3293 dout(20) << " zeroing write result code " << result << dendl;
3294 result = 0;
3295 }
3296 ctx->reply->set_result(result);
3297
3298 // read or error?
3299 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3300 // finish side-effects
3301 if (result >= 0)
3302 do_osd_op_effects(ctx, m->get_connection());
3303
c07f9fc5 3304 complete_read_ctx(result, ctx);
7c673cae
FG
3305 return;
3306 }
3307
3308 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3309
3310 assert(op->may_write() || op->may_cache());
3311
3312 // trim log?
3313 calc_trim_to();
3314
3315 // verify that we are doing this in order?
3316 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3317 !pool.info.is_tier() && !pool.info.has_tiers()) {
3318 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3319 ceph_tid_t t = m->get_tid();
3320 client_t n = m->get_source().num();
3321 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3322 if (p == cm.end()) {
3323 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3324 cm[n] = t;
3325 } else {
3326 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3327 if (p->second > t) {
3328 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3329 assert(0 == "out of order op");
3330 }
3331 p->second = t;
3332 }
3333 }
3334
3335 if (ctx->update_log_only) {
3336 if (result >= 0)
3337 do_osd_op_effects(ctx, m->get_connection());
3338
3339 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3340 // save just what we need from ctx
3341 MOSDOpReply *reply = ctx->reply;
3342 ctx->reply = nullptr;
c07f9fc5
FG
3343 reply->claim_op_out_data(*ctx->ops);
3344 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
3345 close_op_ctx(ctx);
3346
3347 if (result == -ENOENT) {
3348 reply->set_enoent_reply_versions(info.last_update,
3349 info.last_user_version);
3350 }
3351 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3352 // append to pg log for dup detection - don't save buffers for now
3353 record_write_error(op, soid, reply, result);
3354 return;
3355 }
3356
3357 // no need to capture PG ref, repop cancel will handle that
3358 // Can capture the ctx by pointer, it's owned by the repop
3359 ctx->register_on_commit(
3360 [m, ctx, this](){
3361 if (ctx->op)
3362 log_op_stats(
3363 ctx);
3364
3365 if (m && !ctx->sent_reply) {
3366 MOSDOpReply *reply = ctx->reply;
3367 if (reply)
3368 ctx->reply = nullptr;
3369 else {
3370 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3371 reply->set_reply_versions(ctx->at_version,
3372 ctx->user_at_version);
3373 }
3374 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3375 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3376 osd->send_message_osd_client(reply, m->get_connection());
3377 ctx->sent_reply = true;
3378 ctx->op->mark_commit_sent();
3379 }
3380 });
3381 ctx->register_on_success(
3382 [ctx, this]() {
3383 do_osd_op_effects(
3384 ctx,
3385 ctx->op ? ctx->op->get_req()->get_connection() :
3386 ConnectionRef());
3387 });
3388 ctx->register_on_finish(
3389 [ctx, this]() {
3390 delete ctx;
3391 });
3392
3393 // issue replica writes
3394 ceph_tid_t rep_tid = osd->get_tid();
3395
3396 RepGather *repop = new_repop(ctx, obc, rep_tid);
3397
3398 issue_repop(repop, ctx);
3399 eval_repop(repop);
3400 repop->put();
3401}
3402
c07f9fc5
FG
3403void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3404 release_object_locks(ctx->lock_manager);
3405
3406 ctx->op_t.reset();
3407
3408 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3409 ctx->on_finish.erase(p++)) {
3410 (*p)();
3411 }
3412 delete ctx;
3413}
3414
7c673cae
FG
3415void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3416{
3417 if (ctx->op)
3418 osd->reply_op_error(ctx->op, r);
3419 close_op_ctx(ctx);
3420}
3421
3422void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3423{
3424 if (ctx->op)
3425 osd->reply_op_error(ctx->op, r, v, uv);
3426 close_op_ctx(ctx);
3427}
3428
3429void PrimaryLogPG::log_op_stats(OpContext *ctx)
3430{
3431 OpRequestRef op = ctx->op;
3432 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3433
3434 utime_t now = ceph_clock_now();
3435 utime_t latency = now;
3436 latency -= ctx->op->get_req()->get_recv_stamp();
3437 utime_t process_latency = now;
3438 process_latency -= ctx->op->get_dequeued_time();
3439
3440 uint64_t inb = ctx->bytes_written;
3441 uint64_t outb = ctx->bytes_read;
3442
3443 osd->logger->inc(l_osd_op);
3444
3445 osd->logger->inc(l_osd_op_outb, outb);
3446 osd->logger->inc(l_osd_op_inb, inb);
3447 osd->logger->tinc(l_osd_op_lat, latency);
3448 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3449
3450 if (op->may_read() && op->may_write()) {
3451 osd->logger->inc(l_osd_op_rw);
3452 osd->logger->inc(l_osd_op_rw_inb, inb);
3453 osd->logger->inc(l_osd_op_rw_outb, outb);
3454 osd->logger->tinc(l_osd_op_rw_lat, latency);
3455 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3456 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3457 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3458 } else if (op->may_read()) {
3459 osd->logger->inc(l_osd_op_r);
3460 osd->logger->inc(l_osd_op_r_outb, outb);
3461 osd->logger->tinc(l_osd_op_r_lat, latency);
3462 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3463 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3464 } else if (op->may_write() || op->may_cache()) {
3465 osd->logger->inc(l_osd_op_w);
3466 osd->logger->inc(l_osd_op_w_inb, inb);
3467 osd->logger->tinc(l_osd_op_w_lat, latency);
3468 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3469 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3470 } else
3471 ceph_abort();
3472
3473 dout(15) << "log_op_stats " << *m
3474 << " inb " << inb
3475 << " outb " << outb
3476 << " lat " << latency << dendl;
3477}
3478
3479void PrimaryLogPG::do_sub_op(OpRequestRef op)
3480{
3481 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3482 assert(have_same_or_newer_map(m->map_epoch));
3483 assert(m->get_type() == MSG_OSD_SUBOP);
3484 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3485
3486 if (!is_peered()) {
3487 waiting_for_peered.push_back(op);
3488 op->mark_delayed("waiting for active");
3489 return;
3490 }
3491
3492 const OSDOp *first = NULL;
3493 if (m->ops.size() >= 1) {
3494 first = &m->ops[0];
3495 }
3496
3497 if (first) {
3498 switch (first->op.op) {
3499 case CEPH_OSD_OP_DELETE:
3500 sub_op_remove(op);
3501 return;
3502 case CEPH_OSD_OP_SCRUB_RESERVE:
3503 handle_scrub_reserve_request(op);
3504 return;
3505 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3506 handle_scrub_reserve_release(op);
3507 return;
3508 case CEPH_OSD_OP_SCRUB_MAP:
3509 sub_op_scrub_map(op);
3510 return;
3511 }
3512 }
3513}
3514
3515void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3516{
3517 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3518 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3519 if (r->ops.size() >= 1) {
3520 const OSDOp& first = r->ops[0];
3521 switch (first.op.op) {
3522 case CEPH_OSD_OP_SCRUB_RESERVE:
3523 {
3524 pg_shard_t from = r->from;
3525 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3526 bool reserved;
3527 ::decode(reserved, p);
3528 if (reserved) {
3529 handle_scrub_reserve_grant(op, from);
3530 } else {
3531 handle_scrub_reserve_reject(op, from);
3532 }
3533 }
3534 return;
3535 }
3536 }
3537}
3538
3539void PrimaryLogPG::do_scan(
3540 OpRequestRef op,
3541 ThreadPool::TPHandle &handle)
3542{
3543 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3544 assert(m->get_type() == MSG_OSD_PG_SCAN);
3545 dout(10) << "do_scan " << *m << dendl;
3546
3547 op->mark_started();
3548
3549 switch (m->op) {
3550 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3551 {
3552 ostringstream ss;
3553 if (osd->check_backfill_full(ss)) {
3554 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3555 queue_peering_event(
3556 CephPeeringEvtRef(
3557 std::make_shared<CephPeeringEvt>(
3558 get_osdmap()->get_epoch(),
3559 get_osdmap()->get_epoch(),
3560 BackfillTooFull())));
3561 return;
3562 }
3563
3564 BackfillInterval bi;
3565 bi.begin = m->begin;
3566 // No need to flush, there won't be any in progress writes occuring
3567 // past m->begin
3568 scan_range(
3569 cct->_conf->osd_backfill_scan_min,
3570 cct->_conf->osd_backfill_scan_max,
3571 &bi,
3572 handle);
3573 MOSDPGScan *reply = new MOSDPGScan(
3574 MOSDPGScan::OP_SCAN_DIGEST,
3575 pg_whoami,
3576 get_osdmap()->get_epoch(), m->query_epoch,
3577 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3578 ::encode(bi.objects, reply->get_data());
3579 osd->send_message_osd_cluster(reply, m->get_connection());
3580 }
3581 break;
3582
3583 case MOSDPGScan::OP_SCAN_DIGEST:
3584 {
3585 pg_shard_t from = m->from;
3586
3587 // Check that from is in backfill_targets vector
3588 assert(is_backfill_targets(from));
3589
3590 BackfillInterval& bi = peer_backfill_info[from];
3591 bi.begin = m->begin;
3592 bi.end = m->end;
3593 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3594
3595 // take care to preserve ordering!
3596 bi.clear_objects();
3597 ::decode_noclear(bi.objects, p);
3598
3599 if (waiting_on_backfill.erase(from)) {
3600 if (waiting_on_backfill.empty()) {
3601 assert(peer_backfill_info.size() == backfill_targets.size());
3602 finish_recovery_op(hobject_t::get_max());
3603 }
3604 } else {
3605 // we canceled backfill for a while due to a too full, and this
3606 // is an extra response from a non-too-full peer
3607 }
3608 }
3609 break;
3610 }
3611}
3612
3613void PrimaryLogPG::do_backfill(OpRequestRef op)
3614{
3615 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3616 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3617 dout(10) << "do_backfill " << *m << dendl;
3618
3619 op->mark_started();
3620
3621 switch (m->op) {
3622 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3623 {
3624 assert(cct->_conf->osd_kill_backfill_at != 1);
3625
3626 MOSDPGBackfill *reply = new MOSDPGBackfill(
3627 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3628 get_osdmap()->get_epoch(),
3629 m->query_epoch,
3630 spg_t(info.pgid.pgid, get_primary().shard));
3631 reply->set_priority(get_recovery_op_priority());
3632 osd->send_message_osd_cluster(reply, m->get_connection());
3633 queue_peering_event(
3634 CephPeeringEvtRef(
3635 std::make_shared<CephPeeringEvt>(
3636 get_osdmap()->get_epoch(),
3637 get_osdmap()->get_epoch(),
3638 RecoveryDone())));
3639 }
3640 // fall-thru
3641
3642 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3643 {
3644 assert(cct->_conf->osd_kill_backfill_at != 2);
3645
3646 info.set_last_backfill(m->last_backfill);
3647 info.stats = m->stats;
3648
3649 ObjectStore::Transaction t;
3650 dirty_info = true;
3651 write_if_dirty(t);
3652 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3653 assert(tr == 0);
3654 }
3655 break;
3656
3657 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3658 {
3659 assert(is_primary());
3660 assert(cct->_conf->osd_kill_backfill_at != 3);
3661 finish_recovery_op(hobject_t::get_max());
3662 }
3663 break;
3664 }
3665}
3666
3667void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3668{
3669 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3670 op->get_req());
3671 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3672 dout(7) << __func__ << " " << m->ls << dendl;
3673
3674 op->mark_started();
3675
3676 ObjectStore::Transaction t;
3677 for (auto& p : m->ls) {
3678 remove_snap_mapped_object(t, p.first);
3679 }
3680 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3681 assert(r == 0);
3682}
3683
224ce89b
WB
3684int PrimaryLogPG::trim_object(
3685 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
7c673cae 3686{
224ce89b 3687 *ctxp = NULL;
7c673cae
FG
3688 // load clone info
3689 bufferlist bl;
3690 ObjectContextRef obc = get_object_context(coid, false, NULL);
224ce89b
WB
3691 if (!obc || !obc->ssc || !obc->ssc->exists) {
3692 osd->clog->error() << __func__ << ": Can not trim " << coid
3693 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3694 return -ENOENT;
7c673cae 3695 }
7c673cae
FG
3696
3697 hobject_t snapoid(
3698 coid.oid, coid.get_key(),
3699 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3700 info.pgid.pool(), coid.get_namespace());
3701 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
224ce89b
WB
3702 if (!snapset_obc) {
3703 osd->clog->error() << __func__ << ": Can not trim " << coid
3704 << " repair needed, no snapset obc for " << snapoid;
3705 return -ENOENT;
3706 }
7c673cae
FG
3707
3708 SnapSet& snapset = obc->ssc->snapset;
3709
3710 bool legacy = snapset.is_legacy() ||
31f18b77 3711 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
3712
3713 object_info_t &coi = obc->obs.oi;
3714 set<snapid_t> old_snaps;
3715 if (legacy) {
3716 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3717 } else {
3718 auto p = snapset.clone_snaps.find(coid.snap);
3719 if (p == snapset.clone_snaps.end()) {
c07f9fc5
FG
3720 osd->clog->error() << "No clone_snaps in snapset " << snapset
3721 << " for object " << coid << "\n";
224ce89b 3722 return -ENOENT;
7c673cae
FG
3723 }
3724 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3725 snapset.clone_snaps[coid.snap].end());
3726 }
3727 if (old_snaps.empty()) {
c07f9fc5 3728 osd->clog->error() << "No object info snaps for object " << coid;
224ce89b 3729 return -ENOENT;
7c673cae
FG
3730 }
3731
3732 dout(10) << coid << " old_snaps " << old_snaps
3733 << " old snapset " << snapset << dendl;
3734 if (snapset.seq == 0) {
c07f9fc5 3735 osd->clog->error() << "No snapset.seq for object " << coid;
224ce89b 3736 return -ENOENT;
7c673cae
FG
3737 }
3738
3739 set<snapid_t> new_snaps;
3740 for (set<snapid_t>::iterator i = old_snaps.begin();
3741 i != old_snaps.end();
3742 ++i) {
3743 if (!pool.info.is_removed_snap(*i))
3744 new_snaps.insert(*i);
3745 }
3746
3747 vector<snapid_t>::iterator p = snapset.clones.end();
3748
3749 if (new_snaps.empty()) {
3750 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3751 if (p == snapset.clones.end()) {
c07f9fc5 3752 osd->clog->error() << "Snap " << coid.snap << " not in clones";
224ce89b 3753 return -ENOENT;
7c673cae
FG
3754 }
3755 }
3756
3757 OpContextUPtr ctx = simple_opc_create(obc);
3758 ctx->snapset_obc = snapset_obc;
3759
3760 if (!ctx->lock_manager.get_snaptrimmer_write(
3761 coid,
3762 obc,
3763 first)) {
3764 close_op_ctx(ctx.release());
3765 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
224ce89b 3766 return -ENOLCK;
7c673cae
FG
3767 }
3768
3769 if (!ctx->lock_manager.get_snaptrimmer_write(
3770 snapoid,
3771 snapset_obc,
3772 first)) {
3773 close_op_ctx(ctx.release());
3774 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
224ce89b 3775 return -ENOLCK;
7c673cae
FG
3776 }
3777
3778 ctx->at_version = get_next_version();
3779
3780 PGTransaction *t = ctx->op_t.get();
3781
3782 if (new_snaps.empty()) {
3783 // remove clone
3784 dout(10) << coid << " snaps " << old_snaps << " -> "
3785 << new_snaps << " ... deleting" << dendl;
3786
3787 // ...from snapset
3788 assert(p != snapset.clones.end());
3789
3790 snapid_t last = coid.snap;
3791 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3792
3793 if (p != snapset.clones.begin()) {
3794 // not the oldest... merge overlap into next older clone
3795 vector<snapid_t>::iterator n = p - 1;
3796 hobject_t prev_coid = coid;
3797 prev_coid.snap = *n;
3798 bool adjust_prev_bytes = is_present_clone(prev_coid);
3799
3800 if (adjust_prev_bytes)
3801 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3802
3803 snapset.clone_overlap[*n].intersection_of(
3804 snapset.clone_overlap[*p]);
3805
3806 if (adjust_prev_bytes)
3807 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3808 }
3809 ctx->delta_stats.num_objects--;
3810 if (coi.is_dirty())
3811 ctx->delta_stats.num_objects_dirty--;
3812 if (coi.is_omap())
3813 ctx->delta_stats.num_objects_omap--;
3814 if (coi.is_whiteout()) {
3815 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3816 ctx->delta_stats.num_whiteouts--;
3817 }
3818 ctx->delta_stats.num_object_clones--;
3819 if (coi.is_cache_pinned())
3820 ctx->delta_stats.num_objects_pinned--;
3821 obc->obs.exists = false;
3822
3823 snapset.clones.erase(p);
3824 snapset.clone_overlap.erase(last);
3825 snapset.clone_size.erase(last);
3826 snapset.clone_snaps.erase(last);
3827
3828 ctx->log.push_back(
3829 pg_log_entry_t(
3830 pg_log_entry_t::DELETE,
3831 coid,
3832 ctx->at_version,
3833 ctx->obs->oi.version,
3834 0,
3835 osd_reqid_t(),
3836 ctx->mtime,
3837 0)
3838 );
3839 t->remove(coid);
3840 t->update_snaps(
3841 coid,
3842 old_snaps,
3843 new_snaps);
31f18b77
FG
3844
3845 coi = object_info_t(coid);
3846
7c673cae
FG
3847 ctx->at_version.version++;
3848 } else {
3849 // save adjusted snaps for this object
3850 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3851 if (legacy) {
3852 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3853 } else {
3854 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3855 new_snaps.rend());
3856 // we still do a 'modify' event on this object just to trigger a
3857 // snapmapper.update ... :(
3858 }
3859
3860 coi.prior_version = coi.version;
3861 coi.version = ctx->at_version;
3862 bl.clear();
3863 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3864 t->setattr(coid, OI_ATTR, bl);
3865
3866 ctx->log.push_back(
3867 pg_log_entry_t(
3868 pg_log_entry_t::MODIFY,
3869 coid,
3870 coi.version,
3871 coi.prior_version,
3872 0,
3873 osd_reqid_t(),
3874 ctx->mtime,
3875 0)
3876 );
3877 ctx->at_version.version++;
3878
3879 t->update_snaps(
3880 coid,
3881 old_snaps,
3882 new_snaps);
3883 }
3884
3885 // save head snapset
3886 dout(10) << coid << " new snapset " << snapset << " on "
3887 << snapset_obc->obs.oi << dendl;
3888 if (snapset.clones.empty() &&
3889 (!snapset.head_exists ||
3890 (snapset_obc->obs.oi.is_whiteout() &&
3891 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3892 !snapset_obc->obs.oi.is_cache_pinned()))) {
3893 // NOTE: this arguably constitutes minor interference with the
3894 // tiering agent if this is a cache tier since a snap trim event
3895 // is effectively evicting a whiteout we might otherwise want to
3896 // keep around.
3897 dout(10) << coid << " removing " << snapoid << dendl;
3898 ctx->log.push_back(
3899 pg_log_entry_t(
3900 pg_log_entry_t::DELETE,
3901 snapoid,
3902 ctx->at_version,
3903 ctx->snapset_obc->obs.oi.version,
3904 0,
3905 osd_reqid_t(),
3906 ctx->mtime,
3907 0)
3908 );
3909 if (snapoid.is_head()) {
3910 derr << "removing snap head" << dendl;
3911 object_info_t& oi = ctx->snapset_obc->obs.oi;
3912 ctx->delta_stats.num_objects--;
3913 if (oi.is_dirty()) {
3914 ctx->delta_stats.num_objects_dirty--;
7c673cae
FG
3915 }
3916 if (oi.is_omap())
3917 ctx->delta_stats.num_objects_omap--;
3918 if (oi.is_whiteout()) {
3919 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3920 ctx->delta_stats.num_whiteouts--;
7c673cae 3921 }
31f18b77 3922 if (oi.is_cache_pinned()) {
7c673cae 3923 ctx->delta_stats.num_objects_pinned--;
31f18b77 3924 }
7c673cae
FG
3925 }
3926 ctx->snapset_obc->obs.exists = false;
31f18b77 3927 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
7c673cae
FG
3928 t->remove(snapoid);
3929 } else {
3930 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3931 snapset.filter(pool.info);
3932 dout(10) << coid << " writing updated snapset on " << snapoid
3933 << ", snapset is " << snapset << dendl;
3934 ctx->log.push_back(
3935 pg_log_entry_t(
3936 pg_log_entry_t::MODIFY,
3937 snapoid,
3938 ctx->at_version,
3939 ctx->snapset_obc->obs.oi.version,
3940 0,
3941 osd_reqid_t(),
3942 ctx->mtime,
3943 0)
3944 );
3945
3946 ctx->snapset_obc->obs.oi.prior_version =
3947 ctx->snapset_obc->obs.oi.version;
3948 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3949
3950 map <string, bufferlist> attrs;
3951 bl.clear();
3952 ::encode(snapset, bl);
3953 attrs[SS_ATTR].claim(bl);
3954
3955 bl.clear();
3956 ::encode(ctx->snapset_obc->obs.oi, bl,
3957 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3958 attrs[OI_ATTR].claim(bl);
3959 t->setattrs(snapoid, attrs);
3960 }
3961
224ce89b
WB
3962 *ctxp = std::move(ctx);
3963 return 0;
7c673cae
FG
3964}
3965
3966void PrimaryLogPG::kick_snap_trim()
3967{
3968 assert(is_active());
3969 assert(is_primary());
3970 if (is_clean() && !snap_trimq.empty()) {
3971 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3972 snap_trimmer_machine.process_event(KickTrim());
3973 }
3974}
3975
3976void PrimaryLogPG::snap_trimmer_scrub_complete()
3977{
3978 if (is_primary() && is_active() && is_clean()) {
3979 assert(!snap_trimq.empty());
3980 snap_trimmer_machine.process_event(ScrubComplete());
3981 }
3982}
3983
3984void PrimaryLogPG::snap_trimmer(epoch_t queued)
3985{
3986 if (deleting || pg_has_reset_since(queued)) {
3987 return;
3988 }
3989
3990 assert(is_primary());
3991
3992 dout(10) << "snap_trimmer posting" << dendl;
3993 snap_trimmer_machine.process_event(DoSnapWork());
3994 dout(10) << "snap_trimmer complete" << dendl;
3995 return;
3996}
3997
3998int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3999{
4000 __u64 v2;
4001
4002 string v2s(xattr.c_str(), xattr.length());
4003 if (v2s.length())
4004 v2 = strtoull(v2s.c_str(), NULL, 10);
4005 else
4006 v2 = 0;
4007
4008 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4009
4010 switch (op) {
4011 case CEPH_OSD_CMPXATTR_OP_EQ:
4012 return (v1 == v2);
4013 case CEPH_OSD_CMPXATTR_OP_NE:
4014 return (v1 != v2);
4015 case CEPH_OSD_CMPXATTR_OP_GT:
4016 return (v1 > v2);
4017 case CEPH_OSD_CMPXATTR_OP_GTE:
4018 return (v1 >= v2);
4019 case CEPH_OSD_CMPXATTR_OP_LT:
4020 return (v1 < v2);
4021 case CEPH_OSD_CMPXATTR_OP_LTE:
4022 return (v1 <= v2);
4023 default:
4024 return -EINVAL;
4025 }
4026}
4027
4028int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4029{
4030 string v2s(xattr.c_str(), xattr.length());
4031
4032 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4033
4034 switch (op) {
4035 case CEPH_OSD_CMPXATTR_OP_EQ:
4036 return (v1s.compare(v2s) == 0);
4037 case CEPH_OSD_CMPXATTR_OP_NE:
4038 return (v1s.compare(v2s) != 0);
4039 case CEPH_OSD_CMPXATTR_OP_GT:
4040 return (v1s.compare(v2s) > 0);
4041 case CEPH_OSD_CMPXATTR_OP_GTE:
4042 return (v1s.compare(v2s) >= 0);
4043 case CEPH_OSD_CMPXATTR_OP_LT:
4044 return (v1s.compare(v2s) < 0);
4045 case CEPH_OSD_CMPXATTR_OP_LTE:
4046 return (v1s.compare(v2s) <= 0);
4047 default:
4048 return -EINVAL;
4049 }
4050}
4051
7c673cae
FG
4052int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4053{
4054 ceph_osd_op& op = osd_op.op;
4055 vector<OSDOp> write_ops(1);
4056 OSDOp& write_op = write_ops[0];
4057 uint64_t write_length = op.writesame.length;
4058 int result = 0;
4059
4060 if (!write_length)
4061 return 0;
4062
4063 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4064 return -EINVAL;
4065
4066 if (op.writesame.data_length != osd_op.indata.length()) {
4067 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4068 return -EINVAL;
4069 }
4070
4071 while (write_length) {
4072 write_op.indata.append(osd_op.indata);
4073 write_length -= op.writesame.data_length;
4074 }
4075
4076 write_op.op.op = CEPH_OSD_OP_WRITE;
4077 write_op.op.extent.offset = op.writesame.offset;
4078 write_op.op.extent.length = op.writesame.length;
4079 result = do_osd_ops(ctx, write_ops);
4080 if (result < 0)
4081 derr << "do_writesame do_osd_ops failed " << result << dendl;
4082
4083 return result;
4084}
4085
4086// ========================================================================
4087// low level osd ops
4088
4089int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4090{
4091 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4092 bufferlist header, vals;
4093 int r = _get_tmap(ctx, &header, &vals);
4094 if (r < 0) {
4095 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4096 r = 0;
4097 return r;
4098 }
4099
4100 vector<OSDOp> ops(3);
4101
4102 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4103 ops[0].op.extent.offset = 0;
4104 ops[0].op.extent.length = 0;
4105
4106 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4107 ops[1].indata.claim(header);
4108
4109 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4110 ops[2].indata.claim(vals);
4111
4112 return do_osd_ops(ctx, ops);
4113}
4114
4115int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4116 bufferlist& bl)
4117{
4118 // decode
4119 bufferlist header;
4120 map<string, bufferlist> m;
4121 if (bl.length()) {
4122 bufferlist::iterator p = bl.begin();
4123 ::decode(header, p);
4124 ::decode(m, p);
4125 assert(p.end());
4126 }
4127
4128 // do the update(s)
4129 while (!bp.end()) {
4130 __u8 op;
4131 string key;
4132 ::decode(op, bp);
4133
4134 switch (op) {
4135 case CEPH_OSD_TMAP_SET: // insert key
4136 {
4137 ::decode(key, bp);
4138 bufferlist data;
4139 ::decode(data, bp);
4140 m[key] = data;
4141 }
4142 break;
4143 case CEPH_OSD_TMAP_RM: // remove key
4144 ::decode(key, bp);
4145 if (!m.count(key)) {
4146 return -ENOENT;
4147 }
4148 m.erase(key);
4149 break;
4150 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4151 ::decode(key, bp);
4152 m.erase(key);
4153 break;
4154 case CEPH_OSD_TMAP_HDR: // update header
4155 {
4156 ::decode(header, bp);
4157 }
4158 break;
4159 default:
4160 return -EINVAL;
4161 }
4162 }
4163
4164 // reencode
4165 bufferlist obl;
4166 ::encode(header, obl);
4167 ::encode(m, obl);
4168
4169 // write it out
4170 vector<OSDOp> nops(1);
4171 OSDOp& newop = nops[0];
4172 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4173 newop.op.extent.offset = 0;
4174 newop.op.extent.length = obl.length();
4175 newop.indata = obl;
4176 do_osd_ops(ctx, nops);
4177 osd_op.outdata.claim(newop.outdata);
4178 return 0;
4179}
4180
4181int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4182{
4183 bufferlist::iterator orig_bp = bp;
4184 int result = 0;
4185 if (bp.end()) {
4186 dout(10) << "tmapup is a no-op" << dendl;
4187 } else {
4188 // read the whole object
4189 vector<OSDOp> nops(1);
4190 OSDOp& newop = nops[0];
4191 newop.op.op = CEPH_OSD_OP_READ;
4192 newop.op.extent.offset = 0;
4193 newop.op.extent.length = 0;
4194 result = do_osd_ops(ctx, nops);
4195
4196 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4197
4198 dout(30) << " starting is \n";
4199 newop.outdata.hexdump(*_dout);
4200 *_dout << dendl;
4201
4202 bufferlist::iterator ip = newop.outdata.begin();
4203 bufferlist obl;
4204
4205 dout(30) << "the update command is: \n";
4206 osd_op.indata.hexdump(*_dout);
4207 *_dout << dendl;
4208
4209 // header
4210 bufferlist header;
4211 __u32 nkeys = 0;
4212 if (newop.outdata.length()) {
4213 ::decode(header, ip);
4214 ::decode(nkeys, ip);
4215 }
4216 dout(10) << "tmapup header " << header.length() << dendl;
4217
4218 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4219 ++bp;
4220 ::decode(header, bp);
4221 dout(10) << "tmapup new header " << header.length() << dendl;
4222 }
4223
4224 ::encode(header, obl);
4225
4226 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4227
4228 // update keys
4229 bufferlist newkeydata;
4230 string nextkey, last_in_key;
4231 bufferlist nextval;
4232 bool have_next = false;
4233 if (!ip.end()) {
4234 have_next = true;
4235 ::decode(nextkey, ip);
4236 ::decode(nextval, ip);
4237 }
4238 while (!bp.end() && !result) {
4239 __u8 op;
4240 string key;
4241 try {
4242 ::decode(op, bp);
4243 ::decode(key, bp);
4244 }
4245 catch (buffer::error& e) {
4246 return -EINVAL;
4247 }
4248 if (key < last_in_key) {
4249 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4250 << "', falling back to an inefficient (unsorted) update" << dendl;
4251 bp = orig_bp;
4252 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4253 }
4254 last_in_key = key;
4255
4256 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4257
4258 // skip existing intervening keys
4259 bool key_exists = false;
4260 while (have_next && !key_exists) {
4261 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4262 if (nextkey > key)
4263 break;
4264 if (nextkey < key) {
4265 // copy untouched.
4266 ::encode(nextkey, newkeydata);
4267 ::encode(nextval, newkeydata);
4268 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4269 } else {
4270 // don't copy; discard old value. and stop.
4271 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4272 key_exists = true;
4273 nkeys--;
4274 }
4275 if (!ip.end()) {
4276 ::decode(nextkey, ip);
4277 ::decode(nextval, ip);
4278 } else {
4279 have_next = false;
4280 }
4281 }
4282
4283 if (op == CEPH_OSD_TMAP_SET) {
4284 bufferlist val;
4285 try {
4286 ::decode(val, bp);
4287 }
4288 catch (buffer::error& e) {
4289 return -EINVAL;
4290 }
4291 ::encode(key, newkeydata);
4292 ::encode(val, newkeydata);
4293 dout(20) << " set " << key << " " << val.length() << dendl;
4294 nkeys++;
4295 } else if (op == CEPH_OSD_TMAP_CREATE) {
4296 if (key_exists) {
4297 return -EEXIST;
4298 }
4299 bufferlist val;
4300 try {
4301 ::decode(val, bp);
4302 }
4303 catch (buffer::error& e) {
4304 return -EINVAL;
4305 }
4306 ::encode(key, newkeydata);
4307 ::encode(val, newkeydata);
4308 dout(20) << " create " << key << " " << val.length() << dendl;
4309 nkeys++;
4310 } else if (op == CEPH_OSD_TMAP_RM) {
4311 // do nothing.
4312 if (!key_exists) {
4313 return -ENOENT;
4314 }
4315 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4316 // do nothing
4317 } else {
4318 dout(10) << " invalid tmap op " << (int)op << dendl;
4319 return -EINVAL;
4320 }
4321 }
4322
4323 // copy remaining
4324 if (have_next) {
4325 ::encode(nextkey, newkeydata);
4326 ::encode(nextval, newkeydata);
4327 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4328 }
4329 if (!ip.end()) {
4330 bufferlist rest;
4331 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4332 dout(20) << " keep trailing " << rest.length()
4333 << " at " << newkeydata.length() << dendl;
4334 newkeydata.claim_append(rest);
4335 }
4336
4337 // encode final key count + key data
4338 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4339 ::encode(nkeys, obl);
4340 obl.claim_append(newkeydata);
4341
4342 if (0) {
4343 dout(30) << " final is \n";
4344 obl.hexdump(*_dout);
4345 *_dout << dendl;
4346
4347 // sanity check
4348 bufferlist::iterator tp = obl.begin();
4349 bufferlist h;
4350 ::decode(h, tp);
4351 map<string,bufferlist> d;
4352 ::decode(d, tp);
4353 assert(tp.end());
4354 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4355 }
4356
4357 // write it out
4358 if (!result) {
4359 dout(20) << "tmapput write " << obl.length() << dendl;
4360 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4361 newop.op.extent.offset = 0;
4362 newop.op.extent.length = obl.length();
4363 newop.indata = obl;
4364 do_osd_ops(ctx, nops);
4365 osd_op.outdata.claim(newop.outdata);
4366 }
4367 }
4368 return result;
4369}
4370
4371static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4372{
4373 if (offset >= max ||
4374 length > max ||
4375 offset + length > max)
4376 return -EFBIG;
4377
4378 return 0;
4379}
4380
4381struct FillInVerifyExtent : public Context {
4382 ceph_le64 *r;
4383 int32_t *rval;
4384 bufferlist *outdatap;
4385 boost::optional<uint32_t> maybe_crc;
4386 uint64_t size;
4387 OSDService *osd;
4388 hobject_t soid;
4389 __le32 flags;
4390 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4391 boost::optional<uint32_t> mc, uint64_t size,
4392 OSDService *osd, hobject_t soid, __le32 flags) :
4393 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4394 size(size), osd(osd), soid(soid), flags(flags) {}
4395 void finish(int len) override {
7c673cae 4396 *r = len;
c07f9fc5
FG
4397 if (len < 0) {
4398 *rval = len;
7c673cae 4399 return;
c07f9fc5
FG
4400 }
4401 *rval = 0;
4402
7c673cae
FG
4403 // whole object? can we verify the checksum?
4404 if (maybe_crc && *r == size) {
4405 uint32_t crc = outdatap->crc32c(-1);
4406 if (maybe_crc != crc) {
4407 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4408 << " != expected 0x" << *maybe_crc
4409 << std::dec << " on " << soid;
4410 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4411 *rval = -EIO;
4412 *r = 0;
4413 }
4414 }
4415 }
4416 }
4417};
4418
4419struct ToSparseReadResult : public Context {
c07f9fc5
FG
4420 int* result;
4421 bufferlist* data_bl;
7c673cae 4422 uint64_t data_offset;
c07f9fc5
FG
4423 ceph_le64* len;
4424 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4425 ceph_le64* len)
4426 : result(result), data_bl(bl), data_offset(offset),len(len) {}
7c673cae 4427 void finish(int r) override {
c07f9fc5
FG
4428 if (r < 0) {
4429 *result = r;
4430 return;
4431 }
4432 *result = 0;
4433 *len = r;
7c673cae
FG
4434 bufferlist outdata;
4435 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4436 ::encode(extents, outdata);
c07f9fc5
FG
4437 ::encode_destructively(*data_bl, outdata);
4438 data_bl->swap(outdata);
7c673cae
FG
4439 }
4440};
4441
4442template<typename V>
4443static string list_keys(const map<string, V>& m) {
4444 string s;
4445 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4446 if (!s.empty()) {
4447 s.push_back(',');
4448 }
4449 s.append(itr->first);
4450 }
4451 return s;
4452}
4453
4454template<typename T>
4455static string list_entries(const T& m) {
4456 string s;
4457 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4458 if (!s.empty()) {
4459 s.push_back(',');
4460 }
4461 s.append(*itr);
4462 }
4463 return s;
4464}
4465
4466void PrimaryLogPG::maybe_create_new_object(
4467 OpContext *ctx,
4468 bool ignore_transaction)
4469{
4470 ObjectState& obs = ctx->new_obs;
4471 if (!obs.exists) {
4472 ctx->delta_stats.num_objects++;
4473 obs.exists = true;
4474 assert(!obs.oi.is_whiteout());
4475 obs.oi.new_object();
4476 if (!ignore_transaction)
4477 ctx->op_t->create(obs.oi.soid);
4478 } else if (obs.oi.is_whiteout()) {
4479 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4480 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4481 --ctx->delta_stats.num_whiteouts;
4482 }
4483}
4484
c07f9fc5
FG
4485struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4486 OSDOp& osd_op;
4487
4488 ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4489 }
4490
4491 int execute() override {
4492 return osd_op.rval;
4493 }
4494};
4495
7c673cae
FG
4496struct C_ChecksumRead : public Context {
4497 PrimaryLogPG *primary_log_pg;
4498 OSDOp &osd_op;
4499 Checksummer::CSumType csum_type;
4500 bufferlist init_value_bl;
4501 ceph_le64 read_length;
4502 bufferlist read_bl;
4503 Context *fill_extent_ctx;
4504
4505 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4506 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4507 boost::optional<uint32_t> maybe_crc, uint64_t size,
4508 OSDService *osd, hobject_t soid, __le32 flags)
4509 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4510 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4511 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4512 &read_bl, maybe_crc, size,
4513 osd, soid, flags)) {
4514 }
c07f9fc5
FG
4515 ~C_ChecksumRead() override {
4516 delete fill_extent_ctx;
4517 }
7c673cae
FG
4518
4519 void finish(int r) override {
4520 fill_extent_ctx->complete(r);
c07f9fc5 4521 fill_extent_ctx = nullptr;
7c673cae
FG
4522
4523 if (osd_op.rval >= 0) {
4524 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4525 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
c07f9fc5 4526 &init_value_bl_it, read_bl);
7c673cae
FG
4527 }
4528 }
4529};
4530
4531int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
c07f9fc5 4532 bufferlist::iterator *bl_it)
7c673cae
FG
4533{
4534 dout(20) << __func__ << dendl;
28e407b8
AA
4535 bool skip_data_digest =
4536 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4537 g_conf->osd_distrust_data_digest;
7c673cae
FG
4538
4539 auto& op = osd_op.op;
4540 if (op.checksum.chunk_size > 0) {
4541 if (op.checksum.length == 0) {
4542 dout(10) << __func__ << ": length required when chunk size provided"
4543 << dendl;
4544 return -EINVAL;
4545 }
4546 if (op.checksum.length % op.checksum.chunk_size != 0) {
4547 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4548 return -EINVAL;
4549 }
4550 }
4551
4552 auto& oi = ctx->new_obs.oi;
4553 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4554 // zeroed offset+length implies checksum whole object
4555 op.checksum.length = oi.size;
4556 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4557 return -EOVERFLOW;
4558 }
4559
4560 Checksummer::CSumType csum_type;
4561 switch (op.checksum.type) {
4562 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4563 csum_type = Checksummer::CSUM_XXHASH32;
4564 break;
4565 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4566 csum_type = Checksummer::CSUM_XXHASH64;
4567 break;
4568 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4569 csum_type = Checksummer::CSUM_CRC32C;
4570 break;
4571 default:
4572 dout(10) << __func__ << ": unknown crc type ("
4573 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4574 return -EINVAL;
4575 }
4576
4577 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4578 if (bl_it->get_remaining() < csum_init_value_size) {
4579 dout(10) << __func__ << ": init value not provided" << dendl;
4580 return -EINVAL;
4581 }
4582
4583 bufferlist init_value_bl;
4584 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4585 csum_init_value_size);
4586 bl_it->advance(csum_init_value_size);
4587
4588 if (pool.info.require_rollback() && op.checksum.length > 0) {
4589 // If there is a data digest and it is possible we are reading
4590 // entire object, pass the digest.
4591 boost::optional<uint32_t> maybe_crc;
28e407b8
AA
4592 if (!skip_data_digest &&
4593 oi.is_data_digest() && op.checksum.offset == 0 &&
7c673cae
FG
4594 op.checksum.length >= oi.size) {
4595 maybe_crc = oi.data_digest;
4596 }
4597
4598 // async read
4599 auto& soid = oi.soid;
4600 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4601 std::move(init_value_bl), maybe_crc,
4602 oi.size, osd, soid, op.flags);
c07f9fc5 4603
7c673cae
FG
4604 ctx->pending_async_reads.push_back({
4605 {op.checksum.offset, op.checksum.length, op.flags},
4606 {&checksum_ctx->read_bl, checksum_ctx}});
4607
4608 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
c07f9fc5
FG
4609 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4610 new ReadFinisher(osd_op));
4611 return -EINPROGRESS;
7c673cae
FG
4612 }
4613
4614 // sync read
7c673cae
FG
4615 std::vector<OSDOp> read_ops(1);
4616 auto& read_op = read_ops[0];
4617 if (op.checksum.length > 0) {
4618 read_op.op.op = CEPH_OSD_OP_READ;
4619 read_op.op.flags = op.flags;
4620 read_op.op.extent.offset = op.checksum.offset;
4621 read_op.op.extent.length = op.checksum.length;
4622 read_op.op.extent.truncate_size = 0;
4623 read_op.op.extent.truncate_seq = 0;
4624
4625 int r = do_osd_ops(ctx, read_ops);
4626 if (r < 0) {
4627 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4628 return r;
4629 }
4630 }
4631
4632 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4633 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4634 read_op.outdata);
4635}
4636
4637int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4638 Checksummer::CSumType csum_type,
4639 bufferlist::iterator *init_value_bl_it,
4640 const bufferlist &read_bl) {
4641 dout(20) << __func__ << dendl;
4642
4643 auto& op = osd_op.op;
4644
4645 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4646 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4647 << op.checksum.length << dendl;
4648 return -EINVAL;
4649 }
4650
4651 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4652 op.checksum.chunk_size : read_bl.length());
4653 uint32_t csum_count = (csum_chunk_size > 0 ?
4654 read_bl.length() / csum_chunk_size : 0);
4655
4656 bufferlist csum;
4657 bufferptr csum_data;
4658 if (csum_count > 0) {
4659 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4660 csum_data = buffer::create(csum_value_size * csum_count);
4661 csum_data.zero();
4662 csum.append(csum_data);
4663
4664 switch (csum_type) {
4665 case Checksummer::CSUM_XXHASH32:
4666 {
4667 Checksummer::xxhash32::init_value_t init_value;
4668 ::decode(init_value, *init_value_bl_it);
4669 Checksummer::calculate<Checksummer::xxhash32>(
4670 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4671 &csum_data);
4672 }
4673 break;
4674 case Checksummer::CSUM_XXHASH64:
4675 {
4676 Checksummer::xxhash64::init_value_t init_value;
4677 ::decode(init_value, *init_value_bl_it);
4678 Checksummer::calculate<Checksummer::xxhash64>(
4679 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4680 &csum_data);
4681 }
4682 break;
4683 case Checksummer::CSUM_CRC32C:
4684 {
4685 Checksummer::crc32c::init_value_t init_value;
4686 ::decode(init_value, *init_value_bl_it);
4687 Checksummer::calculate<Checksummer::crc32c>(
4688 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4689 &csum_data);
4690 }
4691 break;
4692 default:
4693 break;
4694 }
4695 }
4696
4697 ::encode(csum_count, osd_op.outdata);
4698 osd_op.outdata.claim_append(csum);
4699 return 0;
4700}
4701
c07f9fc5
FG
4702struct C_ExtentCmpRead : public Context {
4703 PrimaryLogPG *primary_log_pg;
4704 OSDOp &osd_op;
4705 ceph_le64 read_length;
4706 bufferlist read_bl;
4707 Context *fill_extent_ctx;
4708
4709 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4710 boost::optional<uint32_t> maybe_crc, uint64_t size,
4711 OSDService *osd, hobject_t soid, __le32 flags)
4712 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4713 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4714 &read_bl, maybe_crc, size,
4715 osd, soid, flags)) {
4716 }
4717 ~C_ExtentCmpRead() override {
4718 delete fill_extent_ctx;
4719 }
4720
4721 void finish(int r) override {
4722 if (r == -ENOENT) {
4723 osd_op.rval = 0;
4724 read_bl.clear();
4725 delete fill_extent_ctx;
4726 } else {
4727 fill_extent_ctx->complete(r);
4728 }
4729 fill_extent_ctx = nullptr;
4730
4731 if (osd_op.rval >= 0) {
4732 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4733 }
4734 }
4735};
4736
4737int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4738{
4739 dout(20) << __func__ << dendl;
4740 ceph_osd_op& op = osd_op.op;
28e407b8
AA
4741 bool skip_data_digest =
4742 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4743 g_conf->osd_distrust_data_digest;
c07f9fc5 4744
3efd9988
FG
4745 auto& oi = ctx->new_obs.oi;
4746 uint64_t size = oi.size;
4747 if ((oi.truncate_seq < op.extent.truncate_seq) &&
4748 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4749 size = op.extent.truncate_size;
4750 }
4751
4752 if (op.extent.offset >= size) {
4753 op.extent.length = 0;
4754 } else if (op.extent.offset + op.extent.length > size) {
4755 op.extent.length = size - op.extent.offset;
4756 }
4757
4758 if (op.extent.length == 0) {
4759 dout(20) << __func__ << " zero length extent" << dendl;
4760 return finish_extent_cmp(osd_op, bufferlist{});
4761 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
c07f9fc5
FG
4762 dout(20) << __func__ << " object DNE" << dendl;
4763 return finish_extent_cmp(osd_op, {});
4764 } else if (pool.info.require_rollback()) {
4765 // If there is a data digest and it is possible we are reading
4766 // entire object, pass the digest.
c07f9fc5 4767 boost::optional<uint32_t> maybe_crc;
28e407b8
AA
4768 if (!skip_data_digest &&
4769 oi.is_data_digest() && op.checksum.offset == 0 &&
c07f9fc5
FG
4770 op.checksum.length >= oi.size) {
4771 maybe_crc = oi.data_digest;
4772 }
4773
4774 // async read
4775 auto& soid = oi.soid;
4776 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4777 osd, soid, op.flags);
4778 ctx->pending_async_reads.push_back({
4779 {op.extent.offset, op.extent.length, op.flags},
4780 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4781
4782 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4783
4784 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4785 new ReadFinisher(osd_op));
4786 return -EINPROGRESS;
4787 }
4788
4789 // sync read
4790 vector<OSDOp> read_ops(1);
4791 OSDOp& read_op = read_ops[0];
4792
4793 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4794 read_op.op.extent.offset = op.extent.offset;
4795 read_op.op.extent.length = op.extent.length;
4796 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4797 read_op.op.extent.truncate_size = op.extent.truncate_size;
4798
4799 int result = do_osd_ops(ctx, read_ops);
4800 if (result < 0) {
4801 derr << __func__ << " failed " << result << dendl;
4802 return result;
4803 }
4804 return finish_extent_cmp(osd_op, read_op.outdata);
4805}
4806
4807int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4808{
4809 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4810 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4811 if (osd_op.indata[idx] != read_byte) {
4812 return (-MAX_ERRNO - idx);
4813 }
4814 }
4815
4816 return 0;
4817}
4818
4819int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4820 dout(20) << __func__ << dendl;
4821 auto& op = osd_op.op;
4822 auto& oi = ctx->new_obs.oi;
4823 auto& soid = oi.soid;
4824 __u32 seq = oi.truncate_seq;
4825 uint64_t size = oi.size;
4826 bool trimmed_read = false;
28e407b8
AA
4827 bool skip_data_digest =
4828 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4829 g_conf->osd_distrust_data_digest;
c07f9fc5
FG
4830
4831 // are we beyond truncate_size?
4832 if ( (seq < op.extent.truncate_seq) &&
4833 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4834 size = op.extent.truncate_size;
4835
4836 if (op.extent.length == 0) //length is zero mean read the whole object
4837 op.extent.length = size;
4838
4839 if (op.extent.offset >= size) {
4840 op.extent.length = 0;
4841 trimmed_read = true;
4842 } else if (op.extent.offset + op.extent.length > size) {
4843 op.extent.length = size - op.extent.offset;
4844 trimmed_read = true;
4845 }
4846
4847 // read into a buffer
4848 int result = 0;
4849 if (trimmed_read && op.extent.length == 0) {
4850 // read size was trimmed to zero and it is expected to do nothing
4851 // a read operation of 0 bytes does *not* do nothing, this is why
4852 // the trimmed_read boolean is needed
4853 } else if (pool.info.require_rollback()) {
4854 boost::optional<uint32_t> maybe_crc;
4855 // If there is a data digest and it is possible we are reading
4856 // entire object, pass the digest. FillInVerifyExtent will
4857 // will check the oi.size again.
28e407b8
AA
4858 if (!skip_data_digest &&
4859 oi.is_data_digest() && op.extent.offset == 0 &&
c07f9fc5
FG
4860 op.extent.length >= oi.size)
4861 maybe_crc = oi.data_digest;
4862 ctx->pending_async_reads.push_back(
4863 make_pair(
4864 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4865 make_pair(&osd_op.outdata,
4866 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4867 &osd_op.outdata, maybe_crc, oi.size,
4868 osd, soid, op.flags))));
4869 dout(10) << " async_read noted for " << soid << dendl;
4870
4871 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4872 new ReadFinisher(osd_op));
4873 } else {
4874 int r = pgbackend->objects_read_sync(
4875 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4876 if (r == -EIO) {
4877 r = rep_repair_primary_object(soid, ctx->op);
4878 }
4879 if (r >= 0)
4880 op.extent.length = r;
4881 else {
4882 result = r;
4883 op.extent.length = 0;
4884 }
4885 dout(10) << " read got " << r << " / " << op.extent.length
4886 << " bytes from obj " << soid << dendl;
4887
4888 // whole object? can we verify the checksum?
28e407b8
AA
4889 if (!skip_data_digest &&
4890 op.extent.length == oi.size && oi.is_data_digest()) {
c07f9fc5
FG
4891 uint32_t crc = osd_op.outdata.crc32c(-1);
4892 if (oi.data_digest != crc) {
4893 osd->clog->error() << info.pgid << std::hex
4894 << " full-object read crc 0x" << crc
4895 << " != expected 0x" << oi.data_digest
4896 << std::dec << " on " << soid;
4897 // FIXME fall back to replica or something?
4898 result = -EIO;
4899 }
4900 }
4901 }
4902
4903 // XXX the op.extent.length is the requested length for async read
4904 // On error this length is changed to 0 after the error comes back.
4905 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4906 ctx->delta_stats.num_rd++;
4907 return result;
4908}
4909
4910int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4911 dout(20) << __func__ << dendl;
4912 auto& op = osd_op.op;
4913 auto& oi = ctx->new_obs.oi;
4914 auto& soid = oi.soid;
28e407b8
AA
4915 bool skip_data_digest =
4916 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4917 g_conf->osd_distrust_data_digest;
c07f9fc5
FG
4918
4919 if (op.extent.truncate_seq) {
4920 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4921 return -EINVAL;
4922 }
4923
4924 ++ctx->num_read;
4925 if (pool.info.ec_pool()) {
4926 // translate sparse read to a normal one if not supported
4927 uint64_t offset = op.extent.offset;
4928 uint64_t length = op.extent.length;
4929 if (offset > oi.size) {
4930 length = 0;
4931 } else if (offset + length > oi.size) {
4932 length = oi.size - offset;
4933 }
4934
4935 if (length > 0) {
4936 ctx->pending_async_reads.push_back(
4937 make_pair(
4938 boost::make_tuple(offset, length, op.flags),
4939 make_pair(
4940 &osd_op.outdata,
4941 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4942 &op.extent.length))));
4943 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4944
4945 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4946 new ReadFinisher(osd_op));
4947 } else {
4948 dout(10) << " sparse read ended up empty for " << soid << dendl;
4949 map<uint64_t, uint64_t> extents;
4950 ::encode(extents, osd_op.outdata);
4951 }
4952 } else {
4953 // read into a buffer
4954 map<uint64_t, uint64_t> m;
4955 uint32_t total_read = 0;
4956 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4957 info.pgid.shard),
4958 op.extent.offset, op.extent.length, m);
4959 if (r < 0) {
4960 return r;
4961 }
4962
4963 map<uint64_t, uint64_t>::iterator miter;
4964 bufferlist data_bl;
4965 uint64_t last = op.extent.offset;
4966 for (miter = m.begin(); miter != m.end(); ++miter) {
4967 // verify hole?
4968 if (cct->_conf->osd_verify_sparse_read_holes &&
4969 last < miter->first) {
4970 bufferlist t;
4971 uint64_t len = miter->first - last;
4972 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
c07f9fc5
FG
4973 if (r < 0) {
4974 osd->clog->error() << coll << " " << soid
4975 << " sparse-read failed to read: "
4976 << r;
4977 } else if (!t.is_zero()) {
4978 osd->clog->error() << coll << " " << soid
4979 << " sparse-read found data in hole "
4980 << last << "~" << len;
4981 }
4982 }
4983
4984 bufferlist tmpbl;
4985 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4986 op.flags, &tmpbl);
181888fb
FG
4987 if (r == -EIO) {
4988 r = rep_repair_primary_object(soid, ctx->op);
4989 }
c07f9fc5
FG
4990 if (r < 0) {
4991 return r;
4992 }
4993
4994 // this is usually happen when we get extent that exceeds the actual file
4995 // size
4996 if (r < (int)miter->second)
4997 miter->second = r;
4998 total_read += r;
4999 dout(10) << "sparse-read " << miter->first << "@" << miter->second
5000 << dendl;
5001 data_bl.claim_append(tmpbl);
5002 last = miter->first + r;
5003 }
5004
5005 if (r < 0) {
5006 return r;
5007 }
5008
5009 // verify trailing hole?
5010 if (cct->_conf->osd_verify_sparse_read_holes) {
5011 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
5012 if (last < end) {
5013 bufferlist t;
5014 uint64_t len = end - last;
5015 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5016 if (r < 0) {
5017 osd->clog->error() << coll << " " << soid
5018 << " sparse-read failed to read: " << r;
5019 } else if (!t.is_zero()) {
5020 osd->clog->error() << coll << " " << soid
5021 << " sparse-read found data in hole "
5022 << last << "~" << len;
5023 }
5024 }
5025 }
5026
5027 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5028 // Maybe at first, there is no much whole objects. With continued use, more
5029 // and more whole object exist. So from this point, for spare-read add
5030 // checksum make sense.
28e407b8
AA
5031 if (!skip_data_digest &&
5032 total_read == oi.size && oi.is_data_digest()) {
c07f9fc5
FG
5033 uint32_t crc = data_bl.crc32c(-1);
5034 if (oi.data_digest != crc) {
5035 osd->clog->error() << info.pgid << std::hex
5036 << " full-object read crc 0x" << crc
5037 << " != expected 0x" << oi.data_digest
5038 << std::dec << " on " << soid;
5039 // FIXME fall back to replica or something?
5040 return -EIO;
5041 }
5042 }
5043
5044 op.extent.length = total_read;
5045
5046 ::encode(m, osd_op.outdata); // re-encode since it might be modified
5047 ::encode_destructively(data_bl, osd_op.outdata);
5048
5049 dout(10) << " sparse_read got " << total_read << " bytes from object "
5050 << soid << dendl;
5051 }
5052
5053 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5054 ctx->delta_stats.num_rd++;
5055 return 0;
5056}
5057
7c673cae
FG
5058int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5059{
5060 int result = 0;
5061 SnapSetContext *ssc = ctx->obc->ssc;
5062 ObjectState& obs = ctx->new_obs;
5063 object_info_t& oi = obs.oi;
5064 const hobject_t& soid = oi.soid;
28e407b8
AA
5065 bool skip_data_digest =
5066 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
5067 g_conf->osd_distrust_data_digest;
7c673cae 5068
7c673cae
FG
5069 PGTransaction* t = ctx->op_t.get();
5070
5071 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5072
c07f9fc5 5073 ctx->current_osd_subop_num = 0;
b32b8144 5074 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
7c673cae
FG
5075 OSDOp& osd_op = *p;
5076 ceph_osd_op& op = osd_op.op;
5077
c07f9fc5
FG
5078 OpFinisher* op_finisher = nullptr;
5079 {
5080 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5081 if (op_finisher_it != ctx->op_finishers.end()) {
5082 op_finisher = op_finisher_it->second.get();
5083 }
5084 }
5085
7c673cae
FG
5086 // TODO: check endianness (__le32 vs uint32_t, etc.)
5087 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5088 // but the code in this function seems to treat them as native-endian. What should the
5089 // tracepoints do?
5090 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5091
5092 dout(10) << "do_osd_op " << osd_op << dendl;
5093
5094 bufferlist::iterator bp = osd_op.indata.begin();
5095
5096 // user-visible modifcation?
5097 switch (op.op) {
5098 // non user-visible modifications
5099 case CEPH_OSD_OP_WATCH:
5100 case CEPH_OSD_OP_CACHE_EVICT:
5101 case CEPH_OSD_OP_CACHE_FLUSH:
5102 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5103 case CEPH_OSD_OP_UNDIRTY:
5104 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5105 case CEPH_OSD_OP_CACHE_PIN:
5106 case CEPH_OSD_OP_CACHE_UNPIN:
31f18b77 5107 case CEPH_OSD_OP_SET_REDIRECT:
7c673cae
FG
5108 break;
5109 default:
5110 if (op.op & CEPH_OSD_OP_MODE_WR)
5111 ctx->user_modify = true;
5112 }
5113
5114 // munge -1 truncate to 0 truncate
5115 if (ceph_osd_op_uses_extent(op.op) &&
5116 op.extent.truncate_seq == 1 &&
5117 op.extent.truncate_size == (-1ULL)) {
5118 op.extent.truncate_size = 0;
5119 op.extent.truncate_seq = 0;
5120 }
5121
5122 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5123 if (op.op == CEPH_OSD_OP_ZERO &&
5124 obs.exists &&
5125 op.extent.offset < cct->_conf->osd_max_object_size &&
5126 op.extent.length >= 1 &&
5127 op.extent.length <= cct->_conf->osd_max_object_size &&
5128 op.extent.offset + op.extent.length >= oi.size) {
5129 if (op.extent.offset >= oi.size) {
5130 // no-op
5131 goto fail;
5132 }
5133 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5134 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5135 op.op = CEPH_OSD_OP_TRUNCATE;
5136 }
5137
5138 switch (op.op) {
5139
5140 // --- READS ---
5141
5142 case CEPH_OSD_OP_CMPEXT:
5143 ++ctx->num_read;
c07f9fc5
FG
5144 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5145 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5146 op.extent.length, op.extent.truncate_size,
5147 op.extent.truncate_seq);
5148
5149 if (op_finisher == nullptr) {
5150 result = do_extent_cmp(ctx, osd_op);
5151 } else {
5152 result = op_finisher->execute();
5153 }
7c673cae
FG
5154 break;
5155
5156 case CEPH_OSD_OP_SYNC_READ:
5157 if (pool.info.require_rollback()) {
5158 result = -EOPNOTSUPP;
5159 break;
5160 }
5161 // fall through
5162 case CEPH_OSD_OP_READ:
5163 ++ctx->num_read;
c07f9fc5
FG
5164 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5165 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5166 op.extent.length, op.extent.truncate_size,
5167 op.extent.truncate_seq);
5168 if (op_finisher == nullptr) {
5169 if (!ctx->data_off) {
7c673cae
FG
5170 ctx->data_off = op.extent.offset;
5171 }
c07f9fc5
FG
5172 result = do_read(ctx, osd_op);
5173 } else {
5174 result = op_finisher->execute();
7c673cae
FG
5175 }
5176 break;
5177
5178 case CEPH_OSD_OP_CHECKSUM:
5179 ++ctx->num_read;
5180 {
5181 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5182 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5183 op.checksum.offset, op.checksum.length,
5184 op.checksum.chunk_size);
5185
c07f9fc5
FG
5186 if (op_finisher == nullptr) {
5187 result = do_checksum(ctx, osd_op, &bp);
5188 } else {
5189 result = op_finisher->execute();
7c673cae
FG
5190 }
5191 }
5192 break;
5193
5194 /* map extents */
5195 case CEPH_OSD_OP_MAPEXT:
5196 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5197 if (pool.info.require_rollback()) {
5198 result = -EOPNOTSUPP;
5199 break;
5200 }
5201 ++ctx->num_read;
5202 {
5203 // read into a buffer
5204 bufferlist bl;
5205 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5206 info.pgid.shard),
5207 op.extent.offset, op.extent.length, bl);
5208 osd_op.outdata.claim(bl);
5209 if (r < 0)
5210 result = r;
5211 else
5212 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5213 ctx->delta_stats.num_rd++;
5214 dout(10) << " map_extents done on object " << soid << dendl;
5215 }
5216 break;
5217
5218 /* map extents */
5219 case CEPH_OSD_OP_SPARSE_READ:
c07f9fc5
FG
5220 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5221 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5222 op.extent.length, op.extent.truncate_size,
5223 op.extent.truncate_seq);
5224 if (op_finisher == nullptr) {
5225 result = do_sparse_read(ctx, osd_op);
7c673cae 5226 } else {
c07f9fc5 5227 result = op_finisher->execute();
7c673cae 5228 }
7c673cae
FG
5229 break;
5230
5231 case CEPH_OSD_OP_CALL:
5232 {
5233 string cname, mname;
5234 bufferlist indata;
5235 try {
5236 bp.copy(op.cls.class_len, cname);
5237 bp.copy(op.cls.method_len, mname);
5238 bp.copy(op.cls.indata_len, indata);
5239 } catch (buffer::error& e) {
5240 dout(10) << "call unable to decode class + method + indata" << dendl;
5241 dout(30) << "in dump: ";
5242 osd_op.indata.hexdump(*_dout);
5243 *_dout << dendl;
5244 result = -EINVAL;
5245 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5246 break;
5247 }
5248 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5249
5250 ClassHandler::ClassData *cls;
5251 result = osd->class_handler->open_class(cname, &cls);
5252 assert(result == 0); // init_op_flags() already verified this works.
5253
5254 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5255 if (!method) {
5256 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5257 result = -EOPNOTSUPP;
5258 break;
5259 }
5260
5261 int flags = method->get_flags();
5262 if (flags & CLS_METHOD_WR)
5263 ctx->user_modify = true;
5264
5265 bufferlist outdata;
5266 dout(10) << "call method " << cname << "." << mname << dendl;
5267 int prev_rd = ctx->num_read;
5268 int prev_wr = ctx->num_write;
5269 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5270
5271 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5272 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5273 result = -EIO;
5274 break;
5275 }
5276 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5277 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5278 result = -EIO;
5279 break;
5280 }
5281
5282 dout(10) << "method called response length=" << outdata.length() << dendl;
5283 op.extent.length = outdata.length();
5284 osd_op.outdata.claim_append(outdata);
5285 dout(30) << "out dump: ";
5286 osd_op.outdata.hexdump(*_dout);
5287 *_dout << dendl;
5288 }
5289 break;
5290
5291 case CEPH_OSD_OP_STAT:
5292 // note: stat does not require RD
5293 {
5294 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5295
5296 if (obs.exists && !oi.is_whiteout()) {
5297 ::encode(oi.size, osd_op.outdata);
5298 ::encode(oi.mtime, osd_op.outdata);
5299 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5300 } else {
5301 result = -ENOENT;
5302 dout(10) << "stat oi object does not exist" << dendl;
5303 }
5304
5305 ctx->delta_stats.num_rd++;
5306 }
5307 break;
5308
5309 case CEPH_OSD_OP_ISDIRTY:
5310 ++ctx->num_read;
5311 {
5312 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5313 bool is_dirty = obs.oi.is_dirty();
5314 ::encode(is_dirty, osd_op.outdata);
5315 ctx->delta_stats.num_rd++;
5316 result = 0;
5317 }
5318 break;
5319
5320 case CEPH_OSD_OP_UNDIRTY:
5321 ++ctx->num_write;
5322 {
5323 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5324 if (oi.is_dirty()) {
5325 ctx->undirty = true; // see make_writeable()
5326 ctx->modify = true;
5327 ctx->delta_stats.num_wr++;
5328 }
5329 result = 0;
5330 }
5331 break;
5332
5333 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5334 ++ctx->num_write;
5335 {
5336 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5337 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5338 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5339 result = -EINVAL;
5340 break;
5341 }
5342 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5343 result = -EINVAL;
5344 break;
5345 }
5346 if (!obs.exists) {
5347 result = 0;
5348 break;
5349 }
5350 if (oi.is_cache_pinned()) {
5351 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5352 result = -EPERM;
5353 break;
5354 }
5355 if (oi.is_dirty()) {
5356 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5357 if (result == -EINPROGRESS)
5358 result = -EAGAIN;
5359 } else {
5360 result = 0;
5361 }
5362 }
5363 break;
5364
5365 case CEPH_OSD_OP_CACHE_FLUSH:
5366 ++ctx->num_write;
5367 {
5368 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5369 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5370 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5371 result = -EINVAL;
5372 break;
5373 }
5374 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5375 result = -EINVAL;
5376 break;
5377 }
5378 if (!obs.exists) {
5379 result = 0;
5380 break;
5381 }
5382 if (oi.is_cache_pinned()) {
5383 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5384 result = -EPERM;
5385 break;
5386 }
5387 hobject_t missing;
5388 if (oi.is_dirty()) {
5389 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5390 if (result == -EINPROGRESS)
5391 result = -EAGAIN;
5392 } else {
5393 result = 0;
5394 }
5395 // Check special return value which has set missing_return
5396 if (result == -ENOENT) {
5397 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5398 assert(!missing.is_min());
5399 wait_for_unreadable_object(missing, ctx->op);
5400 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5401 result = -EAGAIN;
5402 }
5403 }
5404 break;
5405
5406 case CEPH_OSD_OP_CACHE_EVICT:
5407 ++ctx->num_write;
5408 {
5409 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5410 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5411 result = -EINVAL;
5412 break;
5413 }
5414 if (!obs.exists) {
5415 result = 0;
5416 break;
5417 }
5418 if (oi.is_cache_pinned()) {
5419 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5420 result = -EPERM;
5421 break;
5422 }
5423 if (oi.is_dirty()) {
5424 result = -EBUSY;
5425 break;
5426 }
5427 if (!oi.watchers.empty()) {
5428 result = -EBUSY;
5429 break;
5430 }
5431 if (soid.snap == CEPH_NOSNAP) {
5432 result = _verify_no_head_clones(soid, ssc->snapset);
5433 if (result < 0)
5434 break;
5435 }
5436 result = _delete_oid(ctx, true, false);
5437 if (result >= 0) {
5438 // mark that this is a cache eviction to avoid triggering normal
5439 // make_writeable() clone or snapdir object creation in finish_ctx()
5440 ctx->cache_evict = true;
5441 }
5442 osd->logger->inc(l_osd_tier_evict);
5443 }
5444 break;
5445
5446 case CEPH_OSD_OP_GETXATTR:
5447 ++ctx->num_read;
5448 {
5449 string aname;
5450 bp.copy(op.xattr.name_len, aname);
5451 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5452 string name = "_" + aname;
5453 int r = getattr_maybe_cache(
5454 ctx->obc,
5455 name,
5456 &(osd_op.outdata));
5457 if (r >= 0) {
5458 op.xattr.value_len = osd_op.outdata.length();
5459 result = 0;
5460 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5461 } else
5462 result = r;
5463
5464 ctx->delta_stats.num_rd++;
5465 }
5466 break;
5467
5468 case CEPH_OSD_OP_GETXATTRS:
5469 ++ctx->num_read;
5470 {
5471 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5472 map<string, bufferlist> out;
5473 result = getattrs_maybe_cache(
5474 ctx->obc,
b32b8144 5475 &out);
7c673cae
FG
5476
5477 bufferlist bl;
5478 ::encode(out, bl);
5479 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5480 ctx->delta_stats.num_rd++;
5481 osd_op.outdata.claim_append(bl);
5482 }
5483 break;
5484
5485 case CEPH_OSD_OP_CMPXATTR:
5486 ++ctx->num_read;
5487 {
5488 string aname;
5489 bp.copy(op.xattr.name_len, aname);
5490 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5491 string name = "_" + aname;
5492 name[op.xattr.name_len + 1] = 0;
5493
5494 bufferlist xattr;
5495 result = getattr_maybe_cache(
5496 ctx->obc,
5497 name,
5498 &xattr);
5499 if (result < 0 && result != -EEXIST && result != -ENODATA)
5500 break;
5501
5502 ctx->delta_stats.num_rd++;
5503 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5504
5505 switch (op.xattr.cmp_mode) {
5506 case CEPH_OSD_CMPXATTR_MODE_STRING:
5507 {
5508 string val;
5509 bp.copy(op.xattr.value_len, val);
5510 val[op.xattr.value_len] = 0;
5511 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5512 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5513 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5514 }
5515 break;
5516
5517 case CEPH_OSD_CMPXATTR_MODE_U64:
5518 {
5519 uint64_t u64val;
5520 try {
5521 ::decode(u64val, bp);
5522 }
5523 catch (buffer::error& e) {
5524 result = -EINVAL;
5525 goto fail;
5526 }
5527 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5528 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5529 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5530 }
5531 break;
5532
5533 default:
5534 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5535 result = -EINVAL;
5536 }
5537
5538 if (!result) {
5539 dout(10) << "comparison returned false" << dendl;
5540 result = -ECANCELED;
5541 break;
5542 }
5543 if (result < 0) {
5544 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5545 break;
5546 }
5547
5548 dout(10) << "comparison returned true" << dendl;
5549 }
5550 break;
5551
5552 case CEPH_OSD_OP_ASSERT_VER:
5553 ++ctx->num_read;
5554 {
5555 uint64_t ver = op.assert_ver.ver;
5556 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5557 if (!ver)
5558 result = -EINVAL;
5559 else if (ver < oi.user_version)
5560 result = -ERANGE;
5561 else if (ver > oi.user_version)
5562 result = -EOVERFLOW;
5563 }
5564 break;
5565
5566 case CEPH_OSD_OP_LIST_WATCHERS:
5567 ++ctx->num_read;
5568 {
5569 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5570 obj_list_watch_response_t resp;
5571
5572 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5573 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5574 ++oi_iter) {
5575 dout(20) << "key cookie=" << oi_iter->first.first
5576 << " entity=" << oi_iter->first.second << " "
5577 << oi_iter->second << dendl;
5578 assert(oi_iter->first.first == oi_iter->second.cookie);
5579 assert(oi_iter->first.second.is_client());
5580
5581 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5582 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5583 resp.entries.push_back(wi);
5584 }
5585
5586 resp.encode(osd_op.outdata, ctx->get_features());
5587 result = 0;
5588
5589 ctx->delta_stats.num_rd++;
5590 break;
5591 }
5592
5593 case CEPH_OSD_OP_LIST_SNAPS:
5594 ++ctx->num_read;
5595 {
5596 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5597 obj_list_snap_response_t resp;
5598
5599 if (!ssc) {
5600 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5601 }
5602 assert(ssc);
5603
5604 int clonecount = ssc->snapset.clones.size();
5605 if (ssc->snapset.head_exists)
5606 clonecount++;
5607 resp.clones.reserve(clonecount);
5608 for (auto clone_iter = ssc->snapset.clones.begin();
5609 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5610 clone_info ci;
5611 ci.cloneid = *clone_iter;
5612
5613 hobject_t clone_oid = soid;
5614 clone_oid.snap = *clone_iter;
5615
5616 if (!ssc->snapset.is_legacy()) {
5617 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5618 if (p == ssc->snapset.clone_snaps.end()) {
5619 osd->clog->error() << "osd." << osd->whoami
5620 << ": inconsistent clone_snaps found for oid "
5621 << soid << " clone " << *clone_iter
5622 << " snapset " << ssc->snapset;
5623 result = -EINVAL;
5624 break;
5625 }
5626 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5627 ci.snaps.push_back(*q);
5628 }
5629 } else {
5630 /* No need to take a lock here. We are only inspecting state cached on
5631 * in the ObjectContext, so we aren't performing an actual read unless
5632 * the clone obc is not already loaded (in which case, it cannot have
5633 * an in progress write). We also do not risk exposing uncommitted
5634 * state since we do have a read lock on the head object or snapdir,
5635 * which we would have to write lock in order to make user visible
5636 * modifications to the snapshot state (snap trim related mutations
5637 * are not user visible).
5638 */
5639 if (is_missing_object(clone_oid)) {
5640 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5641 wait_for_unreadable_object(clone_oid, ctx->op);
5642 result = -EAGAIN;
5643 break;
5644 }
5645
5646 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5647 if (!clone_obc) {
5648 if (maybe_handle_cache(
5649 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5650 // promoting the clone
5651 result = -EAGAIN;
5652 } else {
5653 osd->clog->error() << "osd." << osd->whoami
5654 << ": missing clone " << clone_oid
5655 << " for oid "
5656 << soid;
5657 // should not happen
5658 result = -ENOENT;
5659 }
5660 break;
5661 }
5662 for (vector<snapid_t>::reverse_iterator p =
5663 clone_obc->obs.oi.legacy_snaps.rbegin();
5664 p != clone_obc->obs.oi.legacy_snaps.rend();
5665 ++p) {
5666 ci.snaps.push_back(*p);
5667 }
5668 }
5669
5670 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5671
5672 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5673 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5674 if (coi == ssc->snapset.clone_overlap.end()) {
5675 osd->clog->error() << "osd." << osd->whoami
5676 << ": inconsistent clone_overlap found for oid "
5677 << soid << " clone " << *clone_iter;
5678 result = -EINVAL;
5679 break;
5680 }
5681 const interval_set<uint64_t> &o = coi->second;
5682 ci.overlap.reserve(o.num_intervals());
5683 for (interval_set<uint64_t>::const_iterator r = o.begin();
5684 r != o.end(); ++r) {
5685 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5686 r.get_len()));
5687 }
5688
5689 map<snapid_t, uint64_t>::const_iterator si;
5690 si = ssc->snapset.clone_size.find(ci.cloneid);
5691 if (si == ssc->snapset.clone_size.end()) {
5692 osd->clog->error() << "osd." << osd->whoami
5693 << ": inconsistent clone_size found for oid "
5694 << soid << " clone " << *clone_iter;
5695 result = -EINVAL;
5696 break;
5697 }
5698 ci.size = si->second;
5699
5700 resp.clones.push_back(ci);
5701 }
5702 if (result < 0) {
5703 break;
5704 }
5705 if (ssc->snapset.head_exists &&
5706 !ctx->obc->obs.oi.is_whiteout()) {
5707 assert(obs.exists);
5708 clone_info ci;
5709 ci.cloneid = CEPH_NOSNAP;
5710
5711 //Size for HEAD is oi.size
5712 ci.size = oi.size;
5713
5714 resp.clones.push_back(ci);
5715 }
5716 resp.seq = ssc->snapset.seq;
5717
5718 resp.encode(osd_op.outdata);
5719 result = 0;
5720
5721 ctx->delta_stats.num_rd++;
5722 break;
5723 }
5724
5725 case CEPH_OSD_OP_NOTIFY:
5726 ++ctx->num_read;
5727 {
5728 uint32_t timeout;
5729 bufferlist bl;
5730
5731 try {
5732 uint32_t ver; // obsolete
5733 ::decode(ver, bp);
5734 ::decode(timeout, bp);
5735 ::decode(bl, bp);
5736 } catch (const buffer::error &e) {
5737 timeout = 0;
5738 }
5739 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5740 if (!timeout)
5741 timeout = cct->_conf->osd_default_notify_timeout;
5742
5743 notify_info_t n;
5744 n.timeout = timeout;
5745 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5746 n.cookie = op.watch.cookie;
5747 n.bl = bl;
5748 ctx->notifies.push_back(n);
5749
5750 // return our unique notify id to the client
5751 ::encode(n.notify_id, osd_op.outdata);
5752 }
5753 break;
5754
5755 case CEPH_OSD_OP_NOTIFY_ACK:
5756 ++ctx->num_read;
5757 {
5758 try {
5759 uint64_t notify_id = 0;
5760 uint64_t watch_cookie = 0;
5761 ::decode(notify_id, bp);
5762 ::decode(watch_cookie, bp);
5763 bufferlist reply_bl;
5764 if (!bp.end()) {
5765 ::decode(reply_bl, bp);
5766 }
5767 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5768 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5769 ctx->notify_acks.push_back(ack);
5770 } catch (const buffer::error &e) {
5771 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5772 OpContext::NotifyAck ack(
5773 // op.watch.cookie is actually the notify_id for historical reasons
5774 op.watch.cookie
5775 );
5776 ctx->notify_acks.push_back(ack);
5777 }
5778 }
5779 break;
5780
5781 case CEPH_OSD_OP_SETALLOCHINT:
5782 ++ctx->num_write;
5783 {
5784 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5785 maybe_create_new_object(ctx);
5786 oi.expected_object_size = op.alloc_hint.expected_object_size;
5787 oi.expected_write_size = op.alloc_hint.expected_write_size;
5788 oi.alloc_hint_flags = op.alloc_hint.flags;
5789 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5790 op.alloc_hint.expected_write_size,
5791 op.alloc_hint.flags);
5792 ctx->delta_stats.num_wr++;
5793 result = 0;
5794 }
5795 break;
5796
5797
5798 // --- WRITES ---
5799
5800 // -- object data --
5801
5802 case CEPH_OSD_OP_WRITE:
5803 ++ctx->num_write;
5804 { // write
5805 __u32 seq = oi.truncate_seq;
5806 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5807 if (op.extent.length != osd_op.indata.length()) {
5808 result = -EINVAL;
5809 break;
5810 }
5811
5812 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5813 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5814
5815 if (pool.info.requires_aligned_append() &&
5816 (op.extent.offset % pool.info.required_alignment() != 0)) {
5817 result = -EOPNOTSUPP;
5818 break;
5819 }
5820
5821 if (!obs.exists) {
5822 if (pool.info.requires_aligned_append() && op.extent.offset) {
5823 result = -EOPNOTSUPP;
5824 break;
5825 }
5826 } else if (op.extent.offset != oi.size &&
5827 pool.info.requires_aligned_append()) {
5828 result = -EOPNOTSUPP;
5829 break;
5830 }
5831
5832 if (seq && (seq > op.extent.truncate_seq) &&
5833 (op.extent.offset + op.extent.length > oi.size)) {
5834 // old write, arrived after trimtrunc
5835 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5836 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5837 << ", adjusting write length to " << op.extent.length << dendl;
5838 bufferlist t;
5839 t.substr_of(osd_op.indata, 0, op.extent.length);
5840 osd_op.indata.swap(t);
5841 }
5842 if (op.extent.truncate_seq > seq) {
5843 // write arrives before trimtrunc
5844 if (obs.exists && !oi.is_whiteout()) {
5845 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5846 << ", truncating to " << op.extent.truncate_size << dendl;
5847 t->truncate(soid, op.extent.truncate_size);
5848 oi.truncate_seq = op.extent.truncate_seq;
5849 oi.truncate_size = op.extent.truncate_size;
5850 if (op.extent.truncate_size != oi.size) {
5851 ctx->delta_stats.num_bytes -= oi.size;
5852 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5853 oi.size = op.extent.truncate_size;
5854 }
5855 } else {
5856 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5857 << ", but object is new" << dendl;
5858 oi.truncate_seq = op.extent.truncate_seq;
5859 oi.truncate_size = op.extent.truncate_size;
5860 }
5861 }
5862 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5863 if (result < 0)
5864 break;
5865
5866 maybe_create_new_object(ctx);
5867
5868 if (op.extent.length == 0) {
5869 if (op.extent.offset > oi.size) {
5870 t->truncate(
5871 soid, op.extent.offset);
5872 } else {
5873 t->nop(soid);
5874 }
5875 } else {
5876 t->write(
5877 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5878 }
5879
28e407b8
AA
5880 if (op.extent.offset == 0 && op.extent.length >= oi.size
5881 && !skip_data_digest) {
7c673cae 5882 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
28e407b8
AA
5883 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
5884 if (skip_data_digest) {
5885 obs.oi.clear_data_digest();
5886 } else {
5887 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5888 }
5889 } else {
7c673cae 5890 obs.oi.clear_data_digest();
28e407b8 5891 }
7c673cae
FG
5892 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5893 op.extent.offset, op.extent.length);
5894
5895 }
5896 break;
5897
5898 case CEPH_OSD_OP_WRITEFULL:
5899 ++ctx->num_write;
5900 { // write full object
5901 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5902
5903 if (op.extent.length != osd_op.indata.length()) {
5904 result = -EINVAL;
5905 break;
5906 }
5907 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5908 if (result < 0)
5909 break;
5910
5911 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5912 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5913
5914 maybe_create_new_object(ctx);
5915 if (pool.info.require_rollback()) {
5916 t->truncate(soid, 0);
5917 } else if (obs.exists && op.extent.length < oi.size) {
5918 t->truncate(soid, op.extent.length);
5919 }
5920 if (op.extent.length) {
5921 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5922 }
28e407b8
AA
5923 if (!skip_data_digest) {
5924 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5925 } else {
5926 obs.oi.clear_data_digest();
5927 }
7c673cae
FG
5928
5929 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5930 0, op.extent.length, true);
5931 }
5932 break;
5933
5934 case CEPH_OSD_OP_WRITESAME:
5935 ++ctx->num_write;
5936 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5937 result = do_writesame(ctx, osd_op);
5938 break;
5939
5940 case CEPH_OSD_OP_ROLLBACK :
5941 ++ctx->num_write;
5942 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5943 result = _rollback_to(ctx, op);
5944 break;
5945
5946 case CEPH_OSD_OP_ZERO:
5947 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5948 if (pool.info.requires_aligned_append()) {
5949 result = -EOPNOTSUPP;
5950 break;
5951 }
5952 ++ctx->num_write;
5953 { // zero
5954 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5955 if (result < 0)
5956 break;
5957 assert(op.extent.length);
5958 if (obs.exists && !oi.is_whiteout()) {
5959 t->zero(soid, op.extent.offset, op.extent.length);
5960 interval_set<uint64_t> ch;
5961 ch.insert(op.extent.offset, op.extent.length);
5962 ctx->modified_ranges.union_of(ch);
5963 ctx->delta_stats.num_wr++;
5964 oi.clear_data_digest();
5965 } else {
5966 // no-op
5967 }
5968 }
5969 break;
5970 case CEPH_OSD_OP_CREATE:
5971 ++ctx->num_write;
5972 {
5973 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5974 int flags = le32_to_cpu(op.flags);
5975 if (obs.exists && !oi.is_whiteout() &&
5976 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5977 result = -EEXIST; /* this is an exclusive create */
5978 } else {
5979 if (osd_op.indata.length()) {
5980 bufferlist::iterator p = osd_op.indata.begin();
5981 string category;
5982 try {
5983 ::decode(category, p);
5984 }
5985 catch (buffer::error& e) {
5986 result = -EINVAL;
5987 goto fail;
5988 }
5989 // category is no longer implemented.
5990 }
5991 if (result >= 0) {
5992 maybe_create_new_object(ctx);
5993 t->nop(soid);
5994 }
5995 }
5996 }
5997 break;
5998
5999 case CEPH_OSD_OP_TRIMTRUNC:
6000 op.extent.offset = op.extent.truncate_size;
6001 // falling through
6002
6003 case CEPH_OSD_OP_TRUNCATE:
6004 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6005 if (pool.info.requires_aligned_append()) {
6006 result = -EOPNOTSUPP;
6007 break;
6008 }
6009 ++ctx->num_write;
6010 {
6011 // truncate
6012 if (!obs.exists || oi.is_whiteout()) {
6013 dout(10) << " object dne, truncate is a no-op" << dendl;
6014 break;
6015 }
6016
6017 if (op.extent.offset > cct->_conf->osd_max_object_size) {
6018 result = -EFBIG;
6019 break;
6020 }
6021
6022 if (op.extent.truncate_seq) {
6023 assert(op.extent.offset == op.extent.truncate_size);
6024 if (op.extent.truncate_seq <= oi.truncate_seq) {
6025 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6026 << ", no-op" << dendl;
6027 break; // old
6028 }
6029 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6030 << ", truncating" << dendl;
6031 oi.truncate_seq = op.extent.truncate_seq;
6032 oi.truncate_size = op.extent.truncate_size;
6033 }
6034
6035 maybe_create_new_object(ctx);
6036 t->truncate(soid, op.extent.offset);
6037 if (oi.size > op.extent.offset) {
6038 interval_set<uint64_t> trim;
6039 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6040 ctx->modified_ranges.union_of(trim);
6041 }
6042 if (op.extent.offset != oi.size) {
6043 ctx->delta_stats.num_bytes -= oi.size;
6044 ctx->delta_stats.num_bytes += op.extent.offset;
6045 oi.size = op.extent.offset;
6046 }
6047 ctx->delta_stats.num_wr++;
6048 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6049
6050 oi.clear_data_digest();
6051 }
6052 break;
6053
6054 case CEPH_OSD_OP_DELETE:
6055 ++ctx->num_write;
6056 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6057 {
6058 result = _delete_oid(ctx, false, ctx->ignore_cache);
6059 }
6060 break;
6061
6062 case CEPH_OSD_OP_WATCH:
6063 ++ctx->num_write;
6064 {
6065 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6066 op.watch.cookie, op.watch.op);
6067 if (!obs.exists) {
6068 result = -ENOENT;
6069 break;
6070 }
6071 uint64_t cookie = op.watch.cookie;
6072 entity_name_t entity = ctx->reqid.name;
6073 ObjectContextRef obc = ctx->obc;
6074
6075 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6076 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6077 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6078 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6079 dout(10) << "watch: peer_addr="
6080 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6081
6082 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6083 if (op.watch.timeout != 0) {
6084 timeout = op.watch.timeout;
6085 }
6086
6087 watch_info_t w(cookie, timeout,
6088 ctx->op->get_req()->get_connection()->get_peer_addr());
6089 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6090 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6091 if (oi.watchers.count(make_pair(cookie, entity))) {
6092 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6093 } else {
6094 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6095 oi.watchers[make_pair(cookie, entity)] = w;
6096 t->nop(soid); // make sure update the object_info on disk!
6097 }
6098 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6099 ctx->watch_connects.push_back(make_pair(w, will_ping));
6100 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6101 if (!oi.watchers.count(make_pair(cookie, entity))) {
6102 result = -ENOTCONN;
6103 break;
6104 }
6105 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6106 ctx->watch_connects.push_back(make_pair(w, true));
6107 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6108 /* Note: WATCH with PING doesn't cause may_write() to return true,
6109 * so if there is nothing else in the transaction, this is going
6110 * to run do_osd_op_effects, but not write out a log entry */
6111 if (!oi.watchers.count(make_pair(cookie, entity))) {
6112 result = -ENOTCONN;
6113 break;
6114 }
6115 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6116 obc->watchers.find(make_pair(cookie, entity));
6117 if (p == obc->watchers.end() ||
6118 !p->second->is_connected()) {
6119 // client needs to reconnect
6120 result = -ETIMEDOUT;
6121 break;
6122 }
6123 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6124 p->second->got_ping(ceph_clock_now());
6125 result = 0;
6126 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6127 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6128 oi.watchers.find(make_pair(cookie, entity));
6129 if (oi_iter != oi.watchers.end()) {
6130 dout(10) << " removed watch " << oi_iter->second << " by "
6131 << entity << dendl;
6132 oi.watchers.erase(oi_iter);
6133 t->nop(soid); // update oi on disk
6134 ctx->watch_disconnects.push_back(
6135 watch_disconnect_t(cookie, entity, false));
6136 } else {
6137 dout(10) << " can't remove: no watch by " << entity << dendl;
6138 }
6139 }
6140 }
6141 break;
6142
6143 case CEPH_OSD_OP_CACHE_PIN:
6144 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6145 if ((!pool.info.is_tier() ||
6146 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6147 result = -EINVAL;
6148 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6149 break;
6150 }
6151 ++ctx->num_write;
6152 {
6153 if (!obs.exists || oi.is_whiteout()) {
6154 result = -ENOENT;
6155 break;
6156 }
6157
6158 if (!oi.is_cache_pinned()) {
6159 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6160 ctx->modify = true;
6161 ctx->delta_stats.num_objects_pinned++;
6162 ctx->delta_stats.num_wr++;
6163 }
6164 result = 0;
6165 }
6166 break;
6167
6168 case CEPH_OSD_OP_CACHE_UNPIN:
6169 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6170 if ((!pool.info.is_tier() ||
6171 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6172 result = -EINVAL;
6173 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6174 break;
6175 }
6176 ++ctx->num_write;
6177 {
6178 if (!obs.exists || oi.is_whiteout()) {
6179 result = -ENOENT;
6180 break;
6181 }
6182
6183 if (oi.is_cache_pinned()) {
6184 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6185 ctx->modify = true;
6186 ctx->delta_stats.num_objects_pinned--;
6187 ctx->delta_stats.num_wr++;
6188 }
6189 result = 0;
6190 }
6191 break;
6192
31f18b77
FG
6193 case CEPH_OSD_OP_SET_REDIRECT:
6194 ++ctx->num_write;
6195 {
6196 if (pool.info.is_tier()) {
6197 result = -EINVAL;
6198 break;
6199 }
6200 if (!obs.exists) {
6201 result = -ENOENT;
6202 break;
6203 }
6204 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6205 result = -EOPNOTSUPP;
6206 break;
6207 }
6208
6209 object_t target_name;
6210 object_locator_t target_oloc;
6211 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6212 version_t target_version = op.copy_from.src_version;
6213 try {
6214 ::decode(target_name, bp);
6215 ::decode(target_oloc, bp);
6216 }
6217 catch (buffer::error& e) {
6218 result = -EINVAL;
6219 goto fail;
6220 }
6221 pg_t raw_pg;
6222 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6223 hobject_t target(target_name, target_oloc.key, target_snapid,
6224 raw_pg.ps(), raw_pg.pool(),
6225 target_oloc.nspace);
6226 if (target == soid) {
6227 dout(20) << " set-redirect self is invalid" << dendl;
6228 result = -EINVAL;
6229 break;
6230 }
6231 oi.set_flag(object_info_t::FLAG_MANIFEST);
6232 oi.manifest.redirect_target = target;
6233 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6234 t->truncate(soid, 0);
6235 if (oi.is_omap() && pool.info.supports_omap()) {
6236 t->omap_clear(soid);
6237 obs.oi.clear_omap_digest();
6238 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6239 }
6240 ctx->delta_stats.num_bytes -= oi.size;
6241 oi.size = 0;
6242 oi.new_object();
6243 oi.user_version = target_version;
6244 ctx->user_at_version = target_version;
6245 /* rm_attrs */
6246 map<string,bufferlist> rmattrs;
6247 result = getattrs_maybe_cache(ctx->obc,
b32b8144 6248 &rmattrs);
31f18b77
FG
6249 if (result < 0) {
6250 return result;
6251 }
6252 map<string, bufferlist>::iterator iter;
6253 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6254 const string& name = iter->first;
6255 t->rmattr(soid, name);
6256 }
6257 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6258 }
6259
6260 break;
7c673cae
FG
6261
6262 // -- object attrs --
6263
6264 case CEPH_OSD_OP_SETXATTR:
6265 ++ctx->num_write;
6266 {
6267 if (cct->_conf->osd_max_attr_size > 0 &&
6268 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6269 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6270 result = -EFBIG;
6271 break;
6272 }
6273 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6274 cct->_conf->osd_max_attr_name_len);
6275 if (op.xattr.name_len > max_name_len) {
6276 result = -ENAMETOOLONG;
6277 break;
6278 }
6279 maybe_create_new_object(ctx);
6280 string aname;
6281 bp.copy(op.xattr.name_len, aname);
6282 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6283 string name = "_" + aname;
6284 bufferlist bl;
6285 bp.copy(op.xattr.value_len, bl);
6286 t->setattr(soid, name, bl);
6287 ctx->delta_stats.num_wr++;
6288 }
6289 break;
6290
6291 case CEPH_OSD_OP_RMXATTR:
6292 ++ctx->num_write;
6293 {
6294 string aname;
6295 bp.copy(op.xattr.name_len, aname);
6296 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6297 if (!obs.exists || oi.is_whiteout()) {
6298 result = -ENOENT;
6299 break;
6300 }
6301 string name = "_" + aname;
6302 t->rmattr(soid, name);
6303 ctx->delta_stats.num_wr++;
6304 }
6305 break;
6306
6307
6308 // -- fancy writers --
6309 case CEPH_OSD_OP_APPEND:
6310 {
6311 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6312 // just do it inline; this works because we are happy to execute
6313 // fancy op on replicas as well.
6314 vector<OSDOp> nops(1);
6315 OSDOp& newop = nops[0];
6316 newop.op.op = CEPH_OSD_OP_WRITE;
6317 newop.op.extent.offset = oi.size;
6318 newop.op.extent.length = op.extent.length;
6319 newop.op.extent.truncate_seq = oi.truncate_seq;
6320 newop.indata = osd_op.indata;
6321 result = do_osd_ops(ctx, nops);
6322 osd_op.outdata.claim(newop.outdata);
6323 }
6324 break;
6325
6326 case CEPH_OSD_OP_STARTSYNC:
6327 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6328 t->nop(soid);
6329 break;
6330
6331
6332 // -- trivial map --
6333 case CEPH_OSD_OP_TMAPGET:
6334 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6335 if (pool.info.require_rollback()) {
6336 result = -EOPNOTSUPP;
6337 break;
6338 }
6339 {
6340 vector<OSDOp> nops(1);
6341 OSDOp& newop = nops[0];
6342 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6343 newop.op.extent.offset = 0;
6344 newop.op.extent.length = 0;
6345 do_osd_ops(ctx, nops);
6346 osd_op.outdata.claim(newop.outdata);
6347 }
6348 break;
6349
6350 case CEPH_OSD_OP_TMAPPUT:
6351 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6352 if (pool.info.require_rollback()) {
6353 result = -EOPNOTSUPP;
6354 break;
6355 }
6356 {
6357 //_dout_lock.Lock();
6358 //osd_op.data.hexdump(*_dout);
6359 //_dout_lock.Unlock();
6360
6361 // verify sort order
6362 bool unsorted = false;
6363 if (true) {
6364 bufferlist header;
6365 ::decode(header, bp);
6366 uint32_t n;
6367 ::decode(n, bp);
6368 string last_key;
6369 while (n--) {
6370 string key;
6371 ::decode(key, bp);
6372 dout(10) << "tmapput key " << key << dendl;
6373 bufferlist val;
6374 ::decode(val, bp);
6375 if (key < last_key) {
6376 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6377 unsorted = true;
6378 break;
6379 }
6380 last_key = key;
6381 }
6382 }
6383
6384 // write it
6385 vector<OSDOp> nops(1);
6386 OSDOp& newop = nops[0];
6387 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6388 newop.op.extent.offset = 0;
6389 newop.op.extent.length = osd_op.indata.length();
6390 newop.indata = osd_op.indata;
6391
6392 if (unsorted) {
6393 bp = osd_op.indata.begin();
6394 bufferlist header;
6395 map<string, bufferlist> m;
6396 ::decode(header, bp);
6397 ::decode(m, bp);
6398 assert(bp.end());
6399 bufferlist newbl;
6400 ::encode(header, newbl);
6401 ::encode(m, newbl);
6402 newop.indata = newbl;
6403 }
6404 result = do_osd_ops(ctx, nops);
6405 assert(result == 0);
6406 }
6407 break;
6408
6409 case CEPH_OSD_OP_TMAPUP:
6410 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6411 if (pool.info.require_rollback()) {
6412 result = -EOPNOTSUPP;
6413 break;
6414 }
6415 ++ctx->num_write;
6416 result = do_tmapup(ctx, bp, osd_op);
6417 break;
6418
6419 case CEPH_OSD_OP_TMAP2OMAP:
6420 ++ctx->num_write;
6421 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6422 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6423 break;
6424
6425 // OMAP Read ops
6426 case CEPH_OSD_OP_OMAPGETKEYS:
6427 ++ctx->num_read;
6428 {
6429 string start_after;
6430 uint64_t max_return;
6431 try {
6432 ::decode(start_after, bp);
6433 ::decode(max_return, bp);
6434 }
6435 catch (buffer::error& e) {
6436 result = -EINVAL;
6437 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6438 goto fail;
6439 }
6440 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6441 max_return = cct->_conf->osd_max_omap_entries_per_request;
6442 }
6443 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6444
6445 bufferlist bl;
6446 uint32_t num = 0;
6447 bool truncated = false;
6448 if (oi.is_omap()) {
6449 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6450 coll, ghobject_t(soid)
6451 );
6452 assert(iter);
6453 iter->upper_bound(start_after);
6454 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6455 if (num >= max_return ||
6456 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6457 truncated = true;
6458 break;
6459 }
6460 ::encode(iter->key(), bl);
6461 }
6462 } // else return empty out_set
6463 ::encode(num, osd_op.outdata);
6464 osd_op.outdata.claim_append(bl);
6465 ::encode(truncated, osd_op.outdata);
6466 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6467 ctx->delta_stats.num_rd++;
6468 }
6469 break;
6470
6471 case CEPH_OSD_OP_OMAPGETVALS:
6472 ++ctx->num_read;
6473 {
6474 string start_after;
6475 uint64_t max_return;
6476 string filter_prefix;
6477 try {
6478 ::decode(start_after, bp);
6479 ::decode(max_return, bp);
6480 ::decode(filter_prefix, bp);
6481 }
6482 catch (buffer::error& e) {
6483 result = -EINVAL;
6484 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6485 goto fail;
6486 }
6487 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6488 max_return = cct->_conf->osd_max_omap_entries_per_request;
6489 }
6490 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6491
6492 uint32_t num = 0;
6493 bool truncated = false;
6494 bufferlist bl;
6495 if (oi.is_omap()) {
6496 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6497 coll, ghobject_t(soid)
6498 );
6499 if (!iter) {
6500 result = -ENOENT;
6501 goto fail;
6502 }
6503 iter->upper_bound(start_after);
6504 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6505 for (num = 0;
6506 iter->valid() &&
6507 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6508 ++num, iter->next(false)) {
6509 dout(20) << "Found key " << iter->key() << dendl;
6510 if (num >= max_return ||
6511 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6512 truncated = true;
6513 break;
6514 }
6515 ::encode(iter->key(), bl);
6516 ::encode(iter->value(), bl);
6517 }
6518 } // else return empty out_set
6519 ::encode(num, osd_op.outdata);
6520 osd_op.outdata.claim_append(bl);
6521 ::encode(truncated, osd_op.outdata);
6522 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6523 ctx->delta_stats.num_rd++;
6524 }
6525 break;
6526
6527 case CEPH_OSD_OP_OMAPGETHEADER:
6528 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6529 if (!oi.is_omap()) {
6530 // return empty header
6531 break;
6532 }
6533 ++ctx->num_read;
6534 {
6535 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6536 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6537 ctx->delta_stats.num_rd++;
6538 }
6539 break;
6540
6541 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6542 ++ctx->num_read;
6543 {
6544 set<string> keys_to_get;
6545 try {
6546 ::decode(keys_to_get, bp);
6547 }
6548 catch (buffer::error& e) {
6549 result = -EINVAL;
6550 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6551 goto fail;
6552 }
6553 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6554 map<string, bufferlist> out;
6555 if (oi.is_omap()) {
6556 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6557 } // else return empty omap entries
6558 ::encode(out, osd_op.outdata);
6559 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6560 ctx->delta_stats.num_rd++;
6561 }
6562 break;
6563
6564 case CEPH_OSD_OP_OMAP_CMP:
6565 ++ctx->num_read;
6566 {
6567 if (!obs.exists || oi.is_whiteout()) {
6568 result = -ENOENT;
6569 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6570 break;
6571 }
6572 map<string, pair<bufferlist, int> > assertions;
6573 try {
6574 ::decode(assertions, bp);
6575 }
6576 catch (buffer::error& e) {
6577 result = -EINVAL;
6578 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6579 goto fail;
6580 }
6581 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6582
6583 map<string, bufferlist> out;
6584
6585 if (oi.is_omap()) {
6586 set<string> to_get;
6587 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6588 i != assertions.end();
6589 ++i)
6590 to_get.insert(i->first);
6591 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6592 to_get, &out);
6593 if (r < 0) {
6594 result = r;
6595 break;
6596 }
6597 } // else leave out empty
6598
6599 //Should set num_rd_kb based on encode length of map
6600 ctx->delta_stats.num_rd++;
6601
6602 int r = 0;
6603 bufferlist empty;
6604 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6605 i != assertions.end();
6606 ++i) {
6607 auto out_entry = out.find(i->first);
6608 bufferlist &bl = (out_entry != out.end()) ?
6609 out_entry->second : empty;
6610 switch (i->second.second) {
6611 case CEPH_OSD_CMPXATTR_OP_EQ:
6612 if (!(bl == i->second.first)) {
6613 r = -ECANCELED;
6614 }
6615 break;
6616 case CEPH_OSD_CMPXATTR_OP_LT:
6617 if (!(bl < i->second.first)) {
6618 r = -ECANCELED;
6619 }
6620 break;
6621 case CEPH_OSD_CMPXATTR_OP_GT:
6622 if (!(bl > i->second.first)) {
6623 r = -ECANCELED;
6624 }
6625 break;
6626 default:
6627 r = -EINVAL;
6628 break;
6629 }
6630 if (r < 0)
6631 break;
6632 }
6633 if (r < 0) {
6634 result = r;
6635 }
6636 }
6637 break;
6638
6639 // OMAP Write ops
6640 case CEPH_OSD_OP_OMAPSETVALS:
6641 if (!pool.info.supports_omap()) {
6642 result = -EOPNOTSUPP;
6643 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6644 break;
6645 }
6646 ++ctx->num_write;
6647 {
6648 maybe_create_new_object(ctx);
6649 bufferlist to_set_bl;
6650 try {
6651 decode_str_str_map_to_bl(bp, &to_set_bl);
6652 }
6653 catch (buffer::error& e) {
6654 result = -EINVAL;
6655 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6656 goto fail;
6657 }
6658 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6659 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6660 dout(20) << "setting vals: " << dendl;
6661 map<string,bufferlist> to_set;
6662 bufferlist::iterator pt = to_set_bl.begin();
6663 ::decode(to_set, pt);
6664 for (map<string, bufferlist>::iterator i = to_set.begin();
6665 i != to_set.end();
6666 ++i) {
6667 dout(20) << "\t" << i->first << dendl;
6668 }
6669 }
6670 t->omap_setkeys(soid, to_set_bl);
6671 ctx->delta_stats.num_wr++;
6672 }
6673 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6674 obs.oi.clear_omap_digest();
6675 break;
6676
6677 case CEPH_OSD_OP_OMAPSETHEADER:
6678 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6679 if (!pool.info.supports_omap()) {
6680 result = -EOPNOTSUPP;
6681 break;
6682 }
6683 ++ctx->num_write;
6684 {
6685 maybe_create_new_object(ctx);
6686 t->omap_setheader(soid, osd_op.indata);
6687 ctx->delta_stats.num_wr++;
6688 }
6689 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6690 obs.oi.clear_omap_digest();
6691 break;
6692
6693 case CEPH_OSD_OP_OMAPCLEAR:
6694 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6695 if (!pool.info.supports_omap()) {
6696 result = -EOPNOTSUPP;
6697 break;
6698 }
6699 ++ctx->num_write;
6700 {
6701 if (!obs.exists || oi.is_whiteout()) {
6702 result = -ENOENT;
6703 break;
6704 }
6705 if (oi.is_omap()) {
6706 t->omap_clear(soid);
6707 ctx->delta_stats.num_wr++;
6708 obs.oi.clear_omap_digest();
6709 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6710 }
6711 }
6712 break;
6713
6714 case CEPH_OSD_OP_OMAPRMKEYS:
6715 if (!pool.info.supports_omap()) {
6716 result = -EOPNOTSUPP;
6717 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6718 break;
6719 }
6720 ++ctx->num_write;
6721 {
6722 if (!obs.exists || oi.is_whiteout()) {
6723 result = -ENOENT;
6724 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6725 break;
6726 }
6727 bufferlist to_rm_bl;
6728 try {
6729 decode_str_set_to_bl(bp, &to_rm_bl);
6730 }
6731 catch (buffer::error& e) {
6732 result = -EINVAL;
6733 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6734 goto fail;
6735 }
6736 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6737 t->omap_rmkeys(soid, to_rm_bl);
6738 ctx->delta_stats.num_wr++;
6739 }
6740 obs.oi.clear_omap_digest();
6741 break;
6742
6743 case CEPH_OSD_OP_COPY_GET:
6744 ++ctx->num_read;
c07f9fc5
FG
6745 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6746 soid.snap.val);
6747 if (op_finisher == nullptr) {
6748 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6749 } else {
6750 result = op_finisher->execute();
6751 }
7c673cae
FG
6752 break;
6753
6754 case CEPH_OSD_OP_COPY_FROM:
6755 ++ctx->num_write;
6756 {
6757 object_t src_name;
6758 object_locator_t src_oloc;
6759 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6760 version_t src_version = op.copy_from.src_version;
6761 try {
6762 ::decode(src_name, bp);
6763 ::decode(src_oloc, bp);
6764 }
6765 catch (buffer::error& e) {
6766 result = -EINVAL;
6767 tracepoint(osd,
6768 do_osd_op_pre_copy_from,
6769 soid.oid.name.c_str(),
6770 soid.snap.val,
6771 "???",
6772 0,
6773 "???",
6774 "???",
6775 0,
6776 src_snapid,
6777 src_version);
6778 goto fail;
6779 }
6780 tracepoint(osd,
6781 do_osd_op_pre_copy_from,
6782 soid.oid.name.c_str(),
6783 soid.snap.val,
6784 src_name.name.c_str(),
6785 src_oloc.pool,
6786 src_oloc.key.c_str(),
6787 src_oloc.nspace.c_str(),
6788 src_oloc.hash,
6789 src_snapid,
6790 src_version);
c07f9fc5 6791 if (op_finisher == nullptr) {
7c673cae
FG
6792 // start
6793 pg_t raw_pg;
6794 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6795 hobject_t src(src_name, src_oloc.key, src_snapid,
6796 raw_pg.ps(), raw_pg.pool(),
6797 src_oloc.nspace);
6798 if (src == soid) {
6799 dout(20) << " copy from self is invalid" << dendl;
6800 result = -EINVAL;
6801 break;
6802 }
c07f9fc5
FG
6803 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6804 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6805 new CopyFromFinisher(cb));
7c673cae
FG
6806 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6807 op.copy_from.flags,
6808 false,
6809 op.copy_from.src_fadvise_flags,
6810 op.flags);
6811 result = -EINPROGRESS;
6812 } else {
6813 // finish
c07f9fc5
FG
6814 result = op_finisher->execute();
6815 assert(result == 0);
6816
6817 // COPY_FROM cannot be executed multiple times -- it must restart
6818 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7c673cae
FG
6819 }
6820 }
6821 break;
6822
6823 default:
6824 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6825 dout(1) << "unrecognized osd op " << op.op
6826 << " " << ceph_osd_op_name(op.op)
6827 << dendl;
6828 result = -EOPNOTSUPP;
6829 }
6830
6831 fail:
6832 osd_op.rval = result;
6833 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6834 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6835 result = 0;
6836
6837 if (result < 0)
6838 break;
6839 }
6840 return result;
6841}
6842
6843int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6844{
6845 if (ctx->new_obs.oi.size == 0) {
6846 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6847 return -ENODATA;
6848 }
6849 vector<OSDOp> nops(1);
6850 OSDOp &newop = nops[0];
6851 newop.op.op = CEPH_OSD_OP_TMAPGET;
6852 do_osd_ops(ctx, nops);
6853 try {
6854 bufferlist::iterator i = newop.outdata.begin();
6855 ::decode(*header, i);
6856 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6857 } catch (...) {
6858 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6859 << dendl;
6860 return -EINVAL;
6861 }
6862 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6863 << dendl;
6864 return 0;
6865}
6866
6867int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6868 const SnapSet& ss)
6869{
6870 // verify that all clones have been evicted
6871 dout(20) << __func__ << " verifying clones are absent "
6872 << ss << dendl;
6873 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6874 p != ss.clones.end();
6875 ++p) {
6876 hobject_t clone_oid = soid;
6877 clone_oid.snap = *p;
6878 if (is_missing_object(clone_oid))
6879 return -EBUSY;
6880 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6881 if (clone_obc && clone_obc->obs.exists) {
6882 dout(10) << __func__ << " cannot evict head before clone "
6883 << clone_oid << dendl;
6884 return -EBUSY;
6885 }
6886 if (copy_ops.count(clone_oid)) {
6887 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6888 << clone_oid << dendl;
6889 return -EBUSY;
6890 }
6891 }
6892 return 0;
6893}
6894
6895inline int PrimaryLogPG::_delete_oid(
6896 OpContext *ctx,
6897 bool no_whiteout, // no whiteouts, no matter what.
6898 bool try_no_whiteout) // try not to whiteout
6899{
6900 SnapSet& snapset = ctx->new_snapset;
6901 ObjectState& obs = ctx->new_obs;
6902 object_info_t& oi = obs.oi;
6903 const hobject_t& soid = oi.soid;
6904 PGTransaction* t = ctx->op_t.get();
6905
6906 // cache: cache: set whiteout on delete?
6907 bool whiteout = false;
6908 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6909 && !no_whiteout
6910 && !try_no_whiteout) {
6911 whiteout = true;
6912 }
6913 bool legacy;
31f18b77 6914 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
6915 legacy = false;
6916 // in luminous or later, we can't delete the head if there are
6917 // clones. we trust the caller passing no_whiteout has already
6918 // verified they don't exist.
6919 if (!snapset.clones.empty() ||
6920 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6921 if (no_whiteout) {
6922 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6923 << dendl;
6924 } else {
6925 dout(20) << __func__ << " has or will have clones; will whiteout"
6926 << dendl;
6927 whiteout = true;
6928 }
6929 }
6930 } else {
3efd9988 6931 legacy = true;
7c673cae
FG
6932 }
6933 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6934 << " no_whiteout=" << (int)no_whiteout
6935 << " try_no_whiteout=" << (int)try_no_whiteout
6936 << dendl;
6937 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6938 return -ENOENT;
6939
6940 t->remove(soid);
6941
6942 if (oi.size > 0) {
6943 interval_set<uint64_t> ch;
6944 ch.insert(0, oi.size);
6945 ctx->modified_ranges.union_of(ch);
6946 }
6947
6948 ctx->delta_stats.num_wr++;
6949 if (soid.is_snap()) {
6950 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6951 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6952 } else {
6953 ctx->delta_stats.num_bytes -= oi.size;
6954 }
6955 oi.size = 0;
6956 oi.new_object();
6957
6958 // disconnect all watchers
6959 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6960 oi.watchers.begin();
6961 p != oi.watchers.end();
6962 ++p) {
6963 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6964 ctx->watch_disconnects.push_back(
6965 watch_disconnect_t(p->first.first, p->first.second, true));
6966 }
6967 oi.watchers.clear();
6968
6969 if (whiteout) {
6970 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6971 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6972 ctx->delta_stats.num_whiteouts++;
6973 t->create(soid);
6974 osd->logger->inc(l_osd_tier_whiteout);
6975 return 0;
6976 }
6977
6978 // delete the head
6979 ctx->delta_stats.num_objects--;
6980 if (soid.is_snap())
6981 ctx->delta_stats.num_object_clones--;
6982 if (oi.is_whiteout()) {
6983 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6984 ctx->delta_stats.num_whiteouts--;
6985 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6986 }
6987 if (oi.is_cache_pinned()) {
6988 ctx->delta_stats.num_objects_pinned--;
6989 }
6990 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6991 snapset.head_exists = false;
6992 }
6993 obs.exists = false;
6994 return 0;
6995}
6996
6997int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6998{
6999 SnapSet& snapset = ctx->new_snapset;
7000 ObjectState& obs = ctx->new_obs;
7001 object_info_t& oi = obs.oi;
7002 const hobject_t& soid = oi.soid;
7003 PGTransaction* t = ctx->op_t.get();
7004 snapid_t snapid = (uint64_t)op.snap.snapid;
7005 hobject_t missing_oid;
7006
7007 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7008
7009 ObjectContextRef rollback_to;
7010 int ret = find_object_context(
7011 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7012 soid.get_namespace()),
7013 &rollback_to, false, false, &missing_oid);
7014 if (ret == -EAGAIN) {
7015 /* clone must be missing */
c07f9fc5
FG
7016 assert(is_degraded_or_backfilling_object(missing_oid));
7017 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7c673cae
FG
7018 << missing_oid << " (requested snapid: ) " << snapid << dendl;
7019 block_write_on_degraded_snap(missing_oid, ctx->op);
7020 return ret;
7021 }
7022 {
7023 ObjectContextRef promote_obc;
31f18b77
FG
7024 cache_result_t tier_mode_result;
7025 if (obs.exists && obs.oi.has_manifest()) {
7026 tier_mode_result =
7027 maybe_handle_manifest_detail(
7028 ctx->op,
7029 true,
7030 rollback_to);
7031 } else {
7032 tier_mode_result =
7033 maybe_handle_cache_detail(
7034 ctx->op,
7035 true,
7036 rollback_to,
7037 ret,
7038 missing_oid,
7039 true,
7040 false,
7041 &promote_obc);
7042 }
7043 switch (tier_mode_result) {
7c673cae
FG
7044 case cache_result_t::NOOP:
7045 break;
7046 case cache_result_t::BLOCKED_PROMOTE:
7047 assert(promote_obc);
7048 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7049 return -EAGAIN;
7050 case cache_result_t::BLOCKED_FULL:
7051 block_write_on_full_cache(soid, ctx->op);
7052 return -EAGAIN;
b32b8144
FG
7053 case cache_result_t::REPLIED_WITH_EAGAIN:
7054 assert(0 == "this can't happen, no rollback on replica");
7c673cae
FG
7055 default:
7056 assert(0 == "must promote was set, other values are not valid");
7057 return -EAGAIN;
7058 }
7059 }
7060
7061 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7062 // there's no snapshot here, or there's no object.
7063 // if there's no snapshot, we delete the object; otherwise, do nothing.
7064 dout(20) << "_rollback_to deleting head on " << soid.oid
7065 << " because got ENOENT|whiteout on find_object_context" << dendl;
7066 if (ctx->obc->obs.oi.watchers.size()) {
7067 // Cannot delete an object with watchers
7068 ret = -EBUSY;
7069 } else {
7070 _delete_oid(ctx, false, false);
7071 ret = 0;
7072 }
7073 } else if (ret) {
7074 // ummm....huh? It *can't* return anything else at time of writing.
7075 assert(0 == "unexpected error code in _rollback_to");
7076 } else { //we got our context, let's use it to do the rollback!
7077 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7078 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7079 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7080 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7081 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7082 ret = -EAGAIN;
7083 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7084 // rolling back to the head; we just need to clone it.
7085 ctx->modify = true;
7086 } else {
7087 /* 1) Delete current head
7088 * 2) Clone correct snapshot into head
7089 * 3) Calculate clone_overlaps by following overlaps
7090 * forward from rollback snapshot */
7091 dout(10) << "_rollback_to deleting " << soid.oid
7092 << " and rolling back to old snap" << dendl;
7093
7094 if (obs.exists) {
7095 t->remove(soid);
7096 }
7097 t->clone(soid, rollback_to_sobject);
7098 snapset.head_exists = true;
7099 t->add_obc(rollback_to);
7100
7101 map<snapid_t, interval_set<uint64_t> >::iterator iter =
7102 snapset.clone_overlap.lower_bound(snapid);
7103 interval_set<uint64_t> overlaps = iter->second;
7104 assert(iter != snapset.clone_overlap.end());
7105 for ( ;
7106 iter != snapset.clone_overlap.end();
7107 ++iter)
7108 overlaps.intersection_of(iter->second);
7109
7110 if (obs.oi.size > 0) {
7111 interval_set<uint64_t> modified;
7112 modified.insert(0, obs.oi.size);
7113 overlaps.intersection_of(modified);
7114 modified.subtract(overlaps);
7115 ctx->modified_ranges.union_of(modified);
7116 }
7117
7118 // Adjust the cached objectcontext
7119 maybe_create_new_object(ctx, true);
7120 ctx->delta_stats.num_bytes -= obs.oi.size;
7121 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7122 obs.oi.size = rollback_to->obs.oi.size;
7123 if (rollback_to->obs.oi.is_data_digest())
7124 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7125 else
7126 obs.oi.clear_data_digest();
7127 if (rollback_to->obs.oi.is_omap_digest())
7128 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7129 else
7130 obs.oi.clear_omap_digest();
7131
7132 if (rollback_to->obs.oi.is_omap()) {
7133 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7134 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7135 } else {
7136 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7137 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7138 }
7139
7140 snapset.head_exists = true;
7141 }
7142 }
7143 return ret;
7144}
7145
7146void PrimaryLogPG::_make_clone(
7147 OpContext *ctx,
7148 PGTransaction* t,
7149 ObjectContextRef obc,
7150 const hobject_t& head, const hobject_t& coid,
7151 object_info_t *poi)
7152{
7153 bufferlist bv;
7154 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7155
7156 t->clone(coid, head);
7157 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7158 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7159}
7160
7161void PrimaryLogPG::make_writeable(OpContext *ctx)
7162{
7163 const hobject_t& soid = ctx->obs->oi.soid;
7164 SnapContext& snapc = ctx->snapc;
7165
7166 // clone?
7167 assert(soid.snap == CEPH_NOSNAP);
7168 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7169 << " snapc=" << snapc << dendl;
7170
7171 bool was_dirty = ctx->obc->obs.oi.is_dirty();
7172 if (ctx->new_obs.exists) {
7173 // we will mark the object dirty
7174 if (ctx->undirty && was_dirty) {
7175 dout(20) << " clearing DIRTY flag" << dendl;
7176 assert(ctx->new_obs.oi.is_dirty());
7177 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7178 --ctx->delta_stats.num_objects_dirty;
7179 osd->logger->inc(l_osd_tier_clean);
7180 } else if (!was_dirty && !ctx->undirty) {
7181 dout(20) << " setting DIRTY flag" << dendl;
7182 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7183 ++ctx->delta_stats.num_objects_dirty;
7184 osd->logger->inc(l_osd_tier_dirty);
7185 }
7186 } else {
7187 if (was_dirty) {
7188 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7189 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7190 --ctx->delta_stats.num_objects_dirty;
7191 }
7192 }
7193
7194 if ((ctx->new_obs.exists &&
7195 ctx->new_obs.oi.is_omap()) &&
7196 (!ctx->obc->obs.exists ||
7197 !ctx->obc->obs.oi.is_omap())) {
7198 ++ctx->delta_stats.num_objects_omap;
7199 }
7200 if ((!ctx->new_obs.exists ||
7201 !ctx->new_obs.oi.is_omap()) &&
7202 (ctx->obc->obs.exists &&
7203 ctx->obc->obs.oi.is_omap())) {
7204 --ctx->delta_stats.num_objects_omap;
7205 }
7206
7207 // use newer snapc?
7208 if (ctx->new_snapset.seq > snapc.seq) {
7209 snapc.seq = ctx->new_snapset.seq;
7210 snapc.snaps = ctx->new_snapset.snaps;
7211 filter_snapc(snapc.snaps);
7212 dout(10) << " using newer snapc " << snapc << dendl;
7213 }
7214
7215 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7216 snapc.snaps.size() && // there are snaps
7217 !ctx->cache_evict &&
7218 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
7219 // clone
7220 hobject_t coid = soid;
7221 coid.snap = snapc.seq;
7222
7223 unsigned l;
7224 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7225
7226 vector<snapid_t> snaps(l);
7227 for (unsigned i=0; i<l; i++)
7228 snaps[i] = snapc.snaps[i];
7229
7230 // prepare clone
7231 object_info_t static_snap_oi(coid);
7232 object_info_t *snap_oi;
7233 if (is_primary()) {
7234 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7235 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7236 ctx->clone_obc->obs.oi = static_snap_oi;
7237 ctx->clone_obc->obs.exists = true;
7238 ctx->clone_obc->ssc = ctx->obc->ssc;
7239 ctx->clone_obc->ssc->ref++;
7240 if (pool.info.require_rollback())
7241 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7242 snap_oi = &ctx->clone_obc->obs.oi;
7243 bool got = ctx->lock_manager.get_write_greedy(
7244 coid,
7245 ctx->clone_obc,
7246 ctx->op);
7247 assert(got);
7248 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7249 } else {
7250 snap_oi = &static_snap_oi;
7251 }
7252 snap_oi->version = ctx->at_version;
7253 snap_oi->prior_version = ctx->obs->oi.version;
7254 snap_oi->copy_user_bits(ctx->obs->oi);
7255
7256 bool legacy = ctx->new_snapset.is_legacy() ||
31f18b77 7257 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
7258 if (legacy) {
7259 snap_oi->legacy_snaps = snaps;
7260 }
7261
7262 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7263
7264 ctx->delta_stats.num_objects++;
7265 if (snap_oi->is_dirty()) {
7266 ctx->delta_stats.num_objects_dirty++;
7267 osd->logger->inc(l_osd_tier_dirty);
7268 }
7269 if (snap_oi->is_omap())
7270 ctx->delta_stats.num_objects_omap++;
7271 if (snap_oi->is_cache_pinned())
7272 ctx->delta_stats.num_objects_pinned++;
7273 ctx->delta_stats.num_object_clones++;
7274 ctx->new_snapset.clones.push_back(coid.snap);
7275 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7276 if (!legacy) {
7277 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7278 }
7279
7280 // clone_overlap should contain an entry for each clone
7281 // (an empty interval_set if there is no overlap)
7282 ctx->new_snapset.clone_overlap[coid.snap];
7283 if (ctx->obs->oi.size)
7284 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7285
7286 // log clone
7287 dout(10) << " cloning v " << ctx->obs->oi.version
7288 << " to " << coid << " v " << ctx->at_version
7289 << " snaps=" << snaps
7290 << " snapset=" << ctx->new_snapset << dendl;
7291 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7292 ctx->obs->oi.version,
7293 ctx->obs->oi.user_version,
7294 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7295 ::encode(snaps, ctx->log.back().snaps);
7296
7297 ctx->at_version.version++;
7298 }
7299
7300 // update most recent clone_overlap and usage stats
7301 if (ctx->new_snapset.clones.size() > 0) {
7302 /* we need to check whether the most recent clone exists, if it's been evicted,
7303 * it's not included in the stats */
7304 hobject_t last_clone_oid = soid;
7305 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7306 if (is_present_clone(last_clone_oid)) {
7307 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7308 ctx->modified_ranges.intersection_of(newest_overlap);
7309 // modified_ranges is still in use by the clone
7310 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7311 newest_overlap.subtract(ctx->modified_ranges);
7312 }
7313 }
7314
7315 // update snapset with latest snap context
7316 ctx->new_snapset.seq = snapc.seq;
7317 ctx->new_snapset.snaps = snapc.snaps;
31f18b77 7318 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
7319 // pessimistic assumption that this is a net-new legacy SnapSet
7320 ctx->delta_stats.num_legacy_snapsets++;
7321 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7322 } else if (ctx->new_snapset.is_legacy()) {
7323 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7324 }
7325 dout(20) << "make_writeable " << soid
7326 << " done, snapset=" << ctx->new_snapset << dendl;
7327}
7328
7329
7330void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7331 interval_set<uint64_t>& modified, uint64_t offset,
7332 uint64_t length, bool write_full)
7333{
7334 interval_set<uint64_t> ch;
7335 if (write_full) {
7336 if (oi.size)
7337 ch.insert(0, oi.size);
7338 } else if (length)
7339 ch.insert(offset, length);
7340 modified.union_of(ch);
7341 if (write_full || offset + length > oi.size) {
7342 uint64_t new_size = offset + length;
7343 delta_stats.num_bytes -= oi.size;
7344 delta_stats.num_bytes += new_size;
7345 oi.size = new_size;
7346 }
7347 delta_stats.num_wr++;
7348 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7349}
7350
7351void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7352{
7353 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7354 delta_stats.num_bytes += p.get_len();
7355 }
7356}
7357
7358void PrimaryLogPG::complete_disconnect_watches(
7359 ObjectContextRef obc,
7360 const list<watch_disconnect_t> &to_disconnect)
7361{
7362 for (list<watch_disconnect_t>::const_iterator i =
7363 to_disconnect.begin();
7364 i != to_disconnect.end();
7365 ++i) {
7366 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7367 auto watchers_entry = obc->watchers.find(watcher);
7368 if (watchers_entry != obc->watchers.end()) {
7369 WatchRef watch = watchers_entry->second;
7370 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7371 obc->watchers.erase(watcher);
7372 watch->remove(i->send_disconnect);
7373 } else {
7374 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7375 << watcher << dendl;
7376 }
7377 }
7378}
7379
7380void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7381{
7382 entity_name_t entity = ctx->reqid.name;
7383 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7384
7385 // disconnects first
7386 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7387
7388 assert(conn);
7389
7390 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7391 if (!session.get())
7392 return;
7393 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7394
7395 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7396 i != ctx->watch_connects.end();
7397 ++i) {
7398 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7399 dout(15) << "do_osd_op_effects applying watch connect on session "
7400 << session.get() << " watcher " << watcher << dendl;
7401 WatchRef watch;
7402 if (ctx->obc->watchers.count(watcher)) {
7403 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7404 << dendl;
7405 watch = ctx->obc->watchers[watcher];
7406 } else {
7407 dout(15) << "do_osd_op_effects new watcher " << watcher
7408 << dendl;
7409 watch = Watch::makeWatchRef(
7410 this, osd, ctx->obc, i->first.timeout_seconds,
7411 i->first.cookie, entity, conn->get_peer_addr());
7412 ctx->obc->watchers.insert(
7413 make_pair(
7414 watcher,
7415 watch));
7416 }
7417 watch->connect(conn, i->second);
7418 }
7419
7420 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7421 p != ctx->notifies.end();
7422 ++p) {
7423 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7424 ConnectionRef conn(ctx->op->get_req()->get_connection());
7425 NotifyRef notif(
7426 Notify::makeNotifyRef(
7427 conn,
7428 ctx->reqid.name.num(),
7429 p->bl,
7430 p->timeout,
7431 p->cookie,
7432 p->notify_id,
7433 ctx->obc->obs.oi.user_version,
7434 osd));
7435 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7436 ctx->obc->watchers.begin();
7437 i != ctx->obc->watchers.end();
7438 ++i) {
7439 dout(10) << "starting notify on watch " << i->first << dendl;
7440 i->second->start_notify(notif);
7441 }
7442 notif->init();
7443 }
7444
7445 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7446 p != ctx->notify_acks.end();
7447 ++p) {
7448 if (p->watch_cookie)
7449 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7450 else
7451 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7452 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7453 ctx->obc->watchers.begin();
7454 i != ctx->obc->watchers.end();
7455 ++i) {
7456 if (i->first.second != entity) continue;
7457 if (p->watch_cookie &&
7458 p->watch_cookie.get() != i->first.first) continue;
7459 dout(10) << "acking notify on watch " << i->first << dendl;
7460 i->second->notify_ack(p->notify_id, p->reply_bl);
7461 }
7462 }
7463}
7464
7465hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7466{
7467 ostringstream ss;
7468 ss << "temp_" << info.pgid << "_" << get_role()
7469 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7470 hobject_t hoid = target.make_temp_hobject(ss.str());
7471 dout(20) << __func__ << " " << hoid << dendl;
7472 return hoid;
7473}
7474
7475hobject_t PrimaryLogPG::get_temp_recovery_object(
7476 const hobject_t& target,
7477 eversion_t version)
7478{
7479 ostringstream ss;
7480 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7481 << "_" << version
7482 << "_" << info.history.same_interval_since
7483 << "_" << target.snap;
7484 // pgid + version + interval + snapid is unique, and short
7485 hobject_t hoid = target.make_temp_hobject(ss.str());
7486 dout(20) << __func__ << " " << hoid << dendl;
7487 return hoid;
7488}
7489
7490int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7491{
c07f9fc5
FG
7492 assert(!ctx->ops->empty());
7493
7c673cae
FG
7494 const hobject_t& soid = ctx->obs->oi.soid;
7495
7496 // valid snap context?
7497 if (!ctx->snapc.is_valid()) {
7498 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7499 return -EINVAL;
7500 }
7501
7502 // prepare the actual mutation
c07f9fc5 7503 int result = do_osd_ops(ctx, *ctx->ops);
7c673cae
FG
7504 if (result < 0) {
7505 if (ctx->op->may_write() &&
31f18b77 7506 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
7507 // need to save the error code in the pg log, to detect dup ops,
7508 // but do nothing else
7509 ctx->update_log_only = true;
7510 }
7511 return result;
7512 }
7513
7514 // read-op? write-op noop? done?
7515 if (ctx->op_t->empty() && !ctx->modify) {
7516 unstable_stats.add(ctx->delta_stats);
7517 if (ctx->op->may_write() &&
31f18b77 7518 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
7519 ctx->update_log_only = true;
7520 }
7521 return result;
7522 }
7523
7524 // check for full
7525 if ((ctx->delta_stats.num_bytes > 0 ||
7526 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7527 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7528 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7529 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7530 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7531 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7532 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7533 << dendl;
7534 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7535 // they tried, they failed.
7536 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7537 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7538 } else {
7539 // drop request
7540 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7541 return -EAGAIN;
7542 }
7543 }
7544
7545 // clone, if necessary
7546 if (soid.snap == CEPH_NOSNAP)
7547 make_writeable(ctx);
7548
7549 finish_ctx(ctx,
7550 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7551 pg_log_entry_t::DELETE);
7552
7553 return result;
7554}
7555
7556void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7557{
7558 const hobject_t& soid = ctx->obs->oi.soid;
7559 dout(20) << __func__ << " " << soid << " " << ctx
7560 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7561 << dendl;
7562 utime_t now = ceph_clock_now();
7563
7564 // snapset
7565 bufferlist bss;
7566
7567 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7568 ::encode(ctx->new_snapset, bss);
7569 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7570 !ctx->new_snapset.is_legacy());
7571
7572 if (ctx->new_obs.exists) {
7573 if (!ctx->obs->exists) {
7574 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7575 hobject_t snapoid = soid.get_snapdir();
7576 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7577 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7578 ctx->at_version,
7579 ctx->snapset_obc->obs.oi.version,
7580 0, osd_reqid_t(), ctx->mtime, 0));
7581 ctx->op_t->remove(snapoid);
7582
7583 ctx->at_version.version++;
7584
7585 ctx->snapset_obc->obs.exists = false;
7586 }
7587 }
7588 } else if (!ctx->new_snapset.clones.empty() &&
7589 !ctx->cache_evict &&
7590 !ctx->new_snapset.head_exists &&
7591 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7592 // save snapset on _snap
7593 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7594 info.pgid.pool(), soid.get_namespace());
7595 dout(10) << " final snapset " << ctx->new_snapset
7596 << " in " << snapoid << dendl;
31f18b77 7597 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
7598 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7599 ctx->at_version,
7600 eversion_t(),
7601 0, osd_reqid_t(), ctx->mtime, 0));
7602
7603 if (!ctx->snapset_obc)
7604 ctx->snapset_obc = get_object_context(snapoid, true);
7605 bool got = false;
7606 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7607 got = ctx->lock_manager.get_write_greedy(
7608 snapoid,
7609 ctx->snapset_obc,
7610 ctx->op);
7611 } else {
7612 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7613 got = ctx->lock_manager.get_lock_type(
7614 ObjectContext::RWState::RWEXCL,
7615 snapoid,
7616 ctx->snapset_obc,
7617 ctx->op);
7618 }
7619 assert(got);
7620 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7621 ctx->snapset_obc->obs.exists = true;
7622 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7623 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7624 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7625 ctx->snapset_obc->obs.oi.local_mtime = now;
7626
7627 map<string, bufferlist> attrs;
7628 bufferlist bv(sizeof(ctx->new_obs.oi));
7629 ::encode(ctx->snapset_obc->obs.oi, bv,
7630 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7631 ctx->op_t->create(snapoid);
7632 attrs[OI_ATTR].claim(bv);
7633 attrs[SS_ATTR].claim(bss);
7634 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7635 ctx->at_version.version++;
7636 }
7637 }
7638
7639 // finish and log the op.
7640 if (ctx->user_modify) {
7641 // update the user_version for any modify ops, except for the watch op
7642 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7643 /* In order for new clients and old clients to interoperate properly
7644 * when exchanging versions, we need to lower bound the user_version
7645 * (which our new clients pay proper attention to)
7646 * by the at_version (which is all the old clients can ever see). */
7647 if (ctx->at_version.version > ctx->user_at_version)
7648 ctx->user_at_version = ctx->at_version.version;
7649 ctx->new_obs.oi.user_version = ctx->user_at_version;
7650 }
7651 ctx->bytes_written = ctx->op_t->get_bytes_written();
7652
7653 if (ctx->new_obs.exists) {
7654 // on the head object
7655 ctx->new_obs.oi.version = ctx->at_version;
7656 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7657 ctx->new_obs.oi.last_reqid = ctx->reqid;
7658 if (ctx->mtime != utime_t()) {
7659 ctx->new_obs.oi.mtime = ctx->mtime;
7660 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7661 ctx->new_obs.oi.local_mtime = now;
7662 } else {
7663 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7664 }
7665
7666 map <string, bufferlist> attrs;
7667 bufferlist bv(sizeof(ctx->new_obs.oi));
7668 ::encode(ctx->new_obs.oi, bv,
7669 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7670 attrs[OI_ATTR].claim(bv);
7671
7672 if (soid.snap == CEPH_NOSNAP) {
7673 dout(10) << " final snapset " << ctx->new_snapset
7674 << " in " << soid << dendl;
7675 attrs[SS_ATTR].claim(bss);
7676 } else {
7677 dout(10) << " no snapset (this is a clone)" << dendl;
7678 }
7679 ctx->op_t->setattrs(soid, attrs);
7680 } else {
7681 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7682 }
7683
7684 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
31f18b77 7685 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
7686
7687 // append to log
7688 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7689 ctx->obs->oi.version,
7690 ctx->user_at_version, ctx->reqid,
7691 ctx->mtime, 0));
7692 if (soid.snap < CEPH_NOSNAP) {
7693 switch (log_op_type) {
7694 case pg_log_entry_t::MODIFY:
7695 case pg_log_entry_t::PROMOTE:
7696 case pg_log_entry_t::CLEAN:
7697 if (legacy_snapset) {
7698 dout(20) << __func__ << " encoding legacy_snaps "
7699 << ctx->new_obs.oi.legacy_snaps
7700 << dendl;
7701 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7702 } else {
7703 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7704 << dendl;
7705 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7706 }
7707 break;
7708 default:
7709 break;
7710 }
7711 }
7712
7713 if (!ctx->extra_reqids.empty()) {
7714 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7715 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7716 }
7717
7718 // apply new object state.
7719 ctx->obc->obs = ctx->new_obs;
7720
7721 if (soid.is_head() && !ctx->obc->obs.exists &&
7722 (!maintain_ssc || ctx->cache_evict)) {
7723 ctx->obc->ssc->exists = false;
7724 ctx->obc->ssc->snapset = SnapSet();
7725 } else {
7726 ctx->obc->ssc->exists = true;
7727 ctx->obc->ssc->snapset = ctx->new_snapset;
7728 }
7729}
7730
7731void PrimaryLogPG::apply_stats(
7732 const hobject_t &soid,
7733 const object_stat_sum_t &delta_stats) {
7734
7735 info.stats.stats.add(delta_stats);
7736
7737 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7738 i != backfill_targets.end();
7739 ++i) {
7740 pg_shard_t bt = *i;
7741 pg_info_t& pinfo = peer_info[bt];
7742 if (soid <= pinfo.last_backfill)
7743 pinfo.stats.stats.add(delta_stats);
7744 else if (soid <= last_backfill_started)
7745 pending_backfill_updates[soid].stats.add(delta_stats);
7746 }
7747
7748 if (is_primary() && scrubber.active) {
7749 if (soid < scrubber.start) {
7750 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7751 << "," << scrubber.end << ")" << dendl;
7752 scrub_cstat.add(delta_stats);
7753 } else {
7754 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7755 << "," << scrubber.end << ")" << dendl;
7756 }
7757 }
7758}
7759
7760void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7761{
7762 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7763 assert(ctx->async_reads_complete());
7764
c07f9fc5
FG
7765 for (vector<OSDOp>::iterator p = ctx->ops->begin();
7766 p != ctx->ops->end() && result >= 0; ++p) {
7c673cae
FG
7767 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7768 result = p->rval;
7769 break;
7770 }
7771 ctx->bytes_read += p->outdata.length();
7772 }
c07f9fc5
FG
7773 ctx->reply->claim_op_out_data(*ctx->ops);
7774 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
7775
7776 MOSDOpReply *reply = ctx->reply;
7777 ctx->reply = nullptr;
7778
7779 if (result >= 0) {
7780 if (!ctx->ignore_log_op_stats) {
7781 log_op_stats(ctx);
7782 publish_stats_to_osd();
7783 }
7784
7785 // on read, return the current object version
7786 if (ctx->obs) {
7787 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7788 } else {
7789 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7790 }
7791 } else if (result == -ENOENT) {
7792 // on ENOENT, set a floor for what the next user version will be.
7793 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7794 }
7795
7796 reply->set_result(result);
7797 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7798 osd->send_message_osd_client(reply, m->get_connection());
7799 close_op_ctx(ctx);
7800}
7801
7802// ========================================================================
7803// copyfrom
7804
7805struct C_Copyfrom : public Context {
7806 PrimaryLogPGRef pg;
7807 hobject_t oid;
7808 epoch_t last_peering_reset;
7809 ceph_tid_t tid;
7810 PrimaryLogPG::CopyOpRef cop;
7811 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7812 const PrimaryLogPG::CopyOpRef& c)
7813 : pg(p), oid(o), last_peering_reset(lpr),
7814 tid(0), cop(c)
7815 {}
7816 void finish(int r) override {
7817 if (r == -ECANCELED)
7818 return;
7819 pg->lock();
7820 if (last_peering_reset == pg->get_last_peering_reset()) {
7821 pg->process_copy_chunk(oid, tid, r);
7822 }
7823 pg->unlock();
7824 }
7825};
7826
7827struct C_CopyFrom_AsyncReadCb : public Context {
7828 OSDOp *osd_op;
7829 object_copy_data_t reply_obj;
7830 uint64_t features;
7831 size_t len;
7832 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7833 osd_op(osd_op), features(features), len(0) {}
7834 void finish(int r) override {
c07f9fc5
FG
7835 osd_op->rval = r;
7836 if (r < 0) {
7837 return;
7838 }
7839
7c673cae
FG
7840 assert(len > 0);
7841 assert(len <= reply_obj.data.length());
7842 bufferlist bl;
7843 bl.substr_of(reply_obj.data, 0, len);
7844 reply_obj.data.swap(bl);
7845 ::encode(reply_obj, osd_op->outdata, features);
7846 }
7847};
7848
c07f9fc5
FG
7849int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7850 OSDOp& osd_op, ObjectContextRef &obc)
7c673cae
FG
7851{
7852 object_info_t& oi = obc->obs.oi;
7853 hobject_t& soid = oi.soid;
7854 int result = 0;
7855 object_copy_cursor_t cursor;
7856 uint64_t out_max;
28e407b8
AA
7857 bool skip_data_digest =
7858 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
7859 g_conf->osd_distrust_data_digest;
7860
7c673cae
FG
7861 try {
7862 ::decode(cursor, bp);
7863 ::decode(out_max, bp);
7864 }
7865 catch (buffer::error& e) {
7866 result = -EINVAL;
7867 return result;
7868 }
7869
7870 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7871 uint64_t features = op->get_features();
7872
7873 bool async_read_started = false;
7874 object_copy_data_t _reply_obj;
7875 C_CopyFrom_AsyncReadCb *cb = NULL;
7876 if (pool.info.require_rollback()) {
7877 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7878 }
7879 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7880 // size, mtime
7881 reply_obj.size = oi.size;
7882 reply_obj.mtime = oi.mtime;
7883 assert(obc->ssc);
7884 if (soid.snap < CEPH_NOSNAP) {
7885 if (obc->ssc->snapset.is_legacy()) {
7886 reply_obj.snaps = oi.legacy_snaps;
7887 } else {
7888 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7889 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7890 reply_obj.snaps = p->second;
7891 }
7892 } else {
7893 reply_obj.snap_seq = obc->ssc->snapset.seq;
7894 }
28e407b8 7895 if (!skip_data_digest && oi.is_data_digest()) {
7c673cae
FG
7896 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7897 reply_obj.data_digest = oi.data_digest;
7898 }
7899 if (oi.is_omap_digest()) {
7900 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7901 reply_obj.omap_digest = oi.omap_digest;
7902 }
7903 reply_obj.truncate_seq = oi.truncate_seq;
7904 reply_obj.truncate_size = oi.truncate_size;
7905
7906 // attrs
7907 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7908 if (!cursor.attr_complete) {
7909 result = getattrs_maybe_cache(
7910 ctx->obc,
b32b8144 7911 &out_attrs);
7c673cae
FG
7912 if (result < 0) {
7913 if (cb) {
7914 delete cb;
7915 }
7916 return result;
7917 }
7918 cursor.attr_complete = true;
7919 dout(20) << " got attrs" << dendl;
7920 }
7921
7922 int64_t left = out_max - osd_op.outdata.length();
7923
7924 // data
7925 bufferlist& bl = reply_obj.data;
7926 if (left > 0 && !cursor.data_complete) {
7927 if (cursor.data_offset < oi.size) {
7928 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7929 if (cb) {
7930 async_read_started = true;
7931 ctx->pending_async_reads.push_back(
7932 make_pair(
7933 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7934 make_pair(&bl, cb)));
c07f9fc5
FG
7935 cb->len = max_read;
7936
7937 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7938 new ReadFinisher(osd_op));
7939 result = -EINPROGRESS;
7940
7941 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7c673cae
FG
7942 } else {
7943 result = pgbackend->objects_read_sync(
c07f9fc5 7944 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7c673cae
FG
7945 if (result < 0)
7946 return result;
7947 }
c07f9fc5
FG
7948 left -= max_read;
7949 cursor.data_offset += max_read;
7c673cae
FG
7950 }
7951 if (cursor.data_offset == oi.size) {
7952 cursor.data_complete = true;
7953 dout(20) << " got data" << dendl;
7954 }
7955 assert(cursor.data_offset <= oi.size);
7956 }
7957
7958 // omap
7959 uint32_t omap_keys = 0;
7960 if (!pool.info.supports_omap() || !oi.is_omap()) {
7961 cursor.omap_complete = true;
7962 } else {
7963 if (left > 0 && !cursor.omap_complete) {
7964 assert(cursor.data_complete);
7965 if (cursor.omap_offset.empty()) {
7966 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7967 &reply_obj.omap_header);
7968 }
7969 bufferlist omap_data;
7970 ObjectMap::ObjectMapIterator iter =
7971 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7972 assert(iter);
7973 iter->upper_bound(cursor.omap_offset);
7974 for (; iter->valid(); iter->next(false)) {
7975 ++omap_keys;
7976 ::encode(iter->key(), omap_data);
7977 ::encode(iter->value(), omap_data);
7978 left -= iter->key().length() + 4 + iter->value().length() + 4;
7979 if (left <= 0)
7980 break;
7981 }
7982 if (omap_keys) {
7983 ::encode(omap_keys, reply_obj.omap_data);
7984 reply_obj.omap_data.claim_append(omap_data);
7985 }
7986 if (iter->valid()) {
7987 cursor.omap_offset = iter->key();
7988 } else {
7989 cursor.omap_complete = true;
7990 dout(20) << " got omap" << dendl;
7991 }
7992 }
7993 }
7994
7995 if (cursor.is_complete()) {
7996 // include reqids only in the final step. this is a bit fragile
7997 // but it works...
7998 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7999 dout(20) << " got reqids" << dendl;
8000 }
8001
8002 dout(20) << " cursor.is_complete=" << cursor.is_complete()
8003 << " " << out_attrs.size() << " attrs"
8004 << " " << bl.length() << " bytes"
8005 << " " << reply_obj.omap_header.length() << " omap header bytes"
8006 << " " << reply_obj.omap_data.length() << " omap data bytes in "
8007 << omap_keys << " keys"
8008 << " " << reply_obj.reqids.size() << " reqids"
8009 << dendl;
8010 reply_obj.cursor = cursor;
8011 if (!async_read_started) {
8012 ::encode(reply_obj, osd_op.outdata, features);
8013 }
8014 if (cb && !async_read_started) {
8015 delete cb;
8016 }
c07f9fc5
FG
8017
8018 if (result > 0) {
8019 result = 0;
8020 }
7c673cae
FG
8021 return result;
8022}
8023
8024void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8025 OSDOp& osd_op)
8026{
8027 // NOTE: we take non-const ref here for claim_op_out_data below; we must
8028 // be careful not to modify anything else that will upset a racing
8029 // operator<<
8030 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
8031 uint64_t features = m->get_features();
8032 object_copy_data_t reply_obj;
8033
8034 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
8035 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8036 ::encode(reply_obj, osd_op.outdata, features);
8037 osd_op.rval = -ENOENT;
8038 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8039 reply->claim_op_out_data(m->ops);
8040 reply->set_result(-ENOENT);
8041 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8042 osd->send_message_osd_client(reply, m->get_connection());
8043}
8044
8045void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8046 hobject_t src, object_locator_t oloc,
8047 version_t version, unsigned flags,
8048 bool mirror_snapset,
8049 unsigned src_obj_fadvise_flags,
8050 unsigned dest_obj_fadvise_flags)
8051{
8052 const hobject_t& dest = obc->obs.oi.soid;
8053 dout(10) << __func__ << " " << dest
8054 << " from " << src << " " << oloc << " v" << version
8055 << " flags " << flags
8056 << (mirror_snapset ? " mirror_snapset" : "")
8057 << dendl;
8058
8059 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8060 src.snap == CEPH_SNAPDIR));
8061
8062 // cancel a previous in-progress copy?
8063 if (copy_ops.count(dest)) {
8064 // FIXME: if the src etc match, we could avoid restarting from the
8065 // beginning.
8066 CopyOpRef cop = copy_ops[dest];
94b18763
FG
8067 vector<ceph_tid_t> tids;
8068 cancel_copy(cop, false, &tids);
8069 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
8070 }
8071
8072 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8073 mirror_snapset, src_obj_fadvise_flags,
8074 dest_obj_fadvise_flags));
8075 copy_ops[dest] = cop;
8076 obc->start_block();
8077
8078 _copy_some(obc, cop);
8079}
8080
8081void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8082{
8083 dout(10) << __func__ << " " << obc << " " << cop << dendl;
8084
8085 unsigned flags = 0;
8086 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8087 flags |= CEPH_OSD_FLAG_FLUSH;
8088 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8089 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8090 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8091 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8092 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8093 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8094 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8095 flags |= CEPH_OSD_FLAG_RWORDERED;
8096
8097 C_GatherBuilder gather(cct);
8098
8099 if (cop->cursor.is_initial() && cop->mirror_snapset) {
8100 // list snaps too.
8101 assert(cop->src.snap == CEPH_NOSNAP);
8102 ObjectOperation op;
8103 op.list_snaps(&cop->results.snapset, NULL);
8104 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8105 CEPH_SNAPDIR, NULL,
8106 flags, gather.new_sub(), NULL);
8107 cop->objecter_tid2 = tid;
8108 }
8109
8110 ObjectOperation op;
8111 if (cop->results.user_version) {
8112 op.assert_version(cop->results.user_version);
8113 } else {
8114 // we should learn the version after the first chunk, if we didn't know
8115 // it already!
8116 assert(cop->cursor.is_initial());
8117 }
8118 op.copy_get(&cop->cursor, get_copy_chunk_size(),
8119 &cop->results.object_size, &cop->results.mtime,
8120 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8121 &cop->results.snaps, &cop->results.snap_seq,
8122 &cop->results.flags,
8123 &cop->results.source_data_digest,
8124 &cop->results.source_omap_digest,
8125 &cop->results.reqids,
8126 &cop->results.truncate_seq,
8127 &cop->results.truncate_size,
8128 &cop->rval);
8129 op.set_last_op_flags(cop->src_obj_fadvise_flags);
8130
8131 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8132 get_last_peering_reset(), cop);
8133 gather.set_finisher(new C_OnFinisher(fin,
8134 &osd->objecter_finisher));
8135
8136 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8137 cop->src.snap, NULL,
8138 flags,
8139 gather.new_sub(),
8140 // discover the object version if we don't know it yet
8141 cop->results.user_version ? NULL : &cop->results.user_version);
8142 fin->tid = tid;
8143 cop->objecter_tid = tid;
8144 gather.activate();
8145}
8146
8147void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8148{
94b18763 8149 vector<ceph_tid_t> tids;
7c673cae
FG
8150 dout(10) << __func__ << " " << oid << " tid " << tid
8151 << " " << cpp_strerror(r) << dendl;
8152 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8153 if (p == copy_ops.end()) {
8154 dout(10) << __func__ << " no copy_op found" << dendl;
8155 return;
8156 }
8157 CopyOpRef cop = p->second;
8158 if (tid != cop->objecter_tid) {
8159 dout(10) << __func__ << " tid " << tid << " != cop " << cop
8160 << " tid " << cop->objecter_tid << dendl;
8161 return;
8162 }
8163
8164 if (cop->omap_data.length() || cop->omap_header.length())
8165 cop->results.has_omap = true;
8166
8167 if (r >= 0 && !pool.info.supports_omap() &&
8168 (cop->omap_data.length() || cop->omap_header.length())) {
8169 r = -EOPNOTSUPP;
8170 }
8171 cop->objecter_tid = 0;
8172 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
8173 ObjectContextRef& cobc = cop->obc;
8174
8175 if (r < 0)
8176 goto out;
8177
8178 assert(cop->rval >= 0);
8179
8180 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8181 // verify snap hasn't been deleted
8182 vector<snapid_t>::iterator p = cop->results.snaps.begin();
8183 while (p != cop->results.snaps.end()) {
8184 if (pool.info.is_removed_snap(*p)) {
8185 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8186 << dendl;
8187 for (vector<snapid_t>::iterator q = p + 1;
8188 q != cop->results.snaps.end();
8189 ++q)
8190 *(q - 1) = *q;
8191 cop->results.snaps.resize(cop->results.snaps.size() - 1);
8192 } else {
8193 ++p;
8194 }
8195 }
8196 if (cop->results.snaps.empty()) {
8197 dout(10) << __func__ << " no more snaps for " << oid << dendl;
8198 r = -ENOENT;
8199 goto out;
8200 }
8201 }
8202
8203 assert(cop->rval >= 0);
8204
8205 if (!cop->temp_cursor.data_complete) {
8206 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8207 }
8208 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8209 if (cop->omap_header.length()) {
8210 cop->results.omap_digest =
8211 cop->omap_header.crc32c(cop->results.omap_digest);
8212 }
8213 if (cop->omap_data.length()) {
8214 bufferlist keys;
8215 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8216 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8217 }
8218 }
8219
8220 if (!cop->temp_cursor.attr_complete) {
8221 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8222 p != cop->attrs.end();
8223 ++p) {
8224 cop->results.attrs[string("_") + p->first] = p->second;
8225 }
8226 cop->attrs.clear();
8227 }
8228
8229 if (!cop->cursor.is_complete()) {
8230 // write out what we have so far
8231 if (cop->temp_cursor.is_initial()) {
8232 assert(!cop->results.started_temp_obj);
8233 cop->results.started_temp_obj = true;
8234 cop->results.temp_oid = generate_temp_object(oid);
8235 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8236 }
8237 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8238 OpContextUPtr ctx = simple_opc_create(tempobc);
8239 if (cop->temp_cursor.is_initial()) {
8240 ctx->new_temp_oid = cop->results.temp_oid;
8241 }
8242 _write_copy_chunk(cop, ctx->op_t.get());
8243 simple_opc_submit(std::move(ctx));
8244 dout(10) << __func__ << " fetching more" << dendl;
8245 _copy_some(cobc, cop);
8246 return;
8247 }
8248
8249 // verify digests?
8250 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8251 dout(20) << __func__ << std::hex
8252 << " got digest: rx data 0x" << cop->results.data_digest
8253 << " omap 0x" << cop->results.omap_digest
8254 << ", source: data 0x" << cop->results.source_data_digest
8255 << " omap 0x" << cop->results.source_omap_digest
8256 << std::dec
8257 << " flags " << cop->results.flags
8258 << dendl;
8259 }
8260 if (cop->results.is_data_digest() &&
8261 cop->results.data_digest != cop->results.source_data_digest) {
8262 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8263 << " != source 0x" << cop->results.source_data_digest << std::dec
8264 << dendl;
8265 osd->clog->error() << info.pgid << " copy from " << cop->src
8266 << " to " << cop->obc->obs.oi.soid << std::hex
8267 << " data digest 0x" << cop->results.data_digest
8268 << " != source 0x" << cop->results.source_data_digest
8269 << std::dec;
8270 r = -EIO;
8271 goto out;
8272 }
8273 if (cop->results.is_omap_digest() &&
8274 cop->results.omap_digest != cop->results.source_omap_digest) {
8275 derr << __func__ << std::hex
8276 << " omap digest 0x" << cop->results.omap_digest
8277 << " != source 0x" << cop->results.source_omap_digest
8278 << std::dec << dendl;
8279 osd->clog->error() << info.pgid << " copy from " << cop->src
8280 << " to " << cop->obc->obs.oi.soid << std::hex
8281 << " omap digest 0x" << cop->results.omap_digest
8282 << " != source 0x" << cop->results.source_omap_digest
8283 << std::dec;
8284 r = -EIO;
8285 goto out;
8286 }
8287 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8288 derr << __func__ << " injecting copyfrom failure" << dendl;
8289 r = -EIO;
8290 goto out;
8291 }
8292
8293 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8294 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8295 ObjectState& obs = cop->obc->obs;
8296 if (cop->temp_cursor.is_initial()) {
8297 dout(20) << "fill_in_final_tx: writing "
8298 << "directly to final object" << dendl;
8299 // write directly to final object
8300 cop->results.temp_oid = obs.oi.soid;
8301 _write_copy_chunk(cop, t);
8302 } else {
8303 // finish writing to temp object, then move into place
8304 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8305 _write_copy_chunk(cop, t);
8306 t->rename(obs.oi.soid, cop->results.temp_oid);
8307 }
8308 t->setattrs(obs.oi.soid, cop->results.attrs);
8309 });
8310
8311 dout(20) << __func__ << " success; committing" << dendl;
8312
8313 out:
8314 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8315 CopyCallbackResults results(r, &cop->results);
8316 cop->cb->complete(results);
8317
8318 copy_ops.erase(cobc->obs.oi.soid);
8319 cobc->stop_block();
8320
8321 if (r < 0 && cop->results.started_temp_obj) {
8322 dout(10) << __func__ << " deleting partial temp object "
8323 << cop->results.temp_oid << dendl;
8324 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8325 OpContextUPtr ctx = simple_opc_create(tempobc);
8326 ctx->op_t->remove(cop->results.temp_oid);
8327 ctx->discard_temp_oid = cop->results.temp_oid;
8328 simple_opc_submit(std::move(ctx));
8329 }
8330
8331 // cancel and requeue proxy ops on this object
8332 if (!r) {
8333 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8334 it != proxyread_ops.end();) {
8335 if (it->second->soid == cobc->obs.oi.soid) {
94b18763 8336 cancel_proxy_read((it++)->second, &tids);
7c673cae
FG
8337 } else {
8338 ++it;
8339 }
8340 }
8341 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8342 it != proxywrite_ops.end();) {
8343 if (it->second->soid == cobc->obs.oi.soid) {
94b18763 8344 cancel_proxy_write((it++)->second, &tids);
7c673cae
FG
8345 } else {
8346 ++it;
8347 }
8348 }
94b18763 8349 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
8350 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8351 }
8352
8353 kick_object_context_blocked(cobc);
8354}
8355
94b18763
FG
8356void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
8357 vector<ceph_tid_t> tids;
8358 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8359 it != proxyread_ops.end();) {
8360 if (it->second->soid == oid) {
8361 cancel_proxy_read((it++)->second, &tids);
8362 } else {
8363 ++it;
8364 }
8365 }
8366 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8367 it != proxywrite_ops.end();) {
8368 if (it->second->soid == oid) {
8369 cancel_proxy_write((it++)->second, &tids);
8370 } else {
8371 ++it;
8372 }
8373 }
8374 osd->objecter->op_cancel(tids, -ECANCELED);
8375 kick_proxy_ops_blocked(oid);
8376}
8377
7c673cae
FG
8378void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8379{
8380 dout(20) << __func__ << " " << cop
8381 << " " << cop->attrs.size() << " attrs"
8382 << " " << cop->data.length() << " bytes"
8383 << " " << cop->omap_header.length() << " omap header bytes"
8384 << " " << cop->omap_data.length() << " omap data bytes"
8385 << dendl;
8386 if (!cop->temp_cursor.attr_complete) {
8387 t->create(cop->results.temp_oid);
8388 }
8389 if (!cop->temp_cursor.data_complete) {
8390 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8391 cop->cursor.data_offset);
8392 if (pool.info.requires_aligned_append() &&
8393 !cop->cursor.data_complete) {
8394 /**
8395 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8396 * to pick it up on the next pass.
8397 */
8398 assert(cop->temp_cursor.data_offset %
8399 pool.info.required_alignment() == 0);
8400 if (cop->data.length() % pool.info.required_alignment() != 0) {
8401 uint64_t to_trim =
8402 cop->data.length() % pool.info.required_alignment();
8403 bufferlist bl;
8404 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8405 cop->data.swap(bl);
8406 cop->cursor.data_offset -= to_trim;
8407 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8408 cop->cursor.data_offset);
8409 }
8410 }
8411 if (cop->data.length()) {
8412 t->write(
8413 cop->results.temp_oid,
8414 cop->temp_cursor.data_offset,
8415 cop->data.length(),
8416 cop->data,
8417 cop->dest_obj_fadvise_flags);
8418 }
8419 cop->data.clear();
8420 }
8421 if (pool.info.supports_omap()) {
8422 if (!cop->temp_cursor.omap_complete) {
8423 if (cop->omap_header.length()) {
8424 t->omap_setheader(
8425 cop->results.temp_oid,
8426 cop->omap_header);
8427 cop->omap_header.clear();
8428 }
8429 if (cop->omap_data.length()) {
8430 map<string,bufferlist> omap;
8431 bufferlist::iterator p = cop->omap_data.begin();
8432 ::decode(omap, p);
8433 t->omap_setkeys(cop->results.temp_oid, omap);
8434 cop->omap_data.clear();
8435 }
8436 }
8437 } else {
8438 assert(cop->omap_header.length() == 0);
8439 assert(cop->omap_data.length() == 0);
8440 }
8441 cop->temp_cursor = cop->cursor;
8442}
8443
c07f9fc5 8444void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
7c673cae 8445{
c07f9fc5 8446 OpContext *ctx = cb->ctx;
7c673cae 8447 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
7c673cae 8448
c07f9fc5 8449 ObjectState& obs = ctx->new_obs;
7c673cae
FG
8450 if (obs.exists) {
8451 dout(20) << __func__ << ": exists, removing" << dendl;
8452 ctx->op_t->remove(obs.oi.soid);
8453 } else {
8454 ctx->delta_stats.num_objects++;
8455 obs.exists = true;
8456 }
8457 if (cb->is_temp_obj_used()) {
8458 ctx->discard_temp_oid = cb->results->temp_oid;
8459 }
8460 cb->results->fill_in_final_tx(ctx->op_t.get());
8461
8462 // CopyFromCallback fills this in for us
8463 obs.oi.user_version = ctx->user_at_version;
8464
28e407b8
AA
8465 if (cb->results->is_data_digest()) {
8466 obs.oi.set_data_digest(cb->results->data_digest);
8467 } else {
8468 obs.oi.clear_data_digest();
8469 }
8470 if (cb->results->is_omap_digest()) {
8471 obs.oi.set_omap_digest(cb->results->omap_digest);
8472 } else {
8473 obs.oi.clear_omap_digest();
8474 }
7c673cae
FG
8475
8476 obs.oi.truncate_seq = cb->results->truncate_seq;
8477 obs.oi.truncate_size = cb->results->truncate_size;
8478
8479 ctx->extra_reqids = cb->results->reqids;
8480
8481 // cache: clear whiteout?
8482 if (obs.oi.is_whiteout()) {
8483 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8484 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8485 --ctx->delta_stats.num_whiteouts;
8486 }
8487
8488 if (cb->results->has_omap) {
8489 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8490 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8491 } else {
8492 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8493 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8494 }
8495
8496 interval_set<uint64_t> ch;
8497 if (obs.oi.size > 0)
8498 ch.insert(0, obs.oi.size);
8499 ctx->modified_ranges.union_of(ch);
8500
8501 if (cb->get_data_size() != obs.oi.size) {
8502 ctx->delta_stats.num_bytes -= obs.oi.size;
8503 obs.oi.size = cb->get_data_size();
8504 ctx->delta_stats.num_bytes += obs.oi.size;
8505 }
8506 ctx->delta_stats.num_wr++;
8507 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8508
8509 osd->logger->inc(l_osd_copyfrom);
8510}
8511
8512void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8513 ObjectContextRef obc)
8514{
8515 const hobject_t& soid = obc->obs.oi.soid;
8516 dout(10) << __func__ << " " << soid << " r=" << r
8517 << " uv" << results->user_version << dendl;
8518
8519 if (r == -ECANCELED) {
8520 return;
8521 }
8522
8523 if (r != -ENOENT && soid.is_snap()) {
8524 if (results->snaps.empty()) {
8525 // we must have read "snap" content from the head object in
8526 // the base pool. use snap_seq to construct what snaps should
8527 // be for this clone (what is was before we evicted the clean
8528 // clone from this pool, and what it will be when we flush and
8529 // the clone eventually happens in the base pool).
8530 SnapSet& snapset = obc->ssc->snapset;
8531 vector<snapid_t>::iterator p = snapset.snaps.begin();
8532 while (p != snapset.snaps.end() && *p > soid.snap)
8533 ++p;
8534 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8535 results->snaps.push_back(*p);
8536 ++p;
8537 }
8538 }
8539
8540 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8541 filter_snapc(results->snaps);
8542
8543 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8544 if (results->snaps.empty()) {
8545 dout(20) << __func__
8546 << " snaps are empty, clone is invalid,"
8547 << " setting r to ENOENT" << dendl;
8548 r = -ENOENT;
8549 }
8550 }
8551
8552 if (r < 0 && results->started_temp_obj) {
8553 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8554 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8555 assert(tempobc);
8556 OpContextUPtr ctx = simple_opc_create(tempobc);
8557 ctx->op_t->remove(results->temp_oid);
8558 simple_opc_submit(std::move(ctx));
8559 results->started_temp_obj = false;
8560 }
8561
8562 if (r == -ENOENT && soid.is_snap()) {
8563 dout(10) << __func__
8564 << ": enoent while trying to promote clone, " << soid
8565 << " must have been trimmed, removing from snapset"
8566 << dendl;
8567 hobject_t head(soid.get_head());
8568 ObjectContextRef obc = get_object_context(head, false);
8569 assert(obc);
8570
8571 OpContextUPtr tctx = simple_opc_create(obc);
8572 tctx->at_version = get_next_version();
8573 filter_snapc(tctx->new_snapset.snaps);
8574 vector<snapid_t> new_clones;
8575 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8576 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8577 i != tctx->new_snapset.clones.end();
8578 ++i) {
8579 if (*i != soid.snap) {
8580 new_clones.push_back(*i);
8581 auto p = tctx->new_snapset.clone_snaps.find(*i);
8582 if (p != tctx->new_snapset.clone_snaps.end()) {
8583 new_clone_snaps[*i] = p->second;
8584 }
8585 }
8586 }
8587 tctx->new_snapset.clones.swap(new_clones);
8588 tctx->new_snapset.clone_overlap.erase(soid.snap);
8589 tctx->new_snapset.clone_size.erase(soid.snap);
8590 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8591
8592 // take RWWRITE lock for duration of our local write. ignore starvation.
8593 if (!tctx->lock_manager.take_write_lock(
8594 head,
8595 obc)) {
8596 assert(0 == "problem!");
8597 }
8598 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8599
8600 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8601
8602 simple_opc_submit(std::move(tctx));
8603 return;
8604 }
8605
8606 bool whiteout = false;
8607 if (r == -ENOENT) {
8608 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8609 dout(10) << __func__ << " whiteout " << soid << dendl;
8610 whiteout = true;
8611 }
8612
8613 if (r < 0 && !whiteout) {
8614 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8615 // pass error to everyone blocked on this object
8616 // FIXME: this is pretty sloppy, but at this point we got
8617 // something unexpected and don't have many other options.
8618 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8619 waiting_for_blocked_object.find(soid);
8620 if (blocked_iter != waiting_for_blocked_object.end()) {
8621 while (!blocked_iter->second.empty()) {
8622 osd->reply_op_error(blocked_iter->second.front(), r);
8623 blocked_iter->second.pop_front();
8624 }
8625 waiting_for_blocked_object.erase(blocked_iter);
8626 }
8627 return;
8628 }
8629
8630 osd->promote_finish(results->object_size);
8631
8632 OpContextUPtr tctx = simple_opc_create(obc);
8633 tctx->at_version = get_next_version();
8634
8635 ++tctx->delta_stats.num_objects;
8636 if (soid.snap < CEPH_NOSNAP)
8637 ++tctx->delta_stats.num_object_clones;
8638 tctx->new_obs.exists = true;
8639
8640 tctx->extra_reqids = results->reqids;
8641
8642 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
31f18b77 8643 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
8644
8645 if (whiteout) {
8646 // create a whiteout
8647 tctx->op_t->create(soid);
8648 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8649 ++tctx->delta_stats.num_whiteouts;
8650 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8651 osd->logger->inc(l_osd_tier_whiteout);
8652 } else {
8653 if (results->has_omap) {
8654 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8655 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8656 ++tctx->delta_stats.num_objects_omap;
8657 }
8658
8659 results->fill_in_final_tx(tctx->op_t.get());
8660 if (results->started_temp_obj) {
8661 tctx->discard_temp_oid = results->temp_oid;
8662 }
8663 tctx->new_obs.oi.size = results->object_size;
8664 tctx->new_obs.oi.user_version = results->user_version;
28e407b8 8665 if (results->is_data_digest()) {
7c673cae 8666 tctx->new_obs.oi.set_data_digest(results->data_digest);
28e407b8
AA
8667 } else {
8668 tctx->new_obs.oi.clear_data_digest();
8669 }
8670 if (results->is_omap_digest()) {
7c673cae 8671 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
28e407b8
AA
8672 } else {
8673 tctx->new_obs.oi.clear_omap_digest();
8674 }
7c673cae
FG
8675 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8676 tctx->new_obs.oi.truncate_size = results->truncate_size;
8677
8678 if (soid.snap != CEPH_NOSNAP) {
8679 if (legacy_snapset) {
8680 tctx->new_obs.oi.legacy_snaps = results->snaps;
8681 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8682 } else {
8683 // it's already in the snapset
8684 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8685 }
8686 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8687 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8688 results->object_size);
8689 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8690
8691 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8692 } else {
8693 tctx->delta_stats.num_bytes += results->object_size;
8694 }
8695 }
8696
8697 if (results->mirror_snapset) {
8698 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8699 tctx->new_snapset.from_snap_set(
8700 results->snapset,
31f18b77 8701 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
8702 }
8703 tctx->new_snapset.head_exists = true;
8704 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8705
8706 // take RWWRITE lock for duration of our local write. ignore starvation.
8707 if (!tctx->lock_manager.take_write_lock(
8708 obc->obs.oi.soid,
8709 obc)) {
8710 assert(0 == "problem!");
8711 }
8712 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8713
8714 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8715
8716 simple_opc_submit(std::move(tctx));
8717
8718 osd->logger->inc(l_osd_tier_promote);
8719
8720 if (agent_state &&
8721 agent_state->is_idle())
8722 agent_choose_mode();
8723}
8724
94b18763
FG
8725void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
8726 vector<ceph_tid_t> *tids)
7c673cae
FG
8727{
8728 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8729 << " from " << cop->src << " " << cop->oloc
8730 << " v" << cop->results.user_version << dendl;
8731
8732 // cancel objecter op, if we can
8733 if (cop->objecter_tid) {
94b18763 8734 tids->push_back(cop->objecter_tid);
7c673cae
FG
8735 cop->objecter_tid = 0;
8736 if (cop->objecter_tid2) {
94b18763 8737 tids->push_back(cop->objecter_tid2);
7c673cae
FG
8738 cop->objecter_tid2 = 0;
8739 }
8740 }
8741
8742 copy_ops.erase(cop->obc->obs.oi.soid);
8743 cop->obc->stop_block();
8744
8745 kick_object_context_blocked(cop->obc);
8746 cop->results.should_requeue = requeue;
8747 CopyCallbackResults result(-ECANCELED, &cop->results);
8748 cop->cb->complete(result);
8749
8750 // There may still be an objecter callback referencing this copy op.
8751 // That callback will not need the obc since it's been canceled, and
8752 // we need the obc reference to go away prior to flush.
8753 cop->obc = ObjectContextRef();
8754}
8755
94b18763 8756void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
8757{
8758 dout(10) << __func__ << dendl;
8759 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8760 while (p != copy_ops.end()) {
8761 // requeue this op? can I queue up all of them?
94b18763 8762 cancel_copy((p++)->second, requeue, tids);
7c673cae
FG
8763 }
8764}
8765
8766
8767// ========================================================================
8768// flush
8769//
8770// Flush a dirty object in the cache tier by writing it back to the
8771// base tier. The sequence looks like:
8772//
8773// * send a copy-from operation to the base tier to copy the current
8774// version of the object
8775// * base tier will pull the object via (perhaps multiple) copy-get(s)
8776// * on completion, we check if the object has been modified. if so,
8777// just reply with -EAGAIN.
8778// * try to take a write lock so we can clear the dirty flag. if this
8779// fails, wait and retry
8780// * start a repop that clears the bit.
8781//
8782// If we have to wait, we will retry by coming back through the
8783// start_flush method. We check if a flush is already in progress
8784// and, if so, try to finish it by rechecking the version and trying
8785// to clear the dirty bit.
8786//
8787// In order for the cache-flush (a write op) to not block the copy-get
8788// from reading the object, the client *must* set the SKIPRWLOCKS
8789// flag.
8790//
8791// NOTE: normally writes are strictly ordered for the client, but
8792// flushes are special in that they can be reordered with respect to
8793// other writes. In particular, we can't have a flush request block
8794// an update to the cache pool object!
8795
8796struct C_Flush : public Context {
8797 PrimaryLogPGRef pg;
8798 hobject_t oid;
8799 epoch_t last_peering_reset;
8800 ceph_tid_t tid;
8801 utime_t start;
8802 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8803 : pg(p), oid(o), last_peering_reset(lpr),
8804 tid(0), start(ceph_clock_now())
8805 {}
8806 void finish(int r) override {
8807 if (r == -ECANCELED)
8808 return;
8809 pg->lock();
8810 if (last_peering_reset == pg->get_last_peering_reset()) {
8811 pg->finish_flush(oid, tid, r);
8812 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8813 }
8814 pg->unlock();
8815 }
8816};
8817
8818int PrimaryLogPG::start_flush(
8819 OpRequestRef op, ObjectContextRef obc,
8820 bool blocking, hobject_t *pmissing,
8821 boost::optional<std::function<void()>> &&on_flush)
8822{
8823 const object_info_t& oi = obc->obs.oi;
8824 const hobject_t& soid = oi.soid;
8825 dout(10) << __func__ << " " << soid
8826 << " v" << oi.version
8827 << " uv" << oi.user_version
8828 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8829 << dendl;
8830
8831 // get a filtered snapset, need to remove removed snaps
8832 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8833
8834 // verify there are no (older) check for dirty clones
8835 {
8836 dout(20) << " snapset " << snapset << dendl;
8837 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8838 while (p != snapset.clones.rend() && *p >= soid.snap)
8839 ++p;
8840 if (p != snapset.clones.rend()) {
8841 hobject_t next = soid;
8842 next.snap = *p;
8843 assert(next.snap < soid.snap);
8844 if (pg_log.get_missing().is_missing(next)) {
8845 dout(10) << __func__ << " missing clone is " << next << dendl;
8846 if (pmissing)
8847 *pmissing = next;
8848 return -ENOENT;
8849 }
8850 ObjectContextRef older_obc = get_object_context(next, false);
8851 if (older_obc) {
8852 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8853 << dendl;
8854 if (older_obc->obs.oi.is_dirty()) {
8855 dout(10) << __func__ << " next oldest clone is dirty: "
8856 << older_obc->obs.oi << dendl;
8857 return -EBUSY;
8858 }
8859 } else {
8860 dout(20) << __func__ << " next oldest clone " << next
8861 << " is not present; implicitly clean" << dendl;
8862 }
8863 } else {
8864 dout(20) << __func__ << " no older clones" << dendl;
8865 }
8866 }
8867
8868 if (blocking)
8869 obc->start_block();
8870
8871 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8872 if (p != flush_ops.end()) {
8873 FlushOpRef fop = p->second;
8874 if (fop->op == op) {
8875 // we couldn't take the write lock on a cache-try-flush before;
8876 // now we are trying again for the lock.
8877 return try_flush_mark_clean(fop);
8878 }
8879 if (fop->flushed_version == obc->obs.oi.user_version &&
8880 (fop->blocking || !blocking)) {
8881 // nonblocking can join anything
8882 // blocking can only join a blocking flush
8883 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8884 if (op)
8885 fop->dup_ops.push_back(op);
8886 return -EAGAIN; // clean up this ctx; op will retry later
8887 }
8888
8889 // cancel current flush since it will fail anyway, or because we
8890 // are blocking and the existing flush is nonblocking.
8891 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8892 if (fop->op)
8893 osd->reply_op_error(fop->op, -EBUSY);
8894 while (!fop->dup_ops.empty()) {
8895 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8896 fop->dup_ops.pop_front();
8897 }
94b18763
FG
8898 vector<ceph_tid_t> tids;
8899 cancel_flush(fop, false, &tids);
8900 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
8901 }
8902
8903 /**
8904 * In general, we need to send a delete and a copyfrom.
8905 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8906 * where 4 is marked as clean. To flush 10, we have to:
8907 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8908 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8909 *
8910 * There is a complicating case. Supposed there had been a clone 7
8911 * for snaps [7, 6] which has been trimmed since they no longer exist.
8912 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8913 * the delete, the snap will be promoted to 5, and the head will become
8914 * a snapdir. When the copy-from goes through, we'll end up with
8915 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8916 *
8917 * Another complication is the case where there is an interval change
8918 * after doing the delete and the flush but before marking the object
8919 * clean. We'll happily delete head and then recreate it at the same
8920 * sequence number, which works out ok.
8921 */
8922
8923 SnapContext snapc, dsnapc;
8924 if (snapset.seq != 0) {
8925 if (soid.snap == CEPH_NOSNAP) {
8926 snapc.seq = snapset.seq;
8927 snapc.snaps = snapset.snaps;
8928 } else {
8929 snapid_t min_included_snap;
8930 if (snapset.is_legacy()) {
8931 min_included_snap = oi.legacy_snaps.back();
8932 } else {
8933 auto p = snapset.clone_snaps.find(soid.snap);
8934 assert(p != snapset.clone_snaps.end());
8935 min_included_snap = p->second.back();
8936 }
8937 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8938 }
8939
8940 snapid_t prev_snapc = 0;
8941 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8942 citer != snapset.clones.rend();
8943 ++citer) {
8944 if (*citer < soid.snap) {
8945 prev_snapc = *citer;
8946 break;
8947 }
8948 }
8949
8950 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8951 }
8952
8953 object_locator_t base_oloc(soid);
8954 base_oloc.pool = pool.info.tier_of;
8955
8956 if (dsnapc.seq < snapc.seq) {
8957 ObjectOperation o;
8958 o.remove();
8959 osd->objecter->mutate(
8960 soid.oid,
8961 base_oloc,
8962 o,
8963 dsnapc,
8964 ceph::real_clock::from_ceph_timespec(oi.mtime),
8965 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8966 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8967 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8968 }
8969
8970 FlushOpRef fop(std::make_shared<FlushOp>());
8971 fop->obc = obc;
8972 fop->flushed_version = oi.user_version;
8973 fop->blocking = blocking;
8974 fop->on_flush = std::move(on_flush);
8975 fop->op = op;
8976
8977 ObjectOperation o;
8978 if (oi.is_whiteout()) {
8979 fop->removal = true;
8980 o.remove();
8981 } else {
8982 object_locator_t oloc(soid);
8983 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8984 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8985 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8986 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8987 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8988 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8989
8990 //mean the base tier don't cache data after this
8991 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8992 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8993 }
8994 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8995
8996 ceph_tid_t tid = osd->objecter->mutate(
8997 soid.oid, base_oloc, o, snapc,
8998 ceph::real_clock::from_ceph_timespec(oi.mtime),
8999 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
9000 new C_OnFinisher(fin,
9001 &osd->objecter_finisher));
9002 /* we're under the pg lock and fin->finish() is grabbing that */
9003 fin->tid = tid;
9004 fop->objecter_tid = tid;
9005
9006 flush_ops[soid] = fop;
9007 info.stats.stats.sum.num_flush++;
9008 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
9009 return -EINPROGRESS;
9010}
9011
9012void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
9013{
9014 dout(10) << __func__ << " " << oid << " tid " << tid
9015 << " " << cpp_strerror(r) << dendl;
9016 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
9017 if (p == flush_ops.end()) {
9018 dout(10) << __func__ << " no flush_op found" << dendl;
9019 return;
9020 }
9021 FlushOpRef fop = p->second;
9022 if (tid != fop->objecter_tid) {
9023 dout(10) << __func__ << " tid " << tid << " != fop " << fop
9024 << " tid " << fop->objecter_tid << dendl;
9025 return;
9026 }
9027 ObjectContextRef obc = fop->obc;
9028 fop->objecter_tid = 0;
9029
9030 if (r < 0 && !(r == -ENOENT && fop->removal)) {
9031 if (fop->op)
9032 osd->reply_op_error(fop->op, -EBUSY);
9033 if (fop->blocking) {
9034 obc->stop_block();
9035 kick_object_context_blocked(obc);
9036 }
9037
9038 if (!fop->dup_ops.empty()) {
9039 dout(20) << __func__ << " requeueing dups" << dendl;
9040 requeue_ops(fop->dup_ops);
9041 }
9042 if (fop->on_flush) {
9043 (*(fop->on_flush))();
9044 fop->on_flush = boost::none;
9045 }
9046 flush_ops.erase(oid);
9047 return;
9048 }
9049
9050 r = try_flush_mark_clean(fop);
9051 if (r == -EBUSY && fop->op) {
9052 osd->reply_op_error(fop->op, r);
9053 }
9054}
9055
9056int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
9057{
9058 ObjectContextRef obc = fop->obc;
9059 const hobject_t& oid = obc->obs.oi.soid;
9060
9061 if (fop->blocking) {
9062 obc->stop_block();
9063 kick_object_context_blocked(obc);
9064 }
9065
9066 if (fop->flushed_version != obc->obs.oi.user_version ||
9067 !obc->obs.exists) {
9068 if (obc->obs.exists)
9069 dout(10) << __func__ << " flushed_version " << fop->flushed_version
9070 << " != current " << obc->obs.oi.user_version
9071 << dendl;
9072 else
9073 dout(10) << __func__ << " object no longer exists" << dendl;
9074
9075 if (!fop->dup_ops.empty()) {
9076 dout(20) << __func__ << " requeueing dups" << dendl;
9077 requeue_ops(fop->dup_ops);
9078 }
9079 if (fop->on_flush) {
9080 (*(fop->on_flush))();
9081 fop->on_flush = boost::none;
9082 }
9083 flush_ops.erase(oid);
9084 if (fop->blocking)
9085 osd->logger->inc(l_osd_tier_flush_fail);
9086 else
9087 osd->logger->inc(l_osd_tier_try_flush_fail);
9088 return -EBUSY;
9089 }
9090
9091 if (!fop->blocking &&
28e407b8 9092 write_blocked_by_scrub(oid)) {
7c673cae
FG
9093 if (fop->op) {
9094 dout(10) << __func__ << " blocked by scrub" << dendl;
9095 requeue_op(fop->op);
9096 requeue_ops(fop->dup_ops);
9097 return -EAGAIN; // will retry
9098 } else {
9099 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
9100 vector<ceph_tid_t> tids;
9101 cancel_flush(fop, false, &tids);
9102 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
9103 return -ECANCELED;
9104 }
9105 }
9106
9107 // successfully flushed, can we evict this object?
9108 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9109 agent_maybe_evict(obc, true)) {
9110 osd->logger->inc(l_osd_tier_clean);
9111 if (fop->on_flush) {
9112 (*(fop->on_flush))();
9113 fop->on_flush = boost::none;
9114 }
9115 flush_ops.erase(oid);
9116 return 0;
9117 }
9118
9119 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9120 OpContextUPtr ctx = simple_opc_create(fop->obc);
9121
9122 // successfully flushed; can we clear the dirty bit?
9123 // try to take the lock manually, since we don't
9124 // have a ctx yet.
9125 if (ctx->lock_manager.get_lock_type(
9126 ObjectContext::RWState::RWWRITE,
9127 oid,
9128 obc,
9129 fop->op)) {
9130 dout(20) << __func__ << " took write lock" << dendl;
9131 } else if (fop->op) {
28e407b8
AA
9132 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
9133 << fop->dup_ops << dendl;
7c673cae 9134 close_op_ctx(ctx.release());
28e407b8
AA
9135 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
9136 for (auto op : fop->dup_ops) {
9137 bool locked = ctx->lock_manager.get_lock_type(
9138 ObjectContext::RWState::RWWRITE,
9139 oid,
9140 obc,
9141 op);
9142 assert(!locked);
9143 }
7c673cae
FG
9144 return -EAGAIN; // will retry
9145 } else {
9146 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9147 close_op_ctx(ctx.release());
9148 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
9149 vector<ceph_tid_t> tids;
9150 cancel_flush(fop, false, &tids);
9151 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
9152 return -ECANCELED;
9153 }
9154
9155 if (fop->on_flush) {
9156 ctx->register_on_finish(*(fop->on_flush));
9157 fop->on_flush = boost::none;
9158 }
9159
9160 ctx->at_version = get_next_version();
9161
9162 ctx->new_obs = obc->obs;
9163 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9164 --ctx->delta_stats.num_objects_dirty;
9165
9166 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9167
9168 osd->logger->inc(l_osd_tier_clean);
9169
9170 if (!fop->dup_ops.empty() || fop->op) {
9171 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9172 list<OpRequestRef> ls;
9173 if (fop->op)
9174 ls.push_back(fop->op);
9175 ls.splice(ls.end(), fop->dup_ops);
9176 requeue_ops(ls);
9177 }
9178
9179 simple_opc_submit(std::move(ctx));
9180
9181 flush_ops.erase(oid);
9182
9183 if (fop->blocking)
9184 osd->logger->inc(l_osd_tier_flush);
9185 else
9186 osd->logger->inc(l_osd_tier_try_flush);
9187
9188 return -EINPROGRESS;
9189}
9190
94b18763
FG
9191void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
9192 vector<ceph_tid_t> *tids)
7c673cae
FG
9193{
9194 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9195 << fop->objecter_tid << dendl;
9196 if (fop->objecter_tid) {
94b18763 9197 tids->push_back(fop->objecter_tid);
7c673cae
FG
9198 fop->objecter_tid = 0;
9199 }
94b18763
FG
9200 if (fop->io_tids.size()) {
9201 for (auto &p : fop->io_tids) {
9202 tids->push_back(p.second);
9203 p.second = 0;
9204 }
9205 }
9206 if (fop->blocking && fop->obc->is_blocked()) {
7c673cae
FG
9207 fop->obc->stop_block();
9208 kick_object_context_blocked(fop->obc);
9209 }
9210 if (requeue) {
9211 if (fop->op)
9212 requeue_op(fop->op);
9213 requeue_ops(fop->dup_ops);
9214 }
9215 if (fop->on_flush) {
9216 (*(fop->on_flush))();
9217 fop->on_flush = boost::none;
9218 }
9219 flush_ops.erase(fop->obc->obs.oi.soid);
9220}
9221
94b18763 9222void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
9223{
9224 dout(10) << __func__ << dendl;
9225 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9226 while (p != flush_ops.end()) {
94b18763 9227 cancel_flush((p++)->second, requeue, tids);
7c673cae
FG
9228 }
9229}
9230
9231bool PrimaryLogPG::is_present_clone(hobject_t coid)
9232{
9233 if (!pool.info.allow_incomplete_clones())
9234 return true;
9235 if (is_missing_object(coid))
9236 return true;
9237 ObjectContextRef obc = get_object_context(coid, false);
9238 return obc && obc->obs.exists;
9239}
9240
9241// ========================================================================
9242// rep op gather
9243
9244class C_OSD_RepopApplied : public Context {
9245 PrimaryLogPGRef pg;
9246 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9247public:
9248 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9249 : pg(pg), repop(repop) {}
9250 void finish(int) override {
9251 pg->repop_all_applied(repop.get());
9252 }
9253};
9254
9255
9256void PrimaryLogPG::repop_all_applied(RepGather *repop)
9257{
9258 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9259 << dendl;
9260 assert(!repop->applies_with_commit);
9261 repop->all_applied = true;
9262 if (!repop->rep_aborted) {
9263 eval_repop(repop);
9264 }
9265}
9266
9267class C_OSD_RepopCommit : public Context {
9268 PrimaryLogPGRef pg;
9269 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9270public:
9271 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9272 : pg(pg), repop(repop) {}
9273 void finish(int) override {
9274 pg->repop_all_committed(repop.get());
9275 }
9276};
9277
9278void PrimaryLogPG::repop_all_committed(RepGather *repop)
9279{
9280 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9281 << dendl;
9282 repop->all_committed = true;
9283 if (repop->applies_with_commit) {
9284 assert(!repop->all_applied);
9285 repop->all_applied = true;
9286 }
9287
9288 if (!repop->rep_aborted) {
9289 if (repop->v != eversion_t()) {
9290 last_update_ondisk = repop->v;
9291 last_complete_ondisk = repop->pg_local_last_complete;
9292 }
9293 eval_repop(repop);
9294 }
9295}
9296
9297void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9298{
9299 dout(10) << "op_applied version " << applied_version << dendl;
9300 if (applied_version == eversion_t())
9301 return;
9302 assert(applied_version > last_update_applied);
9303 assert(applied_version <= info.last_update);
9304 last_update_applied = applied_version;
9305 if (is_primary()) {
9306 if (scrubber.active) {
c07f9fc5 9307 if (last_update_applied >= scrubber.subset_last_update) {
31f18b77
FG
9308 if (ops_blocked_by_scrub()) {
9309 requeue_scrub(true);
9310 } else {
9311 requeue_scrub(false);
9312 }
9313
7c673cae
FG
9314 }
9315 } else {
9316 assert(scrubber.start == scrubber.end);
9317 }
9318 } else {
9319 if (scrubber.active_rep_scrub) {
c07f9fc5 9320 if (last_update_applied >= static_cast<const MOSDRepScrub*>(
7c673cae
FG
9321 scrubber.active_rep_scrub->get_req())->scrub_to) {
9322 osd->enqueue_back(
9323 info.pgid,
9324 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9325 scrubber.active_rep_scrub = OpRequestRef();
9326 }
9327 }
9328 }
9329}
9330
9331void PrimaryLogPG::eval_repop(RepGather *repop)
9332{
9333 const MOSDOp *m = NULL;
9334 if (repop->op)
9335 m = static_cast<const MOSDOp *>(repop->op->get_req());
9336
9337 if (m)
9338 dout(10) << "eval_repop " << *repop
9339 << (repop->rep_done ? " DONE" : "")
9340 << dendl;
9341 else
9342 dout(10) << "eval_repop " << *repop << " (no op)"
9343 << (repop->rep_done ? " DONE" : "")
9344 << dendl;
9345
9346 if (repop->rep_done)
9347 return;
9348
9349 // ondisk?
9350 if (repop->all_committed) {
9351 dout(10) << " commit: " << *repop << dendl;
9352 for (auto p = repop->on_committed.begin();
9353 p != repop->on_committed.end();
9354 repop->on_committed.erase(p++)) {
9355 (*p)();
9356 }
9357 // send dup commits, in order
9358 if (waiting_for_ondisk.count(repop->v)) {
9359 assert(waiting_for_ondisk.begin()->first == repop->v);
9360 for (list<pair<OpRequestRef, version_t> >::iterator i =
9361 waiting_for_ondisk[repop->v].begin();
9362 i != waiting_for_ondisk[repop->v].end();
9363 ++i) {
9364 osd->reply_op_error(i->first, repop->r, repop->v,
9365 i->second);
9366 }
9367 waiting_for_ondisk.erase(repop->v);
9368 }
9369 }
9370
9371 // applied?
9372 if (repop->all_applied) {
9373 if (repop->applies_with_commit) {
9374 assert(repop->on_applied.empty());
9375 }
9376 dout(10) << " applied: " << *repop << " " << dendl;
9377 for (auto p = repop->on_applied.begin();
9378 p != repop->on_applied.end();
9379 repop->on_applied.erase(p++)) {
9380 (*p)();
9381 }
9382 }
9383
9384 // done.
9385 if (repop->all_applied && repop->all_committed) {
9386 repop->rep_done = true;
9387
9388 publish_stats_to_osd();
9389 calc_min_last_complete_ondisk();
9390
9391 dout(10) << " removing " << *repop << dendl;
9392 assert(!repop_queue.empty());
9393 dout(20) << " q front is " << *repop_queue.front() << dendl;
9394 if (repop_queue.front() != repop) {
9395 if (!repop->applies_with_commit) {
9396 dout(0) << " removing " << *repop << dendl;
9397 dout(0) << " q front is " << *repop_queue.front() << dendl;
9398 assert(repop_queue.front() == repop);
9399 }
9400 } else {
9401 RepGather *to_remove = nullptr;
9402 while (!repop_queue.empty() &&
9403 (to_remove = repop_queue.front())->rep_done) {
9404 repop_queue.pop_front();
9405 for (auto p = to_remove->on_success.begin();
9406 p != to_remove->on_success.end();
9407 to_remove->on_success.erase(p++)) {
9408 (*p)();
9409 }
9410 remove_repop(to_remove);
9411 }
9412 }
9413 }
9414}
9415
9416void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9417{
9418 FUNCTRACE();
9419 const hobject_t& soid = ctx->obs->oi.soid;
9420 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9421 << " o " << soid
9422 << dendl;
9423
9424 repop->v = ctx->at_version;
9425 if (ctx->at_version > eversion_t()) {
9426 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9427 i != actingbackfill.end();
9428 ++i) {
9429 if (*i == get_primary()) continue;
9430 pg_info_t &pinfo = peer_info[*i];
9431 // keep peer_info up to date
9432 if (pinfo.last_complete == pinfo.last_update)
9433 pinfo.last_complete = ctx->at_version;
9434 pinfo.last_update = ctx->at_version;
9435 }
9436 }
9437
9438 ctx->obc->ondisk_write_lock();
9439
9440 bool unlock_snapset_obc = false;
9441 ctx->op_t->add_obc(ctx->obc);
9442 if (ctx->clone_obc) {
9443 ctx->clone_obc->ondisk_write_lock();
9444 ctx->op_t->add_obc(ctx->clone_obc);
9445 }
9446 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9447 ctx->obc->obs.oi.soid) {
9448 ctx->snapset_obc->ondisk_write_lock();
9449 unlock_snapset_obc = true;
9450 ctx->op_t->add_obc(ctx->snapset_obc);
9451 }
9452
9453 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9454 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9455 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9456 ctx->obc,
9457 ctx->clone_obc,
9458 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9459 if (!(ctx->log.empty())) {
9460 assert(ctx->at_version >= projected_last_update);
9461 projected_last_update = ctx->at_version;
9462 }
9463 for (auto &&entry: ctx->log) {
9464 projected_log.add(entry);
9465 }
9466 pgbackend->submit_transaction(
9467 soid,
9468 ctx->delta_stats,
9469 ctx->at_version,
9470 std::move(ctx->op_t),
9471 pg_trim_to,
9472 min_last_complete_ondisk,
9473 ctx->log,
9474 ctx->updated_hset_history,
9475 onapplied_sync,
9476 on_all_applied,
9477 on_all_commit,
9478 repop->rep_tid,
9479 ctx->reqid,
9480 ctx->op);
9481}
9482
9483PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9484 OpContext *ctx, ObjectContextRef obc,
9485 ceph_tid_t rep_tid)
9486{
9487 if (ctx->op)
9488 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9489 else
9490 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9491
9492 RepGather *repop = new RepGather(
9493 ctx, rep_tid, info.last_complete, false);
9494
9495 repop->start = ceph_clock_now();
9496
9497 repop_queue.push_back(&repop->queue_item);
9498 repop->get();
9499
9500 osd->logger->inc(l_osd_op_wip);
9501
9502 dout(10) << __func__ << ": " << *repop << dendl;
9503 return repop;
9504}
9505
9506boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9507 eversion_t version,
9508 int r,
9509 ObcLockManager &&manager,
9510 OpRequestRef &&op,
9511 boost::optional<std::function<void(void)> > &&on_complete)
9512{
9513 RepGather *repop = new RepGather(
9514 std::move(manager),
9515 std::move(op),
9516 std::move(on_complete),
9517 osd->get_tid(),
9518 info.last_complete,
9519 true,
9520 r);
9521 repop->v = version;
9522
9523 repop->start = ceph_clock_now();
9524
9525 repop_queue.push_back(&repop->queue_item);
9526
9527 osd->logger->inc(l_osd_op_wip);
9528
9529 dout(10) << __func__ << ": " << *repop << dendl;
9530 return boost::intrusive_ptr<RepGather>(repop);
9531}
9532
9533void PrimaryLogPG::remove_repop(RepGather *repop)
9534{
9535 dout(20) << __func__ << " " << *repop << dendl;
9536
9537 for (auto p = repop->on_finish.begin();
9538 p != repop->on_finish.end();
9539 repop->on_finish.erase(p++)) {
9540 (*p)();
9541 }
9542
9543 release_object_locks(
9544 repop->lock_manager);
9545 repop->put();
9546
9547 osd->logger->dec(l_osd_op_wip);
9548}
9549
9550PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9551{
9552 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
7c673cae
FG
9553 ceph_tid_t rep_tid = osd->get_tid();
9554 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
c07f9fc5 9555 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
7c673cae
FG
9556 ctx->op_t.reset(new PGTransaction());
9557 ctx->mtime = ceph_clock_now();
9558 return ctx;
9559}
9560
9561void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9562{
9563 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9564 dout(20) << __func__ << " " << repop << dendl;
9565 issue_repop(repop, ctx.get());
9566 eval_repop(repop);
224ce89b 9567 calc_trim_to();
7c673cae
FG
9568 repop->put();
9569}
9570
9571
9572void PrimaryLogPG::submit_log_entries(
31f18b77 9573 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
9574 ObcLockManager &&manager,
9575 boost::optional<std::function<void(void)> > &&_on_complete,
9576 OpRequestRef op,
9577 int r)
9578{
9579 dout(10) << __func__ << " " << entries << dendl;
9580 assert(is_primary());
9581
9582 eversion_t version;
9583 if (!entries.empty()) {
9584 assert(entries.rbegin()->version >= projected_last_update);
9585 version = projected_last_update = entries.rbegin()->version;
9586 }
9587
9588 boost::intrusive_ptr<RepGather> repop;
9589 boost::optional<std::function<void(void)> > on_complete;
31f18b77 9590 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9591 repop = new_repop(
9592 version,
9593 r,
9594 std::move(manager),
9595 std::move(op),
9596 std::move(_on_complete));
9597 } else {
9598 on_complete = std::move(_on_complete);
9599 }
9600
9601 pgbackend->call_write_ordered(
9602 [this, entries, repop, on_complete]() {
9603 ObjectStore::Transaction t;
9604 eversion_t old_last_update = info.last_update;
94b18763 9605 merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
7c673cae
FG
9606
9607
9608 set<pg_shard_t> waiting_on;
9609 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9610 i != actingbackfill.end();
9611 ++i) {
9612 pg_shard_t peer(*i);
9613 if (peer == pg_whoami) continue;
9614 assert(peer_missing.count(peer));
9615 assert(peer_info.count(peer));
31f18b77 9616 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9617 assert(repop);
9618 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9619 entries,
9620 spg_t(info.pgid.pgid, i->shard),
9621 pg_whoami.shard,
9622 get_osdmap()->get_epoch(),
9623 last_peering_reset,
94b18763
FG
9624 repop->rep_tid,
9625 pg_trim_to,
9626 min_last_complete_ondisk);
7c673cae
FG
9627 osd->send_message_osd_cluster(
9628 peer.osd, m, get_osdmap()->get_epoch());
9629 waiting_on.insert(peer);
9630 } else {
9631 MOSDPGLog *m = new MOSDPGLog(
9632 peer.shard, pg_whoami.shard,
9633 info.last_update.epoch,
9634 info);
9635 m->log.log = entries;
9636 m->log.tail = old_last_update;
9637 m->log.head = info.last_update;
9638 osd->send_message_osd_cluster(
9639 peer.osd, m, get_osdmap()->get_epoch());
9640 }
9641 }
31f18b77 9642 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9643 ceph_tid_t rep_tid = repop->rep_tid;
9644 waiting_on.insert(pg_whoami);
9645 log_entry_update_waiting_on.insert(
9646 make_pair(
9647 rep_tid,
9648 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9649 ));
9650 struct OnComplete : public Context {
9651 PrimaryLogPGRef pg;
9652 ceph_tid_t rep_tid;
9653 epoch_t epoch;
9654 OnComplete(
9655 PrimaryLogPGRef pg,
9656 ceph_tid_t rep_tid,
9657 epoch_t epoch)
9658 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9659 void finish(int) override {
9660 pg->lock();
9661 if (!pg->pg_has_reset_since(epoch)) {
9662 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9663 assert(it != pg->log_entry_update_waiting_on.end());
9664 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9665 assert(it2 != it->second.waiting_on.end());
9666 it->second.waiting_on.erase(it2);
9667 if (it->second.waiting_on.empty()) {
9668 pg->repop_all_committed(it->second.repop.get());
9669 pg->log_entry_update_waiting_on.erase(it);
9670 }
9671 }
9672 pg->unlock();
9673 }
9674 };
9675 t.register_on_commit(
9676 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9677 } else {
9678 if (on_complete) {
9679 struct OnComplete : public Context {
9680 PrimaryLogPGRef pg;
9681 std::function<void(void)> on_complete;
9682 epoch_t epoch;
9683 OnComplete(
9684 PrimaryLogPGRef pg,
9685 const std::function<void(void)> &on_complete,
9686 epoch_t epoch)
9687 : pg(pg),
9688 on_complete(std::move(on_complete)),
9689 epoch(epoch) {}
9690 void finish(int) override {
9691 pg->lock();
9692 if (!pg->pg_has_reset_since(epoch))
9693 on_complete();
9694 pg->unlock();
9695 }
9696 };
9697 t.register_on_complete(
9698 new OnComplete{
9699 this, *on_complete, get_osdmap()->get_epoch()
9700 });
9701 }
9702 }
9703 t.register_on_applied(
9704 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9705 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9706 assert(r == 0);
9707 });
94b18763
FG
9708
9709 calc_trim_to();
7c673cae
FG
9710}
9711
9712void PrimaryLogPG::cancel_log_updates()
9713{
9714 // get rid of all the LogUpdateCtx so their references to repops are
9715 // dropped
9716 log_entry_update_waiting_on.clear();
9717}
9718
9719// -------------------------------------------------------
9720
9721void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9722{
9723 pair<hobject_t, ObjectContextRef> i;
9724 while (object_contexts.get_next(i.first, &i)) {
9725 ObjectContextRef obc(i.second);
9726 get_obc_watchers(obc, pg_watchers);
9727 }
9728}
9729
9730void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9731{
9732 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9733 obc->watchers.begin();
9734 j != obc->watchers.end();
9735 ++j) {
9736 obj_watch_item_t owi;
9737
9738 owi.obj = obc->obs.oi.soid;
9739 owi.wi.addr = j->second->get_peer_addr();
9740 owi.wi.name = j->second->get_entity();
9741 owi.wi.cookie = j->second->get_cookie();
9742 owi.wi.timeout_seconds = j->second->get_timeout();
9743
9744 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9745 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9746
9747 pg_watchers.push_back(owi);
9748 }
9749}
9750
9751void PrimaryLogPG::check_blacklisted_watchers()
9752{
9753 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9754 pair<hobject_t, ObjectContextRef> i;
9755 while (object_contexts.get_next(i.first, &i))
9756 check_blacklisted_obc_watchers(i.second);
9757}
9758
9759void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9760{
9761 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9762 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9763 obc->watchers.begin();
9764 k != obc->watchers.end();
9765 ) {
9766 //Advance iterator now so handle_watch_timeout() can erase element
9767 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9768 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9769 entity_addr_t ea = j->second->get_peer_addr();
9770 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9771 if (get_osdmap()->is_blacklisted(ea)) {
9772 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9773 assert(j->second->get_pg() == this);
9774 j->second->unregister_cb();
9775 handle_watch_timeout(j->second);
9776 }
9777 }
9778}
9779
9780void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9781{
9782 assert(is_active());
9783 assert((recovering.count(obc->obs.oi.soid) ||
9784 !is_missing_object(obc->obs.oi.soid)) ||
9785 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9786 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9787 pg_log_entry_t::LOST_REVERT &&
9788 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9789 obc->obs.oi.version));
9790
9791 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9792 assert(obc->watchers.empty());
9793 // populate unconnected_watchers
9794 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9795 obc->obs.oi.watchers.begin();
9796 p != obc->obs.oi.watchers.end();
9797 ++p) {
9798 utime_t expire = info.stats.last_became_active;
9799 expire += p->second.timeout_seconds;
9800 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9801 WatchRef watch(
9802 Watch::makeWatchRef(
9803 this, osd, obc, p->second.timeout_seconds, p->first.first,
9804 p->first.second, p->second.addr));
9805 watch->disconnect();
9806 obc->watchers.insert(
9807 make_pair(
9808 make_pair(p->first.first, p->first.second),
9809 watch));
9810 }
9811 // Look for watchers from blacklisted clients and drop
9812 check_blacklisted_obc_watchers(obc);
9813}
9814
9815void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9816{
9817 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9818 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9819
9820 if (!is_active()) {
9821 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9822 return;
9823 }
9824 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9825 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9826 watch->get_delayed_cb()
9827 );
9828 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9829 << obc->obs.oi.soid
9830 << dendl;
9831 return;
9832 }
9833
28e407b8 9834 if (write_blocked_by_scrub(obc->obs.oi.soid)) {
7c673cae
FG
9835 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9836 << obc->obs.oi.soid
9837 << dendl;
9838 scrubber.add_callback(
9839 watch->get_delayed_cb() // This callback!
9840 );
9841 return;
9842 }
9843
9844 OpContextUPtr ctx = simple_opc_create(obc);
9845 ctx->at_version = get_next_version();
9846
9847 object_info_t& oi = ctx->new_obs.oi;
9848 oi.watchers.erase(make_pair(watch->get_cookie(),
9849 watch->get_entity()));
9850
9851 list<watch_disconnect_t> watch_disconnects = {
9852 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9853 };
9854 ctx->register_on_success(
9855 [this, obc, watch_disconnects]() {
9856 complete_disconnect_watches(obc, watch_disconnects);
9857 });
9858
9859
9860 PGTransaction *t = ctx->op_t.get();
9861 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9862 ctx->at_version,
9863 oi.version,
9864 0,
9865 osd_reqid_t(), ctx->mtime, 0));
9866
9867 oi.prior_version = obc->obs.oi.version;
9868 oi.version = ctx->at_version;
9869 bufferlist bl;
9870 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9871 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9872
9873 // apply new object state.
9874 ctx->obc->obs = ctx->new_obs;
9875
9876 // no ctx->delta_stats
9877 simple_opc_submit(std::move(ctx));
9878}
9879
9880ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9881 SnapSetContext *ssc)
9882{
9883 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9884 assert(obc->destructor_callback == NULL);
9885 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9886 obc->obs.oi = oi;
9887 obc->obs.exists = false;
9888 obc->ssc = ssc;
9889 if (ssc)
9890 register_snapset_context(ssc);
9891 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9892 if (is_active())
9893 populate_obc_watchers(obc);
9894 return obc;
9895}
9896
9897ObjectContextRef PrimaryLogPG::get_object_context(
9898 const hobject_t& soid,
9899 bool can_create,
9900 const map<string, bufferlist> *attrs)
9901{
9902 assert(
9903 attrs || !pg_log.get_missing().is_missing(soid) ||
9904 // or this is a revert... see recover_primary()
9905 (pg_log.get_log().objects.count(soid) &&
9906 pg_log.get_log().objects.find(soid)->second->op ==
9907 pg_log_entry_t::LOST_REVERT));
9908 ObjectContextRef obc = object_contexts.lookup(soid);
9909 osd->logger->inc(l_osd_object_ctx_cache_total);
9910 if (obc) {
9911 osd->logger->inc(l_osd_object_ctx_cache_hit);
9912 dout(10) << __func__ << ": found obc in cache: " << obc
9913 << dendl;
9914 } else {
9915 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9916 // check disk
9917 bufferlist bv;
9918 if (attrs) {
9919 assert(attrs->count(OI_ATTR));
9920 bv = attrs->find(OI_ATTR)->second;
9921 } else {
9922 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9923 if (r < 0) {
9924 if (!can_create) {
9925 dout(10) << __func__ << ": no obc for soid "
9926 << soid << " and !can_create"
9927 << dendl;
9928 return ObjectContextRef(); // -ENOENT!
9929 }
9930
9931 dout(10) << __func__ << ": no obc for soid "
9932 << soid << " but can_create"
9933 << dendl;
9934 // new object.
9935 object_info_t oi(soid);
9936 SnapSetContext *ssc = get_snapset_context(
9937 soid, true, 0, false);
224ce89b 9938 assert(ssc);
7c673cae
FG
9939 obc = create_object_context(oi, ssc);
9940 dout(10) << __func__ << ": " << obc << " " << soid
9941 << " " << obc->rwstate
9942 << " oi: " << obc->obs.oi
9943 << " ssc: " << obc->ssc
9944 << " snapset: " << obc->ssc->snapset << dendl;
9945 return obc;
9946 }
9947 }
9948
9949 object_info_t oi;
9950 try {
9951 bufferlist::iterator bliter = bv.begin();
9952 ::decode(oi, bliter);
9953 } catch (...) {
9954 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9955 return ObjectContextRef(); // -ENOENT!
9956 }
9957
9958 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9959
9960 obc = object_contexts.lookup_or_create(oi.soid);
9961 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9962 obc->obs.oi = oi;
9963 obc->obs.exists = true;
9964
9965 obc->ssc = get_snapset_context(
9966 soid, true,
9967 soid.has_snapset() ? attrs : 0);
9968
9969 if (is_active())
9970 populate_obc_watchers(obc);
9971
9972 if (pool.info.require_rollback()) {
9973 if (attrs) {
9974 obc->attr_cache = *attrs;
9975 } else {
9976 int r = pgbackend->objects_get_attrs(
9977 soid,
9978 &obc->attr_cache);
9979 assert(r == 0);
9980 }
9981 }
9982
9983 dout(10) << __func__ << ": creating obc from disk: " << obc
9984 << dendl;
9985 }
224ce89b
WB
9986
9987 // XXX: Caller doesn't expect this
9988 if (obc->ssc == NULL) {
9989 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9990 return ObjectContextRef(); // -ENOENT!
9991 }
9992
7c673cae
FG
9993 dout(10) << __func__ << ": " << obc << " " << soid
9994 << " " << obc->rwstate
9995 << " oi: " << obc->obs.oi
9996 << " exists: " << (int)obc->obs.exists
9997 << " ssc: " << obc->ssc
9998 << " snapset: " << obc->ssc->snapset << dendl;
9999 return obc;
10000}
10001
10002void PrimaryLogPG::context_registry_on_change()
10003{
10004 pair<hobject_t, ObjectContextRef> i;
10005 while (object_contexts.get_next(i.first, &i)) {
10006 ObjectContextRef obc(i.second);
10007 if (obc) {
10008 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10009 obc->watchers.begin();
10010 j != obc->watchers.end();
10011 obc->watchers.erase(j++)) {
10012 j->second->discard();
10013 }
10014 }
10015 }
10016}
10017
10018
10019/*
10020 * If we return an error, and set *pmissing, then promoting that
10021 * object may help.
10022 *
10023 * If we return -EAGAIN, we will always set *pmissing to the missing
10024 * object to wait for.
10025 *
10026 * If we return an error but do not set *pmissing, then we know the
10027 * object does not exist.
10028 */
10029int PrimaryLogPG::find_object_context(const hobject_t& oid,
10030 ObjectContextRef *pobc,
10031 bool can_create,
10032 bool map_snapid_to_clone,
10033 hobject_t *pmissing)
10034{
10035 FUNCTRACE();
10036 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
10037 // want the head?
10038 if (oid.snap == CEPH_NOSNAP) {
10039 ObjectContextRef obc = get_object_context(oid, can_create);
10040 if (!obc) {
10041 if (pmissing)
10042 *pmissing = oid;
10043 return -ENOENT;
10044 }
10045 dout(10) << "find_object_context " << oid
10046 << " @" << oid.snap
10047 << " oi=" << obc->obs.oi
10048 << dendl;
10049 *pobc = obc;
10050
10051 return 0;
10052 }
10053
10054 hobject_t head = oid.get_head();
10055
10056 // want the snapdir?
10057 if (oid.snap == CEPH_SNAPDIR) {
10058 // return head or snapdir, whichever exists.
10059 ObjectContextRef headobc = get_object_context(head, can_create);
10060 ObjectContextRef obc = headobc;
10061 if (!obc || !obc->obs.exists)
10062 obc = get_object_context(oid, can_create);
10063 if (!obc || !obc->obs.exists) {
10064 // if we have neither, we would want to promote the head.
10065 if (pmissing)
10066 *pmissing = head;
10067 if (pobc)
10068 *pobc = headobc; // may be null
10069 return -ENOENT;
10070 }
10071 dout(10) << "find_object_context " << oid
10072 << " @" << oid.snap
10073 << " oi=" << obc->obs.oi
10074 << dendl;
10075 *pobc = obc;
10076
10077 // always populate ssc for SNAPDIR...
10078 if (!obc->ssc)
10079 obc->ssc = get_snapset_context(
10080 oid, true);
10081 return 0;
10082 }
10083
10084 // we want a snap
10085 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
10086 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
10087 return -ENOENT;
10088 }
10089
10090 SnapSetContext *ssc = get_snapset_context(oid, can_create);
10091 if (!ssc || !(ssc->exists || can_create)) {
10092 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
10093 if (pmissing)
10094 *pmissing = head; // start by getting the head
10095 if (ssc)
10096 put_snapset_context(ssc);
10097 return -ENOENT;
10098 }
10099
10100 if (map_snapid_to_clone) {
10101 dout(10) << "find_object_context " << oid << " @" << oid.snap
10102 << " snapset " << ssc->snapset
10103 << " map_snapid_to_clone=true" << dendl;
10104 if (oid.snap > ssc->snapset.seq) {
10105 // already must be readable
10106 ObjectContextRef obc = get_object_context(head, false);
10107 dout(10) << "find_object_context " << oid << " @" << oid.snap
10108 << " snapset " << ssc->snapset
10109 << " maps to head" << dendl;
10110 *pobc = obc;
10111 put_snapset_context(ssc);
10112 return (obc && obc->obs.exists) ? 0 : -ENOENT;
10113 } else {
10114 vector<snapid_t>::const_iterator citer = std::find(
10115 ssc->snapset.clones.begin(),
10116 ssc->snapset.clones.end(),
10117 oid.snap);
10118 if (citer == ssc->snapset.clones.end()) {
10119 dout(10) << "find_object_context " << oid << " @" << oid.snap
10120 << " snapset " << ssc->snapset
10121 << " maps to nothing" << dendl;
10122 put_snapset_context(ssc);
10123 return -ENOENT;
10124 }
10125
10126 dout(10) << "find_object_context " << oid << " @" << oid.snap
10127 << " snapset " << ssc->snapset
10128 << " maps to " << oid << dendl;
10129
10130 if (pg_log.get_missing().is_missing(oid)) {
10131 dout(10) << "find_object_context " << oid << " @" << oid.snap
10132 << " snapset " << ssc->snapset
10133 << " " << oid << " is missing" << dendl;
10134 if (pmissing)
10135 *pmissing = oid;
10136 put_snapset_context(ssc);
10137 return -EAGAIN;
10138 }
10139
10140 ObjectContextRef obc = get_object_context(oid, false);
10141 if (!obc || !obc->obs.exists) {
10142 dout(10) << "find_object_context " << oid << " @" << oid.snap
10143 << " snapset " << ssc->snapset
10144 << " " << oid << " is not present" << dendl;
10145 if (pmissing)
10146 *pmissing = oid;
10147 put_snapset_context(ssc);
10148 return -ENOENT;
10149 }
10150 dout(10) << "find_object_context " << oid << " @" << oid.snap
10151 << " snapset " << ssc->snapset
10152 << " " << oid << " HIT" << dendl;
10153 *pobc = obc;
10154 put_snapset_context(ssc);
10155 return 0;
10156 }
10157 ceph_abort(); //unreachable
10158 }
10159
10160 dout(10) << "find_object_context " << oid << " @" << oid.snap
10161 << " snapset " << ssc->snapset << dendl;
10162
10163 // head?
10164 if (oid.snap > ssc->snapset.seq) {
10165 if (ssc->snapset.head_exists) {
10166 ObjectContextRef obc = get_object_context(head, false);
10167 dout(10) << "find_object_context " << head
10168 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10169 << " -- HIT " << obc->obs
10170 << dendl;
10171 if (!obc->ssc)
10172 obc->ssc = ssc;
10173 else {
10174 assert(ssc == obc->ssc);
10175 put_snapset_context(ssc);
10176 }
10177 *pobc = obc;
10178 return 0;
10179 }
10180 dout(10) << "find_object_context " << head
10181 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10182 << " but head dne -- DNE"
10183 << dendl;
10184 put_snapset_context(ssc);
10185 return -ENOENT;
10186 }
10187
10188 // which clone would it be?
10189 unsigned k = 0;
10190 while (k < ssc->snapset.clones.size() &&
10191 ssc->snapset.clones[k] < oid.snap)
10192 k++;
10193 if (k == ssc->snapset.clones.size()) {
10194 dout(10) << "find_object_context no clones with last >= oid.snap "
10195 << oid.snap << " -- DNE" << dendl;
10196 put_snapset_context(ssc);
10197 return -ENOENT;
10198 }
10199 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10200 info.pgid.pool(), oid.get_namespace());
10201
10202 if (pg_log.get_missing().is_missing(soid)) {
10203 dout(20) << "find_object_context " << soid << " missing, try again later"
10204 << dendl;
10205 if (pmissing)
10206 *pmissing = soid;
10207 put_snapset_context(ssc);
10208 return -EAGAIN;
10209 }
10210
10211 ObjectContextRef obc = get_object_context(soid, false);
10212 if (!obc || !obc->obs.exists) {
7c673cae
FG
10213 if (pmissing)
10214 *pmissing = soid;
10215 put_snapset_context(ssc);
c07f9fc5
FG
10216 if (is_degraded_or_backfilling_object(soid)) {
10217 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10218 return -EAGAIN;
10219 } else {
10220 dout(20) << __func__ << " missing clone " << soid << dendl;
10221 return -ENOENT;
10222 }
7c673cae
FG
10223 }
10224
10225 if (!obc->ssc) {
10226 obc->ssc = ssc;
10227 } else {
10228 assert(obc->ssc == ssc);
10229 put_snapset_context(ssc);
10230 }
10231 ssc = 0;
10232
10233 // clone
10234 dout(20) << "find_object_context " << soid
10235 << " snapset " << obc->ssc->snapset
10236 << " legacy_snaps " << obc->obs.oi.legacy_snaps
10237 << dendl;
10238 snapid_t first, last;
10239 if (obc->ssc->snapset.is_legacy()) {
10240 first = obc->obs.oi.legacy_snaps.back();
10241 last = obc->obs.oi.legacy_snaps.front();
10242 } else {
10243 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10244 assert(p != obc->ssc->snapset.clone_snaps.end());
28e407b8
AA
10245 if (p->second.empty()) {
10246 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
10247 assert(!cct->_conf->osd_debug_verify_snaps);
10248 return -ENOENT;
10249 }
7c673cae
FG
10250 first = p->second.back();
10251 last = p->second.front();
10252 }
10253 if (first <= oid.snap) {
10254 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10255 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10256 *pobc = obc;
10257 return 0;
10258 } else {
10259 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10260 << "] does not contain " << oid.snap << " -- DNE" << dendl;
10261 return -ENOENT;
10262 }
10263}
10264
10265void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10266{
10267 if (obc->ssc)
10268 put_snapset_context(obc->ssc);
10269}
10270
10271void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10272{
10273 object_info_t& oi = obc->obs.oi;
10274
10275 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10276 object_stat_sum_t stat;
10277
10278 stat.num_bytes += oi.size;
10279
10280 if (oi.soid.snap != CEPH_SNAPDIR)
10281 stat.num_objects++;
10282 if (oi.is_dirty())
10283 stat.num_objects_dirty++;
10284 if (oi.is_whiteout())
10285 stat.num_whiteouts++;
10286 if (oi.is_omap())
10287 stat.num_objects_omap++;
10288 if (oi.is_cache_pinned())
10289 stat.num_objects_pinned++;
10290
10291 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10292 stat.num_object_clones++;
10293
10294 if (!obc->ssc)
10295 obc->ssc = get_snapset_context(oi.soid, false);
10296 assert(obc->ssc);
10297
10298 // subtract off clone overlap
10299 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10300 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10301 for (interval_set<uint64_t>::const_iterator r = o.begin();
10302 r != o.end();
10303 ++r) {
10304 stat.num_bytes -= r.get_len();
10305 }
10306 }
10307 }
10308
10309 // add it in
10310 pgstat->stats.sum.add(stat);
10311}
10312
10313void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10314{
10315 const hobject_t& soid = obc->obs.oi.soid;
10316 if (obc->is_blocked()) {
10317 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10318 return;
10319 }
10320
10321 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10322 if (p != waiting_for_blocked_object.end()) {
10323 list<OpRequestRef>& ls = p->second;
10324 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10325 requeue_ops(ls);
10326 waiting_for_blocked_object.erase(p);
10327 }
10328
10329 map<hobject_t, ObjectContextRef>::iterator i =
10330 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10331 if (i != objects_blocked_on_snap_promotion.end()) {
10332 assert(i->second == obc);
10333 objects_blocked_on_snap_promotion.erase(i);
10334 }
10335
10336 if (obc->requeue_scrub_on_unblock) {
10337 obc->requeue_scrub_on_unblock = false;
10338 requeue_scrub();
10339 }
10340}
10341
10342SnapSetContext *PrimaryLogPG::get_snapset_context(
10343 const hobject_t& oid,
10344 bool can_create,
10345 const map<string, bufferlist> *attrs,
10346 bool oid_existed)
10347{
10348 Mutex::Locker l(snapset_contexts_lock);
10349 SnapSetContext *ssc;
10350 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10351 oid.get_snapdir());
10352 if (p != snapset_contexts.end()) {
10353 if (can_create || p->second->exists) {
10354 ssc = p->second;
10355 } else {
10356 return NULL;
10357 }
10358 } else {
10359 bufferlist bv;
10360 if (!attrs) {
10361 int r = -ENOENT;
10362 if (!(oid.is_head() && !oid_existed))
10363 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10364 if (r < 0) {
10365 // try _snapset
10366 if (!(oid.is_snapdir() && !oid_existed))
10367 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10368 if (r < 0 && !can_create)
10369 return NULL;
10370 }
10371 } else {
10372 assert(attrs->count(SS_ATTR));
10373 bv = attrs->find(SS_ATTR)->second;
10374 }
10375 ssc = new SnapSetContext(oid.get_snapdir());
10376 _register_snapset_context(ssc);
10377 if (bv.length()) {
10378 bufferlist::iterator bvp = bv.begin();
224ce89b
WB
10379 try {
10380 ssc->snapset.decode(bvp);
10381 } catch (buffer::error& e) {
10382 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10383 return NULL;
10384 }
7c673cae
FG
10385 ssc->exists = true;
10386 } else {
10387 ssc->exists = false;
10388 }
10389 }
10390 assert(ssc);
10391 ssc->ref++;
10392 return ssc;
10393}
10394
10395void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10396{
10397 Mutex::Locker l(snapset_contexts_lock);
10398 --ssc->ref;
10399 if (ssc->ref == 0) {
10400 if (ssc->registered)
10401 snapset_contexts.erase(ssc->oid);
10402 delete ssc;
10403 }
10404}
10405
10406/** pull - request object from a peer
10407 */
10408
10409/*
10410 * Return values:
10411 * NONE - didn't pull anything
10412 * YES - pulled what the caller wanted
10413 * OTHER - needed to pull something else first (_head or _snapdir)
10414 */
10415enum { PULL_NONE, PULL_OTHER, PULL_YES };
10416
10417int PrimaryLogPG::recover_missing(
10418 const hobject_t &soid, eversion_t v,
10419 int priority,
10420 PGBackend::RecoveryHandle *h)
10421{
10422 if (missing_loc.is_unfound(soid)) {
10423 dout(7) << "pull " << soid
10424 << " v " << v
10425 << " but it is unfound" << dendl;
10426 return PULL_NONE;
10427 }
10428
c07f9fc5
FG
10429 if (missing_loc.is_deleted(soid)) {
10430 start_recovery_op(soid);
10431 assert(!recovering.count(soid));
10432 recovering.insert(make_pair(soid, ObjectContextRef()));
10433 epoch_t cur_epoch = get_osdmap()->get_epoch();
10434 remove_missing_object(soid, v, new FunctionContext(
10435 [=](int) {
10436 lock();
10437 if (!pg_has_reset_since(cur_epoch)) {
10438 bool object_missing = false;
10439 for (const auto& shard : actingbackfill) {
10440 if (shard == pg_whoami)
10441 continue;
10442 if (peer_missing[shard].is_missing(soid)) {
10443 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10444 object_missing = true;
10445 break;
10446 }
10447 }
10448 if (!object_missing) {
10449 object_stat_sum_t stat_diff;
10450 stat_diff.num_objects_recovered = 1;
10451 on_global_recover(soid, stat_diff, true);
10452 } else {
10453 auto recovery_handle = pgbackend->open_recovery_op();
10454 pgbackend->recover_delete_object(soid, v, recovery_handle);
10455 pgbackend->run_recovery_op(recovery_handle, priority);
10456 }
10457 }
10458 unlock();
10459 }));
10460 return PULL_YES;
10461 }
10462
7c673cae
FG
10463 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10464 ObjectContextRef obc;
10465 ObjectContextRef head_obc;
10466 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10467 // do we have the head and/or snapdir?
10468 hobject_t head = soid.get_head();
10469 if (pg_log.get_missing().is_missing(head)) {
10470 if (recovering.count(head)) {
10471 dout(10) << " missing but already recovering head " << head << dendl;
10472 return PULL_NONE;
10473 } else {
10474 int r = recover_missing(
10475 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10476 h);
10477 if (r != PULL_NONE)
10478 return PULL_OTHER;
10479 return PULL_NONE;
10480 }
10481 }
10482 head = soid.get_snapdir();
10483 if (pg_log.get_missing().is_missing(head)) {
10484 if (recovering.count(head)) {
10485 dout(10) << " missing but already recovering snapdir " << head << dendl;
10486 return PULL_NONE;
10487 } else {
10488 int r = recover_missing(
10489 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10490 h);
10491 if (r != PULL_NONE)
10492 return PULL_OTHER;
10493 return PULL_NONE;
10494 }
10495 }
10496
10497 // we must have one or the other
10498 head_obc = get_object_context(
10499 soid.get_head(),
10500 false,
10501 0);
10502 if (!head_obc)
10503 head_obc = get_object_context(
10504 soid.get_snapdir(),
10505 false,
10506 0);
10507 assert(head_obc);
10508 }
10509 start_recovery_op(soid);
10510 assert(!recovering.count(soid));
10511 recovering.insert(make_pair(soid, obc));
224ce89b 10512 int r = pgbackend->recover_object(
7c673cae
FG
10513 soid,
10514 v,
10515 head_obc,
10516 obc,
10517 h);
224ce89b
WB
10518 // This is only a pull which shouldn't return an error
10519 assert(r >= 0);
7c673cae
FG
10520 return PULL_YES;
10521}
10522
10523void PrimaryLogPG::send_remove_op(
10524 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10525{
10526 ceph_tid_t tid = osd->get_tid();
10527 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10528
10529 dout(10) << "send_remove_op " << oid << " from osd." << peer
10530 << " tid " << tid << dendl;
10531
10532 MOSDSubOp *subop = new MOSDSubOp(
10533 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10534 oid, CEPH_OSD_FLAG_ACK,
10535 get_osdmap()->get_epoch(), tid, v);
10536 subop->ops = vector<OSDOp>(1);
10537 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10538
10539 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10540}
10541
c07f9fc5
FG
10542void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10543 eversion_t v, Context *on_complete)
10544{
10545 dout(20) << __func__ << " " << soid << " " << v << dendl;
10546 assert(on_complete != nullptr);
10547 // delete locally
10548 ObjectStore::Transaction t;
10549 remove_snap_mapped_object(t, soid);
10550
10551 ObjectRecoveryInfo recovery_info;
10552 recovery_info.soid = soid;
10553 recovery_info.version = v;
10554
10555 epoch_t cur_epoch = get_osdmap()->get_epoch();
10556 t.register_on_complete(new FunctionContext(
10557 [=](int) {
10558 lock();
10559 if (!pg_has_reset_since(cur_epoch)) {
10560 ObjectStore::Transaction t2;
10561 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10562 t2.register_on_complete(on_complete);
10563 int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10564 assert(r == 0);
10565 unlock();
10566 } else {
10567 unlock();
10568 on_complete->complete(-EAGAIN);
10569 }
10570 }));
10571 int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10572 assert(r == 0);
10573}
7c673cae
FG
10574
10575void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10576{
10577 dout(10) << "finish_degraded_object " << oid << dendl;
7c673cae
FG
10578 if (callbacks_for_degraded_object.count(oid)) {
10579 list<Context*> contexts;
10580 contexts.swap(callbacks_for_degraded_object[oid]);
10581 callbacks_for_degraded_object.erase(oid);
10582 for (list<Context*>::iterator i = contexts.begin();
10583 i != contexts.end();
10584 ++i) {
10585 (*i)->complete(0);
10586 }
10587 }
10588 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10589 oid.get_head());
10590 if (i != objects_blocked_on_degraded_snap.end() &&
10591 i->second == oid.snap)
10592 objects_blocked_on_degraded_snap.erase(i);
10593}
10594
10595void PrimaryLogPG::_committed_pushed_object(
10596 epoch_t epoch, eversion_t last_complete)
10597{
10598 lock();
10599 if (!pg_has_reset_since(epoch)) {
10600 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10601 last_complete_ondisk = last_complete;
10602
10603 if (last_complete_ondisk == info.last_update) {
10604 if (!is_primary()) {
10605 // Either we are a replica or backfill target.
10606 // we are fully up to date. tell the primary!
10607 osd->send_message_osd_cluster(
10608 get_primary().osd,
10609 new MOSDPGTrim(
10610 get_osdmap()->get_epoch(),
10611 spg_t(info.pgid.pgid, get_primary().shard),
10612 last_complete_ondisk),
10613 get_osdmap()->get_epoch());
10614 } else {
10615 calc_min_last_complete_ondisk();
10616 }
10617 }
10618
10619 } else {
10620 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10621 }
10622
10623 unlock();
10624}
10625
10626void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10627{
10628 lock();
c07f9fc5
FG
10629 dout(20) << __func__ << dendl;
10630 if (obc) {
10631 dout(20) << "obc = " << *obc << dendl;
10632 }
7c673cae
FG
10633 assert(active_pushes >= 1);
10634 --active_pushes;
10635
10636 // requeue an active chunky scrub waiting on recovery ops
10637 if (!deleting && active_pushes == 0
10638 && scrubber.is_chunky_scrub_active()) {
31f18b77
FG
10639 if (ops_blocked_by_scrub()) {
10640 requeue_scrub(true);
10641 } else {
10642 requeue_scrub(false);
10643 }
7c673cae 10644 }
7c673cae
FG
10645 unlock();
10646}
10647
10648void PrimaryLogPG::_applied_recovered_object_replica()
10649{
10650 lock();
c07f9fc5 10651 dout(20) << __func__ << dendl;
7c673cae
FG
10652 assert(active_pushes >= 1);
10653 --active_pushes;
10654
10655 // requeue an active chunky scrub waiting on recovery ops
10656 if (!deleting && active_pushes == 0 &&
10657 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10658 scrubber.active_rep_scrub->get_req())->chunky) {
10659 osd->enqueue_back(
10660 info.pgid,
10661 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10662 scrubber.active_rep_scrub = OpRequestRef();
10663 }
7c673cae
FG
10664 unlock();
10665}
10666
10667void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10668{
10669 dout(10) << "got missing " << oid << " v " << v << dendl;
10670 pg_log.recover_got(oid, v, info);
10671 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10672 dout(10) << "last_complete now " << info.last_complete
10673 << " log.complete_to " << pg_log.get_log().complete_to->version
10674 << dendl;
10675 } else {
10676 dout(10) << "last_complete now " << info.last_complete
10677 << " log.complete_to at end" << dendl;
10678 //below is not true in the repair case.
10679 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10680 assert(info.last_complete == info.last_update);
10681 }
10682}
10683
224ce89b
WB
10684void PrimaryLogPG::primary_failed(const hobject_t &soid)
10685{
10686 list<pg_shard_t> fl = { pg_whoami };
10687 failed_push(fl, soid);
10688}
10689
7c673cae
FG
10690void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10691{
10692 dout(20) << __func__ << ": " << soid << dendl;
10693 assert(recovering.count(soid));
10694 auto obc = recovering[soid];
10695 if (obc) {
10696 list<OpRequestRef> blocked_ops;
10697 obc->drop_recovery_read(&blocked_ops);
10698 requeue_ops(blocked_ops);
10699 }
10700 recovering.erase(soid);
10701 for (auto&& i : from)
10702 missing_loc.remove_location(soid, i);
10703 dout(0) << __func__ << " " << soid << " from shard " << from
10704 << ", reps on " << missing_loc.get_locations(soid)
10705 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10706 finish_recovery_op(soid); // close out this attempt,
10707}
10708
10709void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10710{
10711 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10712 assert(m->get_type() == MSG_OSD_SUBOP);
10713 dout(7) << "sub_op_remove " << m->poid << dendl;
10714
10715 op->mark_started();
10716
10717 ObjectStore::Transaction t;
10718 remove_snap_mapped_object(t, m->poid);
10719 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10720 assert(r == 0);
10721}
10722
10723eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10724{
10725 eversion_t v;
10726 pg_missing_item pmi;
10727 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10728 assert(is_missing);
10729 v = pmi.have;
10730 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10731
10732 assert(!actingbackfill.empty());
10733 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10734 i != actingbackfill.end();
10735 ++i) {
10736 if (*i == get_primary()) continue;
10737 pg_shard_t peer = *i;
10738 if (!peer_missing[peer].is_missing(oid)) {
7c673cae
FG
10739 continue;
10740 }
10741 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10742 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10743 if (h > v)
10744 v = h;
10745 }
10746
10747 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10748 return v;
10749}
10750
10751void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10752{
10753 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10754 op->get_req());
10755 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10756 ObjectStore::Transaction t;
94b18763
FG
10757 boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
10758 if (m->pg_trim_to != eversion_t())
10759 op_trim_to = m->pg_trim_to;
10760 if (m->pg_roll_forward_to != eversion_t())
10761 op_roll_forward_to = m->pg_roll_forward_to;
10762
10763 dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
10764
10765 append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
10766 eversion_t new_lcod = info.last_complete;
7c673cae
FG
10767
10768 Context *complete = new FunctionContext(
10769 [=](int) {
10770 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10771 op->get_req());
10772 lock();
10773 if (!pg_has_reset_since(msg->get_epoch())) {
94b18763 10774 update_last_complete_ondisk(new_lcod);
7c673cae
FG
10775 MOSDPGUpdateLogMissingReply *reply =
10776 new MOSDPGUpdateLogMissingReply(
10777 spg_t(info.pgid.pgid, primary_shard().shard),
10778 pg_whoami.shard,
10779 msg->get_epoch(),
10780 msg->min_epoch,
94b18763
FG
10781 msg->get_tid(),
10782 new_lcod);
7c673cae
FG
10783 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10784 msg->get_connection()->send_message(reply);
10785 }
10786 unlock();
10787 });
10788
31f18b77 10789 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
10790 t.register_on_commit(complete);
10791 } else {
10792 /* Hack to work around the fact that ReplicatedBackend sends
10793 * ack+commit if commit happens first
10794 *
10795 * This behavior is no longer necessary, but we preserve it so old
10796 * primaries can keep their repops in order */
10797 if (pool.info.ec_pool()) {
10798 t.register_on_complete(complete);
10799 } else {
10800 t.register_on_commit(complete);
10801 }
10802 }
10803 t.register_on_applied(
10804 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10805 int tr = osd->store->queue_transaction(
10806 osr.get(),
10807 std::move(t),
10808 nullptr);
10809 assert(tr == 0);
10810}
10811
10812void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10813{
10814 const MOSDPGUpdateLogMissingReply *m =
10815 static_cast<const MOSDPGUpdateLogMissingReply*>(
10816 op->get_req());
10817 dout(20) << __func__ << " got reply from "
10818 << m->get_from() << dendl;
10819
10820 auto it = log_entry_update_waiting_on.find(m->get_tid());
10821 if (it != log_entry_update_waiting_on.end()) {
10822 if (it->second.waiting_on.count(m->get_from())) {
10823 it->second.waiting_on.erase(m->get_from());
94b18763
FG
10824 if (m->last_complete_ondisk != eversion_t()) {
10825 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
10826 }
7c673cae
FG
10827 } else {
10828 osd->clog->error()
10829 << info.pgid << " got reply "
10830 << *m << " from shard we are not waiting for "
10831 << m->get_from();
10832 }
10833
10834 if (it->second.waiting_on.empty()) {
10835 repop_all_committed(it->second.repop.get());
10836 log_entry_update_waiting_on.erase(it);
10837 }
10838 } else {
10839 osd->clog->error()
10840 << info.pgid << " got reply "
10841 << *m << " on unknown tid " << m->get_tid();
10842 }
10843}
10844
10845/* Mark all unfound objects as lost.
10846 */
10847void PrimaryLogPG::mark_all_unfound_lost(
10848 int what,
10849 ConnectionRef con,
10850 ceph_tid_t tid)
10851{
10852 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
224ce89b 10853 list<hobject_t> oids;
7c673cae
FG
10854
10855 dout(30) << __func__ << ": log before:\n";
10856 pg_log.get_log().print(*_dout);
10857 *_dout << dendl;
10858
31f18b77 10859 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
7c673cae
FG
10860
10861 utime_t mtime = ceph_clock_now();
10862 map<hobject_t, pg_missing_item>::const_iterator m =
10863 missing_loc.get_needs_recovery().begin();
10864 map<hobject_t, pg_missing_item>::const_iterator mend =
10865 missing_loc.get_needs_recovery().end();
10866
10867 ObcLockManager manager;
10868 eversion_t v = get_next_version();
10869 v.epoch = get_osdmap()->get_epoch();
10870 uint64_t num_unfound = missing_loc.num_unfound();
10871 while (m != mend) {
10872 const hobject_t &oid(m->first);
10873 if (!missing_loc.is_unfound(oid)) {
10874 // We only care about unfound objects
10875 ++m;
10876 continue;
10877 }
10878
10879 ObjectContextRef obc;
10880 eversion_t prev;
10881
10882 switch (what) {
10883 case pg_log_entry_t::LOST_MARK:
10884 assert(0 == "actually, not implemented yet!");
10885 break;
10886
10887 case pg_log_entry_t::LOST_REVERT:
10888 prev = pick_newest_available(oid);
10889 if (prev > eversion_t()) {
10890 // log it
10891 pg_log_entry_t e(
10892 pg_log_entry_t::LOST_REVERT, oid, v,
10893 m->second.need, 0, osd_reqid_t(), mtime, 0);
10894 e.reverting_to = prev;
10895 e.mark_unrollbackable();
10896 log_entries.push_back(e);
10897 dout(10) << e << dendl;
10898
10899 // we are now missing the new version; recovery code will sort it out.
10900 ++v.version;
10901 ++m;
10902 break;
10903 }
10904
10905 case pg_log_entry_t::LOST_DELETE:
10906 {
10907 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10908 0, osd_reqid_t(), mtime, 0);
31f18b77 10909 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
10910 if (pool.info.require_rollback()) {
10911 e.mod_desc.try_rmobject(v.version);
10912 } else {
10913 e.mark_unrollbackable();
10914 }
10915 } // otherwise, just do what we used to do
10916 dout(10) << e << dendl;
10917 log_entries.push_back(e);
224ce89b 10918 oids.push_back(oid);
7c673cae 10919
b32b8144
FG
10920 // If context found mark object as deleted in case
10921 // of racing with new creation. This can happen if
10922 // object lost and EIO at primary.
10923 obc = object_contexts.lookup(oid);
10924 if (obc)
10925 obc->obs.exists = false;
10926
7c673cae
FG
10927 ++v.version;
10928 ++m;
10929 }
10930 break;
10931
10932 default:
10933 ceph_abort();
10934 }
10935 }
10936
10937 info.stats.stats_invalid = true;
10938
10939 submit_log_entries(
10940 log_entries,
10941 std::move(manager),
10942 boost::optional<std::function<void(void)> >(
224ce89b 10943 [this, oids, con, num_unfound, tid]() {
c07f9fc5
FG
10944 if (perform_deletes_during_peering()) {
10945 for (auto oid : oids) {
10946 // clear old locations - merge_new_log_entries will have
10947 // handled rebuilding missing_loc for each of these
10948 // objects if we have the RECOVERY_DELETES flag
224ce89b 10949 missing_loc.recovered(oid);
c07f9fc5
FG
10950 }
10951 }
10952
b32b8144
FG
10953 if (is_recovery_unfound()) {
10954 queue_peering_event(
10955 CephPeeringEvtRef(
10956 std::make_shared<CephPeeringEvt>(
10957 get_osdmap()->get_epoch(),
10958 get_osdmap()->get_epoch(),
10959 DoRecovery())));
10960 } else if (is_backfill_unfound()) {
10961 queue_peering_event(
10962 CephPeeringEvtRef(
10963 std::make_shared<CephPeeringEvt>(
10964 get_osdmap()->get_epoch(),
10965 get_osdmap()->get_epoch(),
10966 RequestBackfill())));
10967 } else {
10968 queue_recovery();
7c673cae 10969 }
7c673cae
FG
10970
10971 stringstream ss;
10972 ss << "pg has " << num_unfound
10973 << " objects unfound and apparently lost marking";
10974 string rs = ss.str();
10975 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10976 osd->clog->info() << rs;
10977 if (con) {
10978 MCommandReply *reply = new MCommandReply(0, rs);
10979 reply->set_tid(tid);
10980 con->send_message(reply);
10981 }
10982 }),
10983 OpRequestRef());
10984}
10985
10986void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10987{
10988 assert(repop_queue.empty());
10989}
10990
10991/*
10992 * pg status change notification
10993 */
10994
10995void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10996{
10997 list<OpRequestRef> rq;
10998
10999 // apply all repops
11000 while (!repop_queue.empty()) {
11001 RepGather *repop = repop_queue.front();
11002 repop_queue.pop_front();
11003 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11004 repop->rep_aborted = true;
11005 repop->on_applied.clear();
11006 repop->on_committed.clear();
11007 repop->on_success.clear();
11008
11009 if (requeue) {
11010 if (repop->op) {
11011 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11012 rq.push_back(repop->op);
11013 repop->op = OpRequestRef();
11014 }
11015
11016 // also requeue any dups, interleaved into position
11017 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
11018 waiting_for_ondisk.find(repop->v);
11019 if (p != waiting_for_ondisk.end()) {
11020 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11021 for (list<pair<OpRequestRef, version_t> >::iterator i =
11022 p->second.begin();
11023 i != p->second.end();
11024 ++i) {
11025 rq.push_back(i->first);
11026 }
11027 waiting_for_ondisk.erase(p);
11028 }
11029 }
11030
11031 remove_repop(repop);
11032 }
11033
11034 assert(repop_queue.empty());
11035
11036 if (requeue) {
11037 requeue_ops(rq);
11038 if (!waiting_for_ondisk.empty()) {
11039 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
11040 waiting_for_ondisk.begin();
11041 i != waiting_for_ondisk.end();
11042 ++i) {
11043 for (list<pair<OpRequestRef, version_t> >::iterator j =
11044 i->second.begin();
11045 j != i->second.end();
11046 ++j) {
11047 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
11048 << i->first << dendl;
11049 }
11050 }
11051 assert(waiting_for_ondisk.empty());
11052 }
11053 }
11054
11055 waiting_for_ondisk.clear();
11056}
11057
11058void PrimaryLogPG::on_flushed()
11059{
11060 assert(flushes_in_progress > 0);
11061 flushes_in_progress--;
11062 if (flushes_in_progress == 0) {
b32b8144 11063 requeue_ops(waiting_for_flush);
7c673cae
FG
11064 }
11065 if (!is_peered() || !is_primary()) {
11066 pair<hobject_t, ObjectContextRef> i;
11067 while (object_contexts.get_next(i.first, &i)) {
11068 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
11069 }
11070 assert(object_contexts.empty());
11071 }
11072 pgbackend->on_flushed();
11073}
11074
11075void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
11076{
11077 dout(10) << "on_removal" << dendl;
11078
11079 // adjust info to backfill
11080 info.set_last_backfill(hobject_t());
11081 pg_log.reset_backfill();
11082 dirty_info = true;
11083
11084
11085 // clear log
11086 PGLogEntryHandler rollbacker{this, t};
11087 pg_log.roll_forward(&rollbacker);
11088
11089 write_if_dirty(*t);
11090
11091 if (!deleting)
11092 on_shutdown();
11093}
11094
c07f9fc5
FG
11095void PrimaryLogPG::clear_async_reads()
11096{
11097 dout(10) << __func__ << dendl;
11098 for(auto& i : in_progress_async_reads) {
11099 dout(10) << "clear ctx: "
11100 << "OpRequestRef " << i.first
11101 << " OpContext " << i.second
11102 << dendl;
11103 close_op_ctx(i.second);
11104 }
11105}
11106
7c673cae
FG
11107void PrimaryLogPG::on_shutdown()
11108{
11109 dout(10) << "on_shutdown" << dendl;
11110
11111 // remove from queues
11112 osd->pg_stat_queue_dequeue(this);
11113 osd->peering_wq.dequeue(this);
11114
11115 // handles queue races
11116 deleting = true;
11117
224ce89b
WB
11118 if (recovery_queued) {
11119 recovery_queued = false;
11120 osd->clear_queued_recovery(this);
11121 }
11122
7c673cae
FG
11123 clear_scrub_reserved();
11124 scrub_clear_state();
11125
11126 unreg_next_scrub();
94b18763
FG
11127
11128 vector<ceph_tid_t> tids;
11129 cancel_copy_ops(false, &tids);
11130 cancel_flush_ops(false, &tids);
11131 cancel_proxy_ops(false, &tids);
11132 osd->objecter->op_cancel(tids, -ECANCELED);
11133
7c673cae
FG
11134 apply_and_flush_repops(false);
11135 cancel_log_updates();
31f18b77
FG
11136 // we must remove PGRefs, so do this this prior to release_backoffs() callers
11137 clear_backoffs();
11138 // clean up snap trim references
11139 snap_trimmer_machine.process_event(Reset());
7c673cae
FG
11140
11141 pgbackend->on_change();
11142
11143 context_registry_on_change();
11144 object_contexts.clear();
11145
c07f9fc5
FG
11146 clear_async_reads();
11147
7c673cae
FG
11148 osd->remote_reserver.cancel_reservation(info.pgid);
11149 osd->local_reserver.cancel_reservation(info.pgid);
11150
11151 clear_primary_state();
11152 cancel_recovery();
11153}
11154
11155void PrimaryLogPG::on_activate()
11156{
11157 // all clean?
11158 if (needs_recovery()) {
11159 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11160 queue_peering_event(
11161 CephPeeringEvtRef(
11162 std::make_shared<CephPeeringEvt>(
11163 get_osdmap()->get_epoch(),
11164 get_osdmap()->get_epoch(),
11165 DoRecovery())));
11166 } else if (needs_backfill()) {
11167 dout(10) << "activate queueing backfill" << dendl;
11168 queue_peering_event(
11169 CephPeeringEvtRef(
11170 std::make_shared<CephPeeringEvt>(
11171 get_osdmap()->get_epoch(),
11172 get_osdmap()->get_epoch(),
11173 RequestBackfill())));
11174 } else {
11175 dout(10) << "activate all replicas clean, no recovery" << dendl;
224ce89b 11176 eio_errors_to_process = false;
7c673cae
FG
11177 queue_peering_event(
11178 CephPeeringEvtRef(
11179 std::make_shared<CephPeeringEvt>(
11180 get_osdmap()->get_epoch(),
11181 get_osdmap()->get_epoch(),
11182 AllReplicasRecovered())));
11183 }
11184
11185 publish_stats_to_osd();
11186
11187 if (!backfill_targets.empty()) {
11188 last_backfill_started = earliest_backfill();
11189 new_backfill = true;
11190 assert(!last_backfill_started.is_max());
11191 dout(5) << "on activate: bft=" << backfill_targets
11192 << " from " << last_backfill_started << dendl;
11193 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11194 i != backfill_targets.end();
11195 ++i) {
11196 dout(5) << "target shard " << *i
11197 << " from " << peer_info[*i].last_backfill
11198 << dendl;
11199 }
11200 }
11201
11202 hit_set_setup();
11203 agent_setup();
11204}
11205
11206void PrimaryLogPG::_on_new_interval()
11207{
b32b8144 11208 dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
c07f9fc5
FG
11209 if (!pg_log.get_missing().may_include_deletes &&
11210 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11211 pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11212 }
11213 assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
7c673cae
FG
11214}
11215
11216void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11217{
11218 dout(10) << "on_change" << dendl;
11219
11220 if (hit_set && hit_set->insert_count() == 0) {
11221 dout(20) << " discarding empty hit_set" << dendl;
11222 hit_set_clear();
11223 }
11224
11225 if (recovery_queued) {
11226 recovery_queued = false;
11227 osd->clear_queued_recovery(this);
11228 }
11229
11230 // requeue everything in the reverse order they should be
11231 // reexamined.
11232 requeue_ops(waiting_for_peered);
b32b8144 11233 requeue_ops(waiting_for_flush);
7c673cae
FG
11234 requeue_ops(waiting_for_active);
11235
11236 clear_scrub_reserved();
11237
94b18763
FG
11238 vector<ceph_tid_t> tids;
11239 cancel_copy_ops(is_primary(), &tids);
11240 cancel_flush_ops(is_primary(), &tids);
11241 cancel_proxy_ops(is_primary(), &tids);
11242 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
11243
11244 // requeue object waiters
11245 for (auto& p : waiting_for_unreadable_object) {
11246 release_backoffs(p.first);
11247 }
11248 if (is_primary()) {
11249 requeue_object_waiters(waiting_for_unreadable_object);
11250 } else {
11251 waiting_for_unreadable_object.clear();
11252 }
11253 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11254 p != waiting_for_degraded_object.end();
11255 waiting_for_degraded_object.erase(p++)) {
11256 release_backoffs(p->first);
11257 if (is_primary())
11258 requeue_ops(p->second);
11259 else
11260 p->second.clear();
11261 finish_degraded_object(p->first);
11262 }
11263
11264 // requeues waiting_for_scrub
11265 scrub_clear_state();
11266
11267 for (auto p = waiting_for_blocked_object.begin();
11268 p != waiting_for_blocked_object.end();
11269 waiting_for_blocked_object.erase(p++)) {
11270 if (is_primary())
11271 requeue_ops(p->second);
11272 else
11273 p->second.clear();
11274 }
11275 for (auto i = callbacks_for_degraded_object.begin();
11276 i != callbacks_for_degraded_object.end();
11277 ) {
11278 finish_degraded_object((i++)->first);
11279 }
11280 assert(callbacks_for_degraded_object.empty());
11281
11282 if (is_primary()) {
11283 requeue_ops(waiting_for_cache_not_full);
7c673cae
FG
11284 } else {
11285 waiting_for_cache_not_full.clear();
7c673cae
FG
11286 }
11287 objects_blocked_on_cache_full.clear();
11288
11289 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11290 in_progress_async_reads.begin();
11291 i != in_progress_async_reads.end();
11292 in_progress_async_reads.erase(i++)) {
11293 close_op_ctx(i->second);
11294 if (is_primary())
11295 requeue_op(i->first);
11296 }
11297
11298 // this will requeue ops we were working on but didn't finish, and
11299 // any dups
11300 apply_and_flush_repops(is_primary());
11301 cancel_log_updates();
11302
11303 // do this *after* apply_and_flush_repops so that we catch any newly
11304 // registered watches.
11305 context_registry_on_change();
11306
11307 pgbackend->on_change_cleanup(t);
11308 scrubber.cleanup_store(t);
11309 pgbackend->on_change();
11310
11311 // clear snap_trimmer state
11312 snap_trimmer_machine.process_event(Reset());
11313
11314 debug_op_order.clear();
11315 unstable_stats.clear();
11316
11317 // we don't want to cache object_contexts through the interval change
11318 // NOTE: we actually assert that all currently live references are dead
11319 // by the time the flush for the next interval completes.
11320 object_contexts.clear();
11321
11322 // should have been cleared above by finishing all of the degraded objects
11323 assert(objects_blocked_on_degraded_snap.empty());
11324}
11325
11326void PrimaryLogPG::on_role_change()
11327{
11328 dout(10) << "on_role_change" << dendl;
11329 if (get_role() != 0 && hit_set) {
11330 dout(10) << " clearing hit set" << dendl;
11331 hit_set_clear();
11332 }
11333}
11334
11335void PrimaryLogPG::on_pool_change()
11336{
11337 dout(10) << __func__ << dendl;
11338 // requeue cache full waiters just in case the cache_mode is
11339 // changing away from writeback mode. note that if we are not
11340 // active the normal requeuing machinery is sufficient (and properly
11341 // ordered).
11342 if (is_active() &&
11343 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11344 !waiting_for_cache_not_full.empty()) {
11345 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11346 << dendl;
11347 requeue_ops(waiting_for_cache_not_full);
11348 objects_blocked_on_cache_full.clear();
11349 }
11350 hit_set_setup();
11351 agent_setup();
11352}
11353
11354// clear state. called on recovery completion AND cancellation.
11355void PrimaryLogPG::_clear_recovery_state()
11356{
11357 missing_loc.clear();
11358#ifdef DEBUG_RECOVERY_OIDS
11359 recovering_oids.clear();
11360#endif
11361 last_backfill_started = hobject_t();
11362 set<hobject_t>::iterator i = backfills_in_flight.begin();
11363 while (i != backfills_in_flight.end()) {
11364 assert(recovering.count(*i));
11365 backfills_in_flight.erase(i++);
11366 }
11367
11368 list<OpRequestRef> blocked_ops;
11369 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11370 i != recovering.end();
11371 recovering.erase(i++)) {
11372 if (i->second) {
11373 i->second->drop_recovery_read(&blocked_ops);
11374 requeue_ops(blocked_ops);
11375 }
11376 }
11377 assert(backfills_in_flight.empty());
11378 pending_backfill_updates.clear();
11379 assert(recovering.empty());
11380 pgbackend->clear_recovery_state();
11381}
11382
11383void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11384{
11385 dout(20) << __func__ << ": " << soid << dendl;
11386 assert(recovering.count(soid));
11387 ObjectContextRef obc = recovering[soid];
11388 if (obc) {
11389 list<OpRequestRef> blocked_ops;
11390 obc->drop_recovery_read(&blocked_ops);
11391 requeue_ops(blocked_ops);
11392 }
11393 recovering.erase(soid);
11394 finish_recovery_op(soid);
11395 release_backoffs(soid);
11396 if (waiting_for_degraded_object.count(soid)) {
11397 dout(20) << " kicking degraded waiters on " << soid << dendl;
11398 requeue_ops(waiting_for_degraded_object[soid]);
11399 waiting_for_degraded_object.erase(soid);
11400 }
11401 if (waiting_for_unreadable_object.count(soid)) {
11402 dout(20) << " kicking unreadable waiters on " << soid << dendl;
11403 requeue_ops(waiting_for_unreadable_object[soid]);
11404 waiting_for_unreadable_object.erase(soid);
11405 }
11406 if (is_missing_object(soid))
11407 pg_log.set_last_requested(0); // get recover_primary to start over
11408 finish_degraded_object(soid);
11409}
11410
11411void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11412{
11413 /*
11414 * check that any peers we are planning to (or currently) pulling
11415 * objects from are dealt with.
11416 */
11417 missing_loc.check_recovery_sources(osdmap);
11418 pgbackend->check_recovery_sources(osdmap);
11419
11420 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11421 i != peer_log_requested.end();
11422 ) {
11423 if (!osdmap->is_up(i->osd)) {
11424 dout(10) << "peer_log_requested removing " << *i << dendl;
11425 peer_log_requested.erase(i++);
11426 } else {
11427 ++i;
11428 }
11429 }
11430
11431 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11432 i != peer_missing_requested.end();
11433 ) {
11434 if (!osdmap->is_up(i->osd)) {
11435 dout(10) << "peer_missing_requested removing " << *i << dendl;
11436 peer_missing_requested.erase(i++);
11437 } else {
11438 ++i;
11439 }
11440 }
11441}
11442
11443void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11444{
11445 set<pg_shard_t> now_down;
11446 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11447 p != missing_loc_sources.end();
11448 ) {
11449 if (osdmap->is_up(p->osd)) {
11450 ++p;
11451 continue;
11452 }
11453 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11454 now_down.insert(*p);
11455 missing_loc_sources.erase(p++);
11456 }
11457
11458 if (now_down.empty()) {
11459 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11460 } else {
11461 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11462 << missing_loc_sources << dendl;
11463
11464 // filter missing_loc
11465 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11466 while (p != missing_loc.end()) {
11467 set<pg_shard_t>::iterator q = p->second.begin();
11468 while (q != p->second.end())
11469 if (now_down.count(*q)) {
11470 p->second.erase(q++);
11471 } else {
11472 ++q;
11473 }
11474 if (p->second.empty())
11475 missing_loc.erase(p++);
11476 else
11477 ++p;
11478 }
11479 }
11480}
11481
11482
11483bool PrimaryLogPG::start_recovery_ops(
11484 uint64_t max,
11485 ThreadPool::TPHandle &handle,
11486 uint64_t *ops_started)
11487{
11488 uint64_t& started = *ops_started;
11489 started = 0;
11490 bool work_in_progress = false;
11491 assert(is_primary());
11492
11493 if (!state_test(PG_STATE_RECOVERING) &&
3efd9988 11494 !state_test(PG_STATE_BACKFILLING)) {
7c673cae
FG
11495 /* TODO: I think this case is broken and will make do_recovery()
11496 * unhappy since we're returning false */
11497 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11498 return false;
11499 }
11500
c07f9fc5 11501 const auto &missing = pg_log.get_missing();
7c673cae
FG
11502
11503 unsigned int num_missing = missing.num_missing();
11504 uint64_t num_unfound = get_num_unfound();
11505
11506 if (num_missing == 0) {
11507 info.last_complete = info.last_update;
11508 }
11509
11510 if (num_missing == num_unfound) {
11511 // All of the missing objects we have are unfound.
11512 // Recover the replicas.
11513 started = recover_replicas(max, handle);
11514 }
11515 if (!started) {
11516 // We still have missing objects that we should grab from replicas.
11517 started += recover_primary(max, handle);
11518 }
11519 if (!started && num_unfound != get_num_unfound()) {
11520 // second chance to recovery replicas
11521 started = recover_replicas(max, handle);
11522 }
11523
11524 if (started)
11525 work_in_progress = true;
11526
11527 bool deferred_backfill = false;
11528 if (recovering.empty() &&
3efd9988 11529 state_test(PG_STATE_BACKFILLING) &&
7c673cae
FG
11530 !backfill_targets.empty() && started < max &&
11531 missing.num_missing() == 0 &&
11532 waiting_on_backfill.empty()) {
11533 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11534 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11535 deferred_backfill = true;
11536 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11537 !is_degraded()) {
11538 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11539 deferred_backfill = true;
11540 } else if (!backfill_reserved) {
11541 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11542 if (!backfill_reserving) {
11543 dout(10) << "queueing RequestBackfill" << dendl;
11544 backfill_reserving = true;
11545 queue_peering_event(
11546 CephPeeringEvtRef(
11547 std::make_shared<CephPeeringEvt>(
11548 get_osdmap()->get_epoch(),
11549 get_osdmap()->get_epoch(),
11550 RequestBackfill())));
11551 }
11552 deferred_backfill = true;
11553 } else {
11554 started += recover_backfill(max - started, handle, &work_in_progress);
11555 }
11556 }
11557
11558 dout(10) << " started " << started << dendl;
11559 osd->logger->inc(l_osd_rop, started);
11560
11561 if (!recovering.empty() ||
11562 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11563 return work_in_progress;
11564
11565 assert(recovering.empty());
11566 assert(recovery_ops_active == 0);
11567
11568 dout(10) << __func__ << " needs_recovery: "
11569 << missing_loc.get_needs_recovery()
11570 << dendl;
11571 dout(10) << __func__ << " missing_loc: "
11572 << missing_loc.get_missing_locs()
11573 << dendl;
11574 int unfound = get_num_unfound();
11575 if (unfound) {
11576 dout(10) << " still have " << unfound << " unfound" << dendl;
11577 return work_in_progress;
11578 }
11579
11580 if (missing.num_missing() > 0) {
11581 // this shouldn't happen!
c07f9fc5
FG
11582 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11583 << missing.num_missing() << ": " << missing.get_items();
7c673cae
FG
11584 return work_in_progress;
11585 }
11586
11587 if (needs_recovery()) {
11588 // this shouldn't happen!
11589 // We already checked num_missing() so we must have missing replicas
c07f9fc5
FG
11590 osd->clog->error() << info.pgid
11591 << " Unexpected Error: recovery ending with missing replicas";
7c673cae
FG
11592 return work_in_progress;
11593 }
11594
11595 if (state_test(PG_STATE_RECOVERING)) {
11596 state_clear(PG_STATE_RECOVERING);
c07f9fc5 11597 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
11598 if (needs_backfill()) {
11599 dout(10) << "recovery done, queuing backfill" << dendl;
11600 queue_peering_event(
11601 CephPeeringEvtRef(
11602 std::make_shared<CephPeeringEvt>(
11603 get_osdmap()->get_epoch(),
11604 get_osdmap()->get_epoch(),
11605 RequestBackfill())));
11606 } else {
11607 dout(10) << "recovery done, no backfill" << dendl;
224ce89b 11608 eio_errors_to_process = false;
c07f9fc5 11609 state_clear(PG_STATE_FORCED_BACKFILL);
7c673cae
FG
11610 queue_peering_event(
11611 CephPeeringEvtRef(
11612 std::make_shared<CephPeeringEvt>(
11613 get_osdmap()->get_epoch(),
11614 get_osdmap()->get_epoch(),
11615 AllReplicasRecovered())));
11616 }
11617 } else { // backfilling
3efd9988 11618 state_clear(PG_STATE_BACKFILLING);
c07f9fc5
FG
11619 state_clear(PG_STATE_FORCED_BACKFILL);
11620 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae 11621 dout(10) << "recovery done, backfill done" << dendl;
224ce89b 11622 eio_errors_to_process = false;
7c673cae
FG
11623 queue_peering_event(
11624 CephPeeringEvtRef(
11625 std::make_shared<CephPeeringEvt>(
11626 get_osdmap()->get_epoch(),
11627 get_osdmap()->get_epoch(),
11628 Backfilled())));
11629 }
11630
11631 return false;
11632}
11633
11634/**
11635 * do one recovery op.
11636 * return true if done, false if nothing left to do.
11637 */
11638uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11639{
11640 assert(is_primary());
11641
c07f9fc5 11642 const auto &missing = pg_log.get_missing();
7c673cae
FG
11643
11644 dout(10) << "recover_primary recovering " << recovering.size()
11645 << " in pg" << dendl;
11646 dout(10) << "recover_primary " << missing << dendl;
11647 dout(25) << "recover_primary " << missing.get_items() << dendl;
11648
11649 // look at log!
11650 pg_log_entry_t *latest = 0;
11651 unsigned started = 0;
11652 int skipped = 0;
11653
11654 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11655 map<version_t, hobject_t>::const_iterator p =
11656 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11657 while (p != missing.get_rmissing().end()) {
11658 handle.reset_tp_timeout();
11659 hobject_t soid;
11660 version_t v = p->first;
11661
11662 if (pg_log.get_log().objects.count(p->second)) {
11663 latest = pg_log.get_log().objects.find(p->second)->second;
c07f9fc5 11664 assert(latest->is_update() || latest->is_delete());
7c673cae
FG
11665 soid = latest->soid;
11666 } else {
11667 latest = 0;
11668 soid = p->second;
11669 }
11670 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11671 ++p;
11672
224ce89b 11673 hobject_t head = soid.get_head();
7c673cae
FG
11674
11675 eversion_t need = item.need;
11676
11677 dout(10) << "recover_primary "
11678 << soid << " " << item.need
11679 << (missing.is_missing(soid) ? " (missing)":"")
11680 << (missing.is_missing(head) ? " (missing head)":"")
11681 << (recovering.count(soid) ? " (recovering)":"")
11682 << (recovering.count(head) ? " (recovering head)":"")
11683 << dendl;
11684
11685 if (latest) {
11686 switch (latest->op) {
11687 case pg_log_entry_t::CLONE:
11688 /*
11689 * Handling for this special case removed for now, until we
11690 * can correctly construct an accurate SnapSet from the old
11691 * one.
11692 */
11693 break;
11694
11695 case pg_log_entry_t::LOST_REVERT:
11696 {
11697 if (item.have == latest->reverting_to) {
11698 ObjectContextRef obc = get_object_context(soid, true);
11699
11700 if (obc->obs.oi.version == latest->version) {
11701 // I'm already reverting
11702 dout(10) << " already reverting " << soid << dendl;
11703 } else {
11704 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11705 obc->ondisk_write_lock();
11706 obc->obs.oi.version = latest->version;
11707
11708 ObjectStore::Transaction t;
11709 bufferlist b2;
11710 obc->obs.oi.encode(
11711 b2,
11712 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11713 assert(!pool.info.require_rollback());
11714 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11715
11716 recover_got(soid, latest->version);
11717 missing_loc.add_location(soid, pg_whoami);
11718
11719 ++active_pushes;
11720
11721 osd->store->queue_transaction(osr.get(), std::move(t),
11722 new C_OSD_AppliedRecoveredObject(this, obc),
11723 new C_OSD_CommittedPushedObject(
11724 this,
11725 get_osdmap()->get_epoch(),
11726 info.last_complete),
11727 new C_OSD_OndiskWriteUnlock(obc));
11728 continue;
11729 }
11730 } else {
11731 /*
11732 * Pull the old version of the object. Update missing_loc here to have the location
11733 * of the version we want.
11734 *
11735 * This doesn't use the usual missing_loc paths, but that's okay:
11736 * - if we have it locally, we hit the case above, and go from there.
11737 * - if we don't, we always pass through this case during recovery and set up the location
11738 * properly.
11739 * - this way we don't need to mangle the missing code to be general about needing an old
11740 * version...
11741 */
11742 eversion_t alternate_need = latest->reverting_to;
11743 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11744
11745 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11746 p != peer_missing.end();
11747 ++p)
11748 if (p->second.is_missing(soid, need) &&
11749 p->second.get_items().at(soid).have == alternate_need) {
11750 missing_loc.add_location(soid, p->first);
11751 }
11752 dout(10) << " will pull " << alternate_need << " or " << need
11753 << " from one of " << missing_loc.get_locations(soid)
11754 << dendl;
11755 }
11756 }
11757 break;
11758 }
11759 }
11760
11761 if (!recovering.count(soid)) {
11762 if (recovering.count(head)) {
11763 ++skipped;
11764 } else {
11765 int r = recover_missing(
11766 soid, need, get_recovery_op_priority(), h);
11767 switch (r) {
11768 case PULL_YES:
11769 ++started;
11770 break;
11771 case PULL_OTHER:
11772 ++started;
11773 case PULL_NONE:
11774 ++skipped;
11775 break;
11776 default:
11777 ceph_abort();
11778 }
11779 if (started >= max)
11780 break;
11781 }
11782 }
11783
11784 // only advance last_requested if we haven't skipped anything
11785 if (!skipped)
11786 pg_log.set_last_requested(v);
11787 }
11788
11789 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11790 return started;
11791}
11792
224ce89b
WB
11793bool PrimaryLogPG::primary_error(
11794 const hobject_t& soid, eversion_t v)
11795{
11796 pg_log.missing_add(soid, v, eversion_t());
11797 pg_log.set_last_requested(0);
11798 missing_loc.remove_location(soid, pg_whoami);
11799 bool uhoh = true;
11800 assert(!actingbackfill.empty());
11801 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11802 i != actingbackfill.end();
11803 ++i) {
11804 if (*i == get_primary()) continue;
11805 pg_shard_t peer = *i;
11806 if (!peer_missing[peer].is_missing(soid, v)) {
11807 missing_loc.add_location(soid, peer);
11808 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11809 << ", there should be a copy on shard " << peer << dendl;
11810 uhoh = false;
11811 }
11812 }
11813 if (uhoh)
11814 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11815 else
11816 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11817 << ", will try copies on " << missing_loc.get_locations(soid);
11818 return uhoh;
11819}
11820
c07f9fc5
FG
11821int PrimaryLogPG::prep_object_replica_deletes(
11822 const hobject_t& soid, eversion_t v,
11823 PGBackend::RecoveryHandle *h)
11824{
11825 assert(is_primary());
11826 dout(10) << __func__ << ": on " << soid << dendl;
11827
11828 start_recovery_op(soid);
11829 assert(!recovering.count(soid));
11830 recovering.insert(make_pair(soid, ObjectContextRef()));
11831
11832 pgbackend->recover_delete_object(soid, v, h);
11833 return 1;
11834}
11835
7c673cae
FG
11836int PrimaryLogPG::prep_object_replica_pushes(
11837 const hobject_t& soid, eversion_t v,
11838 PGBackend::RecoveryHandle *h)
11839{
11840 assert(is_primary());
11841 dout(10) << __func__ << ": on " << soid << dendl;
11842
11843 // NOTE: we know we will get a valid oloc off of disk here.
11844 ObjectContextRef obc = get_object_context(soid, false);
11845 if (!obc) {
224ce89b 11846 primary_error(soid, v);
7c673cae
FG
11847 return 0;
11848 }
11849
11850 if (!obc->get_recovery_read()) {
11851 dout(20) << "recovery delayed on " << soid
11852 << "; could not get rw_manager lock" << dendl;
11853 return 0;
11854 } else {
11855 dout(20) << "recovery got recovery read lock on " << soid
11856 << dendl;
11857 }
11858
11859 start_recovery_op(soid);
11860 assert(!recovering.count(soid));
11861 recovering.insert(make_pair(soid, obc));
11862
11863 /* We need this in case there is an in progress write on the object. In fact,
11864 * the only possible write is an update to the xattr due to a lost_revert --
11865 * a client write would be blocked since the object is degraded.
11866 * In almost all cases, therefore, this lock should be uncontended.
11867 */
11868 obc->ondisk_read_lock();
224ce89b 11869 int r = pgbackend->recover_object(
7c673cae
FG
11870 soid,
11871 v,
11872 ObjectContextRef(),
11873 obc, // has snapset context
11874 h);
11875 obc->ondisk_read_unlock();
224ce89b
WB
11876 if (r < 0) {
11877 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11878 primary_failed(soid);
11879 primary_error(soid, v);
11880 return 0;
11881 }
7c673cae
FG
11882 return 1;
11883}
11884
11885uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11886{
11887 dout(10) << __func__ << "(" << max << ")" << dendl;
11888 uint64_t started = 0;
11889
11890 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11891
11892 // this is FAR from an optimal recovery order. pretty lame, really.
11893 assert(!actingbackfill.empty());
11894 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11895 i != actingbackfill.end();
11896 ++i) {
11897 if (*i == get_primary()) continue;
11898 pg_shard_t peer = *i;
11899 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11900 assert(pm != peer_missing.end());
11901 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11902 assert(pi != peer_info.end());
11903 size_t m_sz = pm->second.num_missing();
11904
11905 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11906 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11907
11908 // oldest first!
11909 const pg_missing_t &m(pm->second);
11910 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11911 p != m.get_rmissing().end() && started < max;
11912 ++p) {
11913 handle.reset_tp_timeout();
11914 const hobject_t soid(p->second);
11915
224ce89b
WB
11916 if (missing_loc.is_unfound(soid)) {
11917 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11918 continue;
11919 }
11920
7c673cae
FG
11921 if (soid > pi->second.last_backfill) {
11922 if (!recovering.count(soid)) {
224ce89b 11923 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
7c673cae
FG
11924 derr << __func__ << ": object added to missing set for backfill, but "
11925 << "is not in recovering, error!" << dendl;
11926 ceph_abort();
11927 }
11928 continue;
11929 }
11930
11931 if (recovering.count(soid)) {
11932 dout(10) << __func__ << ": already recovering " << soid << dendl;
11933 continue;
11934 }
11935
c07f9fc5
FG
11936 if (missing_loc.is_deleted(soid)) {
11937 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11938 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11939 started += prep_object_replica_deletes(soid, r->second.need, h);
11940 continue;
11941 }
11942
7c673cae
FG
11943 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11944 dout(10) << __func__ << ": " << soid.get_head()
11945 << " still missing on primary" << dendl;
11946 continue;
11947 }
11948
11949 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11950 dout(10) << __func__ << ": " << soid.get_snapdir()
11951 << " still missing on primary" << dendl;
11952 continue;
11953 }
11954
11955 if (pg_log.get_missing().is_missing(soid)) {
11956 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11957 continue;
11958 }
11959
11960 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11961 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11962 started += prep_object_replica_pushes(soid, r->second.need,
11963 h);
11964 }
11965 }
11966
11967 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11968 return started;
11969}
11970
11971hobject_t PrimaryLogPG::earliest_peer_backfill() const
11972{
11973 hobject_t e = hobject_t::get_max();
11974 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11975 i != backfill_targets.end();
11976 ++i) {
11977 pg_shard_t peer = *i;
11978 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11979 peer_backfill_info.find(peer);
11980 assert(iter != peer_backfill_info.end());
11981 if (iter->second.begin < e)
11982 e = iter->second.begin;
11983 }
11984 return e;
11985}
11986
11987bool PrimaryLogPG::all_peer_done() const
11988{
11989 // Primary hasn't got any more objects
11990 assert(backfill_info.empty());
11991
11992 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11993 i != backfill_targets.end();
11994 ++i) {
11995 pg_shard_t bt = *i;
11996 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11997 peer_backfill_info.find(bt);
11998 assert(piter != peer_backfill_info.end());
11999 const BackfillInterval& pbi = piter->second;
12000 // See if peer has more to process
12001 if (!pbi.extends_to_end() || !pbi.empty())
12002 return false;
12003 }
12004 return true;
12005}
12006
12007/**
12008 * recover_backfill
12009 *
12010 * Invariants:
12011 *
12012 * backfilled: fully pushed to replica or present in replica's missing set (both
12013 * our copy and theirs).
12014 *
12015 * All objects on a backfill_target in
12016 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12017 * objects have been actually deleted and all logically-valid objects are replicated.
12018 * There may be PG objects in this interval yet to be backfilled.
12019 *
12020 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12021 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
12022 *
12023 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
12024 * backfill_info.begin) in PG are backfilled. No deleted objects in this
12025 * interval remain on the backfill target.
12026 *
12027 * For a backfill target, all objects <= peer_info[target].last_backfill
12028 * have been backfilled to target
12029 *
12030 * There *MAY* be missing/outdated objects between last_backfill_started and
12031 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12032 * io created objects since the last scan. For this reason, we call
12033 * update_range() again before continuing backfill.
12034 */
12035uint64_t PrimaryLogPG::recover_backfill(
12036 uint64_t max,
12037 ThreadPool::TPHandle &handle, bool *work_started)
12038{
12039 dout(10) << "recover_backfill (" << max << ")"
12040 << " bft=" << backfill_targets
12041 << " last_backfill_started " << last_backfill_started
12042 << (new_backfill ? " new_backfill":"")
12043 << dendl;
12044 assert(!backfill_targets.empty());
12045
12046 // Initialize from prior backfill state
12047 if (new_backfill) {
12048 // on_activate() was called prior to getting here
12049 assert(last_backfill_started == earliest_backfill());
12050 new_backfill = false;
12051
12052 // initialize BackfillIntervals
12053 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12054 i != backfill_targets.end();
12055 ++i) {
12056 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
12057 }
12058 backfill_info.reset(last_backfill_started);
12059
12060 backfills_in_flight.clear();
12061 pending_backfill_updates.clear();
12062 }
12063
12064 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12065 i != backfill_targets.end();
12066 ++i) {
12067 dout(10) << "peer osd." << *i
12068 << " info " << peer_info[*i]
12069 << " interval " << peer_backfill_info[*i].begin
12070 << "-" << peer_backfill_info[*i].end
12071 << " " << peer_backfill_info[*i].objects.size() << " objects"
12072 << dendl;
12073 }
12074
12075 // update our local interval to cope with recent changes
12076 backfill_info.begin = last_backfill_started;
12077 update_range(&backfill_info, handle);
12078
12079 unsigned ops = 0;
7c673cae
FG
12080 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
12081 set<hobject_t> add_to_stat;
12082
12083 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12084 i != backfill_targets.end();
12085 ++i) {
12086 peer_backfill_info[*i].trim_to(
12087 std::max(peer_info[*i].last_backfill, last_backfill_started));
12088 }
12089 backfill_info.trim_to(last_backfill_started);
12090
224ce89b 12091 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
7c673cae
FG
12092 while (ops < max) {
12093 if (backfill_info.begin <= earliest_peer_backfill() &&
12094 !backfill_info.extends_to_end() && backfill_info.empty()) {
12095 hobject_t next = backfill_info.end;
12096 backfill_info.reset(next);
12097 backfill_info.end = hobject_t::get_max();
12098 update_range(&backfill_info, handle);
12099 backfill_info.trim();
12100 }
12101
12102 dout(20) << " my backfill interval " << backfill_info << dendl;
12103
12104 bool sent_scan = false;
12105 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12106 i != backfill_targets.end();
12107 ++i) {
12108 pg_shard_t bt = *i;
12109 BackfillInterval& pbi = peer_backfill_info[bt];
12110
12111 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
12112 if (pbi.begin <= backfill_info.begin &&
12113 !pbi.extends_to_end() && pbi.empty()) {
12114 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
12115 epoch_t e = get_osdmap()->get_epoch();
12116 MOSDPGScan *m = new MOSDPGScan(
12117 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
12118 spg_t(info.pgid.pgid, bt.shard),
12119 pbi.end, hobject_t());
12120 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12121 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
12122 waiting_on_backfill.insert(bt);
12123 sent_scan = true;
12124 }
12125 }
12126
12127 // Count simultaneous scans as a single op and let those complete
12128 if (sent_scan) {
12129 ops++;
12130 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12131 break;
12132 }
12133
12134 if (backfill_info.empty() && all_peer_done()) {
12135 dout(10) << " reached end for both local and all peers" << dendl;
12136 break;
12137 }
12138
12139 // Get object within set of peers to operate on and
12140 // the set of targets for which that object applies.
12141 hobject_t check = earliest_peer_backfill();
12142
12143 if (check < backfill_info.begin) {
12144
12145 set<pg_shard_t> check_targets;
12146 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12147 i != backfill_targets.end();
12148 ++i) {
12149 pg_shard_t bt = *i;
12150 BackfillInterval& pbi = peer_backfill_info[bt];
12151 if (pbi.begin == check)
12152 check_targets.insert(bt);
12153 }
12154 assert(!check_targets.empty());
12155
12156 dout(20) << " BACKFILL removing " << check
12157 << " from peers " << check_targets << dendl;
12158 for (set<pg_shard_t>::iterator i = check_targets.begin();
12159 i != check_targets.end();
12160 ++i) {
12161 pg_shard_t bt = *i;
12162 BackfillInterval& pbi = peer_backfill_info[bt];
12163 assert(pbi.begin == check);
12164
12165 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12166 pbi.pop_front();
12167 }
12168
12169 /* This requires a bit of explanation. We compare head against
12170 * last_backfill to determine whether to send an operation
12171 * to the replica. A single write operation can touch up to three
12172 * objects: head, the snapdir, and a new clone which sorts closer to
12173 * head than any existing clone. If last_backfill points at a clone,
12174 * the transaction won't be sent and all 3 must lie on the right side
12175 * of the line (i.e., we'll backfill them later). If last_backfill
12176 * points at snapdir, it sorts greater than head, so we send the
12177 * transaction which is correct because all three must lie to the left
12178 * of the line.
12179 *
12180 * If it points at head, we have a bit of an issue. If head actually
12181 * exists, no problem, because any transaction which touches snapdir
12182 * must end up creating it (and deleting head), so sending the
12183 * operation won't pose a problem -- we'll end up having to scan it,
12184 * but it'll end up being the right version so we won't bother to
12185 * rebackfill it. However, if head doesn't exist, any write on head
12186 * will remove snapdir. For a replicated pool, this isn't a problem,
12187 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12188 * It only poses a problem for EC pools, because we never just delete
12189 * an object, we rename it into a rollback object. That operation
12190 * will end up crashing the osd with ENOENT. Tolerating the failure
12191 * wouldn't work either, even if snapdir exists, we'd be creating a
12192 * rollback object past the last_backfill line which wouldn't get
12193 * cleaned up (no rollback objects past the last_backfill line is an
12194 * existing important invariant). Thus, let's avoid the whole issue
12195 * by just not updating last_backfill_started here if head doesn't
12196 * exist and snapdir does. We aren't using up a recovery count here,
12197 * so we're going to recover snapdir immediately anyway. We'll only
12198 * fail "backward" if we fail to get the rw lock and that just means
12199 * we'll re-process this section of the hash space again.
12200 *
12201 * I'm choosing this hack here because the really "correct" answer is
12202 * going to be to unify snapdir and head into a single object (a
12203 * snapdir is really just a confusing way to talk about head existing
12204 * as a whiteout), but doing that is going to be a somewhat larger
12205 * undertaking.
12206 *
12207 * @see http://tracker.ceph.com/issues/17668
12208 */
12209 if (!(check.is_head() &&
12210 backfill_info.begin.is_snapdir() &&
12211 check == backfill_info.begin.get_head()))
12212 last_backfill_started = check;
12213
12214 // Don't increment ops here because deletions
12215 // are cheap and not replied to unlike real recovery_ops,
12216 // and we can't increment ops without requeueing ourself
12217 // for recovery.
12218 } else {
12219 eversion_t& obj_v = backfill_info.objects.begin()->second;
12220
12221 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12222 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12223 i != backfill_targets.end();
12224 ++i) {
12225 pg_shard_t bt = *i;
12226 BackfillInterval& pbi = peer_backfill_info[bt];
12227 // Find all check peers that have the wrong version
12228 if (check == backfill_info.begin && check == pbi.begin) {
12229 if (pbi.objects.begin()->second != obj_v) {
12230 need_ver_targs.push_back(bt);
12231 } else {
12232 keep_ver_targs.push_back(bt);
12233 }
12234 } else {
12235 pg_info_t& pinfo = peer_info[bt];
12236
12237 // Only include peers that we've caught up to their backfill line
12238 // otherwise, they only appear to be missing this object
12239 // because their pbi.begin > backfill_info.begin.
12240 if (backfill_info.begin > pinfo.last_backfill)
12241 missing_targs.push_back(bt);
12242 else
12243 skip_targs.push_back(bt);
12244 }
12245 }
12246
12247 if (!keep_ver_targs.empty()) {
12248 // These peers have version obj_v
12249 dout(20) << " BACKFILL keeping " << check
12250 << " with ver " << obj_v
12251 << " on peers " << keep_ver_targs << dendl;
12252 //assert(!waiting_for_degraded_object.count(check));
12253 }
12254 if (!need_ver_targs.empty() || !missing_targs.empty()) {
12255 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12256 assert(obc);
12257 if (obc->get_recovery_read()) {
12258 if (!need_ver_targs.empty()) {
12259 dout(20) << " BACKFILL replacing " << check
12260 << " with ver " << obj_v
12261 << " to peers " << need_ver_targs << dendl;
12262 }
12263 if (!missing_targs.empty()) {
12264 dout(20) << " BACKFILL pushing " << backfill_info.begin
12265 << " with ver " << obj_v
12266 << " to peers " << missing_targs << dendl;
12267 }
12268 vector<pg_shard_t> all_push = need_ver_targs;
12269 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12270
224ce89b
WB
12271 handle.reset_tp_timeout();
12272 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12273 if (r < 0) {
12274 *work_started = true;
12275 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12276 break;
12277 }
7c673cae
FG
12278 ops++;
12279 } else {
12280 *work_started = true;
12281 dout(20) << "backfill blocking on " << backfill_info.begin
12282 << "; could not get rw_manager lock" << dendl;
12283 break;
12284 }
12285 }
12286 dout(20) << "need_ver_targs=" << need_ver_targs
12287 << " keep_ver_targs=" << keep_ver_targs << dendl;
12288 dout(20) << "backfill_targets=" << backfill_targets
12289 << " missing_targs=" << missing_targs
12290 << " skip_targs=" << skip_targs << dendl;
12291
12292 last_backfill_started = backfill_info.begin;
12293 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12294 backfill_info.pop_front();
12295 vector<pg_shard_t> check_targets = need_ver_targs;
12296 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12297 for (vector<pg_shard_t>::iterator i = check_targets.begin();
12298 i != check_targets.end();
12299 ++i) {
12300 pg_shard_t bt = *i;
12301 BackfillInterval& pbi = peer_backfill_info[bt];
12302 pbi.pop_front();
12303 }
12304 }
12305 }
12306
12307 hobject_t backfill_pos =
12308 std::min(backfill_info.begin, earliest_peer_backfill());
12309
12310 for (set<hobject_t>::iterator i = add_to_stat.begin();
12311 i != add_to_stat.end();
12312 ++i) {
12313 ObjectContextRef obc = get_object_context(*i, false);
12314 assert(obc);
12315 pg_stat_t stat;
12316 add_object_context_to_pg_stat(obc, &stat);
12317 pending_backfill_updates[*i] = stat;
12318 }
12319 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12320 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12321 for (unsigned i = 0; i < to_remove.size(); ++i) {
12322 handle.reset_tp_timeout();
12323 const hobject_t& oid = to_remove[i].get<0>();
12324 eversion_t v = to_remove[i].get<1>();
12325 pg_shard_t peer = to_remove[i].get<2>();
12326 MOSDPGBackfillRemove *m;
12327 auto it = reqs.find(peer);
12328 if (it != reqs.end()) {
12329 m = it->second;
12330 } else {
12331 m = reqs[peer] = new MOSDPGBackfillRemove(
12332 spg_t(info.pgid.pgid, peer.shard),
12333 get_osdmap()->get_epoch());
12334 }
12335 m->ls.push_back(make_pair(oid, v));
12336
12337 if (oid <= last_backfill_started)
12338 pending_backfill_updates[oid]; // add empty stat!
12339 }
12340 for (auto p : reqs) {
12341 osd->send_message_osd_cluster(p.first.osd, p.second,
12342 get_osdmap()->get_epoch());
12343 }
12344 } else {
12345 // for jewel targets
12346 for (unsigned i = 0; i < to_remove.size(); ++i) {
12347 handle.reset_tp_timeout();
12348
12349 // ordered before any subsequent updates
12350 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12351 to_remove[i].get<2>());
12352
12353 if (to_remove[i].get<0>() <= last_backfill_started)
12354 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12355 }
12356 }
12357
7c673cae
FG
12358 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12359
12360 dout(5) << "backfill_pos is " << backfill_pos << dendl;
12361 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12362 i != backfills_in_flight.end();
12363 ++i) {
12364 dout(20) << *i << " is still in flight" << dendl;
12365 }
12366
12367 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12368 backfill_pos : *(backfills_in_flight.begin());
12369 hobject_t new_last_backfill = earliest_backfill();
12370 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12371 for (map<hobject_t, pg_stat_t>::iterator i =
12372 pending_backfill_updates.begin();
12373 i != pending_backfill_updates.end() &&
12374 i->first < next_backfill_to_complete;
12375 pending_backfill_updates.erase(i++)) {
12376 dout(20) << " pending_backfill_update " << i->first << dendl;
12377 assert(i->first > new_last_backfill);
12378 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12379 j != backfill_targets.end();
12380 ++j) {
12381 pg_shard_t bt = *j;
12382 pg_info_t& pinfo = peer_info[bt];
12383 //Add stats to all peers that were missing object
12384 if (i->first > pinfo.last_backfill)
12385 pinfo.stats.add(i->second);
12386 }
12387 new_last_backfill = i->first;
12388 }
12389 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12390
12391 assert(!pending_backfill_updates.empty() ||
12392 new_last_backfill == last_backfill_started);
12393 if (pending_backfill_updates.empty() &&
12394 backfill_pos.is_max()) {
12395 assert(backfills_in_flight.empty());
12396 new_last_backfill = backfill_pos;
12397 last_backfill_started = backfill_pos;
12398 }
12399 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12400
12401 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12402 // all the backfill targets. Otherwise, we will move last_backfill up on
12403 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12404 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12405 i != backfill_targets.end();
12406 ++i) {
12407 pg_shard_t bt = *i;
12408 pg_info_t& pinfo = peer_info[bt];
12409
12410 if (new_last_backfill > pinfo.last_backfill) {
12411 pinfo.set_last_backfill(new_last_backfill);
12412 epoch_t e = get_osdmap()->get_epoch();
12413 MOSDPGBackfill *m = NULL;
12414 if (pinfo.last_backfill.is_max()) {
12415 m = new MOSDPGBackfill(
12416 MOSDPGBackfill::OP_BACKFILL_FINISH,
12417 e,
12418 last_peering_reset,
12419 spg_t(info.pgid.pgid, bt.shard));
12420 // Use default priority here, must match sub_op priority
12421 /* pinfo.stats might be wrong if we did log-based recovery on the
12422 * backfilled portion in addition to continuing backfill.
12423 */
12424 pinfo.stats = info.stats;
12425 start_recovery_op(hobject_t::get_max());
12426 } else {
12427 m = new MOSDPGBackfill(
12428 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12429 e,
12430 last_peering_reset,
12431 spg_t(info.pgid.pgid, bt.shard));
12432 // Use default priority here, must match sub_op priority
12433 }
12434 m->last_backfill = pinfo.last_backfill;
12435 m->stats = pinfo.stats;
12436 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12437 dout(10) << " peer " << bt
12438 << " num_objects now " << pinfo.stats.stats.sum.num_objects
12439 << " / " << info.stats.stats.sum.num_objects << dendl;
12440 }
12441 }
12442
12443 if (ops)
12444 *work_started = true;
12445 return ops;
12446}
12447
224ce89b 12448int PrimaryLogPG::prep_backfill_object_push(
7c673cae
FG
12449 hobject_t oid, eversion_t v,
12450 ObjectContextRef obc,
12451 vector<pg_shard_t> peers,
12452 PGBackend::RecoveryHandle *h)
12453{
224ce89b 12454 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
7c673cae
FG
12455 assert(!peers.empty());
12456
12457 backfills_in_flight.insert(oid);
12458 for (unsigned int i = 0 ; i < peers.size(); ++i) {
12459 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12460 assert(bpm != peer_missing.end());
c07f9fc5 12461 bpm->second.add(oid, eversion_t(), eversion_t(), false);
7c673cae
FG
12462 }
12463
12464 assert(!recovering.count(oid));
12465
12466 start_recovery_op(oid);
12467 recovering.insert(make_pair(oid, obc));
12468
12469 // We need to take the read_lock here in order to flush in-progress writes
12470 obc->ondisk_read_lock();
224ce89b 12471 int r = pgbackend->recover_object(
7c673cae
FG
12472 oid,
12473 v,
12474 ObjectContextRef(),
12475 obc,
12476 h);
12477 obc->ondisk_read_unlock();
224ce89b
WB
12478 if (r < 0) {
12479 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12480 primary_failed(oid);
12481 primary_error(oid, v);
12482 backfills_in_flight.erase(oid);
12483 missing_loc.add_missing(oid, v, eversion_t());
12484 }
12485 return r;
7c673cae
FG
12486}
12487
12488void PrimaryLogPG::update_range(
12489 BackfillInterval *bi,
12490 ThreadPool::TPHandle &handle)
12491{
12492 int local_min = cct->_conf->osd_backfill_scan_min;
12493 int local_max = cct->_conf->osd_backfill_scan_max;
12494
12495 if (bi->version < info.log_tail) {
12496 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12497 << dendl;
28e407b8 12498 osr->flush();
7c673cae
FG
12499 if (last_update_applied >= info.log_tail) {
12500 bi->version = last_update_applied;
12501 } else {
7c673cae
FG
12502 bi->version = info.last_update;
12503 }
12504 scan_range(local_min, local_max, bi, handle);
12505 }
12506
12507 if (bi->version >= projected_last_update) {
12508 dout(10) << __func__<< ": bi is current " << dendl;
12509 assert(bi->version == projected_last_update);
12510 } else if (bi->version >= info.log_tail) {
12511 if (pg_log.get_log().empty() && projected_log.empty()) {
12512 /* Because we don't move log_tail on split, the log might be
12513 * empty even if log_tail != last_update. However, the only
12514 * way to get here with an empty log is if log_tail is actually
12515 * eversion_t(), because otherwise the entry which changed
12516 * last_update since the last scan would have to be present.
12517 */
12518 assert(bi->version == eversion_t());
12519 return;
12520 }
12521
12522 dout(10) << __func__<< ": bi is old, (" << bi->version
12523 << ") can be updated with log to projected_last_update "
12524 << projected_last_update << dendl;
12525
12526 auto func = [&](const pg_log_entry_t &e) {
12527 dout(10) << __func__ << ": updating from version " << e.version
12528 << dendl;
12529 const hobject_t &soid = e.soid;
12530 if (soid >= bi->begin &&
12531 soid < bi->end) {
12532 if (e.is_update()) {
12533 dout(10) << __func__ << ": " << e.soid << " updated to version "
12534 << e.version << dendl;
12535 bi->objects.erase(e.soid);
12536 bi->objects.insert(
12537 make_pair(
12538 e.soid,
12539 e.version));
12540 } else if (e.is_delete()) {
12541 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12542 bi->objects.erase(e.soid);
12543 }
12544 }
12545 };
12546 dout(10) << "scanning pg log first" << dendl;
12547 pg_log.get_log().scan_log_after(bi->version, func);
12548 dout(10) << "scanning projected log" << dendl;
12549 projected_log.scan_log_after(bi->version, func);
12550 bi->version = projected_last_update;
12551 } else {
12552 assert(0 == "scan_range should have raised bi->version past log_tail");
12553 }
12554}
12555
12556void PrimaryLogPG::scan_range(
12557 int min, int max, BackfillInterval *bi,
12558 ThreadPool::TPHandle &handle)
12559{
12560 assert(is_locked());
12561 dout(10) << "scan_range from " << bi->begin << dendl;
12562 bi->clear_objects();
12563
12564 vector<hobject_t> ls;
12565 ls.reserve(max);
12566 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12567 assert(r >= 0);
12568 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12569 dout(20) << ls << dendl;
12570
12571 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12572 handle.reset_tp_timeout();
12573 ObjectContextRef obc;
12574 if (is_primary())
12575 obc = object_contexts.lookup(*p);
12576 if (obc) {
12577 bi->objects[*p] = obc->obs.oi.version;
12578 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12579 } else {
12580 bufferlist bl;
12581 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12582
12583 /* If the object does not exist here, it must have been removed
12584 * between the collection_list_partial and here. This can happen
12585 * for the first item in the range, which is usually last_backfill.
12586 */
12587 if (r == -ENOENT)
12588 continue;
12589
12590 assert(r >= 0);
12591 object_info_t oi(bl);
12592 bi->objects[*p] = oi.version;
12593 dout(20) << " " << *p << " " << oi.version << dendl;
12594 }
12595 }
12596}
12597
12598
12599/** check_local
12600 *
12601 * verifies that stray objects have been deleted
12602 */
12603void PrimaryLogPG::check_local()
12604{
12605 dout(10) << __func__ << dendl;
12606
12607 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12608
12609 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12610 return;
12611
12612 // just scan the log.
12613 set<hobject_t> did;
12614 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12615 p != pg_log.get_log().log.rend();
12616 ++p) {
12617 if (did.count(p->soid))
12618 continue;
12619 did.insert(p->soid);
12620
c07f9fc5 12621 if (p->is_delete() && !is_missing_object(p->soid)) {
7c673cae
FG
12622 dout(10) << " checking " << p->soid
12623 << " at " << p->version << dendl;
12624 struct stat st;
12625 int r = osd->store->stat(
12626 ch,
12627 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12628 &st);
12629 if (r != -ENOENT) {
12630 derr << __func__ << " " << p->soid << " exists, but should have been "
12631 << "deleted" << dendl;
12632 assert(0 == "erroneously present object");
12633 }
12634 } else {
12635 // ignore old(+missing) objects
12636 }
12637 }
12638}
12639
12640
12641
12642// ===========================
12643// hit sets
12644
12645hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12646{
12647 ostringstream ss;
12648 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12649 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12650 info.pgid.ps(), info.pgid.pool(),
12651 cct->_conf->osd_hit_set_namespace);
12652 dout(20) << __func__ << " " << hoid << dendl;
12653 return hoid;
12654}
12655
12656hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12657 utime_t end,
12658 bool using_gmt)
12659{
12660 ostringstream ss;
12661 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12662 if (using_gmt) {
12663 start.gmtime(ss) << "_";
12664 end.gmtime(ss);
12665 } else {
12666 start.localtime(ss) << "_";
12667 end.localtime(ss);
12668 }
12669 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12670 info.pgid.ps(), info.pgid.pool(),
12671 cct->_conf->osd_hit_set_namespace);
12672 dout(20) << __func__ << " " << hoid << dendl;
12673 return hoid;
12674}
12675
12676void PrimaryLogPG::hit_set_clear()
12677{
12678 dout(20) << __func__ << dendl;
12679 hit_set.reset();
12680 hit_set_start_stamp = utime_t();
12681}
12682
12683void PrimaryLogPG::hit_set_setup()
12684{
12685 if (!is_active() ||
12686 !is_primary()) {
12687 hit_set_clear();
12688 return;
12689 }
12690
12691 if (is_active() && is_primary() &&
12692 (!pool.info.hit_set_count ||
12693 !pool.info.hit_set_period ||
12694 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12695 hit_set_clear();
12696
12697 // only primary is allowed to remove all the hit set objects
12698 hit_set_remove_all();
12699 return;
12700 }
12701
12702 // FIXME: discard any previous data for now
12703 hit_set_create();
12704
12705 // include any writes we know about from the pg log. this doesn't
12706 // capture reads, but it is better than nothing!
12707 hit_set_apply_log();
12708}
12709
12710void PrimaryLogPG::hit_set_remove_all()
12711{
12712 // If any archives are degraded we skip this
12713 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12714 p != info.hit_set.history.end();
12715 ++p) {
12716 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12717
12718 // Once we hit a degraded object just skip
12719 if (is_degraded_or_backfilling_object(aoid))
12720 return;
28e407b8 12721 if (write_blocked_by_scrub(aoid))
7c673cae
FG
12722 return;
12723 }
12724
12725 if (!info.hit_set.history.empty()) {
12726 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12727 assert(p != info.hit_set.history.rend());
12728 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12729 assert(!is_degraded_or_backfilling_object(oid));
12730 ObjectContextRef obc = get_object_context(oid, false);
12731 assert(obc);
12732
12733 OpContextUPtr ctx = simple_opc_create(obc);
12734 ctx->at_version = get_next_version();
12735 ctx->updated_hset_history = info.hit_set;
12736 utime_t now = ceph_clock_now();
12737 ctx->mtime = now;
12738 hit_set_trim(ctx, 0);
12739 simple_opc_submit(std::move(ctx));
12740 }
12741
12742 info.hit_set = pg_hit_set_history_t();
12743 if (agent_state) {
12744 agent_state->discard_hit_sets();
12745 }
12746}
12747
12748void PrimaryLogPG::hit_set_create()
12749{
12750 utime_t now = ceph_clock_now();
12751 // make a copy of the params to modify
12752 HitSet::Params params(pool.info.hit_set_params);
12753
12754 dout(20) << __func__ << " " << params << dendl;
12755 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12756 BloomHitSet::Params *p =
12757 static_cast<BloomHitSet::Params*>(params.impl.get());
12758
12759 // convert false positive rate so it holds up across the full period
12760 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12761 if (p->get_fpp() <= 0.0)
12762 p->set_fpp(.01); // fpp cannot be zero!
12763
12764 // if we don't have specified size, estimate target size based on the
12765 // previous bin!
12766 if (p->target_size == 0 && hit_set) {
12767 utime_t dur = now - hit_set_start_stamp;
12768 unsigned unique = hit_set->approx_unique_insert_count();
12769 dout(20) << __func__ << " previous set had approx " << unique
12770 << " unique items over " << dur << " seconds" << dendl;
12771 p->target_size = (double)unique * (double)pool.info.hit_set_period
12772 / (double)dur;
12773 }
12774 if (p->target_size <
12775 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12776 p->target_size = cct->_conf->osd_hit_set_min_size;
12777
12778 if (p->target_size
12779 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12780 p->target_size = cct->_conf->osd_hit_set_max_size;
12781
12782 p->seed = now.sec();
12783
12784 dout(10) << __func__ << " target_size " << p->target_size
12785 << " fpp " << p->get_fpp() << dendl;
12786 }
12787 hit_set.reset(new HitSet(params));
12788 hit_set_start_stamp = now;
12789}
12790
12791/**
12792 * apply log entries to set
12793 *
12794 * this would only happen after peering, to at least capture writes
12795 * during an interval that was potentially lost.
12796 */
12797bool PrimaryLogPG::hit_set_apply_log()
12798{
12799 if (!hit_set)
12800 return false;
12801
12802 eversion_t to = info.last_update;
12803 eversion_t from = info.hit_set.current_last_update;
12804 if (to <= from) {
12805 dout(20) << __func__ << " no update" << dendl;
12806 return false;
12807 }
12808
12809 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12810 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12811 while (p != pg_log.get_log().log.rend() && p->version > to)
12812 ++p;
12813 while (p != pg_log.get_log().log.rend() && p->version > from) {
12814 hit_set->insert(p->soid);
12815 ++p;
12816 }
12817
12818 return true;
12819}
12820
12821void PrimaryLogPG::hit_set_persist()
12822{
12823 dout(10) << __func__ << dendl;
12824 bufferlist bl;
12825 unsigned max = pool.info.hit_set_count;
12826
12827 utime_t now = ceph_clock_now();
12828 hobject_t oid;
12829
12830 // If any archives are degraded we skip this persist request
12831 // account for the additional entry being added below
12832 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12833 p != info.hit_set.history.end();
12834 ++p) {
12835 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12836
12837 // Once we hit a degraded object just skip further trim
12838 if (is_degraded_or_backfilling_object(aoid))
12839 return;
28e407b8 12840 if (write_blocked_by_scrub(aoid))
7c673cae
FG
12841 return;
12842 }
12843
12844 // If backfill is in progress and we could possibly overlap with the
12845 // hit_set_* objects, back off. Since these all have
12846 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12847 // look just at that. This is necessary because our transactions
12848 // may include a modify of the new hit_set *and* a delete of the
12849 // old one, and this may span the backfill boundary.
12850 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12851 p != backfill_targets.end();
12852 ++p) {
12853 assert(peer_info.count(*p));
12854 const pg_info_t& pi = peer_info[*p];
12855 if (pi.last_backfill == hobject_t() ||
12856 pi.last_backfill.get_hash() == info.pgid.ps()) {
12857 dout(10) << __func__ << " backfill target osd." << *p
12858 << " last_backfill has not progressed past pgid ps"
12859 << dendl;
12860 return;
12861 }
12862 }
12863
12864
12865 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12866 new_hset.begin = hit_set_start_stamp;
12867 new_hset.end = now;
12868 oid = get_hit_set_archive_object(
12869 new_hset.begin,
12870 new_hset.end,
12871 new_hset.using_gmt);
12872
12873 // If the current object is degraded we skip this persist request
28e407b8 12874 if (write_blocked_by_scrub(oid))
7c673cae
FG
12875 return;
12876
12877 hit_set->seal();
12878 ::encode(*hit_set, bl);
12879 dout(20) << __func__ << " archive " << oid << dendl;
12880
12881 if (agent_state) {
12882 agent_state->add_hit_set(new_hset.begin, hit_set);
12883 uint32_t size = agent_state->hit_set_map.size();
12884 if (size >= pool.info.hit_set_count) {
12885 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12886 }
12887 hit_set_in_memory_trim(size);
12888 }
12889
12890 ObjectContextRef obc = get_object_context(oid, true);
12891 OpContextUPtr ctx = simple_opc_create(obc);
12892
12893 ctx->at_version = get_next_version();
12894 ctx->updated_hset_history = info.hit_set;
12895 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12896
12897 updated_hit_set_hist.current_last_update = info.last_update;
12898 new_hset.version = ctx->at_version;
12899
12900 updated_hit_set_hist.history.push_back(new_hset);
12901 hit_set_create();
12902
12903 // fabricate an object_info_t and SnapSet
12904 obc->obs.oi.version = ctx->at_version;
12905 obc->obs.oi.mtime = now;
12906 obc->obs.oi.size = bl.length();
12907 obc->obs.exists = true;
12908 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12909
12910 ctx->new_obs = obc->obs;
12911
12912 obc->ssc->snapset.head_exists = true;
12913 ctx->new_snapset = obc->ssc->snapset;
12914
12915 ctx->delta_stats.num_objects++;
12916 ctx->delta_stats.num_objects_hit_set_archive++;
12917 ctx->delta_stats.num_bytes += bl.length();
12918 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12919
12920 bufferlist bss;
12921 ::encode(ctx->new_snapset, bss);
12922 bufferlist boi(sizeof(ctx->new_obs.oi));
12923 ::encode(ctx->new_obs.oi, boi,
12924 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12925
12926 ctx->op_t->create(oid);
12927 if (bl.length()) {
12928 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12929 }
12930 map <string, bufferlist> attrs;
12931 attrs[OI_ATTR].claim(boi);
12932 attrs[SS_ATTR].claim(bss);
12933 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12934 ctx->log.push_back(
12935 pg_log_entry_t(
12936 pg_log_entry_t::MODIFY,
12937 oid,
12938 ctx->at_version,
12939 eversion_t(),
12940 0,
12941 osd_reqid_t(),
12942 ctx->mtime,
12943 0)
12944 );
12945
12946 hit_set_trim(ctx, max);
12947
12948 simple_opc_submit(std::move(ctx));
12949}
12950
12951void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12952{
12953 assert(ctx->updated_hset_history);
12954 pg_hit_set_history_t &updated_hit_set_hist =
12955 *(ctx->updated_hset_history);
12956 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12957 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12958 assert(p != updated_hit_set_hist.history.end());
12959 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12960
12961 assert(!is_degraded_or_backfilling_object(oid));
12962
12963 dout(20) << __func__ << " removing " << oid << dendl;
12964 ++ctx->at_version.version;
12965 ctx->log.push_back(
12966 pg_log_entry_t(pg_log_entry_t::DELETE,
12967 oid,
12968 ctx->at_version,
12969 p->version,
12970 0,
12971 osd_reqid_t(),
12972 ctx->mtime,
12973 0));
12974
12975 ctx->op_t->remove(oid);
12976 updated_hit_set_hist.history.pop_front();
12977
12978 ObjectContextRef obc = get_object_context(oid, false);
12979 assert(obc);
12980 --ctx->delta_stats.num_objects;
12981 --ctx->delta_stats.num_objects_hit_set_archive;
12982 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12983 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12984 }
12985}
12986
12987void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12988{
12989 while (agent_state->hit_set_map.size() > max_in_memory) {
12990 agent_state->remove_oldest_hit_set();
12991 }
12992}
12993
12994
12995// =======================================
12996// cache agent
12997
12998void PrimaryLogPG::agent_setup()
12999{
13000 assert(is_locked());
13001 if (!is_active() ||
13002 !is_primary() ||
13003 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13004 pool.info.tier_of < 0 ||
13005 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13006 agent_clear();
13007 return;
13008 }
13009 if (!agent_state) {
13010 agent_state.reset(new TierAgentState);
13011
13012 // choose random starting position
13013 agent_state->position = hobject_t();
13014 agent_state->position.pool = info.pgid.pool();
13015 agent_state->position.set_hash(pool.info.get_random_pg_position(
13016 info.pgid.pgid,
13017 rand()));
13018 agent_state->start = agent_state->position;
13019
13020 dout(10) << __func__ << " allocated new state, position "
13021 << agent_state->position << dendl;
13022 } else {
13023 dout(10) << __func__ << " keeping existing state" << dendl;
13024 }
13025
13026 if (info.stats.stats_invalid) {
13027 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13028 }
13029
13030 agent_choose_mode();
13031}
13032
13033void PrimaryLogPG::agent_clear()
13034{
13035 agent_stop();
13036 agent_state.reset(NULL);
13037}
13038
13039// Return false if no objects operated on since start of object hash space
13040bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13041{
13042 lock();
13043 if (!agent_state) {
13044 dout(10) << __func__ << " no agent state, stopping" << dendl;
13045 unlock();
13046 return true;
13047 }
13048
13049 assert(!deleting);
13050
13051 if (agent_state->is_idle()) {
13052 dout(10) << __func__ << " idle, stopping" << dendl;
13053 unlock();
13054 return true;
13055 }
13056
13057 osd->logger->inc(l_osd_agent_wake);
13058
13059 dout(10) << __func__
13060 << " max " << start_max
13061 << ", flush " << agent_state->get_flush_mode_name()
13062 << ", evict " << agent_state->get_evict_mode_name()
13063 << ", pos " << agent_state->position
13064 << dendl;
13065 assert(is_primary());
13066 assert(is_active());
13067
13068 agent_load_hit_sets();
13069
13070 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13071 assert(base_pool);
13072
13073 int ls_min = 1;
13074 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13075
13076 // list some objects. this conveniently lists clones (oldest to
13077 // newest) before heads... the same order we want to flush in.
13078 //
13079 // NOTE: do not flush the Sequencer. we will assume that the
13080 // listing we get back is imprecise.
13081 vector<hobject_t> ls;
13082 hobject_t next;
13083 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13084 &ls, &next);
13085 assert(r >= 0);
13086 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13087 int started = 0;
13088 for (vector<hobject_t>::iterator p = ls.begin();
13089 p != ls.end();
13090 ++p) {
13091 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13092 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13093 osd->logger->inc(l_osd_agent_skip);
13094 continue;
13095 }
13096 if (is_degraded_or_backfilling_object(*p)) {
13097 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13098 osd->logger->inc(l_osd_agent_skip);
13099 continue;
13100 }
13101 if (is_missing_object(p->get_head())) {
13102 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13103 osd->logger->inc(l_osd_agent_skip);
13104 continue;
13105 }
13106 ObjectContextRef obc = get_object_context(*p, false, NULL);
13107 if (!obc) {
13108 // we didn't flush; we may miss something here.
13109 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13110 osd->logger->inc(l_osd_agent_skip);
13111 continue;
13112 }
13113 if (!obc->obs.exists) {
13114 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13115 osd->logger->inc(l_osd_agent_skip);
13116 continue;
13117 }
28e407b8
AA
13118 if (range_intersects_scrub(obc->obs.oi.soid,
13119 obc->obs.oi.soid.get_head())) {
7c673cae
FG
13120 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13121 osd->logger->inc(l_osd_agent_skip);
13122 continue;
13123 }
13124 if (obc->is_blocked()) {
13125 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13126 osd->logger->inc(l_osd_agent_skip);
13127 continue;
13128 }
13129 if (obc->is_request_pending()) {
13130 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13131 osd->logger->inc(l_osd_agent_skip);
13132 continue;
13133 }
13134
13135 // be careful flushing omap to an EC pool.
13136 if (!base_pool->supports_omap() &&
13137 obc->obs.oi.is_omap()) {
13138 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13139 osd->logger->inc(l_osd_agent_skip);
13140 continue;
13141 }
13142
13143 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13144 agent_maybe_evict(obc, false))
13145 ++started;
13146 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13147 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13148 ++started;
13149 --agent_flush_quota;
13150 }
13151 if (started >= start_max) {
13152 // If finishing early, set "next" to the next object
13153 if (++p != ls.end())
13154 next = *p;
13155 break;
13156 }
13157 }
13158
13159 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13160 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13161 agent_state->hist_age = 0;
13162 agent_state->temp_hist.decay();
13163 }
13164
13165 // Total objects operated on so far
13166 int total_started = agent_state->started + started;
13167 bool need_delay = false;
13168
13169 dout(20) << __func__ << " start pos " << agent_state->position
13170 << " next start pos " << next
13171 << " started " << total_started << dendl;
13172
13173 // See if we've made a full pass over the object hash space
13174 // This might check at most ls_max objects a second time to notice that
13175 // we've checked every objects at least once.
13176 if (agent_state->position < agent_state->start &&
13177 next >= agent_state->start) {
13178 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13179 if (total_started == 0)
13180 need_delay = true;
13181 else
13182 total_started = 0;
13183 agent_state->start = next;
13184 }
13185 agent_state->started = total_started;
13186
13187 // See if we are starting from beginning
13188 if (next.is_max())
13189 agent_state->position = hobject_t();
13190 else
13191 agent_state->position = next;
13192
13193 // Discard old in memory HitSets
13194 hit_set_in_memory_trim(pool.info.hit_set_count);
13195
13196 if (need_delay) {
13197 assert(agent_state->delaying == false);
13198 agent_delay();
13199 unlock();
13200 return false;
13201 }
13202 agent_choose_mode();
13203 unlock();
13204 return true;
13205}
13206
13207void PrimaryLogPG::agent_load_hit_sets()
13208{
13209 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13210 return;
13211 }
13212
13213 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13214 dout(10) << __func__ << dendl;
13215 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13216 p != info.hit_set.history.end(); ++p) {
13217 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13218 dout(10) << __func__ << " loading " << p->begin << "-"
13219 << p->end << dendl;
13220 if (!pool.info.is_replicated()) {
13221 // FIXME: EC not supported here yet
13222 derr << __func__ << " on non-replicated pool" << dendl;
13223 break;
13224 }
13225
13226 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13227 if (is_unreadable_object(oid)) {
13228 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13229 break;
13230 }
13231
13232 ObjectContextRef obc = get_object_context(oid, false);
13233 if (!obc) {
13234 derr << __func__ << ": could not load hitset " << oid << dendl;
13235 break;
13236 }
13237
13238 bufferlist bl;
13239 {
13240 obc->ondisk_read_lock();
13241 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13242 assert(r >= 0);
13243 obc->ondisk_read_unlock();
13244 }
13245 HitSetRef hs(new HitSet);
13246 bufferlist::iterator pbl = bl.begin();
13247 ::decode(*hs, pbl);
13248 agent_state->add_hit_set(p->begin.sec(), hs);
13249 }
13250 }
13251 }
13252}
13253
13254bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13255{
13256 if (!obc->obs.oi.is_dirty()) {
13257 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13258 osd->logger->inc(l_osd_agent_skip);
13259 return false;
13260 }
13261 if (obc->obs.oi.is_cache_pinned()) {
13262 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13263 osd->logger->inc(l_osd_agent_skip);
13264 return false;
13265 }
13266
13267 utime_t now = ceph_clock_now();
13268 utime_t ob_local_mtime;
13269 if (obc->obs.oi.local_mtime != utime_t()) {
13270 ob_local_mtime = obc->obs.oi.local_mtime;
13271 } else {
13272 ob_local_mtime = obc->obs.oi.mtime;
13273 }
13274 bool evict_mode_full =
13275 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13276 if (!evict_mode_full &&
13277 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
13278 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13279 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13280 osd->logger->inc(l_osd_agent_skip);
13281 return false;
13282 }
13283
13284 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13285 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13286 osd->logger->inc(l_osd_agent_skip);
13287 return false;
13288 }
13289
13290 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13291
13292 // FIXME: flush anything dirty, regardless of what distribution of
13293 // ages we expect.
13294
13295 hobject_t oid = obc->obs.oi.soid;
13296 osd->agent_start_op(oid);
13297 // no need to capture a pg ref, can't outlive fop or ctx
13298 std::function<void()> on_flush = [this, oid]() {
13299 osd->agent_finish_op(oid);
13300 };
13301
13302 int result = start_flush(
13303 OpRequestRef(), obc, false, NULL,
13304 on_flush);
13305 if (result != -EINPROGRESS) {
13306 on_flush();
13307 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13308 << " with " << result << dendl;
13309 osd->logger->inc(l_osd_agent_skip);
13310 return false;
13311 }
13312
13313 osd->logger->inc(l_osd_agent_flush);
13314 return true;
13315}
13316
13317bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13318{
13319 const hobject_t& soid = obc->obs.oi.soid;
13320 if (!after_flush && obc->obs.oi.is_dirty()) {
13321 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13322 return false;
13323 }
13324 if (!obc->obs.oi.watchers.empty()) {
13325 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13326 return false;
13327 }
13328 if (obc->is_blocked()) {
13329 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13330 return false;
13331 }
13332 if (obc->obs.oi.is_cache_pinned()) {
13333 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13334 return false;
13335 }
13336
13337 if (soid.snap == CEPH_NOSNAP) {
13338 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13339 if (result < 0) {
13340 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13341 return false;
13342 }
13343 }
13344
13345 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13346 // is this object old than cache_min_evict_age?
13347 utime_t now = ceph_clock_now();
13348 utime_t ob_local_mtime;
13349 if (obc->obs.oi.local_mtime != utime_t()) {
13350 ob_local_mtime = obc->obs.oi.local_mtime;
13351 } else {
13352 ob_local_mtime = obc->obs.oi.mtime;
13353 }
13354 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13355 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13356 osd->logger->inc(l_osd_agent_skip);
13357 return false;
13358 }
13359 // is this object old and/or cold enough?
13360 int temp = 0;
13361 uint64_t temp_upper = 0, temp_lower = 0;
13362 if (hit_set)
13363 agent_estimate_temp(soid, &temp);
13364 agent_state->temp_hist.add(temp);
13365 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13366
13367 dout(20) << __func__
13368 << " temp " << temp
13369 << " pos " << temp_lower << "-" << temp_upper
13370 << ", evict_effort " << agent_state->evict_effort
13371 << dendl;
13372 dout(30) << "agent_state:\n";
13373 Formatter *f = Formatter::create("");
13374 f->open_object_section("agent_state");
13375 agent_state->dump(f);
13376 f->close_section();
13377 f->flush(*_dout);
13378 delete f;
13379 *_dout << dendl;
13380
13381 if (1000000 - temp_upper >= agent_state->evict_effort)
13382 return false;
13383 }
13384
13385 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13386 OpContextUPtr ctx = simple_opc_create(obc);
13387
13388 if (!ctx->lock_manager.get_lock_type(
13389 ObjectContext::RWState::RWWRITE,
13390 obc->obs.oi.soid,
13391 obc,
13392 OpRequestRef())) {
13393 close_op_ctx(ctx.release());
13394 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13395 return false;
13396 }
13397
13398 osd->agent_start_evict_op();
13399 ctx->register_on_finish(
13400 [this]() {
13401 osd->agent_finish_evict_op();
13402 });
13403
13404 ctx->at_version = get_next_version();
13405 assert(ctx->new_obs.exists);
13406 int r = _delete_oid(ctx.get(), true, false);
13407 if (obc->obs.oi.is_omap())
13408 ctx->delta_stats.num_objects_omap--;
13409 ctx->delta_stats.num_evict++;
13410 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13411 if (obc->obs.oi.is_dirty())
13412 --ctx->delta_stats.num_objects_dirty;
13413 assert(r == 0);
13414 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13415 simple_opc_submit(std::move(ctx));
13416 osd->logger->inc(l_osd_tier_evict);
13417 osd->logger->inc(l_osd_agent_evict);
13418 return true;
13419}
13420
13421void PrimaryLogPG::agent_stop()
13422{
13423 dout(20) << __func__ << dendl;
13424 if (agent_state && !agent_state->is_idle()) {
13425 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13426 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13427 osd->agent_disable_pg(this, agent_state->evict_effort);
13428 }
13429}
13430
13431void PrimaryLogPG::agent_delay()
13432{
13433 dout(20) << __func__ << dendl;
13434 if (agent_state && !agent_state->is_idle()) {
13435 assert(agent_state->delaying == false);
13436 agent_state->delaying = true;
13437 osd->agent_disable_pg(this, agent_state->evict_effort);
13438 }
13439}
13440
13441void PrimaryLogPG::agent_choose_mode_restart()
13442{
13443 dout(20) << __func__ << dendl;
13444 lock();
13445 if (agent_state && agent_state->delaying) {
13446 agent_state->delaying = false;
13447 agent_choose_mode(true);
13448 }
13449 unlock();
13450}
13451
13452bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13453{
13454 bool requeued = false;
13455 // Let delay play out
13456 if (agent_state->delaying) {
13457 dout(20) << __func__ << this << " delaying, ignored" << dendl;
13458 return requeued;
13459 }
13460
13461 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13462 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13463 unsigned evict_effort = 0;
13464
13465 if (info.stats.stats_invalid) {
13466 // idle; stats can't be trusted until we scrub.
13467 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13468 goto skip_calc;
13469 }
13470
13471 {
13472 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13473 assert(divisor > 0);
13474
13475 // adjust (effective) user objects down based on the number
13476 // of HitSet objects, which should not count toward our total since
13477 // they cannot be flushed.
13478 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13479
13480 // also exclude omap objects if ec backing pool
13481 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13482 assert(base_pool);
13483 if (!base_pool->supports_omap())
13484 unflushable += info.stats.stats.sum.num_objects_omap;
13485
13486 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13487 if (num_user_objects > unflushable)
13488 num_user_objects -= unflushable;
13489 else
13490 num_user_objects = 0;
13491
13492 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13493 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13494 num_user_bytes -= unflushable_bytes;
13495 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13496 num_user_bytes += num_overhead_bytes;
13497
13498 // also reduce the num_dirty by num_objects_omap
13499 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13500 if (!base_pool->supports_omap()) {
13501 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13502 num_dirty -= info.stats.stats.sum.num_objects_omap;
13503 else
13504 num_dirty = 0;
13505 }
13506
13507 dout(10) << __func__
13508 << " flush_mode: "
13509 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13510 << " evict_mode: "
13511 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13512 << " num_objects: " << info.stats.stats.sum.num_objects
13513 << " num_bytes: " << info.stats.stats.sum.num_bytes
13514 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13515 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13516 << " num_dirty: " << num_dirty
13517 << " num_user_objects: " << num_user_objects
13518 << " num_user_bytes: " << num_user_bytes
13519 << " num_overhead_bytes: " << num_overhead_bytes
13520 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13521 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13522 << dendl;
13523
13524 // get dirty, full ratios
13525 uint64_t dirty_micro = 0;
13526 uint64_t full_micro = 0;
13527 if (pool.info.target_max_bytes && num_user_objects > 0) {
13528 uint64_t avg_size = num_user_bytes / num_user_objects;
13529 dirty_micro =
13530 num_dirty * avg_size * 1000000 /
13531 MAX(pool.info.target_max_bytes / divisor, 1);
13532 full_micro =
13533 num_user_objects * avg_size * 1000000 /
13534 MAX(pool.info.target_max_bytes / divisor, 1);
13535 }
13536 if (pool.info.target_max_objects > 0) {
13537 uint64_t dirty_objects_micro =
13538 num_dirty * 1000000 /
13539 MAX(pool.info.target_max_objects / divisor, 1);
13540 if (dirty_objects_micro > dirty_micro)
13541 dirty_micro = dirty_objects_micro;
13542 uint64_t full_objects_micro =
13543 num_user_objects * 1000000 /
13544 MAX(pool.info.target_max_objects / divisor, 1);
13545 if (full_objects_micro > full_micro)
13546 full_micro = full_objects_micro;
13547 }
13548 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13549 << " full " << ((float)full_micro / 1000000.0)
13550 << dendl;
13551
13552 // flush mode
13553 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13554 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13555 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13556 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13557 flush_target += flush_slop;
13558 flush_high_target += flush_slop;
13559 } else {
13560 flush_target -= MIN(flush_target, flush_slop);
13561 flush_high_target -= MIN(flush_high_target, flush_slop);
13562 }
13563
13564 if (dirty_micro > flush_high_target) {
13565 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13566 } else if (dirty_micro > flush_target) {
13567 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13568 }
13569
13570 // evict mode
13571 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13572 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13573 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13574 evict_target += evict_slop;
13575 else
13576 evict_target -= MIN(evict_target, evict_slop);
13577
13578 if (full_micro > 1000000) {
13579 // evict anything clean
13580 evict_mode = TierAgentState::EVICT_MODE_FULL;
13581 evict_effort = 1000000;
13582 } else if (full_micro > evict_target) {
13583 // set effort in [0..1] range based on where we are between
13584 evict_mode = TierAgentState::EVICT_MODE_SOME;
13585 uint64_t over = full_micro - evict_target;
13586 uint64_t span = 1000000 - evict_target;
13587 evict_effort = MAX(over * 1000000 / span,
13588 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13589
13590 // quantize effort to avoid too much reordering in the agent_queue.
13591 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13592 assert(inc > 0);
13593 uint64_t was = evict_effort;
13594 evict_effort -= evict_effort % inc;
13595 if (evict_effort < inc)
13596 evict_effort = inc;
13597 assert(evict_effort >= inc && evict_effort <= 1000000);
13598 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13599 }
13600 }
13601
13602 skip_calc:
13603 bool old_idle = agent_state->is_idle();
13604 if (flush_mode != agent_state->flush_mode) {
13605 dout(5) << __func__ << " flush_mode "
13606 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13607 << " -> "
13608 << TierAgentState::get_flush_mode_name(flush_mode)
13609 << dendl;
13610 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13611 osd->agent_inc_high_count();
13612 info.stats.stats.sum.num_flush_mode_high = 1;
13613 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13614 info.stats.stats.sum.num_flush_mode_low = 1;
13615 }
13616 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13617 osd->agent_dec_high_count();
13618 info.stats.stats.sum.num_flush_mode_high = 0;
13619 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13620 info.stats.stats.sum.num_flush_mode_low = 0;
13621 }
13622 agent_state->flush_mode = flush_mode;
13623 }
13624 if (evict_mode != agent_state->evict_mode) {
13625 dout(5) << __func__ << " evict_mode "
13626 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13627 << " -> "
13628 << TierAgentState::get_evict_mode_name(evict_mode)
13629 << dendl;
13630 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13631 is_active()) {
13632 if (op)
13633 requeue_op(op);
b32b8144 13634 requeue_ops(waiting_for_flush);
7c673cae
FG
13635 requeue_ops(waiting_for_active);
13636 requeue_ops(waiting_for_scrub);
13637 requeue_ops(waiting_for_cache_not_full);
13638 objects_blocked_on_cache_full.clear();
13639 requeued = true;
13640 }
13641 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13642 info.stats.stats.sum.num_evict_mode_some = 1;
13643 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13644 info.stats.stats.sum.num_evict_mode_full = 1;
13645 }
13646 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13647 info.stats.stats.sum.num_evict_mode_some = 0;
13648 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13649 info.stats.stats.sum.num_evict_mode_full = 0;
13650 }
13651 agent_state->evict_mode = evict_mode;
13652 }
13653 uint64_t old_effort = agent_state->evict_effort;
13654 if (evict_effort != agent_state->evict_effort) {
13655 dout(5) << __func__ << " evict_effort "
13656 << ((float)agent_state->evict_effort / 1000000.0)
13657 << " -> "
13658 << ((float)evict_effort / 1000000.0)
13659 << dendl;
13660 agent_state->evict_effort = evict_effort;
13661 }
13662
13663 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13664 // (including flush). This is probably fine (they should be
13665 // correlated) but it is not precisely correct.
13666 if (agent_state->is_idle()) {
13667 if (!restart && !old_idle) {
13668 osd->agent_disable_pg(this, old_effort);
13669 }
13670 } else {
13671 if (restart || old_idle) {
13672 osd->agent_enable_pg(this, agent_state->evict_effort);
13673 } else if (old_effort != agent_state->evict_effort) {
13674 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13675 }
13676 }
13677 return requeued;
13678}
13679
13680void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13681{
13682 assert(hit_set);
13683 assert(temp);
13684 *temp = 0;
13685 if (hit_set->contains(oid))
13686 *temp = 1000000;
13687 unsigned i = 0;
13688 int last_n = pool.info.hit_set_search_last_n;
13689 for (map<time_t,HitSetRef>::reverse_iterator p =
13690 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13691 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13692 if (p->second->contains(oid)) {
13693 *temp += pool.info.get_grade(i);
13694 --last_n;
13695 }
13696 }
13697}
13698
13699// Dup op detection
13700
13701bool PrimaryLogPG::already_complete(eversion_t v)
13702{
13703 dout(20) << __func__ << ": " << v << dendl;
13704 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13705 !i.end();
13706 ++i) {
13707 dout(20) << __func__ << ": " << **i << dendl;
13708 // skip copy from temp object ops
13709 if ((*i)->v == eversion_t()) {
13710 dout(20) << __func__ << ": " << **i
13711 << " version is empty" << dendl;
13712 continue;
13713 }
13714 if ((*i)->v > v) {
13715 dout(20) << __func__ << ": " << **i
13716 << " (*i)->v past v" << dendl;
13717 break;
13718 }
13719 if (!(*i)->all_committed) {
13720 dout(20) << __func__ << ": " << **i
13721 << " not committed, returning false"
13722 << dendl;
13723 return false;
13724 }
13725 }
13726 dout(20) << __func__ << ": returning true" << dendl;
13727 return true;
13728}
13729
13730bool PrimaryLogPG::already_ack(eversion_t v)
13731{
13732 dout(20) << __func__ << ": " << v << dendl;
13733 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13734 !i.end();
13735 ++i) {
13736 // skip copy from temp object ops
13737 if ((*i)->v == eversion_t()) {
13738 dout(20) << __func__ << ": " << **i
13739 << " version is empty" << dendl;
13740 continue;
13741 }
13742 if ((*i)->v > v) {
13743 dout(20) << __func__ << ": " << **i
13744 << " (*i)->v past v" << dendl;
13745 break;
13746 }
13747 if (!(*i)->all_applied) {
13748 dout(20) << __func__ << ": " << **i
13749 << " not applied, returning false"
13750 << dendl;
13751 return false;
13752 }
13753 }
13754 dout(20) << __func__ << ": returning true" << dendl;
13755 return true;
13756}
13757
13758
13759// ==========================================================================================
13760// SCRUB
13761
13762
13763bool PrimaryLogPG::_range_available_for_scrub(
13764 const hobject_t &begin, const hobject_t &end)
13765{
13766 pair<hobject_t, ObjectContextRef> next;
13767 next.second = object_contexts.lookup(begin);
13768 next.first = begin;
13769 bool more = true;
13770 while (more && next.first < end) {
13771 if (next.second && next.second->is_blocked()) {
13772 next.second->requeue_scrub_on_unblock = true;
13773 dout(10) << __func__ << ": scrub delayed, "
13774 << next.first << " is blocked"
13775 << dendl;
13776 return false;
13777 }
13778 more = object_contexts.get_next(next.first, &next);
13779 }
13780 return true;
13781}
13782
13783static bool doing_clones(const boost::optional<SnapSet> &snapset,
13784 const vector<snapid_t>::reverse_iterator &curclone) {
13785 return snapset && curclone != snapset.get().clones.rend();
13786}
13787
13788void PrimaryLogPG::log_missing(unsigned missing,
13789 const boost::optional<hobject_t> &head,
13790 LogChannelRef clog,
13791 const spg_t &pgid,
13792 const char *func,
13793 const char *mode,
13794 bool allow_incomplete_clones)
13795{
13796 assert(head);
13797 if (allow_incomplete_clones) {
13798 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13799 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13800 } else {
13801 clog->info() << mode << " " << pgid << " " << head.get()
13802 << " " << missing << " missing clone(s)";
13803 }
13804}
13805
13806unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13807 const boost::optional<SnapSet> &snapset,
13808 LogChannelRef clog,
13809 const spg_t &pgid,
13810 const char *mode,
13811 bool allow_incomplete_clones,
13812 boost::optional<snapid_t> target,
13813 vector<snapid_t>::reverse_iterator *curclone,
13814 inconsistent_snapset_wrapper &e)
13815{
13816 assert(head);
13817 assert(snapset);
13818 unsigned missing = 0;
13819
13820 // NOTE: clones are in descending order, thus **curclone > target test here
13821 hobject_t next_clone(head.get());
13822 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13823 ++missing;
13824 // it is okay to be missing one or more clones in a cache tier.
13825 // skip higher-numbered clones in the list.
13826 if (!allow_incomplete_clones) {
13827 next_clone.snap = **curclone;
13828 clog->error() << mode << " " << pgid << " " << head.get()
c07f9fc5
FG
13829 << " expected clone " << next_clone << " " << missing
13830 << " missing";
7c673cae
FG
13831 ++scrubber.shallow_errors;
13832 e.set_clone_missing(next_clone.snap);
13833 }
13834 // Clones are descending
13835 ++(*curclone);
13836 }
13837 return missing;
13838}
13839
13840/*
13841 * Validate consistency of the object info and snap sets.
13842 *
13843 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13844 * the comparison of the objects is against multiple snapset.clones. There are
13845 * multiple clone lists and in between lists we expect head or snapdir.
13846 *
13847 * Example
13848 *
13849 * objects expected
13850 * ======= =======
13851 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13852 * obj2 head head/snapdir, head ok
13853 * [SnapSet clones 6 4 2 1]
13854 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13855 * obj2 snap 6 obj2 snap 6, match
13856 * obj2 snap 4 obj2 snap 4, match
13857 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13858 * [Snapset clones 3 1]
13859 * obj3 snap 3 obj3 snap 3 match
13860 * obj3 snap 1 obj3 snap 1 match
13861 * obj4 snapdir head/snapdir, snapdir ok
13862 * [Snapset clones 4]
13863 * EOL obj4 snap 4, (expected)
13864 */
13865void PrimaryLogPG::scrub_snapshot_metadata(
13866 ScrubMap &scrubmap,
28e407b8
AA
13867 const map<hobject_t,
13868 pair<boost::optional<uint32_t>,
13869 boost::optional<uint32_t>>> &missing_digest)
7c673cae
FG
13870{
13871 dout(10) << __func__ << dendl;
13872
13873 coll_t c(info.pgid);
13874 bool repair = state_test(PG_STATE_REPAIR);
13875 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13876 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13877 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13878
13879 /// snapsets to repair
13880 map<hobject_t,SnapSet> snapset_to_repair;
13881
13882 // traverse in reverse order.
13883 boost::optional<hobject_t> head;
13884 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13885 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13886 unsigned missing = 0;
13887 inconsistent_snapset_wrapper soid_error, head_error;
94b18763 13888 unsigned soid_error_count = 0;
7c673cae
FG
13889
13890 bufferlist last_data;
13891
13892 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13893 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13894 const hobject_t& soid = p->first;
13895 soid_error = inconsistent_snapset_wrapper{soid};
13896 object_stat_sum_t stat;
13897 boost::optional<object_info_t> oi;
13898
13899 if (!soid.is_snapdir())
13900 stat.num_objects++;
13901
13902 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13903 stat.num_objects_hit_set_archive++;
13904
13905 if (soid.is_snap()) {
13906 // it's a clone
13907 stat.num_object_clones++;
13908 }
13909
13910 // basic checks.
13911 if (p->second.attrs.count(OI_ATTR) == 0) {
13912 oi = boost::none;
13913 osd->clog->error() << mode << " " << info.pgid << " " << soid
13914 << " no '" << OI_ATTR << "' attr";
13915 ++scrubber.shallow_errors;
94b18763 13916 soid_error.set_info_missing();
7c673cae
FG
13917 } else {
13918 bufferlist bv;
13919 bv.push_back(p->second.attrs[OI_ATTR]);
13920 try {
13921 oi = object_info_t(); // Initialize optional<> before decode into it
13922 oi.get().decode(bv);
13923 } catch (buffer::error& e) {
13924 oi = boost::none;
13925 osd->clog->error() << mode << " " << info.pgid << " " << soid
13926 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13927 ++scrubber.shallow_errors;
94b18763
FG
13928 soid_error.set_info_corrupted();
13929 soid_error.set_info_missing(); // Not available too
7c673cae
FG
13930 }
13931 }
13932
13933 if (oi) {
13934 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13935 osd->clog->error() << mode << " " << info.pgid << " " << soid
13936 << " on disk size (" << p->second.size
13937 << ") does not match object info size ("
13938 << oi->size << ") adjusted for ondisk to ("
13939 << pgbackend->be_get_ondisk_size(oi->size)
13940 << ")";
13941 soid_error.set_size_mismatch();
13942 ++scrubber.shallow_errors;
13943 }
13944
13945 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13946
13947 // A clone num_bytes will be added later when we have snapset
13948 if (!soid.is_snap()) {
13949 stat.num_bytes += oi->size;
13950 }
13951 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13952 stat.num_bytes_hit_set_archive += oi->size;
13953
13954 if (!soid.is_snapdir()) {
13955 if (oi->is_dirty())
13956 ++stat.num_objects_dirty;
13957 if (oi->is_whiteout())
13958 ++stat.num_whiteouts;
13959 if (oi->is_omap())
13960 ++stat.num_objects_omap;
13961 if (oi->is_cache_pinned())
13962 ++stat.num_objects_pinned;
13963 }
13964 } else {
13965 // pessimistic assumption that this object might contain a
13966 // legacy SnapSet
13967 stat.num_legacy_snapsets++;
13968 }
13969
13970 // Check for any problems while processing clones
13971 if (doing_clones(snapset, curclone)) {
13972 boost::optional<snapid_t> target;
13973 // Expecting an object with snap for current head
13974 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13975
13976 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13977 << soid << " while processing " << head.get() << dendl;
13978
13979 target = all_clones;
13980 } else {
13981 assert(soid.is_snap());
13982 target = soid.snap;
13983 }
13984
13985 // Log any clones we were expecting to be there up to target
13986 // This will set missing, but will be a no-op if snap.soid == *curclone.
13987 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13988 pool.info.allow_incomplete_clones(), target, &curclone,
13989 head_error);
13990 }
13991 bool expected;
13992 // Check doing_clones() again in case we ran process_clones_to()
13993 if (doing_clones(snapset, curclone)) {
13994 // A head/snapdir would have processed all clones above
13995 // or all greater than *curclone.
13996 assert(soid.is_snap() && *curclone <= soid.snap);
13997
13998 // After processing above clone snap should match the expected curclone
13999 expected = (*curclone == soid.snap);
14000 } else {
14001 // If we aren't doing clones any longer, then expecting head/snapdir
14002 expected = soid.has_snapset();
14003 }
14004 if (!expected) {
14005 // If we couldn't read the head's snapset, just ignore clones
14006 if (head && !snapset) {
14007 osd->clog->error() << mode << " " << info.pgid << " " << soid
14008 << " clone ignored due to missing snapset";
14009 } else {
14010 osd->clog->error() << mode << " " << info.pgid << " " << soid
14011 << " is an unexpected clone";
14012 }
14013 ++scrubber.shallow_errors;
14014 soid_error.set_headless();
14015 scrubber.store->add_snap_error(pool.id, soid_error);
94b18763 14016 ++soid_error_count;
7c673cae
FG
14017 if (head && soid.get_head() == head->get_head())
14018 head_error.set_clone(soid.snap);
14019 continue;
14020 }
14021
14022 // new snapset?
14023 if (soid.has_snapset()) {
14024
14025 if (missing) {
14026 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14027 pool.info.allow_incomplete_clones());
14028 }
14029
14030 // Save previous head error information
94b18763 14031 if (head && (head_error.errors || soid_error_count))
7c673cae
FG
14032 scrubber.store->add_snap_error(pool.id, head_error);
14033 // Set this as a new head object
14034 head = soid;
14035 missing = 0;
14036 head_error = soid_error;
94b18763 14037 soid_error_count = 0;
7c673cae
FG
14038
14039 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14040
14041 if (p->second.attrs.count(SS_ATTR) == 0) {
14042 osd->clog->error() << mode << " " << info.pgid << " " << soid
14043 << " no '" << SS_ATTR << "' attr";
14044 ++scrubber.shallow_errors;
14045 snapset = boost::none;
94b18763 14046 head_error.set_snapset_missing();
7c673cae
FG
14047 } else {
14048 bufferlist bl;
14049 bl.push_back(p->second.attrs[SS_ATTR]);
14050 bufferlist::iterator blp = bl.begin();
14051 try {
14052 snapset = SnapSet(); // Initialize optional<> before decoding into it
14053 ::decode(snapset.get(), blp);
94b18763 14054 head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
7c673cae
FG
14055 } catch (buffer::error& e) {
14056 snapset = boost::none;
14057 osd->clog->error() << mode << " " << info.pgid << " " << soid
14058 << " can't decode '" << SS_ATTR << "' attr " << e.what();
14059 ++scrubber.shallow_errors;
94b18763 14060 head_error.set_snapset_corrupted();
7c673cae
FG
14061 }
14062 }
14063
14064 if (snapset) {
14065 // what will be next?
14066 curclone = snapset->clones.rbegin();
14067
14068 if (!snapset->clones.empty()) {
14069 dout(20) << " snapset " << snapset.get() << dendl;
14070 if (snapset->seq == 0) {
14071 osd->clog->error() << mode << " " << info.pgid << " " << soid
14072 << " snaps.seq not set";
14073 ++scrubber.shallow_errors;
94b18763 14074 head_error.set_snapset_error();
7c673cae
FG
14075 }
14076 }
14077
14078 if (soid.is_head() && !snapset->head_exists) {
14079 osd->clog->error() << mode << " " << info.pgid << " " << soid
14080 << " snapset.head_exists=false, but head exists";
14081 ++scrubber.shallow_errors;
14082 head_error.set_head_mismatch();
b5b8bbf5
FG
14083 // Fix head_exists locally so is_legacy() returns correctly
14084 snapset->head_exists = true;
7c673cae
FG
14085 }
14086 if (soid.is_snapdir() && snapset->head_exists) {
14087 osd->clog->error() << mode << " " << info.pgid << " " << soid
14088 << " snapset.head_exists=true, but snapdir exists";
14089 ++scrubber.shallow_errors;
14090 head_error.set_head_mismatch();
b5b8bbf5
FG
14091 // For symmetry fix this too, but probably doesn't matter
14092 snapset->head_exists = false;
7c673cae
FG
14093 }
14094
31f18b77 14095 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
14096 if (soid.is_snapdir()) {
14097 dout(10) << " will move snapset to head from " << soid << dendl;
14098 snapset_to_repair[soid.get_head()] = *snapset;
14099 } else if (snapset->is_legacy()) {
14100 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
14101 << dendl;
14102 snapset_to_repair[soid.get_head()] = *snapset;
14103 }
14104 } else {
14105 stat.num_legacy_snapsets++;
14106 }
14107 } else {
14108 // pessimistic assumption that this object might contain a
14109 // legacy SnapSet
14110 stat.num_legacy_snapsets++;
14111 }
14112 } else {
14113 assert(soid.is_snap());
14114 assert(head);
14115 assert(snapset);
14116 assert(soid.snap == *curclone);
14117
14118 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14119
14120 if (snapset->clone_size.count(soid.snap) == 0) {
14121 osd->clog->error() << mode << " " << info.pgid << " " << soid
14122 << " is missing in clone_size";
14123 ++scrubber.shallow_errors;
14124 soid_error.set_size_mismatch();
14125 } else {
14126 if (oi && oi->size != snapset->clone_size[soid.snap]) {
14127 osd->clog->error() << mode << " " << info.pgid << " " << soid
14128 << " size " << oi->size << " != clone_size "
14129 << snapset->clone_size[*curclone];
14130 ++scrubber.shallow_errors;
14131 soid_error.set_size_mismatch();
14132 }
14133
14134 if (snapset->clone_overlap.count(soid.snap) == 0) {
14135 osd->clog->error() << mode << " " << info.pgid << " " << soid
14136 << " is missing in clone_overlap";
14137 ++scrubber.shallow_errors;
14138 soid_error.set_size_mismatch();
14139 } else {
14140 // This checking is based on get_clone_bytes(). The first 2 asserts
14141 // can't happen because we know we have a clone_size and
14142 // a clone_overlap. Now we check that the interval_set won't
14143 // cause the last assert.
14144 uint64_t size = snapset->clone_size.find(soid.snap)->second;
14145 const interval_set<uint64_t> &overlap =
14146 snapset->clone_overlap.find(soid.snap)->second;
14147 bool bad_interval_set = false;
14148 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14149 i != overlap.end(); ++i) {
14150 if (size < i.get_len()) {
14151 bad_interval_set = true;
14152 break;
14153 }
14154 size -= i.get_len();
14155 }
14156
14157 if (bad_interval_set) {
14158 osd->clog->error() << mode << " " << info.pgid << " " << soid
14159 << " bad interval_set in clone_overlap";
14160 ++scrubber.shallow_errors;
14161 soid_error.set_size_mismatch();
14162 } else {
14163 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14164 }
14165 }
14166 }
14167
14168 // migrate legacy_snaps to snapset?
14169 auto p = snapset_to_repair.find(soid.get_head());
14170 if (p != snapset_to_repair.end()) {
14171 if (!oi || oi->legacy_snaps.empty()) {
14172 osd->clog->error() << mode << " " << info.pgid << " " << soid
14173 << " has no oi or legacy_snaps; cannot convert "
14174 << *snapset;
14175 ++scrubber.shallow_errors;
14176 } else {
14177 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
14178 << " to snapset " << p->second << dendl;
14179 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14180 }
14181 }
14182
14183 // what's next?
14184 ++curclone;
94b18763 14185 if (soid_error.errors) {
7c673cae 14186 scrubber.store->add_snap_error(pool.id, soid_error);
94b18763
FG
14187 ++soid_error_count;
14188 }
7c673cae
FG
14189 }
14190
14191 scrub_cstat.add(stat);
14192 }
14193
14194 if (doing_clones(snapset, curclone)) {
14195 dout(10) << __func__ << " " << mode << " " << info.pgid
14196 << " No more objects while processing " << head.get() << dendl;
14197
14198 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14199 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14200 head_error);
14201 }
14202 // There could be missing found by the test above or even
14203 // before dropping out of the loop for the last head.
14204 if (missing) {
14205 log_missing(missing, head, osd->clog, info.pgid, __func__,
14206 mode, pool.info.allow_incomplete_clones());
14207 }
94b18763 14208 if (head && (head_error.errors || soid_error_count))
7c673cae
FG
14209 scrubber.store->add_snap_error(pool.id, head_error);
14210
28e407b8 14211 for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
7c673cae
FG
14212 if (p->first.is_snapdir())
14213 continue;
14214 dout(10) << __func__ << " recording digests for " << p->first << dendl;
14215 ObjectContextRef obc = get_object_context(p->first, false);
14216 if (!obc) {
14217 osd->clog->error() << info.pgid << " " << mode
c07f9fc5 14218 << " cannot get object context for object "
7c673cae
FG
14219 << p->first;
14220 continue;
14221 } else if (obc->obs.oi.soid != p->first) {
14222 osd->clog->error() << info.pgid << " " << mode
14223 << " object " << p->first
14224 << " has a valid oi attr with a mismatched name, "
14225 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14226 continue;
14227 }
14228 OpContextUPtr ctx = simple_opc_create(obc);
14229 ctx->at_version = get_next_version();
14230 ctx->mtime = utime_t(); // do not update mtime
28e407b8
AA
14231 if (p->second.first) {
14232 ctx->new_obs.oi.set_data_digest(*p->second.first);
14233 } else {
14234 ctx->new_obs.oi.clear_data_digest();
14235 }
14236 if (p->second.second) {
14237 ctx->new_obs.oi.set_omap_digest(*p->second.second);
14238 } else {
14239 ctx->new_obs.oi.clear_omap_digest();
14240 }
7c673cae
FG
14241 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14242
14243 ctx->register_on_success(
14244 [this]() {
14245 dout(20) << "updating scrub digest" << dendl;
14246 if (--scrubber.num_digest_updates_pending == 0) {
14247 requeue_scrub();
14248 }
14249 });
14250
14251 simple_opc_submit(std::move(ctx));
14252 ++scrubber.num_digest_updates_pending;
14253 }
14254 for (auto& p : snapset_to_repair) {
14255 // cache pools may not have the clones, which means we won't know
14256 // what snaps they have. fake out the clone_snaps entries anyway (with
14257 // blank snap lists).
14258 p.second.head_exists = true;
14259 if (pool.info.allow_incomplete_clones()) {
14260 for (auto s : p.second.clones) {
14261 if (p.second.clone_snaps.count(s) == 0) {
14262 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14263 << s << dendl;
14264 p.second.clone_snaps[s];
14265 }
14266 }
14267 }
14268 if (p.second.clones.size() != p.second.clone_snaps.size() ||
14269 p.second.is_legacy()) {
14270 // this happens if we encounter other errors above, like a missing
14271 // or extra clone.
14272 dout(10) << __func__ << " not writing snapset to " << p.first
14273 << " snapset " << p.second << " clones " << p.second.clones
14274 << "; didn't convert fully" << dendl;
14275 scrub_cstat.sum.num_legacy_snapsets++;
14276 continue;
14277 }
14278 dout(10) << __func__ << " writing snapset to " << p.first
14279 << " " << p.second << dendl;
14280 ObjectContextRef obc = get_object_context(p.first, true);
14281 if (!obc) {
14282 osd->clog->error() << info.pgid << " " << mode
c07f9fc5 14283 << " cannot get object context for object "
7c673cae
FG
14284 << p.first;
14285 continue;
14286 } else if (obc->obs.oi.soid != p.first) {
14287 osd->clog->error() << info.pgid << " " << mode
14288 << " object " << p.first
14289 << " has a valid oi attr with a mismatched name, "
14290 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14291 continue;
14292 }
14293 ObjectContextRef snapset_obc;
14294 if (!obc->obs.exists) {
14295 snapset_obc = get_object_context(p.first.get_snapdir(), false);
14296 if (!snapset_obc) {
14297 osd->clog->error() << info.pgid << " " << mode
14298 << " cannot get object context for "
14299 << p.first.get_snapdir();
14300 continue;
14301 }
14302 }
14303 OpContextUPtr ctx = simple_opc_create(obc);
14304 PGTransaction *t = ctx->op_t.get();
14305 ctx->snapset_obc = snapset_obc;
14306 ctx->at_version = get_next_version();
14307 ctx->mtime = utime_t(); // do not update mtime
14308 ctx->new_snapset = p.second;
14309 if (!ctx->new_obs.exists) {
14310 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
14311 ctx->new_obs.exists = true;
14312 ctx->new_snapset.head_exists = true;
14313 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14314 ++ctx->delta_stats.num_whiteouts;
14315 ++ctx->delta_stats.num_objects;
14316 t->create(p.first);
14317 if (p.first < scrubber.start) {
14318 dout(20) << __func__ << " kludging around update outside of scrub range"
14319 << dendl;
14320 } else {
14321 scrub_cstat.add(ctx->delta_stats);
14322 }
14323 }
14324 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
14325 assert(!ctx->new_snapset.is_legacy());
14326 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14327 ctx->register_on_success(
14328 [this]() {
14329 dout(20) << "updating snapset" << dendl;
14330 if (--scrubber.num_digest_updates_pending == 0) {
14331 requeue_scrub();
14332 }
14333 });
14334
14335 simple_opc_submit(std::move(ctx));
14336 ++scrubber.num_digest_updates_pending;
14337 }
14338
14339 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14340}
14341
14342void PrimaryLogPG::_scrub_clear_state()
14343{
14344 scrub_cstat = object_stat_collection_t();
14345}
14346
14347void PrimaryLogPG::_scrub_finish()
14348{
14349 bool repair = state_test(PG_STATE_REPAIR);
14350 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14351 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14352
14353 if (info.stats.stats_invalid) {
14354 info.stats.stats = scrub_cstat;
14355 info.stats.stats_invalid = false;
14356
14357 if (agent_state)
14358 agent_choose_mode();
14359 }
14360
14361 dout(10) << mode << " got "
14362 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14363 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14364 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14365 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14366 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14367 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14368 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14369 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14370 << dendl;
14371
14372 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14373 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14374 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14375 !info.stats.dirty_stats_invalid) ||
14376 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14377 !info.stats.omap_stats_invalid) ||
14378 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14379 !info.stats.pin_stats_invalid) ||
14380 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14381 !info.stats.hitset_stats_invalid) ||
14382 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14383 !info.stats.hitset_bytes_stats_invalid) ||
14384 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14385 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14386 osd->clog->error() << info.pgid << " " << mode
14387 << " stat mismatch, got "
14388 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14389 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14390 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14391 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14392 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14393 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14394 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14395 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14396 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14397 ++scrubber.shallow_errors;
14398
14399 if (repair) {
14400 ++scrubber.fixed;
14401 info.stats.stats = scrub_cstat;
14402 info.stats.dirty_stats_invalid = false;
14403 info.stats.omap_stats_invalid = false;
14404 info.stats.hitset_stats_invalid = false;
14405 info.stats.hitset_bytes_stats_invalid = false;
14406 publish_stats_to_osd();
14407 share_pg_info();
14408 }
14409 } else if (scrub_cstat.sum.num_legacy_snapsets !=
14410 info.stats.stats.sum.num_legacy_snapsets) {
14411 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14412 << " from " << info.stats.stats.sum.num_legacy_snapsets
14413 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14414 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14415 publish_stats_to_osd();
14416 share_pg_info();
14417 }
224ce89b
WB
14418 // Clear object context cache to get repair information
14419 if (repair)
14420 object_contexts.clear();
7c673cae
FG
14421}
14422
14423bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14424{
14425 return osd->check_osdmap_full(missing_on);
14426}
14427
224ce89b
WB
14428int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14429{
14430 // Only supports replicated pools
14431 assert(!pool.info.require_rollback());
14432 assert(is_primary());
14433
14434 dout(10) << __func__ << " " << soid
14435 << " peers osd.{" << actingbackfill << "}" << dendl;
14436
14437 if (!is_clean()) {
14438 block_for_clean(soid, op);
14439 return -EAGAIN;
14440 }
14441
14442 assert(!pg_log.get_missing().is_missing(soid));
14443 bufferlist bv;
14444 object_info_t oi;
14445 eversion_t v;
14446 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14447 if (r < 0) {
14448 // Leave v and try to repair without a version, getting attr failed
14449 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14450 << soid << " error=" << r << dendl;
14451 } else try {
14452 bufferlist::iterator bliter = bv.begin();
14453 ::decode(oi, bliter);
14454 v = oi.version;
14455 } catch (...) {
14456 // Leave v as default constructed. This will fail when sent to older OSDs, but
14457 // not much worse than failing here.
14458 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14459 }
14460
14461 missing_loc.add_missing(soid, v, eversion_t());
14462 if (primary_error(soid, v)) {
14463 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14464 // XXX: If we knew that there is no down osd which could include this
14465 // object, it would be nice if we could return EIO here.
14466 // If a "never fail" flag was available, that could be used
14467 // for rbd to NOT return EIO until object marked lost.
14468
14469 // Drop through to save this op in case an osd comes up with the object.
14470 }
14471
14472 // Restart the op after object becomes readable again
14473 waiting_for_unreadable_object[soid].push_back(op);
14474 op->mark_delayed("waiting for missing object");
14475
14476 if (!eio_errors_to_process) {
14477 eio_errors_to_process = true;
14478 assert(is_clean());
14479 queue_peering_event(
14480 CephPeeringEvtRef(
14481 std::make_shared<CephPeeringEvt>(
14482 get_osdmap()->get_epoch(),
14483 get_osdmap()->get_epoch(),
14484 DoRecovery())));
14485 } else {
14486 // A prior error must have already cleared clean state and queued recovery
14487 // or a map change has triggered re-peering.
14488 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14489 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14490 }
14491
14492 return -EAGAIN;
14493}
14494
7c673cae
FG
14495/*---SnapTrimmer Logging---*/
14496#undef dout_prefix
14497#define dout_prefix *_dout << pg->gen_prefix()
14498
14499void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14500{
14501 ldout(pg->cct, 20) << "enter " << state_name << dendl;
14502}
14503
14504void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14505{
14506 ldout(pg->cct, 20) << "exit " << state_name << dendl;
14507}
14508
14509/*---SnapTrimmer states---*/
14510#undef dout_prefix
14511#define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14512 << "SnapTrimmer state<" << get_state_name() << ">: ")
14513
14514/* NotTrimming */
14515PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14516 : my_base(ctx),
14517 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14518{
14519 context< SnapTrimmer >().log_enter(state_name);
14520}
14521
14522void PrimaryLogPG::NotTrimming::exit()
14523{
14524 context< SnapTrimmer >().log_exit(state_name, enter_time);
14525}
14526
14527boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14528{
14529 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14530 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14531
14532 if (!(pg->is_primary() && pg->is_active())) {
14533 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14534 return discard_event();
14535 }
14536 if (!pg->is_clean() ||
14537 pg->snap_trimq.empty()) {
14538 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14539 return discard_event();
14540 }
14541 if (pg->scrubber.active) {
14542 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
7c673cae
FG
14543 return transit< WaitScrub >();
14544 } else {
14545 return transit< Trimming >();
14546 }
14547}
14548
14549boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14550{
14551 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14552 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14553
14554 pending = nullptr;
14555 if (!context< SnapTrimmer >().can_trim()) {
14556 post_event(KickTrim());
14557 return transit< NotTrimming >();
14558 }
14559
14560 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14561 ldout(pg->cct, 10) << "NotTrimming: trimming "
14562 << pg->snap_trimq.range_start()
14563 << dendl;
14564 return transit< AwaitAsyncWork >();
14565}
14566
14567/* AwaitAsyncWork */
14568PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14569 : my_base(ctx),
14570 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14571{
14572 auto *pg = context< SnapTrimmer >().pg;
14573 context< SnapTrimmer >().log_enter(state_name);
14574 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14575 pg->state_set(PG_STATE_SNAPTRIM);
224ce89b 14576 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
14577 pg->publish_stats_to_osd();
14578}
14579
14580boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14581{
14582 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14583 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14584 auto &in_flight = context<Trimming>().in_flight;
14585 assert(in_flight.empty());
14586
14587 assert(pg->is_primary() && pg->is_active());
14588 if (!context< SnapTrimmer >().can_trim()) {
14589 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14590 post_event(KickTrim());
14591 return transit< NotTrimming >();
14592 }
14593
14594 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14595
14596 vector<hobject_t> to_trim;
14597 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14598 to_trim.reserve(max);
14599 int r = pg->snap_mapper.get_next_objects_to_trim(
14600 snap_to_trim,
14601 max,
14602 &to_trim);
14603 if (r != 0 && r != -ENOENT) {
14604 lderr(pg->cct) << "get_next_objects_to_trim returned "
14605 << cpp_strerror(r) << dendl;
14606 assert(0 == "get_next_objects_to_trim returned an invalid code");
14607 } else if (r == -ENOENT) {
14608 // Done!
14609 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14610
14611 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14612 << " to purged_snaps"
14613 << dendl;
14614 pg->info.purged_snaps.insert(snap_to_trim);
14615 pg->snap_trimq.erase(snap_to_trim);
14616 ldout(pg->cct, 10) << "purged_snaps now "
14617 << pg->info.purged_snaps << ", snap_trimq now "
14618 << pg->snap_trimq << dendl;
14619
14620 ObjectStore::Transaction t;
14621 pg->dirty_big_info = true;
14622 pg->write_if_dirty(t);
14623 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14624 assert(tr == 0);
14625
14626 pg->share_pg_info();
14627 post_event(KickTrim());
14628 return transit< NotTrimming >();
14629 }
14630 assert(!to_trim.empty());
14631
14632 for (auto &&object: to_trim) {
14633 // Get next
14634 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
224ce89b
WB
14635 OpContextUPtr ctx;
14636 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14637 if (error) {
14638 if (error == -ENOLCK) {
14639 ldout(pg->cct, 10) << "could not get write lock on obj "
14640 << object << dendl;
14641 } else {
14642 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14643 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14644 }
14645 if (!in_flight.empty()) {
14646 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14647 return transit< WaitRepops >();
14648 }
14649 if (error == -ENOLCK) {
7c673cae
FG
14650 ldout(pg->cct, 10) << "waiting for it to clear"
14651 << dendl;
14652 return transit< WaitRWLock >();
7c673cae 14653 } else {
224ce89b 14654 return transit< NotTrimming >();
7c673cae
FG
14655 }
14656 }
14657
14658 in_flight.insert(object);
14659 ctx->register_on_success(
14660 [pg, object, &in_flight]() {
14661 assert(in_flight.find(object) != in_flight.end());
14662 in_flight.erase(object);
224ce89b
WB
14663 if (in_flight.empty()) {
14664 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14665 pg->snap_trimmer_machine.process_event(Reset());
14666 } else {
14667 pg->snap_trimmer_machine.process_event(RepopsComplete());
14668 }
14669 }
7c673cae
FG
14670 });
14671
14672 pg->simple_opc_submit(std::move(ctx));
14673 }
14674
14675 return transit< WaitRepops >();
14676}
14677
14678void PrimaryLogPG::setattr_maybe_cache(
14679 ObjectContextRef obc,
14680 OpContext *op,
14681 PGTransaction *t,
14682 const string &key,
14683 bufferlist &val)
14684{
14685 t->setattr(obc->obs.oi.soid, key, val);
14686}
14687
14688void PrimaryLogPG::setattrs_maybe_cache(
14689 ObjectContextRef obc,
14690 OpContext *op,
14691 PGTransaction *t,
14692 map<string, bufferlist> &attrs)
14693{
14694 t->setattrs(obc->obs.oi.soid, attrs);
14695}
14696
14697void PrimaryLogPG::rmattr_maybe_cache(
14698 ObjectContextRef obc,
14699 OpContext *op,
14700 PGTransaction *t,
14701 const string &key)
14702{
14703 t->rmattr(obc->obs.oi.soid, key);
14704}
14705
14706int PrimaryLogPG::getattr_maybe_cache(
14707 ObjectContextRef obc,
14708 const string &key,
14709 bufferlist *val)
14710{
14711 if (pool.info.require_rollback()) {
14712 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14713 if (i != obc->attr_cache.end()) {
14714 if (val)
14715 *val = i->second;
14716 return 0;
14717 } else {
14718 return -ENODATA;
14719 }
14720 }
14721 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14722}
14723
14724int PrimaryLogPG::getattrs_maybe_cache(
14725 ObjectContextRef obc,
b32b8144 14726 map<string, bufferlist> *out)
7c673cae
FG
14727{
14728 int r = 0;
b32b8144 14729 assert(out);
7c673cae 14730 if (pool.info.require_rollback()) {
b32b8144 14731 *out = obc->attr_cache;
7c673cae
FG
14732 } else {
14733 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14734 }
b32b8144
FG
14735 map<string, bufferlist> tmp;
14736 for (map<string, bufferlist>::iterator i = out->begin();
14737 i != out->end();
14738 ++i) {
14739 if (i->first.size() > 1 && i->first[0] == '_')
14740 tmp[i->first.substr(1, i->first.size())].claim(i->second);
7c673cae 14741 }
b32b8144 14742 tmp.swap(*out);
7c673cae
FG
14743 return r;
14744}
14745
14746bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14747 return osd->check_failsafe_full(ss);
14748}
14749
14750void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14751void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14752
14753#ifdef PG_DEBUG_REFS
14754uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14755void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14756#endif
14757
14758void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14759void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }