]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.cc
update sources to v12.1.1
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#include "boost/tuple/tuple.hpp"
19#include "boost/intrusive_ptr.hpp"
20#include "PG.h"
21#include "PrimaryLogPG.h"
22#include "OSD.h"
23#include "OpRequest.h"
24#include "ScrubStore.h"
25#include "Session.h"
26#include "objclass/objclass.h"
27
28#include "common/errno.h"
29#include "common/scrub_types.h"
30#include "common/perf_counters.h"
31
32#include "messages/MOSDOp.h"
33#include "messages/MOSDBackoff.h"
34#include "messages/MOSDSubOp.h"
35#include "messages/MOSDSubOpReply.h"
36#include "messages/MOSDPGTrim.h"
37#include "messages/MOSDPGScan.h"
38#include "messages/MOSDRepScrub.h"
39#include "messages/MOSDPGBackfill.h"
40#include "messages/MOSDPGBackfillRemove.h"
41#include "messages/MOSDPGUpdateLogMissing.h"
42#include "messages/MOSDPGUpdateLogMissingReply.h"
43#include "messages/MCommandReply.h"
44#include "messages/MOSDScrubReserve.h"
45#include "mds/inode_backtrace.h" // Ugh
46#include "common/EventTrace.h"
47
48#include "common/config.h"
49#include "include/compat.h"
50#include "mon/MonClient.h"
51#include "osdc/Objecter.h"
52#include "json_spirit/json_spirit_value.h"
53#include "json_spirit/json_spirit_reader.h"
54#include "include/assert.h" // json_spirit clobbers it
55#include "include/rados/rados_types.hpp"
56
57#ifdef WITH_LTTNG
58#include "tracing/osd.h"
59#else
60#define tracepoint(...)
61#endif
62
63#define dout_context cct
64#define dout_subsys ceph_subsys_osd
65#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66#undef dout_prefix
67#define dout_prefix _prefix(_dout, this)
68template <typename T>
69static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71}
72
73
74#include <sstream>
75#include <utility>
76
77#include <errno.h>
78
79MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81PGLSFilter::PGLSFilter() : cct(nullptr)
82{
83}
84
85PGLSFilter::~PGLSFilter()
86{
87}
88
89struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104};
105
106/**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128};
129
130template <typename T>
131class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146};
147
148GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152}
153
154class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169};
170
171
172Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174}
175
176class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185};
186
187class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201};
202
203struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
210 if (r < 0)
211 opcontext->async_read_result = r;
212 opcontext->finish_read(pg);
213 }
214 ~OnReadComplete() override {}
215};
216
217class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
218 PrimaryLogPGRef pg;
219 ObjectContextRef obc;
220 public:
221 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
222 pg(p), obc(o) {}
223 void finish(int r) override {
224 pg->_applied_recovered_object(obc);
225 }
226};
227
228class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
229 PrimaryLogPGRef pg;
230 epoch_t epoch;
231 eversion_t last_complete;
232 public:
233 C_OSD_CommittedPushedObject(
234 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
235 pg(p), epoch(epoch), last_complete(lc) {
236 }
237 void finish(int r) override {
238 pg->_committed_pushed_object(epoch, last_complete);
239 }
240};
241
242class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
243 PrimaryLogPGRef pg;
244 public:
245 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
246 pg(p) {}
247 void finish(int r) override {
248 pg->_applied_recovered_object_replica();
249 }
250};
251
252// OpContext
253void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
254{
255 inflightreads = 1;
256 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
257 pair<bufferlist*, Context*> > > in;
258 in.swap(pending_async_reads);
259 pg->pgbackend->objects_read_async(
260 obc->obs.oi.soid,
261 in,
262 new OnReadComplete(pg, this), pg->get_pool().fast_read);
263}
264void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
265{
266 assert(inflightreads > 0);
267 --inflightreads;
268 if (async_reads_complete()) {
269 assert(pg->in_progress_async_reads.size());
270 assert(pg->in_progress_async_reads.front().second == this);
271 pg->in_progress_async_reads.pop_front();
272 pg->complete_read_ctx(async_read_result, this);
273 }
274}
275
276class CopyFromCallback: public PrimaryLogPG::CopyCallback {
277public:
278 PrimaryLogPG::CopyResults *results;
279 int retval;
280 PrimaryLogPG::OpContext *ctx;
281 explicit CopyFromCallback(PrimaryLogPG::OpContext *ctx_)
282 : results(NULL),
283 retval(0),
284 ctx(ctx_) {}
285 ~CopyFromCallback() override {}
286
287 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
288 results = results_.get<1>();
289 int r = results_.get<0>();
290 retval = r;
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
297 }
298 ctx->copy_cb = NULL;
299 if (r < 0) {
300 if (r != -ECANCELED) { // on cancel just toss it out; client resends
301 if (ctx->op)
302 ctx->pg->osd->reply_op_error(ctx->op, r);
303 } else if (results->should_requeue) {
304 if (ctx->op)
305 ctx->pg->requeue_op(ctx->op);
306 }
307 ctx->pg->close_op_ctx(ctx);
308 }
309 }
310
311 bool is_temp_obj_used() {
312 return results->started_temp_obj;
313 }
314 uint64_t get_data_size() {
315 return results->object_size;
316 }
317 int get_result() {
318 return retval;
319 }
320};
321
322// ======================
323// PGBackend::Listener
324
325void PrimaryLogPG::on_local_recover(
326 const hobject_t &hoid,
327 const ObjectRecoveryInfo &_recovery_info,
328 ObjectContextRef obc,
329 ObjectStore::Transaction *t
330 )
331{
332 dout(10) << __func__ << ": " << hoid << dendl;
333
334 ObjectRecoveryInfo recovery_info(_recovery_info);
335 clear_object_snap_mapping(t, hoid);
336 if (recovery_info.soid.is_snap()) {
337 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
338 set<snapid_t> snaps;
339 dout(20) << " snapset " << recovery_info.ss
340 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
341 if (recovery_info.ss.is_legacy() ||
342 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
343 assert(recovery_info.oi.legacy_snaps.size());
344 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
345 recovery_info.oi.legacy_snaps.end());
346 } else {
347 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
348 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
349 snaps.insert(p->second.begin(), p->second.end());
350 }
351 dout(20) << " snaps " << snaps << dendl;
352 snap_mapper.add_oid(
353 recovery_info.soid,
354 snaps,
355 &_t);
356 }
357 if (pg_log.get_missing().is_missing(recovery_info.soid) &&
358 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
359 assert(is_primary());
360 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
361 if (latest->op == pg_log_entry_t::LOST_REVERT &&
362 latest->reverting_to == recovery_info.version) {
363 dout(10) << " got old revert version " << recovery_info.version
364 << " for " << *latest << dendl;
365 recovery_info.version = latest->version;
366 // update the attr to the revert event version
367 recovery_info.oi.prior_version = recovery_info.oi.version;
368 recovery_info.oi.version = latest->version;
369 bufferlist bl;
370 ::encode(recovery_info.oi, bl,
371 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
372 assert(!pool.info.require_rollback());
373 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
374 if (obc)
375 obc->attr_cache[OI_ATTR] = bl;
376 }
377 }
378
379 // keep track of active pushes for scrub
380 ++active_pushes;
381
382 if (recovery_info.version > pg_log.get_can_rollback_to()) {
383 /* This can only happen during a repair, and even then, it would
384 * be one heck of a race. If we are repairing the object, the
385 * write in question must be fully committed, so it's not valid
386 * to roll it back anyway (and we'll be rolled forward shortly
387 * anyway) */
388 PGLogEntryHandler h{this, t};
389 pg_log.roll_forward_to(recovery_info.version, &h);
390 }
391 recover_got(recovery_info.soid, recovery_info.version);
392
393 if (is_primary()) {
394 assert(obc);
395 obc->obs.exists = true;
396 obc->ondisk_write_lock();
397
398 bool got = obc->get_recovery_read();
399 assert(got);
400
401 assert(recovering.count(obc->obs.oi.soid));
402 recovering[obc->obs.oi.soid] = obc;
403 obc->obs.oi = recovery_info.oi; // may have been updated above
404
405
406 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
407 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
408
409 publish_stats_to_osd();
410 assert(missing_loc.needs_recovery(hoid));
411 missing_loc.add_location(hoid, pg_whoami);
412 release_backoffs(hoid);
413 if (!is_unreadable_object(hoid)) {
414 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
415 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
416 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
417 requeue_ops(unreadable_object_entry->second);
418 waiting_for_unreadable_object.erase(unreadable_object_entry);
419 }
420 }
7c673cae
FG
421 } else {
422 t->register_on_applied(
423 new C_OSD_AppliedRecoveredObjectReplica(this));
424
425 }
426
427 t->register_on_commit(
428 new C_OSD_CommittedPushedObject(
429 this,
430 get_osdmap()->get_epoch(),
431 info.last_complete));
432
433 // update pg
434 dirty_info = true;
435 write_if_dirty(*t);
436}
437
438void PrimaryLogPG::on_global_recover(
439 const hobject_t &soid,
440 const object_stat_sum_t &stat_diff)
441{
442 info.stats.stats.sum.add(stat_diff);
443 missing_loc.recovered(soid);
444 publish_stats_to_osd();
445 dout(10) << "pushed " << soid << " to all replicas" << dendl;
446 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
447 assert(i != recovering.end());
448
449 // recover missing won't have had an obc, but it gets filled in
450 // during on_local_recover
451 assert(i->second);
452 list<OpRequestRef> requeue_list;
453 i->second->drop_recovery_read(&requeue_list);
454 requeue_ops(requeue_list);
455
456 backfills_in_flight.erase(soid);
457
458 recovering.erase(i);
459 finish_recovery_op(soid);
460 release_backoffs(soid);
461 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
462 if (degraded_object_entry != waiting_for_degraded_object.end()) {
463 dout(20) << " kicking degraded waiters on " << soid << dendl;
464 requeue_ops(degraded_object_entry->second);
465 waiting_for_degraded_object.erase(degraded_object_entry);
466 }
467 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
468 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
469 dout(20) << " kicking unreadable waiters on " << soid << dendl;
470 requeue_ops(unreadable_object_entry->second);
471 waiting_for_unreadable_object.erase(unreadable_object_entry);
472 }
473 finish_degraded_object(soid);
474}
475
476void PrimaryLogPG::on_peer_recover(
477 pg_shard_t peer,
478 const hobject_t &soid,
479 const ObjectRecoveryInfo &recovery_info)
480{
481 publish_stats_to_osd();
482 // done!
483 peer_missing[peer].got(soid, recovery_info.version);
484}
485
486void PrimaryLogPG::begin_peer_recover(
487 pg_shard_t peer,
488 const hobject_t soid)
489{
490 peer_missing[peer].revise_have(soid, eversion_t());
491}
492
493void PrimaryLogPG::schedule_recovery_work(
494 GenContext<ThreadPool::TPHandle&> *c)
495{
496 osd->recovery_gen_wq.queue(c);
497}
498
499void PrimaryLogPG::send_message_osd_cluster(
500 int peer, Message *m, epoch_t from_epoch)
501{
502 osd->send_message_osd_cluster(peer, m, from_epoch);
503}
504
505void PrimaryLogPG::send_message_osd_cluster(
506 Message *m, Connection *con)
507{
508 osd->send_message_osd_cluster(m, con);
509}
510
511void PrimaryLogPG::send_message_osd_cluster(
512 Message *m, const ConnectionRef& con)
513{
514 osd->send_message_osd_cluster(m, con);
515}
516
224ce89b
WB
517void PrimaryLogPG::on_primary_error(
518 const hobject_t &oid,
519 eversion_t v)
520{
521 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
522 primary_failed(oid);
523 primary_error(oid, v);
524 backfills_in_flight.erase(oid);
525 missing_loc.add_missing(oid, v, eversion_t());
526}
527
7c673cae
FG
528ConnectionRef PrimaryLogPG::get_con_osd_cluster(
529 int peer, epoch_t from_epoch)
530{
531 return osd->get_con_osd_cluster(peer, from_epoch);
532}
533
534PerfCounters *PrimaryLogPG::get_logger()
535{
536 return osd->logger;
537}
538
539
540// ====================
541// missing objects
542
543bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
544{
545 return pg_log.get_missing().get_items().count(soid);
546}
547
548void PrimaryLogPG::maybe_kick_recovery(
549 const hobject_t &soid)
550{
551 eversion_t v;
552 if (!missing_loc.needs_recovery(soid, &v))
553 return;
554
555 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
556 if (p != recovering.end()) {
557 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
558 } else if (missing_loc.is_unfound(soid)) {
559 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
560 } else {
561 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
562 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
563 if (is_missing_object(soid)) {
564 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
565 } else {
566 prep_object_replica_pushes(soid, v, h);
567 }
568 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
569 }
570}
571
572void PrimaryLogPG::wait_for_unreadable_object(
573 const hobject_t& soid, OpRequestRef op)
574{
575 assert(is_unreadable_object(soid));
576 maybe_kick_recovery(soid);
577 waiting_for_unreadable_object[soid].push_back(op);
578 op->mark_delayed("waiting for missing object");
579}
580
7c673cae
FG
581bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
582{
583 /* The conditions below may clear (on_local_recover, before we queue
584 * the transaction) before we actually requeue the degraded waiters
585 * in on_global_recover after the transaction completes.
586 */
587 if (waiting_for_degraded_object.count(soid))
588 return true;
589 if (pg_log.get_missing().get_items().count(soid))
590 return true;
591 assert(!actingbackfill.empty());
592 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
593 i != actingbackfill.end();
594 ++i) {
595 if (*i == get_primary()) continue;
596 pg_shard_t peer = *i;
597 auto peer_missing_entry = peer_missing.find(peer);
598 if (peer_missing_entry != peer_missing.end() &&
599 peer_missing_entry->second.get_items().count(soid))
600 return true;
601
602 // Object is degraded if after last_backfill AND
603 // we are backfilling it
604 if (is_backfill_targets(peer) &&
605 peer_info[peer].last_backfill <= soid &&
606 last_backfill_started >= soid &&
607 backfills_in_flight.count(soid))
608 return true;
609 }
610 return false;
611}
612
613void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
614{
615 assert(is_degraded_or_backfilling_object(soid));
616
617 maybe_kick_recovery(soid);
618 waiting_for_degraded_object[soid].push_back(op);
619 op->mark_delayed("waiting for degraded object");
620}
621
622void PrimaryLogPG::block_write_on_full_cache(
623 const hobject_t& _oid, OpRequestRef op)
624{
625 const hobject_t oid = _oid.get_head();
626 dout(20) << __func__ << ": blocking object " << oid
627 << " on full cache" << dendl;
628 objects_blocked_on_cache_full.insert(oid);
629 waiting_for_cache_not_full.push_back(op);
630 op->mark_delayed("waiting for cache not full");
631}
632
224ce89b
WB
633void PrimaryLogPG::block_for_clean(
634 const hobject_t& oid, OpRequestRef op)
635{
636 dout(20) << __func__ << ": blocking object " << oid
637 << " on primary repair" << dendl;
638 waiting_for_clean_to_primary_repair.push_back(op);
639 op->mark_delayed("waiting for clean to repair");
640}
641
7c673cae
FG
642void PrimaryLogPG::block_write_on_snap_rollback(
643 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
644{
645 dout(20) << __func__ << ": blocking object " << oid.get_head()
646 << " on snap promotion " << obc->obs.oi.soid << dendl;
647 // otherwise, we'd have blocked in do_op
648 assert(oid.is_head());
649 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
650 objects_blocked_on_snap_promotion[oid] = obc;
651 wait_for_blocked_object(obc->obs.oi.soid, op);
652}
653
654void PrimaryLogPG::block_write_on_degraded_snap(
655 const hobject_t& snap, OpRequestRef op)
656{
657 dout(20) << __func__ << ": blocking object " << snap.get_head()
658 << " on degraded snap " << snap << dendl;
659 // otherwise, we'd have blocked in do_op
660 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
661 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
662 wait_for_degraded_object(snap, op);
663}
664
665bool PrimaryLogPG::maybe_await_blocked_snapset(
666 const hobject_t &hoid,
667 OpRequestRef op)
668{
669 ObjectContextRef obc;
670 obc = object_contexts.lookup(hoid.get_head());
671 if (obc) {
672 if (obc->is_blocked()) {
673 wait_for_blocked_object(obc->obs.oi.soid, op);
674 return true;
675 } else {
676 return false;
677 }
678 }
679 obc = object_contexts.lookup(hoid.get_snapdir());
680 if (obc) {
681 if (obc->is_blocked()) {
682 wait_for_blocked_object(obc->obs.oi.soid, op);
683 return true;
684 } else {
685 return false;
686 }
687 }
688 return false;
689}
690
691void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
692{
693 dout(10) << __func__ << " " << soid << " " << op << dendl;
694 waiting_for_blocked_object[soid].push_back(op);
695 op->mark_delayed("waiting for blocked object");
696}
697
698void PrimaryLogPG::maybe_force_recovery()
699{
700 // no force if not in degraded/recovery/backfill stats
701 if (!is_degraded() &&
702 !state_test(PG_STATE_RECOVERING |
703 PG_STATE_RECOVERY_WAIT |
704 PG_STATE_BACKFILL |
705 PG_STATE_BACKFILL_WAIT |
706 PG_STATE_BACKFILL_TOOFULL))
707 return;
708
709 if (pg_log.get_log().approx_size() <
710 cct->_conf->osd_max_pg_log_entries *
711 cct->_conf->osd_force_recovery_pg_log_entries_factor)
712 return;
713
714 // find the oldest missing object
715 version_t min_version = 0;
716 hobject_t soid;
717 if (!pg_log.get_missing().get_items().empty()) {
718 min_version = pg_log.get_missing().get_rmissing().begin()->first;
719 soid = pg_log.get_missing().get_rmissing().begin()->second;
720 }
721 assert(!actingbackfill.empty());
722 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
723 it != actingbackfill.end();
724 ++it) {
725 if (*it == get_primary()) continue;
726 pg_shard_t peer = *it;
727 if (peer_missing.count(peer) &&
728 !peer_missing[peer].get_items().empty() &&
729 min_version > peer_missing[peer].get_rmissing().begin()->first) {
730 min_version = peer_missing[peer].get_rmissing().begin()->first;
731 soid = peer_missing[peer].get_rmissing().begin()->second;
732 }
733 }
734
735 // recover it
736 if (soid != hobject_t())
737 maybe_kick_recovery(soid);
738}
739
740class PGLSPlainFilter : public PGLSFilter {
741 string val;
742public:
743 int init(bufferlist::iterator &params) override
744 {
745 try {
746 ::decode(xattr, params);
747 ::decode(val, params);
748 } catch (buffer::error &e) {
749 return -EINVAL;
750 }
751
752 return 0;
753 }
754 ~PGLSPlainFilter() override {}
755 bool filter(const hobject_t &obj, bufferlist& xattr_data,
756 bufferlist& outdata) override;
757};
758
759class PGLSParentFilter : public PGLSFilter {
760 inodeno_t parent_ino;
761public:
762 CephContext* cct;
763 PGLSParentFilter(CephContext* cct) : cct(cct) {
764 xattr = "_parent";
765 }
766 int init(bufferlist::iterator &params) override
767 {
768 try {
769 ::decode(parent_ino, params);
770 } catch (buffer::error &e) {
771 return -EINVAL;
772 }
773 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
774
775 return 0;
776 }
777 ~PGLSParentFilter() override {}
778 bool filter(const hobject_t &obj, bufferlist& xattr_data,
779 bufferlist& outdata) override;
780};
781
782bool PGLSParentFilter::filter(const hobject_t &obj,
783 bufferlist& xattr_data, bufferlist& outdata)
784{
785 bufferlist::iterator iter = xattr_data.begin();
786 inode_backtrace_t bt;
787
788 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
789
790 ::decode(bt, iter);
791
792 vector<inode_backpointer_t>::iterator vi;
793 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
794 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
795 if (vi->dirino == parent_ino) {
796 ::encode(*vi, outdata);
797 return true;
798 }
799 }
800
801 return false;
802}
803
804bool PGLSPlainFilter::filter(const hobject_t &obj,
805 bufferlist& xattr_data, bufferlist& outdata)
806{
807 if (val.size() != xattr_data.length())
808 return false;
809
810 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
811 return false;
812
813 return true;
814}
815
816bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
817{
818 bufferlist bl;
819
820 // If filter has expressed an interest in an xattr, load it.
821 if (!filter->get_xattr().empty()) {
822 int ret = pgbackend->objects_get_attr(
823 sobj,
824 filter->get_xattr(),
825 &bl);
826 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
827 if (ret < 0) {
828 if (ret != -ENODATA || filter->reject_empty_xattr()) {
829 return false;
830 }
831 }
832 }
833
834 return filter->filter(sobj, bl, outdata);
835}
836
837int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
838{
839 string type;
840 PGLSFilter *filter;
841
842 try {
843 ::decode(type, iter);
844 }
845 catch (buffer::error& e) {
846 return -EINVAL;
847 }
848
849 if (type.compare("parent") == 0) {
850 filter = new PGLSParentFilter(cct);
851 } else if (type.compare("plain") == 0) {
852 filter = new PGLSPlainFilter();
853 } else {
854 std::size_t dot = type.find(".");
855 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
856 return -EINVAL;
857 }
858
859 const std::string class_name = type.substr(0, dot);
860 const std::string filter_name = type.substr(dot + 1);
861 ClassHandler::ClassData *cls = NULL;
862 int r = osd->class_handler->open_class(class_name, &cls);
863 if (r != 0) {
864 derr << "Error opening class '" << class_name << "': "
865 << cpp_strerror(r) << dendl;
866 if (r != -EPERM) // propogate permission error
867 r = -EINVAL;
868 return r;
869 } else {
870 assert(cls);
871 }
872
873 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
874 if (class_filter == NULL) {
875 derr << "Error finding filter '" << filter_name << "' in class "
876 << class_name << dendl;
877 return -EINVAL;
878 }
879 filter = class_filter->fn();
880 if (!filter) {
881 // Object classes are obliged to return us something, but let's
882 // give an error rather than asserting out.
883 derr << "Buggy class " << class_name << " failed to construct "
884 "filter " << filter_name << dendl;
885 return -EINVAL;
886 }
887 }
888
889 assert(filter);
890 int r = filter->init(iter);
891 if (r < 0) {
892 derr << "Error initializing filter " << type << ": "
893 << cpp_strerror(r) << dendl;
894 delete filter;
895 return -EINVAL;
896 } else {
897 // Successfully constructed and initialized, return it.
898 *pfilter = filter;
899 return 0;
900 }
901}
902
903
904// ==========================================================
905
906int PrimaryLogPG::do_command(
907 cmdmap_t cmdmap,
908 ostream& ss,
909 bufferlist& idata,
910 bufferlist& odata,
911 ConnectionRef con,
912 ceph_tid_t tid)
913{
914 const pg_missing_t &missing = pg_log.get_missing();
915 string prefix;
916 string format;
917
918 cmd_getval(cct, cmdmap, "format", format);
919 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
920
921 string command;
922 cmd_getval(cct, cmdmap, "cmd", command);
923 if (command == "query") {
924 f->open_object_section("pg");
925 f->dump_string("state", pg_state_string(get_state()));
926 f->dump_stream("snap_trimq") << snap_trimq;
927 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
928 f->open_array_section("up");
929 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
930 f->dump_unsigned("osd", *p);
931 f->close_section();
932 f->open_array_section("acting");
933 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
934 f->dump_unsigned("osd", *p);
935 f->close_section();
936 if (!backfill_targets.empty()) {
937 f->open_array_section("backfill_targets");
938 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
939 p != backfill_targets.end();
940 ++p)
941 f->dump_stream("shard") << *p;
942 f->close_section();
943 }
944 if (!actingbackfill.empty()) {
945 f->open_array_section("actingbackfill");
946 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
947 p != actingbackfill.end();
948 ++p)
949 f->dump_stream("shard") << *p;
950 f->close_section();
951 }
952 f->open_object_section("info");
953 _update_calc_stats();
954 info.dump(f.get());
955 f->close_section();
956
957 f->open_array_section("peer_info");
958 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
959 p != peer_info.end();
960 ++p) {
961 f->open_object_section("info");
962 f->dump_stream("peer") << p->first;
963 p->second.dump(f.get());
964 f->close_section();
965 }
966 f->close_section();
967
968 f->open_array_section("recovery_state");
969 handle_query_state(f.get());
970 f->close_section();
971
972 f->open_object_section("agent_state");
973 if (agent_state)
974 agent_state->dump(f.get());
975 f->close_section();
976
977 f->close_section();
978 f->flush(odata);
979 return 0;
980 }
981 else if (command == "mark_unfound_lost") {
982 string mulcmd;
983 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
984 int mode = -1;
985 if (mulcmd == "revert") {
986 if (pool.info.ec_pool()) {
987 ss << "mode must be 'delete' for ec pool";
988 return -EINVAL;
989 }
990 mode = pg_log_entry_t::LOST_REVERT;
991 } else if (mulcmd == "delete") {
992 mode = pg_log_entry_t::LOST_DELETE;
993 } else {
994 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
995 return -EINVAL;
996 }
997 assert(mode == pg_log_entry_t::LOST_REVERT ||
998 mode == pg_log_entry_t::LOST_DELETE);
999
1000 if (!is_primary()) {
1001 ss << "not primary";
1002 return -EROFS;
1003 }
1004
1005 uint64_t unfound = missing_loc.num_unfound();
1006 if (!unfound) {
1007 ss << "pg has no unfound objects";
1008 return 0; // make command idempotent
1009 }
1010
1011 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1012 ss << "pg has " << unfound
1013 << " unfound objects but we haven't probed all sources, not marking lost";
1014 return -EINVAL;
1015 }
1016
1017 mark_all_unfound_lost(mode, con, tid);
1018 return -EAGAIN;
1019 }
1020 else if (command == "list_missing") {
1021 hobject_t offset;
1022 string offset_json;
1023 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1024 json_spirit::Value v;
1025 try {
1026 if (!json_spirit::read(offset_json, v))
1027 throw std::runtime_error("bad json");
1028 offset.decode(v);
1029 } catch (std::runtime_error& e) {
1030 ss << "error parsing offset: " << e.what();
1031 return -EINVAL;
1032 }
1033 }
1034 f->open_object_section("missing");
1035 {
1036 f->open_object_section("offset");
1037 offset.dump(f.get());
1038 f->close_section();
1039 }
1040 f->dump_int("num_missing", missing.num_missing());
1041 f->dump_int("num_unfound", get_num_unfound());
1042 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1043 missing_loc.get_needs_recovery();
1044 map<hobject_t, pg_missing_item>::const_iterator p =
1045 needs_recovery_map.upper_bound(offset);
1046 {
1047 f->open_array_section("objects");
1048 int32_t num = 0;
1049 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1050 if (missing_loc.is_unfound(p->first)) {
1051 f->open_object_section("object");
1052 {
1053 f->open_object_section("oid");
1054 p->first.dump(f.get());
1055 f->close_section();
1056 }
1057 p->second.dump(f.get()); // have, need keys
1058 {
1059 f->open_array_section("locations");
1060 for (set<pg_shard_t>::iterator r =
1061 missing_loc.get_locations(p->first).begin();
1062 r != missing_loc.get_locations(p->first).end();
1063 ++r)
1064 f->dump_stream("shard") << *r;
1065 f->close_section();
1066 }
1067 f->close_section();
1068 num++;
1069 }
1070 }
1071 f->close_section();
1072 }
1073 f->dump_bool("more", p != needs_recovery_map.end());
1074 f->close_section();
1075 f->flush(odata);
1076 return 0;
1077 }
1078
1079 ss << "unknown pg command " << prefix;
1080 return -EINVAL;
1081}
1082
1083// ==========================================================
1084
1085void PrimaryLogPG::do_pg_op(OpRequestRef op)
1086{
1087 // NOTE: this is non-const because we modify the OSDOp.outdata in
1088 // place
1089 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1090 assert(m->get_type() == CEPH_MSG_OSD_OP);
1091 dout(10) << "do_pg_op " << *m << dendl;
1092
1093 op->mark_started();
1094
1095 int result = 0;
1096 string cname, mname;
1097 PGLSFilter *filter = NULL;
1098 bufferlist filter_out;
1099
1100 snapid_t snapid = m->get_snapid();
1101
1102 vector<OSDOp> ops = m->ops;
1103
1104 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1105 OSDOp& osd_op = *p;
1106 bufferlist::iterator bp = p->indata.begin();
1107 switch (p->op.op) {
1108 case CEPH_OSD_OP_PGNLS_FILTER:
1109 try {
1110 ::decode(cname, bp);
1111 ::decode(mname, bp);
1112 }
1113 catch (const buffer::error& e) {
1114 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1115 result = -EINVAL;
1116 break;
1117 }
1118 if (filter) {
1119 delete filter;
1120 filter = NULL;
1121 }
1122 result = get_pgls_filter(bp, &filter);
1123 if (result < 0)
1124 break;
1125
1126 assert(filter);
1127
1128 // fall through
1129
1130 case CEPH_OSD_OP_PGNLS:
1131 if (snapid != CEPH_NOSNAP) {
1132 result = -EINVAL;
1133 break;
1134 }
1135 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1136 dout(10) << " pgnls pg=" << m->get_pg()
1137 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1138 << " != " << info.pgid << dendl;
1139 result = 0; // hmm?
1140 } else {
1141 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1142
1143 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1144 // read into a buffer
1145 vector<hobject_t> sentries;
1146 pg_nls_response_t response;
1147 try {
1148 ::decode(response.handle, bp);
1149 }
1150 catch (const buffer::error& e) {
1151 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1152 result = -EINVAL;
1153 break;
1154 }
1155
1156 hobject_t next;
1157 hobject_t lower_bound = response.handle;
1158 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1159 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1160 dout(10) << " pgnls lower_bound " << lower_bound
1161 << " pg_end " << pg_end << dendl;
1162 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1163 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1164 // this should only happen with a buggy client.
1165 dout(10) << "outside of PG bounds " << pg_start << " .. "
1166 << pg_end << dendl;
1167 result = -EINVAL;
1168 break;
1169 }
1170
1171 hobject_t current = lower_bound;
1172 osr->flush();
1173 int r = pgbackend->objects_list_partial(
1174 current,
1175 list_size,
1176 list_size,
1177 &sentries,
1178 &next);
1179 if (r != 0) {
1180 result = -EINVAL;
1181 break;
1182 }
1183
1184 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1185 pg_log.get_missing().get_items().lower_bound(current);
1186 vector<hobject_t>::iterator ls_iter = sentries.begin();
1187 hobject_t _max = hobject_t::get_max();
1188 while (1) {
1189 const hobject_t &mcand =
1190 missing_iter == pg_log.get_missing().get_items().end() ?
1191 _max :
1192 missing_iter->first;
1193 const hobject_t &lcand =
1194 ls_iter == sentries.end() ?
1195 _max :
1196 *ls_iter;
1197
1198 hobject_t candidate;
1199 if (mcand == lcand) {
1200 candidate = mcand;
1201 if (!mcand.is_max()) {
1202 ++ls_iter;
1203 ++missing_iter;
1204 }
1205 } else if (mcand < lcand) {
1206 candidate = mcand;
1207 assert(!mcand.is_max());
1208 ++missing_iter;
1209 } else {
1210 candidate = lcand;
1211 assert(!lcand.is_max());
1212 ++ls_iter;
1213 }
1214
1215 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1216 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1217
1218 if (candidate >= next) {
1219 break;
1220 }
1221
1222 if (response.entries.size() == list_size) {
1223 next = candidate;
1224 break;
1225 }
1226
1227 // skip snapdir objects
1228 if (candidate.snap == CEPH_SNAPDIR)
1229 continue;
1230
1231 if (candidate.snap != CEPH_NOSNAP)
1232 continue;
1233
1234 // skip internal namespace
1235 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1236 continue;
1237
1238 // skip wrong namespace
1239 if (m->get_hobj().nspace != librados::all_nspaces &&
1240 candidate.get_namespace() != m->get_hobj().nspace)
1241 continue;
1242
1243 if (filter && !pgls_filter(filter, candidate, filter_out))
1244 continue;
1245
1246 dout(20) << "pgnls item 0x" << std::hex
1247 << candidate.get_hash()
1248 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1249 << std::dec << " "
1250 << candidate.oid.name << dendl;
1251
1252 librados::ListObjectImpl item;
1253 item.nspace = candidate.get_namespace();
1254 item.oid = candidate.oid.name;
1255 item.locator = candidate.get_key();
1256 response.entries.push_back(item);
1257 }
1258
1259 if (next.is_max() &&
1260 missing_iter == pg_log.get_missing().get_items().end() &&
1261 ls_iter == sentries.end()) {
1262 result = 1;
1263
1264 // Set response.handle to the start of the next PG according
1265 // to the object sort order.
1266 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1267 } else {
1268 response.handle = next;
1269 }
1270 dout(10) << "pgnls handle=" << response.handle << dendl;
1271 ::encode(response, osd_op.outdata);
1272 if (filter)
1273 ::encode(filter_out, osd_op.outdata);
1274 dout(10) << " pgnls result=" << result << " outdata.length()="
1275 << osd_op.outdata.length() << dendl;
1276 }
1277 break;
1278
1279 case CEPH_OSD_OP_PGLS_FILTER:
1280 try {
1281 ::decode(cname, bp);
1282 ::decode(mname, bp);
1283 }
1284 catch (const buffer::error& e) {
1285 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1286 result = -EINVAL;
1287 break;
1288 }
1289 if (filter) {
1290 delete filter;
1291 filter = NULL;
1292 }
1293 result = get_pgls_filter(bp, &filter);
1294 if (result < 0)
1295 break;
1296
1297 assert(filter);
1298
1299 // fall through
1300
1301 case CEPH_OSD_OP_PGLS:
1302 if (snapid != CEPH_NOSNAP) {
1303 result = -EINVAL;
1304 break;
1305 }
1306 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1307 dout(10) << " pgls pg=" << m->get_pg()
1308 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1309 << " != " << info.pgid << dendl;
1310 result = 0; // hmm?
1311 } else {
1312 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1313
1314 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1315 // read into a buffer
1316 vector<hobject_t> sentries;
1317 pg_ls_response_t response;
1318 try {
1319 ::decode(response.handle, bp);
1320 }
1321 catch (const buffer::error& e) {
1322 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1323 result = -EINVAL;
1324 break;
1325 }
1326
1327 hobject_t next;
1328 hobject_t current = response.handle;
1329 osr->flush();
1330 int r = pgbackend->objects_list_partial(
1331 current,
1332 list_size,
1333 list_size,
1334 &sentries,
1335 &next);
1336 if (r != 0) {
1337 result = -EINVAL;
1338 break;
1339 }
1340
1341 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1342
1343 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1344 pg_log.get_missing().get_items().lower_bound(current);
1345 vector<hobject_t>::iterator ls_iter = sentries.begin();
1346 hobject_t _max = hobject_t::get_max();
1347 while (1) {
1348 const hobject_t &mcand =
1349 missing_iter == pg_log.get_missing().get_items().end() ?
1350 _max :
1351 missing_iter->first;
1352 const hobject_t &lcand =
1353 ls_iter == sentries.end() ?
1354 _max :
1355 *ls_iter;
1356
1357 hobject_t candidate;
1358 if (mcand == lcand) {
1359 candidate = mcand;
1360 if (!mcand.is_max()) {
1361 ++ls_iter;
1362 ++missing_iter;
1363 }
1364 } else if (mcand < lcand) {
1365 candidate = mcand;
1366 assert(!mcand.is_max());
1367 ++missing_iter;
1368 } else {
1369 candidate = lcand;
1370 assert(!lcand.is_max());
1371 ++ls_iter;
1372 }
1373
1374 if (candidate >= next) {
1375 break;
1376 }
1377
1378 if (response.entries.size() == list_size) {
1379 next = candidate;
1380 break;
1381 }
1382
1383 // skip snapdir objects
1384 if (candidate.snap == CEPH_SNAPDIR)
1385 continue;
1386
1387 if (candidate.snap != CEPH_NOSNAP)
1388 continue;
1389
1390 // skip wrong namespace
1391 if (candidate.get_namespace() != m->get_hobj().nspace)
1392 continue;
1393
1394 if (filter && !pgls_filter(filter, candidate, filter_out))
1395 continue;
1396
1397 response.entries.push_back(make_pair(candidate.oid,
1398 candidate.get_key()));
1399 }
1400 if (next.is_max() &&
1401 missing_iter == pg_log.get_missing().get_items().end() &&
1402 ls_iter == sentries.end()) {
1403 result = 1;
1404 }
1405 response.handle = next;
1406 ::encode(response, osd_op.outdata);
1407 if (filter)
1408 ::encode(filter_out, osd_op.outdata);
1409 dout(10) << " pgls result=" << result << " outdata.length()="
1410 << osd_op.outdata.length() << dendl;
1411 }
1412 break;
1413
1414 case CEPH_OSD_OP_PG_HITSET_LS:
1415 {
1416 list< pair<utime_t,utime_t> > ls;
1417 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1418 p != info.hit_set.history.end();
1419 ++p)
1420 ls.push_back(make_pair(p->begin, p->end));
1421 if (hit_set)
1422 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1423 ::encode(ls, osd_op.outdata);
1424 }
1425 break;
1426
1427 case CEPH_OSD_OP_PG_HITSET_GET:
1428 {
1429 utime_t stamp(osd_op.op.hit_set_get.stamp);
1430 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1431 // read the current in-memory HitSet, not the version we've
1432 // checkpointed.
1433 if (!hit_set) {
1434 result= -ENOENT;
1435 break;
1436 }
1437 ::encode(*hit_set, osd_op.outdata);
1438 result = osd_op.outdata.length();
1439 } else {
1440 // read an archived HitSet.
1441 hobject_t oid;
1442 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1443 p != info.hit_set.history.end();
1444 ++p) {
1445 if (stamp >= p->begin && stamp <= p->end) {
1446 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1447 break;
1448 }
1449 }
1450 if (oid == hobject_t()) {
1451 result = -ENOENT;
1452 break;
1453 }
1454 if (!pool.info.is_replicated()) {
1455 // FIXME: EC not supported yet
1456 result = -EOPNOTSUPP;
1457 break;
1458 }
1459 if (is_unreadable_object(oid)) {
1460 wait_for_unreadable_object(oid, op);
1461 delete filter;
1462 return;
1463 }
1464 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1465 }
1466 }
1467 break;
1468
1469 case CEPH_OSD_OP_SCRUBLS:
1470 result = do_scrub_ls(m, &osd_op);
1471 break;
1472
1473 default:
1474 result = -EINVAL;
1475 break;
1476 }
1477
1478 if (result < 0)
1479 break;
1480 }
1481
1482 // reply
1483 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1484 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1485 false);
1486 reply->claim_op_out_data(ops);
1487 reply->set_result(result);
1488 reply->set_reply_versions(info.last_update, info.last_user_version);
1489 osd->send_message_osd_client(reply, m->get_connection());
1490 delete filter;
1491}
1492
1493int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1494{
1495 if (m->get_pg() != info.pgid.pgid) {
1496 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1497 return -EINVAL; // hmm?
1498 }
1499 auto bp = osd_op->indata.begin();
1500 scrub_ls_arg_t arg;
1501 try {
1502 arg.decode(bp);
1503 } catch (buffer::error&) {
1504 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1505 return -EINVAL;
1506 }
1507 int r = 0;
1508 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1509 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1510 r = -EAGAIN;
1511 } else if (!scrubber.store) {
1512 r = -ENOENT;
1513 } else if (arg.get_snapsets) {
1514 result.vals = scrubber.store->get_snap_errors(osd->store,
1515 get_pgid().pool(),
1516 arg.start_after,
1517 arg.max_return);
1518 } else {
1519 result.vals = scrubber.store->get_object_errors(osd->store,
1520 get_pgid().pool(),
1521 arg.start_after,
1522 arg.max_return);
1523 }
1524 ::encode(result, osd_op->outdata);
1525 return r;
1526}
1527
1528void PrimaryLogPG::calc_trim_to()
1529{
1530 size_t target = cct->_conf->osd_min_pg_log_entries;
1531 if (is_degraded() ||
1532 state_test(PG_STATE_RECOVERING |
1533 PG_STATE_RECOVERY_WAIT |
1534 PG_STATE_BACKFILL |
1535 PG_STATE_BACKFILL_WAIT |
1536 PG_STATE_BACKFILL_TOOFULL)) {
1537 target = cct->_conf->osd_max_pg_log_entries;
1538 }
1539
1540 eversion_t limit = MIN(
1541 min_last_complete_ondisk,
1542 pg_log.get_can_rollback_to());
1543 if (limit != eversion_t() &&
1544 limit != pg_trim_to &&
1545 pg_log.get_log().approx_size() > target) {
1546 size_t num_to_trim = pg_log.get_log().approx_size() - target;
1547 if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1548 return;
1549 }
1550 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1551 eversion_t new_trim_to;
1552 for (size_t i = 0; i < num_to_trim; ++i) {
1553 new_trim_to = it->version;
1554 ++it;
1555 if (new_trim_to > limit) {
1556 new_trim_to = limit;
1557 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1558 break;
1559 }
1560 }
1561 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1562 pg_trim_to = new_trim_to;
1563 assert(pg_trim_to <= pg_log.get_head());
1564 assert(pg_trim_to <= min_last_complete_ondisk);
1565 }
1566}
1567
1568PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1569 const PGPool &_pool, spg_t p) :
1570 PG(o, curmap, _pool, p),
1571 pgbackend(
1572 PGBackend::build_pg_backend(
1573 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1574 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1575 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1576 new_backfill(false),
1577 temp_seq(0),
1578 snap_trimmer_machine(this)
1579{
1580 missing_loc.set_backend_predicates(
1581 pgbackend->get_is_readable_predicate(),
1582 pgbackend->get_is_recoverable_predicate());
1583 snap_trimmer_machine.initiate();
1584}
1585
1586void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1587{
1588 src_oloc = oloc;
1589 if (oloc.key.empty())
1590 src_oloc.key = oid.name;
1591}
1592
1593void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1594{
1595 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1596 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1597 if (!session)
1598 return; // drop it.
1599 session->put(); // get_priv takes a ref, and so does the SessionRef
1600 hobject_t begin = info.pgid.pgid.get_hobj_start();
1601 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1602 if (begin < m->begin) {
1603 begin = m->begin;
1604 }
1605 if (end > m->end) {
1606 end = m->end;
1607 }
1608 dout(10) << __func__ << " backoff ack id " << m->id
1609 << " [" << begin << "," << end << ")" << dendl;
1610 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1611}
1612
1613void PrimaryLogPG::do_request(
1614 OpRequestRef& op,
1615 ThreadPool::TPHandle &handle)
1616{
1617 if (op->osd_trace) {
1618 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1619 op->pg_trace.event("do request");
1620 }
1621 // make sure we have a new enough map
1622 auto p = waiting_for_map.find(op->get_source());
1623 if (p != waiting_for_map.end()) {
1624 // preserve ordering
1625 dout(20) << __func__ << " waiting_for_map "
1626 << p->first << " not empty, queueing" << dendl;
1627 p->second.push_back(op);
1628 op->mark_delayed("waiting_for_map not empty");
1629 return;
1630 }
1631 if (!have_same_or_newer_map(op->min_epoch)) {
1632 dout(20) << __func__ << " min " << op->min_epoch
1633 << ", queue on waiting_for_map " << op->get_source() << dendl;
1634 waiting_for_map[op->get_source()].push_back(op);
1635 op->mark_delayed("op must wait for map");
1636 return;
1637 }
1638
1639 if (can_discard_request(op)) {
1640 return;
1641 }
1642
1643 // pg-wide backoffs
1644 const Message *m = op->get_req();
1645 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1646 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1647 if (!session)
1648 return; // drop it.
1649 session->put(); // get_priv takes a ref, and so does the SessionRef
1650
1651 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1652 if (session->check_backoff(cct, info.pgid,
1653 info.pgid.pgid.get_hobj_start(), m)) {
1654 return;
1655 }
1656
1657 bool backoff =
1658 is_down() ||
1659 is_incomplete() ||
1660 (!is_active() && is_peered());
1661 if (g_conf->osd_backoff_on_peering && !backoff) {
1662 if (is_peering()) {
1663 backoff = true;
1664 }
1665 }
1666 if (backoff) {
1667 add_pg_backoff(session);
1668 return;
1669 }
1670 }
1671 // pg backoff acks at pg-level
1672 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1673 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1674 if (ba->begin != ba->end) {
1675 handle_backoff(op);
1676 return;
1677 }
1678 }
1679 }
1680
1681 if (flushes_in_progress > 0) {
1682 dout(20) << flushes_in_progress
1683 << " flushes_in_progress pending "
1684 << "waiting for active on " << op << dendl;
1685 waiting_for_peered.push_back(op);
1686 op->mark_delayed("waiting for peered");
1687 return;
1688 }
1689
1690 if (!is_peered()) {
1691 // Delay unless PGBackend says it's ok
1692 if (pgbackend->can_handle_while_inactive(op)) {
1693 bool handled = pgbackend->handle_message(op);
1694 assert(handled);
1695 return;
1696 } else {
1697 waiting_for_peered.push_back(op);
1698 op->mark_delayed("waiting for peered");
1699 return;
1700 }
1701 }
1702
1703 assert(is_peered() && flushes_in_progress == 0);
1704 if (pgbackend->handle_message(op))
1705 return;
1706
1707 switch (op->get_req()->get_type()) {
1708 case CEPH_MSG_OSD_OP:
1709 case CEPH_MSG_OSD_BACKOFF:
1710 if (!is_active()) {
1711 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1712 waiting_for_active.push_back(op);
1713 op->mark_delayed("waiting for active");
1714 return;
1715 }
1716 switch (op->get_req()->get_type()) {
1717 case CEPH_MSG_OSD_OP:
1718 // verify client features
1719 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1720 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1721 osd->reply_op_error(op, -EOPNOTSUPP);
1722 return;
1723 }
1724 do_op(op);
1725 break;
1726 case CEPH_MSG_OSD_BACKOFF:
1727 // object-level backoff acks handled in osdop context
1728 handle_backoff(op);
1729 break;
1730 }
1731 break;
1732
1733 case MSG_OSD_SUBOP:
1734 do_sub_op(op);
1735 break;
1736
1737 case MSG_OSD_SUBOPREPLY:
1738 do_sub_op_reply(op);
1739 break;
1740
1741 case MSG_OSD_PG_SCAN:
1742 do_scan(op, handle);
1743 break;
1744
1745 case MSG_OSD_PG_BACKFILL:
1746 do_backfill(op);
1747 break;
1748
1749 case MSG_OSD_PG_BACKFILL_REMOVE:
1750 do_backfill_remove(op);
1751 break;
1752
1753 case MSG_OSD_SCRUB_RESERVE:
1754 {
1755 const MOSDScrubReserve *m =
1756 static_cast<const MOSDScrubReserve*>(op->get_req());
1757 switch (m->type) {
1758 case MOSDScrubReserve::REQUEST:
1759 handle_scrub_reserve_request(op);
1760 break;
1761 case MOSDScrubReserve::GRANT:
1762 handle_scrub_reserve_grant(op, m->from);
1763 break;
1764 case MOSDScrubReserve::REJECT:
1765 handle_scrub_reserve_reject(op, m->from);
1766 break;
1767 case MOSDScrubReserve::RELEASE:
1768 handle_scrub_reserve_release(op);
1769 break;
1770 }
1771 }
1772 break;
1773
1774 case MSG_OSD_REP_SCRUB:
1775 replica_scrub(op, handle);
1776 break;
1777
1778 case MSG_OSD_REP_SCRUBMAP:
1779 do_replica_scrub_map(op);
1780 break;
1781
1782 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1783 do_update_log_missing(op);
1784 break;
1785
1786 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1787 do_update_log_missing_reply(op);
1788 break;
1789
1790 default:
1791 assert(0 == "bad message type in do_request");
1792 }
1793}
1794
1795hobject_t PrimaryLogPG::earliest_backfill() const
1796{
1797 hobject_t e = hobject_t::get_max();
1798 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1799 i != backfill_targets.end();
1800 ++i) {
1801 pg_shard_t bt = *i;
1802 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1803 assert(iter != peer_info.end());
1804 if (iter->second.last_backfill < e)
1805 e = iter->second.last_backfill;
1806 }
1807 return e;
1808}
1809
1810/** do_op - do an op
1811 * pg lock will be held (if multithreaded)
1812 * osd_lock NOT held.
1813 */
1814void PrimaryLogPG::do_op(OpRequestRef& op)
1815{
1816 FUNCTRACE();
1817 // NOTE: take a non-const pointer here; we must be careful not to
1818 // change anything that will break other reads on m (operator<<).
1819 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1820 assert(m->get_type() == CEPH_MSG_OSD_OP);
1821 if (m->finish_decode()) {
1822 op->reset_desc(); // for TrackedOp
1823 m->clear_payload();
1824 }
1825
1826 dout(20) << __func__ << ": op " << *m << dendl;
1827
1828 hobject_t head = m->get_hobj();
1829 head.snap = CEPH_NOSNAP;
1830
1831 if (!info.pgid.pgid.contains(
1832 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1833 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1834 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1835 << std::hex << head.get_hash() << std::dec << dendl;
1836 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1837 << " op " << *m;
1838 assert(!cct->_conf->osd_debug_misdirected_ops);
1839 return;
1840 }
1841
1842 bool can_backoff =
1843 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1844 SessionRef session;
1845 if (can_backoff) {
1846 session = static_cast<Session*>(m->get_connection()->get_priv());
1847 if (!session.get()) {
1848 dout(10) << __func__ << " no session" << dendl;
1849 return;
1850 }
1851 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1852
1853 if (session->check_backoff(cct, info.pgid, head, m)) {
1854 return;
1855 }
1856 }
1857
1858 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1859 // not implemented.
1860 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1861 osd->reply_op_error(op, -EINVAL);
1862 return;
1863 }
1864
1865 if (op->rmw_flags == 0) {
1866 int r = osd->osd->init_op_flags(op);
1867 if (r) {
1868 osd->reply_op_error(op, r);
1869 return;
1870 }
1871 }
1872
1873 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1874 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1875 op->may_read() &&
1876 !(op->may_write() || op->may_cache())) {
1877 // balanced reads; any replica will do
1878 if (!(is_primary() || is_replica())) {
1879 osd->handle_misdirected_op(this, op);
1880 return;
1881 }
1882 } else {
1883 // normal case; must be primary
1884 if (!is_primary()) {
1885 osd->handle_misdirected_op(this, op);
1886 return;
1887 }
1888 }
1889
7c673cae
FG
1890 if (!op_has_sufficient_caps(op)) {
1891 osd->reply_op_error(op, -EPERM);
1892 return;
1893 }
1894
31f18b77
FG
1895 if (op->includes_pg_op()) {
1896 return do_pg_op(op);
1897 }
1898
7c673cae
FG
1899 // object name too long?
1900 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1901 dout(4) << "do_op name is longer than "
1902 << cct->_conf->osd_max_object_name_len
1903 << " bytes" << dendl;
1904 osd->reply_op_error(op, -ENAMETOOLONG);
1905 return;
1906 }
1907 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1908 dout(4) << "do_op locator is longer than "
1909 << cct->_conf->osd_max_object_name_len
1910 << " bytes" << dendl;
1911 osd->reply_op_error(op, -ENAMETOOLONG);
1912 return;
1913 }
1914 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1915 dout(4) << "do_op namespace is longer than "
1916 << cct->_conf->osd_max_object_namespace_len
1917 << " bytes" << dendl;
1918 osd->reply_op_error(op, -ENAMETOOLONG);
1919 return;
1920 }
1921
1922 if (int r = osd->store->validate_hobject_key(head)) {
1923 dout(4) << "do_op object " << head << " invalid for backing store: "
1924 << r << dendl;
1925 osd->reply_op_error(op, r);
1926 return;
1927 }
1928
1929 // blacklisted?
1930 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1931 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1932 osd->reply_op_error(op, -EBLACKLISTED);
1933 return;
1934 }
1935
1936 // order this op as a write?
1937 bool write_ordered = op->rwordered();
1938
1939 // discard due to cluster full transition? (we discard any op that
1940 // originates before the cluster or pool is marked full; the client
1941 // will resend after the full flag is removed or if they expect the
1942 // op to succeed despite being full). The except is FULL_FORCE and
1943 // FULL_TRY ops, which there is no reason to discard because they
1944 // bypass all full checks anyway. If this op isn't write or
1945 // read-ordered, we skip.
1946 // FIXME: we exclude mds writes for now.
1947 if (write_ordered && !(m->get_source().is_mds() ||
1948 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1949 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1950 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1951 dout(10) << __func__ << " discarding op sent before full " << m << " "
1952 << *m << dendl;
1953 return;
1954 }
1955 // mds should have stopped writing before this point.
1956 // We can't allow OSD to become non-startable even if mds
1957 // could be writing as part of file removals.
1958 ostringstream ss;
1959 if (write_ordered && osd->check_failsafe_full(ss)) {
1960 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1961 << ss.str()
1962 << dendl;
1963 return;
1964 }
1965 int64_t poolid = get_pgid().pool();
1966 if (op->may_write()) {
1967
1968 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1969 if (!pi) {
1970 return;
1971 }
1972
1973 // invalid?
1974 if (m->get_snapid() != CEPH_NOSNAP) {
1975 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1976 osd->reply_op_error(op, -EINVAL);
1977 return;
1978 }
1979
1980 // too big?
1981 if (cct->_conf->osd_max_write_size &&
1982 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1983 // journal can't hold commit!
1984 derr << "do_op msg data len " << m->get_data_len()
1985 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1986 << " on " << *m << dendl;
1987 osd->reply_op_error(op, -OSD_WRITETOOBIG);
1988 return;
1989 }
1990 }
1991
1992 dout(10) << "do_op " << *m
1993 << (op->may_write() ? " may_write" : "")
1994 << (op->may_read() ? " may_read" : "")
1995 << (op->may_cache() ? " may_cache" : "")
1996 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
1997 << " flags " << ceph_osd_flag_string(m->get_flags())
1998 << dendl;
1999
2000 // missing object?
2001 if (is_unreadable_object(head)) {
224ce89b
WB
2002 if (!is_primary()) {
2003 osd->reply_op_error(op, -EAGAIN);
2004 return;
2005 }
7c673cae
FG
2006 if (can_backoff &&
2007 (g_conf->osd_backoff_on_degraded ||
2008 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2009 add_backoff(session, head, head);
2010 maybe_kick_recovery(head);
2011 } else {
2012 wait_for_unreadable_object(head, op);
2013 }
2014 return;
2015 }
2016
2017 // degraded object?
2018 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2019 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2020 add_backoff(session, head, head);
2021 } else {
2022 wait_for_degraded_object(head, op);
2023 }
2024 return;
2025 }
2026
2027 if (write_ordered &&
2028 scrubber.write_blocked_by_scrub(head)) {
2029 dout(20) << __func__ << ": waiting for scrub" << dendl;
2030 waiting_for_scrub.push_back(op);
2031 op->mark_delayed("waiting for scrub");
2032 return;
2033 }
2034
2035 // blocked on snap?
2036 map<hobject_t, snapid_t>::iterator blocked_iter =
2037 objects_blocked_on_degraded_snap.find(head);
2038 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2039 hobject_t to_wait_on(head);
2040 to_wait_on.snap = blocked_iter->second;
2041 wait_for_degraded_object(to_wait_on, op);
2042 return;
2043 }
2044 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2045 objects_blocked_on_snap_promotion.find(head);
2046 if (write_ordered &&
2047 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2048 wait_for_blocked_object(
2049 blocked_snap_promote_iter->second->obs.oi.soid,
2050 op);
2051 return;
2052 }
2053 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2054 block_write_on_full_cache(head, op);
2055 return;
2056 }
2057
2058 // missing snapdir?
2059 hobject_t snapdir = head.get_snapdir();
2060
2061 if (is_unreadable_object(snapdir)) {
2062 wait_for_unreadable_object(snapdir, op);
2063 return;
2064 }
2065
2066 // degraded object?
2067 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2068 wait_for_degraded_object(snapdir, op);
2069 return;
2070 }
2071
2072 // dup/resent?
2073 if (op->may_write() || op->may_cache()) {
2074 // warning: we will get back *a* request for this reqid, but not
2075 // necessarily the most recent. this happens with flush and
2076 // promote ops, but we can't possible have both in our log where
2077 // the original request is still not stable on disk, so for our
2078 // purposes here it doesn't matter which one we get.
2079 eversion_t version;
2080 version_t user_version;
2081 int return_code = 0;
2082 bool got = check_in_progress_op(
2083 m->get_reqid(), &version, &user_version, &return_code);
2084 if (got) {
2085 dout(3) << __func__ << " dup " << m->get_reqid()
2086 << " version " << version << dendl;
2087 if (already_complete(version)) {
2088 osd->reply_op_error(op, return_code, version, user_version);
2089 } else {
2090 dout(10) << " waiting for " << version << " to commit" << dendl;
2091 // always queue ondisk waiters, so that we can requeue if needed
2092 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2093 op->mark_delayed("waiting for ondisk");
2094 }
2095 return;
2096 }
2097 }
2098
2099 ObjectContextRef obc;
2100 bool can_create = op->may_write() || op->may_cache();
2101 hobject_t missing_oid;
2102 const hobject_t& oid = m->get_hobj();
2103
2104 // io blocked on obc?
2105 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2106 maybe_await_blocked_snapset(oid, op)) {
2107 return;
2108 }
2109
2110 int r = find_object_context(
2111 oid, &obc, can_create,
2112 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2113 &missing_oid);
2114
2115 if (r == -EAGAIN) {
2116 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2117 // we have to wait for the object.
2118 if (is_primary()) {
2119 // missing the specific snap we need; requeue and wait.
2120 assert(!op->may_write()); // only happens on a read/cache
2121 wait_for_unreadable_object(missing_oid, op);
2122 return;
2123 }
2124 } else if (r == 0) {
2125 if (is_unreadable_object(obc->obs.oi.soid)) {
2126 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2127 << " is unreadable, waiting" << dendl;
2128 wait_for_unreadable_object(obc->obs.oi.soid, op);
2129 return;
2130 }
2131
2132 // degraded object? (the check above was for head; this could be a clone)
2133 if (write_ordered &&
2134 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2135 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2136 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2137 << " is degraded, waiting" << dendl;
2138 wait_for_degraded_object(obc->obs.oi.soid, op);
2139 return;
2140 }
2141 }
2142
2143 bool in_hit_set = false;
2144 if (hit_set) {
2145 if (obc.get()) {
2146 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2147 in_hit_set = true;
2148 } else {
2149 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2150 in_hit_set = true;
2151 }
2152 if (!op->hitset_inserted) {
2153 hit_set->insert(oid);
2154 op->hitset_inserted = true;
2155 if (hit_set->is_full() ||
2156 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2157 hit_set_persist();
2158 }
2159 }
2160 }
2161
2162 if (agent_state) {
2163 if (agent_choose_mode(false, op))
2164 return;
2165 }
2166
31f18b77
FG
2167 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2168 if (maybe_handle_manifest(op,
2169 write_ordered,
2170 obc))
2171 return;
2172 }
2173
7c673cae
FG
2174 if (maybe_handle_cache(op,
2175 write_ordered,
2176 obc,
2177 r,
2178 missing_oid,
2179 false,
2180 in_hit_set))
2181 return;
2182
2183 if (r && (r != -ENOENT || !obc)) {
2184 // copy the reqids for copy get on ENOENT
2185 if (r == -ENOENT &&
2186 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2187 fill_in_copy_get_noent(op, oid, m->ops[0]);
2188 return;
2189 }
224ce89b 2190 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
7c673cae 2191 if (op->may_write() &&
31f18b77 2192 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
2193 record_write_error(op, oid, nullptr, r);
2194 } else {
2195 osd->reply_op_error(op, r);
2196 }
2197 return;
2198 }
2199
2200 // make sure locator is consistent
2201 object_locator_t oloc(obc->obs.oi.soid);
2202 if (m->get_object_locator() != oloc) {
2203 dout(10) << " provided locator " << m->get_object_locator()
2204 << " != object's " << obc->obs.oi.soid << dendl;
2205 osd->clog->warn() << "bad locator " << m->get_object_locator()
2206 << " on object " << oloc
2207 << " op " << *m;
2208 }
2209
2210 // io blocked on obc?
2211 if (obc->is_blocked() &&
2212 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2213 wait_for_blocked_object(obc->obs.oi.soid, op);
2214 return;
2215 }
2216
2217 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2218
2219 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2220 OSDOp& osd_op = *p;
2221
2222 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2223 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2224 m->get_snapid() != CEPH_SNAPDIR) {
2225 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2226 osd->reply_op_error(op, -EINVAL);
2227 return;
2228 }
2229 }
2230
2231 OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, obc, this);
2232
2233 if (!obc->obs.exists)
2234 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2235
2236 /* Due to obc caching, we might have a cached non-existent snapset_obc
2237 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2238 * do_op pipeline make decisions based on whether snapset_obc is
2239 * populated.
2240 */
2241 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2242 ctx->snapset_obc = ObjectContextRef();
2243
2244 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2245 dout(20) << __func__ << ": skipping rw locks" << dendl;
2246 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2247 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2248
2249 // verify there is in fact a flush in progress
2250 // FIXME: we could make this a stronger test.
2251 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2252 if (p == flush_ops.end()) {
2253 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2254 reply_ctx(ctx, -EINVAL);
2255 return;
2256 }
2257 } else if (!get_rw_locks(write_ordered, ctx)) {
2258 dout(20) << __func__ << " waiting for rw locks " << dendl;
2259 op->mark_delayed("waiting for rw locks");
2260 close_op_ctx(ctx);
2261 return;
2262 }
2263 dout(20) << __func__ << " obc " << *obc << dendl;
2264
2265 if (r) {
2266 dout(20) << __func__ << " returned an error: " << r << dendl;
2267 close_op_ctx(ctx);
2268 if (op->may_write() &&
31f18b77 2269 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
2270 record_write_error(op, oid, nullptr, r);
2271 } else {
2272 osd->reply_op_error(op, r);
2273 }
2274 return;
2275 }
2276
2277 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2278 ctx->ignore_cache = true;
2279 }
2280
2281 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2282 // This object is lost. Reading from it returns an error.
2283 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2284 << " is lost" << dendl;
2285 reply_ctx(ctx, -ENFILE);
2286 return;
2287 }
2288 if (!op->may_write() &&
2289 !op->may_cache() &&
2290 (!obc->obs.exists ||
2291 ((m->get_snapid() != CEPH_SNAPDIR) &&
2292 obc->obs.oi.is_whiteout()))) {
2293 // copy the reqids for copy get on ENOENT
2294 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2295 fill_in_copy_get_noent(op, oid, m->ops[0]);
2296 close_op_ctx(ctx);
2297 return;
2298 }
2299 reply_ctx(ctx, -ENOENT);
2300 return;
2301 }
2302
2303 op->mark_started();
2304
2305 execute_ctx(ctx);
2306 utime_t prepare_latency = ceph_clock_now();
2307 prepare_latency -= op->get_dequeued_time();
2308 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2309 if (op->may_read() && op->may_write()) {
2310 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2311 } else if (op->may_read()) {
2312 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2313 } else if (op->may_write() || op->may_cache()) {
2314 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2315 }
2316
2317 // force recovery of the oldest missing object if too many logs
2318 maybe_force_recovery();
2319}
31f18b77
FG
2320PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2321 OpRequestRef op,
2322 bool write_ordered,
2323 ObjectContextRef obc)
2324{
2325 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2326 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2327 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2328 return cache_result_t::NOOP;
2329 }
2330
2331 if (obc)
2332 dout(10) << __func__ << " " << obc->obs.oi << " "
2333 << (obc->obs.exists ? "exists" : "DNE")
2334 << dendl;
2335
2336 // if it is write-ordered and blocked, stop now
2337 if (obc.get() && obc->is_blocked() && write_ordered) {
2338 // we're already doing something with this object
2339 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2340 return cache_result_t::NOOP;
2341 }
2342
2343 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2344 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2345 OSDOp& osd_op = *p;
2346 ceph_osd_op& op = osd_op.op;
2347 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2348 return cache_result_t::NOOP;
2349 }
2350 }
2351
2352 switch (obc->obs.oi.manifest.type) {
2353 case object_manifest_t::TYPE_REDIRECT:
2354 if (op->may_write() || write_ordered) {
2355 do_proxy_write(op, obc->obs.oi.soid, obc);
2356 } else {
2357 do_proxy_read(op, obc);
2358 }
2359 return cache_result_t::HANDLED_PROXY;
2360 case object_manifest_t::TYPE_CHUNKED:
2361 default:
2362 assert(0 == "unrecognized manifest type");
2363 }
2364
2365 return cache_result_t::NOOP;
2366}
7c673cae
FG
2367
2368void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2369 MOSDOpReply *orig_reply, int r)
2370{
2371 dout(20) << __func__ << " r=" << r << dendl;
2372 assert(op->may_write());
2373 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2374 ObjectContextRef obc;
31f18b77 2375 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
2376 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2377 get_next_version(), eversion_t(), 0,
2378 reqid, utime_t(), r));
2379
2380 struct OnComplete {
2381 PrimaryLogPG *pg;
2382 OpRequestRef op;
2383 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2384 int r;
2385 OnComplete(
2386 PrimaryLogPG *pg,
2387 OpRequestRef op,
2388 MOSDOpReply *orig_reply,
2389 int r)
2390 : pg(pg), op(op),
2391 orig_reply(orig_reply, false /* take over ref */), r(r)
2392 {}
2393 void operator()() {
2394 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2395 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2396 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2397 MOSDOpReply *reply = orig_reply.detach();
2398 if (reply == nullptr) {
2399 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2400 flags, true);
2401 }
2402 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2403 pg->osd->send_message_osd_client(reply, m->get_connection());
2404 }
2405 };
2406
2407 ObcLockManager lock_manager;
2408 submit_log_entries(
2409 entries,
2410 std::move(lock_manager),
2411 boost::optional<std::function<void(void)> >(
2412 OnComplete(this, op, orig_reply, r)),
2413 op,
2414 r);
2415}
2416
2417PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2418 OpRequestRef op,
2419 bool write_ordered,
2420 ObjectContextRef obc,
2421 int r, hobject_t missing_oid,
2422 bool must_promote,
2423 bool in_hit_set,
2424 ObjectContextRef *promote_obc)
2425{
2426 if (op &&
2427 op->get_req() &&
2428 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2429 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2430 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2431 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2432 return cache_result_t::NOOP;
2433 }
2434 // return quickly if caching is not enabled
2435 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2436 return cache_result_t::NOOP;
2437
2438 must_promote = must_promote || op->need_promote();
2439
2440 if (obc)
2441 dout(25) << __func__ << " " << obc->obs.oi << " "
2442 << (obc->obs.exists ? "exists" : "DNE")
2443 << " missing_oid " << missing_oid
2444 << " must_promote " << (int)must_promote
2445 << " in_hit_set " << (int)in_hit_set
2446 << dendl;
2447 else
2448 dout(25) << __func__ << " (no obc)"
2449 << " missing_oid " << missing_oid
2450 << " must_promote " << (int)must_promote
2451 << " in_hit_set " << (int)in_hit_set
2452 << dendl;
2453
2454 // if it is write-ordered and blocked, stop now
2455 if (obc.get() && obc->is_blocked() && write_ordered) {
2456 // we're already doing something with this object
2457 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2458 return cache_result_t::NOOP;
2459 }
2460
2461 if (r == -ENOENT && missing_oid == hobject_t()) {
2462 // we know this object is logically absent (e.g., an undefined clone)
2463 return cache_result_t::NOOP;
2464 }
2465
2466 if (obc.get() && obc->obs.exists) {
2467 osd->logger->inc(l_osd_op_cache_hit);
2468 return cache_result_t::NOOP;
2469 }
2470
2471 if (missing_oid == hobject_t() && obc.get()) {
2472 missing_oid = obc->obs.oi.soid;
2473 }
2474
2475 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2476 const object_locator_t oloc = m->get_object_locator();
2477
2478 if (op->need_skip_handle_cache()) {
2479 return cache_result_t::NOOP;
2480 }
2481
2482 // older versions do not proxy the feature bits.
2483 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2484 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2485 OpRequestRef promote_op;
2486
2487 switch (pool.info.cache_mode) {
2488 case pg_pool_t::CACHEMODE_WRITEBACK:
2489 if (agent_state &&
2490 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2491 if (!op->may_write() && !op->may_cache() &&
2492 !write_ordered && !must_promote) {
2493 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2494 do_proxy_read(op);
2495 return cache_result_t::HANDLED_PROXY;
2496 }
2497 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2498 block_write_on_full_cache(missing_oid, op);
2499 return cache_result_t::BLOCKED_FULL;
2500 }
2501
2502 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2503 promote_object(obc, missing_oid, oloc, op, promote_obc);
2504 return cache_result_t::BLOCKED_PROMOTE;
2505 }
2506
2507 if (op->may_write() || op->may_cache()) {
2508 if (can_proxy_write) {
2509 do_proxy_write(op, missing_oid);
2510 } else {
2511 // promote if can't proxy the write
2512 promote_object(obc, missing_oid, oloc, op, promote_obc);
2513 return cache_result_t::BLOCKED_PROMOTE;
2514 }
2515
2516 // Promote too?
2517 if (!op->need_skip_promote() &&
2518 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2519 pool.info.min_write_recency_for_promote,
2520 OpRequestRef(),
2521 promote_obc)) {
2522 return cache_result_t::BLOCKED_PROMOTE;
2523 }
2524 return cache_result_t::HANDLED_PROXY;
2525 } else {
2526 do_proxy_read(op);
2527
2528 // Avoid duplicate promotion
2529 if (obc.get() && obc->is_blocked()) {
2530 if (promote_obc)
2531 *promote_obc = obc;
2532 return cache_result_t::BLOCKED_PROMOTE;
2533 }
2534
2535 // Promote too?
2536 if (!op->need_skip_promote()) {
2537 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2538 pool.info.min_read_recency_for_promote,
2539 promote_op, promote_obc);
2540 }
2541
2542 return cache_result_t::HANDLED_PROXY;
2543 }
2544 assert(0 == "unreachable");
2545 return cache_result_t::NOOP;
2546
2547 case pg_pool_t::CACHEMODE_FORWARD:
2548 // FIXME: this mode allows requests to be reordered.
2549 do_cache_redirect(op);
2550 return cache_result_t::HANDLED_REDIRECT;
2551
2552 case pg_pool_t::CACHEMODE_READONLY:
2553 // TODO: clean this case up
2554 if (!obc.get() && r == -ENOENT) {
2555 // we don't have the object and op's a read
2556 promote_object(obc, missing_oid, oloc, op, promote_obc);
2557 return cache_result_t::BLOCKED_PROMOTE;
2558 }
2559 if (!r) { // it must be a write
2560 do_cache_redirect(op);
2561 return cache_result_t::HANDLED_REDIRECT;
2562 }
2563 // crap, there was a failure of some kind
2564 return cache_result_t::NOOP;
2565
2566 case pg_pool_t::CACHEMODE_READFORWARD:
2567 // Do writeback to the cache tier for writes
2568 if (op->may_write() || write_ordered || must_promote) {
2569 if (agent_state &&
2570 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2571 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2572 block_write_on_full_cache(missing_oid, op);
2573 return cache_result_t::BLOCKED_FULL;
2574 }
2575 promote_object(obc, missing_oid, oloc, op, promote_obc);
2576 return cache_result_t::BLOCKED_PROMOTE;
2577 }
2578
2579 // If it is a read, we can read, we need to forward it
2580 do_cache_redirect(op);
2581 return cache_result_t::HANDLED_REDIRECT;
2582
2583 case pg_pool_t::CACHEMODE_PROXY:
2584 if (!must_promote) {
2585 if (op->may_write() || op->may_cache() || write_ordered) {
2586 if (can_proxy_write) {
2587 do_proxy_write(op, missing_oid);
2588 return cache_result_t::HANDLED_PROXY;
2589 }
2590 } else {
2591 do_proxy_read(op);
2592 return cache_result_t::HANDLED_PROXY;
2593 }
2594 }
2595 // ugh, we're forced to promote.
2596 if (agent_state &&
2597 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2598 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2599 block_write_on_full_cache(missing_oid, op);
2600 return cache_result_t::BLOCKED_FULL;
2601 }
2602 promote_object(obc, missing_oid, oloc, op, promote_obc);
2603 return cache_result_t::BLOCKED_PROMOTE;
2604
2605 case pg_pool_t::CACHEMODE_READPROXY:
2606 // Do writeback to the cache tier for writes
2607 if (op->may_write() || write_ordered || must_promote) {
2608 if (agent_state &&
2609 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2610 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2611 block_write_on_full_cache(missing_oid, op);
2612 return cache_result_t::BLOCKED_FULL;
2613 }
2614 promote_object(obc, missing_oid, oloc, op, promote_obc);
2615 return cache_result_t::BLOCKED_PROMOTE;
2616 }
2617
2618 // If it is a read, we can read, we need to proxy it
2619 do_proxy_read(op);
2620 return cache_result_t::HANDLED_PROXY;
2621
2622 default:
2623 assert(0 == "unrecognized cache_mode");
2624 }
2625 return cache_result_t::NOOP;
2626}
2627
2628bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2629 const hobject_t& missing_oid,
2630 const object_locator_t& oloc,
2631 bool in_hit_set,
2632 uint32_t recency,
2633 OpRequestRef promote_op,
2634 ObjectContextRef *promote_obc)
2635{
2636 dout(20) << __func__ << " missing_oid " << missing_oid
2637 << " in_hit_set " << in_hit_set << dendl;
2638
2639 switch (recency) {
2640 case 0:
2641 break;
2642 case 1:
2643 // Check if in the current hit set
2644 if (in_hit_set) {
2645 break;
2646 } else {
2647 // not promoting
2648 return false;
2649 }
2650 break;
2651 default:
2652 {
2653 unsigned count = (int)in_hit_set;
2654 if (count) {
2655 // Check if in other hit sets
2656 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2657 for (map<time_t,HitSetRef>::reverse_iterator itor =
2658 agent_state->hit_set_map.rbegin();
2659 itor != agent_state->hit_set_map.rend();
2660 ++itor) {
2661 if (!itor->second->contains(oid)) {
2662 break;
2663 }
2664 ++count;
2665 if (count >= recency) {
2666 break;
2667 }
2668 }
2669 }
2670 if (count >= recency) {
2671 break;
2672 }
2673 return false; // not promoting
2674 }
2675 break;
2676 }
2677
2678 if (osd->promote_throttle()) {
2679 dout(10) << __func__ << " promote throttled" << dendl;
2680 return false;
2681 }
2682 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2683 return true;
2684}
2685
2686void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2687{
2688 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2689 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2690 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2691 get_osdmap()->get_epoch(), flags, false);
2692 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2693 reply->set_redirect(redir);
2694 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2695 << op << dendl;
2696 m->get_connection()->send_message(reply);
2697 return;
2698}
2699
2700struct C_ProxyRead : public Context {
2701 PrimaryLogPGRef pg;
2702 hobject_t oid;
2703 epoch_t last_peering_reset;
2704 ceph_tid_t tid;
2705 PrimaryLogPG::ProxyReadOpRef prdop;
2706 utime_t start;
2707 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2708 const PrimaryLogPG::ProxyReadOpRef& prd)
2709 : pg(p), oid(o), last_peering_reset(lpr),
2710 tid(0), prdop(prd), start(ceph_clock_now())
2711 {}
2712 void finish(int r) override {
2713 if (prdop->canceled)
2714 return;
2715 pg->lock();
2716 if (prdop->canceled) {
2717 pg->unlock();
2718 return;
2719 }
2720 if (last_peering_reset == pg->get_last_peering_reset()) {
2721 pg->finish_proxy_read(oid, tid, r);
2722 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2723 }
2724 pg->unlock();
2725 }
2726};
2727
31f18b77 2728void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
2729{
2730 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2731 // stash the result in the request's OSDOp vector
2732 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77
FG
2733 object_locator_t oloc;
2734 hobject_t soid;
2735 /* extensible tier */
2736 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2737 switch (obc->obs.oi.manifest.type) {
2738 case object_manifest_t::TYPE_REDIRECT:
2739 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2740 soid = obc->obs.oi.manifest.redirect_target;
2741 break;
2742 case object_manifest_t::TYPE_CHUNKED:
2743 default:
2744 assert(0 == "unrecognized manifest type");
2745 }
2746 } else {
2747 /* proxy */
2748 soid = m->get_hobj();
2749 oloc = object_locator_t(m->get_object_locator());
2750 oloc.pool = pool.info.tier_of;
2751 }
7c673cae
FG
2752 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2753
2754 // pass through some original flags that make sense.
2755 // - leave out redirection and balancing flags since we are
2756 // already proxying through the primary
2757 // - leave off read/write/exec flags that are derived from the op
2758 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2759 CEPH_OSD_FLAG_ORDERSNAP |
2760 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2761 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2762
2763 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2764
2765 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2766
2767 ObjectOperation obj_op;
2768 obj_op.dup(prdop->ops);
2769
2770 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2771 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2772 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2773 ceph_osd_op op = obj_op.ops[i].op;
2774 switch (op.op) {
2775 case CEPH_OSD_OP_READ:
2776 case CEPH_OSD_OP_SYNC_READ:
2777 case CEPH_OSD_OP_SPARSE_READ:
2778 case CEPH_OSD_OP_CHECKSUM:
2779 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2780 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2781 }
2782 }
2783 }
2784
2785 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2786 prdop);
2787 ceph_tid_t tid = osd->objecter->read(
2788 soid.oid, oloc, obj_op,
2789 m->get_snapid(), NULL,
2790 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2791 &prdop->user_version,
2792 &prdop->data_offset,
2793 m->get_features());
2794 fin->tid = tid;
2795 prdop->objecter_tid = tid;
2796 proxyread_ops[tid] = prdop;
2797 in_progress_proxy_ops[soid].push_back(op);
2798}
2799
2800void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2801{
2802 dout(10) << __func__ << " " << oid << " tid " << tid
2803 << " " << cpp_strerror(r) << dendl;
2804
2805 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2806 if (p == proxyread_ops.end()) {
2807 dout(10) << __func__ << " no proxyread_op found" << dendl;
2808 return;
2809 }
2810 ProxyReadOpRef prdop = p->second;
2811 if (tid != prdop->objecter_tid) {
2812 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2813 << " tid " << prdop->objecter_tid << dendl;
2814 return;
2815 }
2816 if (oid != prdop->soid) {
2817 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2818 << " soid " << prdop->soid << dendl;
2819 return;
2820 }
2821 proxyread_ops.erase(tid);
2822
2823 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2824 if (q == in_progress_proxy_ops.end()) {
2825 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2826 return;
2827 }
2828 assert(q->second.size());
2829 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2830 q->second.end(),
2831 prdop->op);
2832 assert(it != q->second.end());
2833 OpRequestRef op = *it;
2834 q->second.erase(it);
2835 if (q->second.size() == 0) {
2836 in_progress_proxy_ops.erase(oid);
2837 }
2838
2839 osd->logger->inc(l_osd_tier_proxy_read);
2840
2841 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2842 OpContext *ctx = new OpContext(op, m->get_reqid(), prdop->ops, this);
2843 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2844 ctx->user_at_version = prdop->user_version;
2845 ctx->data_off = prdop->data_offset;
2846 ctx->ignore_log_op_stats = true;
2847 complete_read_ctx(r, ctx);
2848}
2849
2850void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2851{
2852 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2853 if (p == in_progress_proxy_ops.end())
2854 return;
2855
2856 list<OpRequestRef>& ls = p->second;
2857 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2858 requeue_ops(ls);
2859 in_progress_proxy_ops.erase(p);
2860}
2861
2862void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2863{
2864 dout(10) << __func__ << " " << prdop->soid << dendl;
2865 prdop->canceled = true;
2866
2867 // cancel objecter op, if we can
2868 if (prdop->objecter_tid) {
2869 osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2870 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2871 prdop->ops[i].outdata.clear();
2872 }
2873 proxyread_ops.erase(prdop->objecter_tid);
2874 prdop->objecter_tid = 0;
2875 }
2876}
2877
2878void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2879{
2880 dout(10) << __func__ << dendl;
2881
2882 // cancel proxy reads
2883 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2884 while (p != proxyread_ops.end()) {
2885 cancel_proxy_read((p++)->second);
2886 }
2887
2888 // cancel proxy writes
2889 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2890 while (q != proxywrite_ops.end()) {
2891 cancel_proxy_write((q++)->second);
2892 }
2893
2894 if (requeue) {
2895 map<hobject_t, list<OpRequestRef>>::iterator p =
2896 in_progress_proxy_ops.begin();
2897 while (p != in_progress_proxy_ops.end()) {
2898 list<OpRequestRef>& ls = p->second;
2899 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2900 << " requests" << dendl;
2901 requeue_ops(ls);
2902 in_progress_proxy_ops.erase(p++);
2903 }
2904 } else {
2905 in_progress_proxy_ops.clear();
2906 }
2907}
2908
2909struct C_ProxyWrite_Commit : public Context {
2910 PrimaryLogPGRef pg;
2911 hobject_t oid;
2912 epoch_t last_peering_reset;
2913 ceph_tid_t tid;
2914 PrimaryLogPG::ProxyWriteOpRef pwop;
2915 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2916 const PrimaryLogPG::ProxyWriteOpRef& pw)
2917 : pg(p), oid(o), last_peering_reset(lpr),
2918 tid(0), pwop(pw)
2919 {}
2920 void finish(int r) override {
2921 if (pwop->canceled)
2922 return;
2923 pg->lock();
2924 if (pwop->canceled) {
2925 pg->unlock();
2926 return;
2927 }
2928 if (last_peering_reset == pg->get_last_peering_reset()) {
2929 pg->finish_proxy_write(oid, tid, r);
2930 }
2931 pg->unlock();
2932 }
2933};
2934
31f18b77 2935void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
7c673cae
FG
2936{
2937 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2938 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77 2939 object_locator_t oloc;
7c673cae 2940 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
31f18b77
FG
2941 hobject_t soid;
2942 /* extensible tier */
2943 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2944 switch (obc->obs.oi.manifest.type) {
2945 case object_manifest_t::TYPE_REDIRECT:
2946 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2947 soid = obc->obs.oi.manifest.redirect_target;
2948 break;
2949 case object_manifest_t::TYPE_CHUNKED:
2950 default:
2951 assert(0 == "unrecognized manifest type");
2952 }
2953 } else {
2954 /* proxy */
2955 soid = m->get_hobj();
2956 oloc = object_locator_t(m->get_object_locator());
2957 oloc.pool = pool.info.tier_of;
2958 }
7c673cae 2959
7c673cae 2960 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
31f18b77
FG
2961 if (!(op->may_write() || op->may_cache())) {
2962 flags |= CEPH_OSD_FLAG_RWORDERED;
2963 }
7c673cae
FG
2964 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2965
2966 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2967 pwop->ctx = new OpContext(op, m->get_reqid(), pwop->ops, this);
2968 pwop->mtime = m->get_mtime();
2969
2970 ObjectOperation obj_op;
2971 obj_op.dup(pwop->ops);
2972
2973 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2974 this, soid, get_last_peering_reset(), pwop);
2975 ceph_tid_t tid = osd->objecter->mutate(
2976 soid.oid, oloc, obj_op, snapc,
2977 ceph::real_clock::from_ceph_timespec(pwop->mtime),
2978 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2979 &pwop->user_version, pwop->reqid);
2980 fin->tid = tid;
2981 pwop->objecter_tid = tid;
2982 proxywrite_ops[tid] = pwop;
2983 in_progress_proxy_ops[soid].push_back(op);
2984}
2985
2986void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
2987{
2988 dout(10) << __func__ << " " << oid << " tid " << tid
2989 << " " << cpp_strerror(r) << dendl;
2990
2991 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
2992 if (p == proxywrite_ops.end()) {
2993 dout(10) << __func__ << " no proxywrite_op found" << dendl;
2994 return;
2995 }
2996 ProxyWriteOpRef pwop = p->second;
2997 assert(tid == pwop->objecter_tid);
2998 assert(oid == pwop->soid);
2999
3000 proxywrite_ops.erase(tid);
3001
3002 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3003 if (q == in_progress_proxy_ops.end()) {
3004 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3005 delete pwop->ctx;
3006 pwop->ctx = NULL;
3007 return;
3008 }
3009 list<OpRequestRef>& in_progress_op = q->second;
3010 assert(in_progress_op.size());
3011 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3012 in_progress_op.end(),
3013 pwop->op);
3014 assert(it != in_progress_op.end());
3015 in_progress_op.erase(it);
3016 if (in_progress_op.size() == 0) {
3017 in_progress_proxy_ops.erase(oid);
3018 }
3019
3020 osd->logger->inc(l_osd_tier_proxy_write);
3021
3022 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3023 assert(m != NULL);
3024
3025 if (!pwop->sent_reply) {
3026 // send commit.
3027 MOSDOpReply *reply = pwop->ctx->reply;
3028 if (reply)
3029 pwop->ctx->reply = NULL;
3030 else {
3031 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3032 reply->set_reply_versions(eversion_t(), pwop->user_version);
3033 }
3034 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3035 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3036 osd->send_message_osd_client(reply, m->get_connection());
3037 pwop->sent_reply = true;
3038 pwop->ctx->op->mark_commit_sent();
3039 }
3040
3041 delete pwop->ctx;
3042 pwop->ctx = NULL;
3043}
3044
3045void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3046{
3047 dout(10) << __func__ << " " << pwop->soid << dendl;
3048 pwop->canceled = true;
3049
3050 // cancel objecter op, if we can
3051 if (pwop->objecter_tid) {
3052 osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3053 delete pwop->ctx;
3054 pwop->ctx = NULL;
3055 proxywrite_ops.erase(pwop->objecter_tid);
3056 pwop->objecter_tid = 0;
3057 }
3058}
3059
3060class PromoteCallback: public PrimaryLogPG::CopyCallback {
3061 ObjectContextRef obc;
3062 PrimaryLogPG *pg;
3063 utime_t start;
3064public:
3065 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3066 : obc(obc_),
3067 pg(pg_),
3068 start(ceph_clock_now()) {}
3069
3070 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3071 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3072 int r = results.get<0>();
3073 pg->finish_promote(r, results_data, obc);
3074 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3075 }
3076};
3077
3078void PrimaryLogPG::promote_object(ObjectContextRef obc,
3079 const hobject_t& missing_oid,
3080 const object_locator_t& oloc,
3081 OpRequestRef op,
3082 ObjectContextRef *promote_obc)
3083{
3084 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3085 assert(hoid != hobject_t());
3086 if (scrubber.write_blocked_by_scrub(hoid)) {
3087 dout(10) << __func__ << " " << hoid
3088 << " blocked by scrub" << dendl;
3089 if (op) {
3090 waiting_for_scrub.push_back(op);
3091 op->mark_delayed("waiting for scrub");
3092 dout(10) << __func__ << " " << hoid
3093 << " placing op in waiting_for_scrub" << dendl;
3094 } else {
3095 dout(10) << __func__ << " " << hoid
3096 << " no op, dropping on the floor" << dendl;
3097 }
3098 return;
3099 }
3100 if (!obc) { // we need to create an ObjectContext
3101 assert(missing_oid != hobject_t());
3102 obc = get_object_context(missing_oid, true);
3103 }
3104 if (promote_obc)
3105 *promote_obc = obc;
3106
3107 /*
3108 * Before promote complete, if there are proxy-reads for the object,
3109 * for this case we don't use DONTNEED.
3110 */
3111 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3112 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3113 if (q == in_progress_proxy_ops.end()) {
3114 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3115 }
3116
3117 PromoteCallback *cb = new PromoteCallback(obc, this);
3118 object_locator_t my_oloc = oloc;
3119 my_oloc.pool = pool.info.tier_of;
3120
3121 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3122 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3123 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3124 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3125 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3126 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3127 src_fadvise_flags, 0);
3128
3129 assert(obc->is_blocked());
3130
3131 if (op)
3132 wait_for_blocked_object(obc->obs.oi.soid, op);
3133 info.stats.stats.sum.num_promote++;
3134}
3135
3136void PrimaryLogPG::execute_ctx(OpContext *ctx)
3137{
3138 FUNCTRACE();
3139 dout(10) << __func__ << " " << ctx << dendl;
3140 ctx->reset_obs(ctx->obc);
3141 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3142 OpRequestRef op = ctx->op;
3143 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3144 ObjectContextRef obc = ctx->obc;
3145 const hobject_t& soid = obc->obs.oi.soid;
3146
3147 // this method must be idempotent since we may call it several times
3148 // before we finally apply the resulting transaction.
3149 ctx->op_t.reset(new PGTransaction);
3150
3151 if (op->may_write() || op->may_cache()) {
3152 // snap
3153 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3154 pool.info.is_pool_snaps_mode()) {
3155 // use pool's snapc
3156 ctx->snapc = pool.snapc;
3157 } else {
3158 // client specified snapc
3159 ctx->snapc.seq = m->get_snap_seq();
3160 ctx->snapc.snaps = m->get_snaps();
3161 filter_snapc(ctx->snapc.snaps);
3162 }
3163 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3164 ctx->snapc.seq < obc->ssc->snapset.seq) {
3165 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3166 << " < snapset seq " << obc->ssc->snapset.seq
3167 << " on " << obc->obs.oi.soid << dendl;
3168 reply_ctx(ctx, -EOLDSNAPC);
3169 return;
3170 }
3171
3172 // version
3173 ctx->at_version = get_next_version();
3174 ctx->mtime = m->get_mtime();
3175
3176 dout(10) << __func__ << " " << soid << " " << ctx->ops
3177 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3178 << " snapc " << ctx->snapc
3179 << " snapset " << obc->ssc->snapset
3180 << dendl;
3181 } else {
3182 dout(10) << __func__ << " " << soid << " " << ctx->ops
3183 << " ov " << obc->obs.oi.version
3184 << dendl;
3185 }
3186
3187 if (!ctx->user_at_version)
3188 ctx->user_at_version = obc->obs.oi.user_version;
3189 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3190
3191 if (op->may_read()) {
3192 dout(10) << " taking ondisk_read_lock" << dendl;
3193 obc->ondisk_read_lock();
3194 }
3195
3196 {
3197#ifdef WITH_LTTNG
3198 osd_reqid_t reqid = ctx->op->get_reqid();
3199#endif
3200 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3201 reqid.name._num, reqid.tid, reqid.inc);
3202 }
3203
3204 int result = prepare_transaction(ctx);
3205
3206 {
3207#ifdef WITH_LTTNG
3208 osd_reqid_t reqid = ctx->op->get_reqid();
3209#endif
3210 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3211 reqid.name._num, reqid.tid, reqid.inc);
3212 }
3213
3214 if (op->may_read()) {
3215 dout(10) << " dropping ondisk_read_lock" << dendl;
3216 obc->ondisk_read_unlock();
3217 }
3218
3219 if (result == -EINPROGRESS) {
3220 // come back later.
3221 return;
3222 }
3223
3224 if (result == -EAGAIN) {
3225 // clean up after the ctx
3226 close_op_ctx(ctx);
3227 return;
3228 }
3229
3230 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3231 // prepare the reply
3232 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3233 successful_write);
3234
3235 // Write operations aren't allowed to return a data payload because
3236 // we can't do so reliably. If the client has to resend the request
3237 // and it has already been applied, we will return 0 with no
3238 // payload. Non-deterministic behavior is no good. However, it is
3239 // possible to construct an operation that does a read, does a guard
3240 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3241 // with the write, or return a CMPXATTR and the read value.
3242 if (successful_write) {
3243 // write. normalize the result code.
3244 dout(20) << " zeroing write result code " << result << dendl;
3245 result = 0;
3246 }
3247 ctx->reply->set_result(result);
3248
3249 // read or error?
3250 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3251 // finish side-effects
3252 if (result >= 0)
3253 do_osd_op_effects(ctx, m->get_connection());
3254
3255 if (ctx->pending_async_reads.empty()) {
3256 complete_read_ctx(result, ctx);
3257 } else {
3258 in_progress_async_reads.push_back(make_pair(op, ctx));
3259 ctx->start_async_reads(this);
3260 }
3261
3262 return;
3263 }
3264
3265 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3266
3267 assert(op->may_write() || op->may_cache());
3268
3269 // trim log?
3270 calc_trim_to();
3271
3272 // verify that we are doing this in order?
3273 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3274 !pool.info.is_tier() && !pool.info.has_tiers()) {
3275 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3276 ceph_tid_t t = m->get_tid();
3277 client_t n = m->get_source().num();
3278 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3279 if (p == cm.end()) {
3280 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3281 cm[n] = t;
3282 } else {
3283 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3284 if (p->second > t) {
3285 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3286 assert(0 == "out of order op");
3287 }
3288 p->second = t;
3289 }
3290 }
3291
3292 if (ctx->update_log_only) {
3293 if (result >= 0)
3294 do_osd_op_effects(ctx, m->get_connection());
3295
3296 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3297 // save just what we need from ctx
3298 MOSDOpReply *reply = ctx->reply;
3299 ctx->reply = nullptr;
3300 reply->claim_op_out_data(ctx->ops);
3301 reply->get_header().data_off = ctx->data_off;
3302 close_op_ctx(ctx);
3303
3304 if (result == -ENOENT) {
3305 reply->set_enoent_reply_versions(info.last_update,
3306 info.last_user_version);
3307 }
3308 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3309 // append to pg log for dup detection - don't save buffers for now
3310 record_write_error(op, soid, reply, result);
3311 return;
3312 }
3313
3314 // no need to capture PG ref, repop cancel will handle that
3315 // Can capture the ctx by pointer, it's owned by the repop
3316 ctx->register_on_commit(
3317 [m, ctx, this](){
3318 if (ctx->op)
3319 log_op_stats(
3320 ctx);
3321
3322 if (m && !ctx->sent_reply) {
3323 MOSDOpReply *reply = ctx->reply;
3324 if (reply)
3325 ctx->reply = nullptr;
3326 else {
3327 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3328 reply->set_reply_versions(ctx->at_version,
3329 ctx->user_at_version);
3330 }
3331 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3332 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3333 osd->send_message_osd_client(reply, m->get_connection());
3334 ctx->sent_reply = true;
3335 ctx->op->mark_commit_sent();
3336 }
3337 });
3338 ctx->register_on_success(
3339 [ctx, this]() {
3340 do_osd_op_effects(
3341 ctx,
3342 ctx->op ? ctx->op->get_req()->get_connection() :
3343 ConnectionRef());
3344 });
3345 ctx->register_on_finish(
3346 [ctx, this]() {
3347 delete ctx;
3348 });
3349
3350 // issue replica writes
3351 ceph_tid_t rep_tid = osd->get_tid();
3352
3353 RepGather *repop = new_repop(ctx, obc, rep_tid);
3354
3355 issue_repop(repop, ctx);
3356 eval_repop(repop);
3357 repop->put();
3358}
3359
3360void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3361{
3362 if (ctx->op)
3363 osd->reply_op_error(ctx->op, r);
3364 close_op_ctx(ctx);
3365}
3366
3367void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3368{
3369 if (ctx->op)
3370 osd->reply_op_error(ctx->op, r, v, uv);
3371 close_op_ctx(ctx);
3372}
3373
3374void PrimaryLogPG::log_op_stats(OpContext *ctx)
3375{
3376 OpRequestRef op = ctx->op;
3377 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3378
3379 utime_t now = ceph_clock_now();
3380 utime_t latency = now;
3381 latency -= ctx->op->get_req()->get_recv_stamp();
3382 utime_t process_latency = now;
3383 process_latency -= ctx->op->get_dequeued_time();
3384
3385 uint64_t inb = ctx->bytes_written;
3386 uint64_t outb = ctx->bytes_read;
3387
3388 osd->logger->inc(l_osd_op);
3389
3390 osd->logger->inc(l_osd_op_outb, outb);
3391 osd->logger->inc(l_osd_op_inb, inb);
3392 osd->logger->tinc(l_osd_op_lat, latency);
3393 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3394
3395 if (op->may_read() && op->may_write()) {
3396 osd->logger->inc(l_osd_op_rw);
3397 osd->logger->inc(l_osd_op_rw_inb, inb);
3398 osd->logger->inc(l_osd_op_rw_outb, outb);
3399 osd->logger->tinc(l_osd_op_rw_lat, latency);
3400 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3401 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3402 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3403 } else if (op->may_read()) {
3404 osd->logger->inc(l_osd_op_r);
3405 osd->logger->inc(l_osd_op_r_outb, outb);
3406 osd->logger->tinc(l_osd_op_r_lat, latency);
3407 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3408 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3409 } else if (op->may_write() || op->may_cache()) {
3410 osd->logger->inc(l_osd_op_w);
3411 osd->logger->inc(l_osd_op_w_inb, inb);
3412 osd->logger->tinc(l_osd_op_w_lat, latency);
3413 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3414 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3415 } else
3416 ceph_abort();
3417
3418 dout(15) << "log_op_stats " << *m
3419 << " inb " << inb
3420 << " outb " << outb
3421 << " lat " << latency << dendl;
3422}
3423
3424void PrimaryLogPG::do_sub_op(OpRequestRef op)
3425{
3426 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3427 assert(have_same_or_newer_map(m->map_epoch));
3428 assert(m->get_type() == MSG_OSD_SUBOP);
3429 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3430
3431 if (!is_peered()) {
3432 waiting_for_peered.push_back(op);
3433 op->mark_delayed("waiting for active");
3434 return;
3435 }
3436
3437 const OSDOp *first = NULL;
3438 if (m->ops.size() >= 1) {
3439 first = &m->ops[0];
3440 }
3441
3442 if (first) {
3443 switch (first->op.op) {
3444 case CEPH_OSD_OP_DELETE:
3445 sub_op_remove(op);
3446 return;
3447 case CEPH_OSD_OP_SCRUB_RESERVE:
3448 handle_scrub_reserve_request(op);
3449 return;
3450 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3451 handle_scrub_reserve_release(op);
3452 return;
3453 case CEPH_OSD_OP_SCRUB_MAP:
3454 sub_op_scrub_map(op);
3455 return;
3456 }
3457 }
3458}
3459
3460void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3461{
3462 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3463 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3464 if (r->ops.size() >= 1) {
3465 const OSDOp& first = r->ops[0];
3466 switch (first.op.op) {
3467 case CEPH_OSD_OP_SCRUB_RESERVE:
3468 {
3469 pg_shard_t from = r->from;
3470 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3471 bool reserved;
3472 ::decode(reserved, p);
3473 if (reserved) {
3474 handle_scrub_reserve_grant(op, from);
3475 } else {
3476 handle_scrub_reserve_reject(op, from);
3477 }
3478 }
3479 return;
3480 }
3481 }
3482}
3483
3484void PrimaryLogPG::do_scan(
3485 OpRequestRef op,
3486 ThreadPool::TPHandle &handle)
3487{
3488 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3489 assert(m->get_type() == MSG_OSD_PG_SCAN);
3490 dout(10) << "do_scan " << *m << dendl;
3491
3492 op->mark_started();
3493
3494 switch (m->op) {
3495 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3496 {
3497 ostringstream ss;
3498 if (osd->check_backfill_full(ss)) {
3499 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3500 queue_peering_event(
3501 CephPeeringEvtRef(
3502 std::make_shared<CephPeeringEvt>(
3503 get_osdmap()->get_epoch(),
3504 get_osdmap()->get_epoch(),
3505 BackfillTooFull())));
3506 return;
3507 }
3508
3509 BackfillInterval bi;
3510 bi.begin = m->begin;
3511 // No need to flush, there won't be any in progress writes occuring
3512 // past m->begin
3513 scan_range(
3514 cct->_conf->osd_backfill_scan_min,
3515 cct->_conf->osd_backfill_scan_max,
3516 &bi,
3517 handle);
3518 MOSDPGScan *reply = new MOSDPGScan(
3519 MOSDPGScan::OP_SCAN_DIGEST,
3520 pg_whoami,
3521 get_osdmap()->get_epoch(), m->query_epoch,
3522 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3523 ::encode(bi.objects, reply->get_data());
3524 osd->send_message_osd_cluster(reply, m->get_connection());
3525 }
3526 break;
3527
3528 case MOSDPGScan::OP_SCAN_DIGEST:
3529 {
3530 pg_shard_t from = m->from;
3531
3532 // Check that from is in backfill_targets vector
3533 assert(is_backfill_targets(from));
3534
3535 BackfillInterval& bi = peer_backfill_info[from];
3536 bi.begin = m->begin;
3537 bi.end = m->end;
3538 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3539
3540 // take care to preserve ordering!
3541 bi.clear_objects();
3542 ::decode_noclear(bi.objects, p);
3543
3544 if (waiting_on_backfill.erase(from)) {
3545 if (waiting_on_backfill.empty()) {
3546 assert(peer_backfill_info.size() == backfill_targets.size());
3547 finish_recovery_op(hobject_t::get_max());
3548 }
3549 } else {
3550 // we canceled backfill for a while due to a too full, and this
3551 // is an extra response from a non-too-full peer
3552 }
3553 }
3554 break;
3555 }
3556}
3557
3558void PrimaryLogPG::do_backfill(OpRequestRef op)
3559{
3560 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3561 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3562 dout(10) << "do_backfill " << *m << dendl;
3563
3564 op->mark_started();
3565
3566 switch (m->op) {
3567 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3568 {
3569 assert(cct->_conf->osd_kill_backfill_at != 1);
3570
3571 MOSDPGBackfill *reply = new MOSDPGBackfill(
3572 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3573 get_osdmap()->get_epoch(),
3574 m->query_epoch,
3575 spg_t(info.pgid.pgid, get_primary().shard));
3576 reply->set_priority(get_recovery_op_priority());
3577 osd->send_message_osd_cluster(reply, m->get_connection());
3578 queue_peering_event(
3579 CephPeeringEvtRef(
3580 std::make_shared<CephPeeringEvt>(
3581 get_osdmap()->get_epoch(),
3582 get_osdmap()->get_epoch(),
3583 RecoveryDone())));
3584 }
3585 // fall-thru
3586
3587 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3588 {
3589 assert(cct->_conf->osd_kill_backfill_at != 2);
3590
3591 info.set_last_backfill(m->last_backfill);
3592 info.stats = m->stats;
3593
3594 ObjectStore::Transaction t;
3595 dirty_info = true;
3596 write_if_dirty(t);
3597 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3598 assert(tr == 0);
3599 }
3600 break;
3601
3602 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3603 {
3604 assert(is_primary());
3605 assert(cct->_conf->osd_kill_backfill_at != 3);
3606 finish_recovery_op(hobject_t::get_max());
3607 }
3608 break;
3609 }
3610}
3611
3612void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3613{
3614 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3615 op->get_req());
3616 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3617 dout(7) << __func__ << " " << m->ls << dendl;
3618
3619 op->mark_started();
3620
3621 ObjectStore::Transaction t;
3622 for (auto& p : m->ls) {
3623 remove_snap_mapped_object(t, p.first);
3624 }
3625 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3626 assert(r == 0);
3627}
3628
224ce89b
WB
3629int PrimaryLogPG::trim_object(
3630 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
7c673cae 3631{
224ce89b 3632 *ctxp = NULL;
7c673cae
FG
3633 // load clone info
3634 bufferlist bl;
3635 ObjectContextRef obc = get_object_context(coid, false, NULL);
224ce89b
WB
3636 if (!obc || !obc->ssc || !obc->ssc->exists) {
3637 osd->clog->error() << __func__ << ": Can not trim " << coid
3638 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3639 return -ENOENT;
7c673cae 3640 }
7c673cae
FG
3641
3642 hobject_t snapoid(
3643 coid.oid, coid.get_key(),
3644 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3645 info.pgid.pool(), coid.get_namespace());
3646 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
224ce89b
WB
3647 if (!snapset_obc) {
3648 osd->clog->error() << __func__ << ": Can not trim " << coid
3649 << " repair needed, no snapset obc for " << snapoid;
3650 return -ENOENT;
3651 }
7c673cae
FG
3652
3653 SnapSet& snapset = obc->ssc->snapset;
3654
3655 bool legacy = snapset.is_legacy() ||
31f18b77 3656 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
3657
3658 object_info_t &coi = obc->obs.oi;
3659 set<snapid_t> old_snaps;
3660 if (legacy) {
3661 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3662 } else {
3663 auto p = snapset.clone_snaps.find(coid.snap);
3664 if (p == snapset.clone_snaps.end()) {
3665 osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset
3666 << " for " << coid << "\n";
224ce89b 3667 return -ENOENT;
7c673cae
FG
3668 }
3669 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3670 snapset.clone_snaps[coid.snap].end());
3671 }
3672 if (old_snaps.empty()) {
3673 osd->clog->error() << __func__ << " No object info snaps for " << coid;
224ce89b 3674 return -ENOENT;
7c673cae
FG
3675 }
3676
3677 dout(10) << coid << " old_snaps " << old_snaps
3678 << " old snapset " << snapset << dendl;
3679 if (snapset.seq == 0) {
3680 osd->clog->error() << __func__ << " No snapset.seq for " << coid;
224ce89b 3681 return -ENOENT;
7c673cae
FG
3682 }
3683
3684 set<snapid_t> new_snaps;
3685 for (set<snapid_t>::iterator i = old_snaps.begin();
3686 i != old_snaps.end();
3687 ++i) {
3688 if (!pool.info.is_removed_snap(*i))
3689 new_snaps.insert(*i);
3690 }
3691
3692 vector<snapid_t>::iterator p = snapset.clones.end();
3693
3694 if (new_snaps.empty()) {
3695 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3696 if (p == snapset.clones.end()) {
3697 osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones";
224ce89b 3698 return -ENOENT;
7c673cae
FG
3699 }
3700 }
3701
3702 OpContextUPtr ctx = simple_opc_create(obc);
3703 ctx->snapset_obc = snapset_obc;
3704
3705 if (!ctx->lock_manager.get_snaptrimmer_write(
3706 coid,
3707 obc,
3708 first)) {
3709 close_op_ctx(ctx.release());
3710 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
224ce89b 3711 return -ENOLCK;
7c673cae
FG
3712 }
3713
3714 if (!ctx->lock_manager.get_snaptrimmer_write(
3715 snapoid,
3716 snapset_obc,
3717 first)) {
3718 close_op_ctx(ctx.release());
3719 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
224ce89b 3720 return -ENOLCK;
7c673cae
FG
3721 }
3722
3723 ctx->at_version = get_next_version();
3724
3725 PGTransaction *t = ctx->op_t.get();
3726
3727 if (new_snaps.empty()) {
3728 // remove clone
3729 dout(10) << coid << " snaps " << old_snaps << " -> "
3730 << new_snaps << " ... deleting" << dendl;
3731
3732 // ...from snapset
3733 assert(p != snapset.clones.end());
3734
3735 snapid_t last = coid.snap;
3736 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3737
3738 if (p != snapset.clones.begin()) {
3739 // not the oldest... merge overlap into next older clone
3740 vector<snapid_t>::iterator n = p - 1;
3741 hobject_t prev_coid = coid;
3742 prev_coid.snap = *n;
3743 bool adjust_prev_bytes = is_present_clone(prev_coid);
3744
3745 if (adjust_prev_bytes)
3746 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3747
3748 snapset.clone_overlap[*n].intersection_of(
3749 snapset.clone_overlap[*p]);
3750
3751 if (adjust_prev_bytes)
3752 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3753 }
3754 ctx->delta_stats.num_objects--;
3755 if (coi.is_dirty())
3756 ctx->delta_stats.num_objects_dirty--;
3757 if (coi.is_omap())
3758 ctx->delta_stats.num_objects_omap--;
3759 if (coi.is_whiteout()) {
3760 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3761 ctx->delta_stats.num_whiteouts--;
3762 }
3763 ctx->delta_stats.num_object_clones--;
3764 if (coi.is_cache_pinned())
3765 ctx->delta_stats.num_objects_pinned--;
3766 obc->obs.exists = false;
3767
3768 snapset.clones.erase(p);
3769 snapset.clone_overlap.erase(last);
3770 snapset.clone_size.erase(last);
3771 snapset.clone_snaps.erase(last);
3772
3773 ctx->log.push_back(
3774 pg_log_entry_t(
3775 pg_log_entry_t::DELETE,
3776 coid,
3777 ctx->at_version,
3778 ctx->obs->oi.version,
3779 0,
3780 osd_reqid_t(),
3781 ctx->mtime,
3782 0)
3783 );
3784 t->remove(coid);
3785 t->update_snaps(
3786 coid,
3787 old_snaps,
3788 new_snaps);
31f18b77
FG
3789
3790 coi = object_info_t(coid);
3791
7c673cae
FG
3792 ctx->at_version.version++;
3793 } else {
3794 // save adjusted snaps for this object
3795 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3796 if (legacy) {
3797 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3798 } else {
3799 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3800 new_snaps.rend());
3801 // we still do a 'modify' event on this object just to trigger a
3802 // snapmapper.update ... :(
3803 }
3804
3805 coi.prior_version = coi.version;
3806 coi.version = ctx->at_version;
3807 bl.clear();
3808 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3809 t->setattr(coid, OI_ATTR, bl);
3810
3811 ctx->log.push_back(
3812 pg_log_entry_t(
3813 pg_log_entry_t::MODIFY,
3814 coid,
3815 coi.version,
3816 coi.prior_version,
3817 0,
3818 osd_reqid_t(),
3819 ctx->mtime,
3820 0)
3821 );
3822 ctx->at_version.version++;
3823
3824 t->update_snaps(
3825 coid,
3826 old_snaps,
3827 new_snaps);
3828 }
3829
3830 // save head snapset
3831 dout(10) << coid << " new snapset " << snapset << " on "
3832 << snapset_obc->obs.oi << dendl;
3833 if (snapset.clones.empty() &&
3834 (!snapset.head_exists ||
3835 (snapset_obc->obs.oi.is_whiteout() &&
3836 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3837 !snapset_obc->obs.oi.is_cache_pinned()))) {
3838 // NOTE: this arguably constitutes minor interference with the
3839 // tiering agent if this is a cache tier since a snap trim event
3840 // is effectively evicting a whiteout we might otherwise want to
3841 // keep around.
3842 dout(10) << coid << " removing " << snapoid << dendl;
3843 ctx->log.push_back(
3844 pg_log_entry_t(
3845 pg_log_entry_t::DELETE,
3846 snapoid,
3847 ctx->at_version,
3848 ctx->snapset_obc->obs.oi.version,
3849 0,
3850 osd_reqid_t(),
3851 ctx->mtime,
3852 0)
3853 );
3854 if (snapoid.is_head()) {
3855 derr << "removing snap head" << dendl;
3856 object_info_t& oi = ctx->snapset_obc->obs.oi;
3857 ctx->delta_stats.num_objects--;
3858 if (oi.is_dirty()) {
3859 ctx->delta_stats.num_objects_dirty--;
7c673cae
FG
3860 }
3861 if (oi.is_omap())
3862 ctx->delta_stats.num_objects_omap--;
3863 if (oi.is_whiteout()) {
3864 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3865 ctx->delta_stats.num_whiteouts--;
7c673cae 3866 }
31f18b77 3867 if (oi.is_cache_pinned()) {
7c673cae 3868 ctx->delta_stats.num_objects_pinned--;
31f18b77 3869 }
7c673cae
FG
3870 }
3871 ctx->snapset_obc->obs.exists = false;
31f18b77 3872 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
7c673cae
FG
3873 t->remove(snapoid);
3874 } else {
3875 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3876 snapset.filter(pool.info);
3877 dout(10) << coid << " writing updated snapset on " << snapoid
3878 << ", snapset is " << snapset << dendl;
3879 ctx->log.push_back(
3880 pg_log_entry_t(
3881 pg_log_entry_t::MODIFY,
3882 snapoid,
3883 ctx->at_version,
3884 ctx->snapset_obc->obs.oi.version,
3885 0,
3886 osd_reqid_t(),
3887 ctx->mtime,
3888 0)
3889 );
3890
3891 ctx->snapset_obc->obs.oi.prior_version =
3892 ctx->snapset_obc->obs.oi.version;
3893 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3894
3895 map <string, bufferlist> attrs;
3896 bl.clear();
3897 ::encode(snapset, bl);
3898 attrs[SS_ATTR].claim(bl);
3899
3900 bl.clear();
3901 ::encode(ctx->snapset_obc->obs.oi, bl,
3902 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3903 attrs[OI_ATTR].claim(bl);
3904 t->setattrs(snapoid, attrs);
3905 }
3906
224ce89b
WB
3907 *ctxp = std::move(ctx);
3908 return 0;
7c673cae
FG
3909}
3910
3911void PrimaryLogPG::kick_snap_trim()
3912{
3913 assert(is_active());
3914 assert(is_primary());
3915 if (is_clean() && !snap_trimq.empty()) {
3916 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3917 snap_trimmer_machine.process_event(KickTrim());
3918 }
3919}
3920
3921void PrimaryLogPG::snap_trimmer_scrub_complete()
3922{
3923 if (is_primary() && is_active() && is_clean()) {
3924 assert(!snap_trimq.empty());
3925 snap_trimmer_machine.process_event(ScrubComplete());
3926 }
3927}
3928
3929void PrimaryLogPG::snap_trimmer(epoch_t queued)
3930{
3931 if (deleting || pg_has_reset_since(queued)) {
3932 return;
3933 }
3934
3935 assert(is_primary());
3936
3937 dout(10) << "snap_trimmer posting" << dendl;
3938 snap_trimmer_machine.process_event(DoSnapWork());
3939 dout(10) << "snap_trimmer complete" << dendl;
3940 return;
3941}
3942
3943int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3944{
3945 __u64 v2;
3946
3947 string v2s(xattr.c_str(), xattr.length());
3948 if (v2s.length())
3949 v2 = strtoull(v2s.c_str(), NULL, 10);
3950 else
3951 v2 = 0;
3952
3953 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3954
3955 switch (op) {
3956 case CEPH_OSD_CMPXATTR_OP_EQ:
3957 return (v1 == v2);
3958 case CEPH_OSD_CMPXATTR_OP_NE:
3959 return (v1 != v2);
3960 case CEPH_OSD_CMPXATTR_OP_GT:
3961 return (v1 > v2);
3962 case CEPH_OSD_CMPXATTR_OP_GTE:
3963 return (v1 >= v2);
3964 case CEPH_OSD_CMPXATTR_OP_LT:
3965 return (v1 < v2);
3966 case CEPH_OSD_CMPXATTR_OP_LTE:
3967 return (v1 <= v2);
3968 default:
3969 return -EINVAL;
3970 }
3971}
3972
3973int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
3974{
3975 string v2s(xattr.c_str(), xattr.length());
3976
3977 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
3978
3979 switch (op) {
3980 case CEPH_OSD_CMPXATTR_OP_EQ:
3981 return (v1s.compare(v2s) == 0);
3982 case CEPH_OSD_CMPXATTR_OP_NE:
3983 return (v1s.compare(v2s) != 0);
3984 case CEPH_OSD_CMPXATTR_OP_GT:
3985 return (v1s.compare(v2s) > 0);
3986 case CEPH_OSD_CMPXATTR_OP_GTE:
3987 return (v1s.compare(v2s) >= 0);
3988 case CEPH_OSD_CMPXATTR_OP_LT:
3989 return (v1s.compare(v2s) < 0);
3990 case CEPH_OSD_CMPXATTR_OP_LTE:
3991 return (v1s.compare(v2s) <= 0);
3992 default:
3993 return -EINVAL;
3994 }
3995}
3996
3997int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
3998{
3999 ceph_osd_op& op = osd_op.op;
4000 vector<OSDOp> read_ops(1);
4001 OSDOp& read_op = read_ops[0];
4002 int result = 0;
4003
4004 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4005 read_op.op.extent.offset = op.extent.offset;
4006 read_op.op.extent.length = op.extent.length;
4007 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4008 read_op.op.extent.truncate_size = op.extent.truncate_size;
4009
4010 result = do_osd_ops(ctx, read_ops);
4011 if (result < 0) {
4012 derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
4013 return result;
4014 }
4015
4016 if (read_op.outdata.length() != osd_op.indata.length())
4017 return -EINVAL;
4018
4019 for (uint64_t p = 0; p < osd_op.indata.length(); p++) {
4020 if (read_op.outdata[p] != osd_op.indata[p]) {
4021 return (-MAX_ERRNO - p);
4022 }
4023 }
4024
4025 return result;
4026}
4027
4028int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4029{
4030 ceph_osd_op& op = osd_op.op;
4031 vector<OSDOp> write_ops(1);
4032 OSDOp& write_op = write_ops[0];
4033 uint64_t write_length = op.writesame.length;
4034 int result = 0;
4035
4036 if (!write_length)
4037 return 0;
4038
4039 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4040 return -EINVAL;
4041
4042 if (op.writesame.data_length != osd_op.indata.length()) {
4043 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4044 return -EINVAL;
4045 }
4046
4047 while (write_length) {
4048 write_op.indata.append(osd_op.indata);
4049 write_length -= op.writesame.data_length;
4050 }
4051
4052 write_op.op.op = CEPH_OSD_OP_WRITE;
4053 write_op.op.extent.offset = op.writesame.offset;
4054 write_op.op.extent.length = op.writesame.length;
4055 result = do_osd_ops(ctx, write_ops);
4056 if (result < 0)
4057 derr << "do_writesame do_osd_ops failed " << result << dendl;
4058
4059 return result;
4060}
4061
4062// ========================================================================
4063// low level osd ops
4064
4065int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4066{
4067 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4068 bufferlist header, vals;
4069 int r = _get_tmap(ctx, &header, &vals);
4070 if (r < 0) {
4071 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4072 r = 0;
4073 return r;
4074 }
4075
4076 vector<OSDOp> ops(3);
4077
4078 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4079 ops[0].op.extent.offset = 0;
4080 ops[0].op.extent.length = 0;
4081
4082 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4083 ops[1].indata.claim(header);
4084
4085 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4086 ops[2].indata.claim(vals);
4087
4088 return do_osd_ops(ctx, ops);
4089}
4090
4091int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4092 bufferlist& bl)
4093{
4094 // decode
4095 bufferlist header;
4096 map<string, bufferlist> m;
4097 if (bl.length()) {
4098 bufferlist::iterator p = bl.begin();
4099 ::decode(header, p);
4100 ::decode(m, p);
4101 assert(p.end());
4102 }
4103
4104 // do the update(s)
4105 while (!bp.end()) {
4106 __u8 op;
4107 string key;
4108 ::decode(op, bp);
4109
4110 switch (op) {
4111 case CEPH_OSD_TMAP_SET: // insert key
4112 {
4113 ::decode(key, bp);
4114 bufferlist data;
4115 ::decode(data, bp);
4116 m[key] = data;
4117 }
4118 break;
4119 case CEPH_OSD_TMAP_RM: // remove key
4120 ::decode(key, bp);
4121 if (!m.count(key)) {
4122 return -ENOENT;
4123 }
4124 m.erase(key);
4125 break;
4126 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4127 ::decode(key, bp);
4128 m.erase(key);
4129 break;
4130 case CEPH_OSD_TMAP_HDR: // update header
4131 {
4132 ::decode(header, bp);
4133 }
4134 break;
4135 default:
4136 return -EINVAL;
4137 }
4138 }
4139
4140 // reencode
4141 bufferlist obl;
4142 ::encode(header, obl);
4143 ::encode(m, obl);
4144
4145 // write it out
4146 vector<OSDOp> nops(1);
4147 OSDOp& newop = nops[0];
4148 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4149 newop.op.extent.offset = 0;
4150 newop.op.extent.length = obl.length();
4151 newop.indata = obl;
4152 do_osd_ops(ctx, nops);
4153 osd_op.outdata.claim(newop.outdata);
4154 return 0;
4155}
4156
4157int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4158{
4159 bufferlist::iterator orig_bp = bp;
4160 int result = 0;
4161 if (bp.end()) {
4162 dout(10) << "tmapup is a no-op" << dendl;
4163 } else {
4164 // read the whole object
4165 vector<OSDOp> nops(1);
4166 OSDOp& newop = nops[0];
4167 newop.op.op = CEPH_OSD_OP_READ;
4168 newop.op.extent.offset = 0;
4169 newop.op.extent.length = 0;
4170 result = do_osd_ops(ctx, nops);
4171
4172 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4173
4174 dout(30) << " starting is \n";
4175 newop.outdata.hexdump(*_dout);
4176 *_dout << dendl;
4177
4178 bufferlist::iterator ip = newop.outdata.begin();
4179 bufferlist obl;
4180
4181 dout(30) << "the update command is: \n";
4182 osd_op.indata.hexdump(*_dout);
4183 *_dout << dendl;
4184
4185 // header
4186 bufferlist header;
4187 __u32 nkeys = 0;
4188 if (newop.outdata.length()) {
4189 ::decode(header, ip);
4190 ::decode(nkeys, ip);
4191 }
4192 dout(10) << "tmapup header " << header.length() << dendl;
4193
4194 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4195 ++bp;
4196 ::decode(header, bp);
4197 dout(10) << "tmapup new header " << header.length() << dendl;
4198 }
4199
4200 ::encode(header, obl);
4201
4202 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4203
4204 // update keys
4205 bufferlist newkeydata;
4206 string nextkey, last_in_key;
4207 bufferlist nextval;
4208 bool have_next = false;
4209 if (!ip.end()) {
4210 have_next = true;
4211 ::decode(nextkey, ip);
4212 ::decode(nextval, ip);
4213 }
4214 while (!bp.end() && !result) {
4215 __u8 op;
4216 string key;
4217 try {
4218 ::decode(op, bp);
4219 ::decode(key, bp);
4220 }
4221 catch (buffer::error& e) {
4222 return -EINVAL;
4223 }
4224 if (key < last_in_key) {
4225 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4226 << "', falling back to an inefficient (unsorted) update" << dendl;
4227 bp = orig_bp;
4228 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4229 }
4230 last_in_key = key;
4231
4232 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4233
4234 // skip existing intervening keys
4235 bool key_exists = false;
4236 while (have_next && !key_exists) {
4237 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4238 if (nextkey > key)
4239 break;
4240 if (nextkey < key) {
4241 // copy untouched.
4242 ::encode(nextkey, newkeydata);
4243 ::encode(nextval, newkeydata);
4244 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4245 } else {
4246 // don't copy; discard old value. and stop.
4247 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4248 key_exists = true;
4249 nkeys--;
4250 }
4251 if (!ip.end()) {
4252 ::decode(nextkey, ip);
4253 ::decode(nextval, ip);
4254 } else {
4255 have_next = false;
4256 }
4257 }
4258
4259 if (op == CEPH_OSD_TMAP_SET) {
4260 bufferlist val;
4261 try {
4262 ::decode(val, bp);
4263 }
4264 catch (buffer::error& e) {
4265 return -EINVAL;
4266 }
4267 ::encode(key, newkeydata);
4268 ::encode(val, newkeydata);
4269 dout(20) << " set " << key << " " << val.length() << dendl;
4270 nkeys++;
4271 } else if (op == CEPH_OSD_TMAP_CREATE) {
4272 if (key_exists) {
4273 return -EEXIST;
4274 }
4275 bufferlist val;
4276 try {
4277 ::decode(val, bp);
4278 }
4279 catch (buffer::error& e) {
4280 return -EINVAL;
4281 }
4282 ::encode(key, newkeydata);
4283 ::encode(val, newkeydata);
4284 dout(20) << " create " << key << " " << val.length() << dendl;
4285 nkeys++;
4286 } else if (op == CEPH_OSD_TMAP_RM) {
4287 // do nothing.
4288 if (!key_exists) {
4289 return -ENOENT;
4290 }
4291 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4292 // do nothing
4293 } else {
4294 dout(10) << " invalid tmap op " << (int)op << dendl;
4295 return -EINVAL;
4296 }
4297 }
4298
4299 // copy remaining
4300 if (have_next) {
4301 ::encode(nextkey, newkeydata);
4302 ::encode(nextval, newkeydata);
4303 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4304 }
4305 if (!ip.end()) {
4306 bufferlist rest;
4307 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4308 dout(20) << " keep trailing " << rest.length()
4309 << " at " << newkeydata.length() << dendl;
4310 newkeydata.claim_append(rest);
4311 }
4312
4313 // encode final key count + key data
4314 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4315 ::encode(nkeys, obl);
4316 obl.claim_append(newkeydata);
4317
4318 if (0) {
4319 dout(30) << " final is \n";
4320 obl.hexdump(*_dout);
4321 *_dout << dendl;
4322
4323 // sanity check
4324 bufferlist::iterator tp = obl.begin();
4325 bufferlist h;
4326 ::decode(h, tp);
4327 map<string,bufferlist> d;
4328 ::decode(d, tp);
4329 assert(tp.end());
4330 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4331 }
4332
4333 // write it out
4334 if (!result) {
4335 dout(20) << "tmapput write " << obl.length() << dendl;
4336 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4337 newop.op.extent.offset = 0;
4338 newop.op.extent.length = obl.length();
4339 newop.indata = obl;
4340 do_osd_ops(ctx, nops);
4341 osd_op.outdata.claim(newop.outdata);
4342 }
4343 }
4344 return result;
4345}
4346
4347static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4348{
4349 if (offset >= max ||
4350 length > max ||
4351 offset + length > max)
4352 return -EFBIG;
4353
4354 return 0;
4355}
4356
4357struct FillInVerifyExtent : public Context {
4358 ceph_le64 *r;
4359 int32_t *rval;
4360 bufferlist *outdatap;
4361 boost::optional<uint32_t> maybe_crc;
4362 uint64_t size;
4363 OSDService *osd;
4364 hobject_t soid;
4365 __le32 flags;
4366 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4367 boost::optional<uint32_t> mc, uint64_t size,
4368 OSDService *osd, hobject_t soid, __le32 flags) :
4369 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4370 size(size), osd(osd), soid(soid), flags(flags) {}
4371 void finish(int len) override {
4372 *rval = len;
4373 *r = len;
4374 if (len < 0)
4375 return;
4376 // whole object? can we verify the checksum?
4377 if (maybe_crc && *r == size) {
4378 uint32_t crc = outdatap->crc32c(-1);
4379 if (maybe_crc != crc) {
4380 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4381 << " != expected 0x" << *maybe_crc
4382 << std::dec << " on " << soid;
4383 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4384 *rval = -EIO;
4385 *r = 0;
4386 }
4387 }
4388 }
4389 }
4390};
4391
4392struct ToSparseReadResult : public Context {
4393 bufferlist& data_bl;
4394 uint64_t data_offset;
4395 ceph_le64& len;
4396 ToSparseReadResult(bufferlist& bl, uint64_t offset, ceph_le64& len):
4397 data_bl(bl), data_offset(offset),len(len) {}
4398 void finish(int r) override {
4399 if (r < 0) return;
4400 len = r;
4401 bufferlist outdata;
4402 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4403 ::encode(extents, outdata);
4404 ::encode_destructively(data_bl, outdata);
4405 data_bl.swap(outdata);
4406 }
4407};
4408
4409template<typename V>
4410static string list_keys(const map<string, V>& m) {
4411 string s;
4412 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4413 if (!s.empty()) {
4414 s.push_back(',');
4415 }
4416 s.append(itr->first);
4417 }
4418 return s;
4419}
4420
4421template<typename T>
4422static string list_entries(const T& m) {
4423 string s;
4424 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4425 if (!s.empty()) {
4426 s.push_back(',');
4427 }
4428 s.append(*itr);
4429 }
4430 return s;
4431}
4432
4433void PrimaryLogPG::maybe_create_new_object(
4434 OpContext *ctx,
4435 bool ignore_transaction)
4436{
4437 ObjectState& obs = ctx->new_obs;
4438 if (!obs.exists) {
4439 ctx->delta_stats.num_objects++;
4440 obs.exists = true;
4441 assert(!obs.oi.is_whiteout());
4442 obs.oi.new_object();
4443 if (!ignore_transaction)
4444 ctx->op_t->create(obs.oi.soid);
4445 } else if (obs.oi.is_whiteout()) {
4446 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4447 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4448 --ctx->delta_stats.num_whiteouts;
4449 }
4450}
4451
4452struct C_ChecksumRead : public Context {
4453 PrimaryLogPG *primary_log_pg;
4454 OSDOp &osd_op;
4455 Checksummer::CSumType csum_type;
4456 bufferlist init_value_bl;
4457 ceph_le64 read_length;
4458 bufferlist read_bl;
4459 Context *fill_extent_ctx;
4460
4461 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4462 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4463 boost::optional<uint32_t> maybe_crc, uint64_t size,
4464 OSDService *osd, hobject_t soid, __le32 flags)
4465 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4466 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4467 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4468 &read_bl, maybe_crc, size,
4469 osd, soid, flags)) {
4470 }
4471
4472 void finish(int r) override {
4473 fill_extent_ctx->complete(r);
4474
4475 if (osd_op.rval >= 0) {
4476 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4477 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4478 &init_value_bl_it,
4479 read_bl);
4480 }
4481 }
4482};
4483
4484int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4485 bufferlist::iterator *bl_it, bool *async_read)
4486{
4487 dout(20) << __func__ << dendl;
4488
4489 auto& op = osd_op.op;
4490 if (op.checksum.chunk_size > 0) {
4491 if (op.checksum.length == 0) {
4492 dout(10) << __func__ << ": length required when chunk size provided"
4493 << dendl;
4494 return -EINVAL;
4495 }
4496 if (op.checksum.length % op.checksum.chunk_size != 0) {
4497 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4498 return -EINVAL;
4499 }
4500 }
4501
4502 auto& oi = ctx->new_obs.oi;
4503 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4504 // zeroed offset+length implies checksum whole object
4505 op.checksum.length = oi.size;
4506 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4507 return -EOVERFLOW;
4508 }
4509
4510 Checksummer::CSumType csum_type;
4511 switch (op.checksum.type) {
4512 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4513 csum_type = Checksummer::CSUM_XXHASH32;
4514 break;
4515 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4516 csum_type = Checksummer::CSUM_XXHASH64;
4517 break;
4518 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4519 csum_type = Checksummer::CSUM_CRC32C;
4520 break;
4521 default:
4522 dout(10) << __func__ << ": unknown crc type ("
4523 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4524 return -EINVAL;
4525 }
4526
4527 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4528 if (bl_it->get_remaining() < csum_init_value_size) {
4529 dout(10) << __func__ << ": init value not provided" << dendl;
4530 return -EINVAL;
4531 }
4532
4533 bufferlist init_value_bl;
4534 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4535 csum_init_value_size);
4536 bl_it->advance(csum_init_value_size);
4537
4538 if (pool.info.require_rollback() && op.checksum.length > 0) {
4539 // If there is a data digest and it is possible we are reading
4540 // entire object, pass the digest.
4541 boost::optional<uint32_t> maybe_crc;
4542 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4543 op.checksum.length >= oi.size) {
4544 maybe_crc = oi.data_digest;
4545 }
4546
4547 // async read
4548 auto& soid = oi.soid;
4549 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4550 std::move(init_value_bl), maybe_crc,
4551 oi.size, osd, soid, op.flags);
4552 ctx->pending_async_reads.push_back({
4553 {op.checksum.offset, op.checksum.length, op.flags},
4554 {&checksum_ctx->read_bl, checksum_ctx}});
4555
4556 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4557 *async_read = true;
4558 return 0;
4559 }
4560
4561 // sync read
4562 *async_read = false;
4563 std::vector<OSDOp> read_ops(1);
4564 auto& read_op = read_ops[0];
4565 if (op.checksum.length > 0) {
4566 read_op.op.op = CEPH_OSD_OP_READ;
4567 read_op.op.flags = op.flags;
4568 read_op.op.extent.offset = op.checksum.offset;
4569 read_op.op.extent.length = op.checksum.length;
4570 read_op.op.extent.truncate_size = 0;
4571 read_op.op.extent.truncate_seq = 0;
4572
4573 int r = do_osd_ops(ctx, read_ops);
4574 if (r < 0) {
4575 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4576 return r;
4577 }
4578 }
4579
4580 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4581 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4582 read_op.outdata);
4583}
4584
4585int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4586 Checksummer::CSumType csum_type,
4587 bufferlist::iterator *init_value_bl_it,
4588 const bufferlist &read_bl) {
4589 dout(20) << __func__ << dendl;
4590
4591 auto& op = osd_op.op;
4592
4593 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4594 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4595 << op.checksum.length << dendl;
4596 return -EINVAL;
4597 }
4598
4599 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4600 op.checksum.chunk_size : read_bl.length());
4601 uint32_t csum_count = (csum_chunk_size > 0 ?
4602 read_bl.length() / csum_chunk_size : 0);
4603
4604 bufferlist csum;
4605 bufferptr csum_data;
4606 if (csum_count > 0) {
4607 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4608 csum_data = buffer::create(csum_value_size * csum_count);
4609 csum_data.zero();
4610 csum.append(csum_data);
4611
4612 switch (csum_type) {
4613 case Checksummer::CSUM_XXHASH32:
4614 {
4615 Checksummer::xxhash32::init_value_t init_value;
4616 ::decode(init_value, *init_value_bl_it);
4617 Checksummer::calculate<Checksummer::xxhash32>(
4618 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4619 &csum_data);
4620 }
4621 break;
4622 case Checksummer::CSUM_XXHASH64:
4623 {
4624 Checksummer::xxhash64::init_value_t init_value;
4625 ::decode(init_value, *init_value_bl_it);
4626 Checksummer::calculate<Checksummer::xxhash64>(
4627 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4628 &csum_data);
4629 }
4630 break;
4631 case Checksummer::CSUM_CRC32C:
4632 {
4633 Checksummer::crc32c::init_value_t init_value;
4634 ::decode(init_value, *init_value_bl_it);
4635 Checksummer::calculate<Checksummer::crc32c>(
4636 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4637 &csum_data);
4638 }
4639 break;
4640 default:
4641 break;
4642 }
4643 }
4644
4645 ::encode(csum_count, osd_op.outdata);
4646 osd_op.outdata.claim_append(csum);
4647 return 0;
4648}
4649
4650int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
4651{
4652 int result = 0;
4653 SnapSetContext *ssc = ctx->obc->ssc;
4654 ObjectState& obs = ctx->new_obs;
4655 object_info_t& oi = obs.oi;
4656 const hobject_t& soid = oi.soid;
4657
4658 bool first_read = true;
4659
4660 PGTransaction* t = ctx->op_t.get();
4661
4662 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
4663
4664 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
4665 OSDOp& osd_op = *p;
4666 ceph_osd_op& op = osd_op.op;
4667
4668 // TODO: check endianness (__le32 vs uint32_t, etc.)
4669 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
4670 // but the code in this function seems to treat them as native-endian. What should the
4671 // tracepoints do?
4672 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
4673
4674 dout(10) << "do_osd_op " << osd_op << dendl;
4675
4676 bufferlist::iterator bp = osd_op.indata.begin();
4677
4678 // user-visible modifcation?
4679 switch (op.op) {
4680 // non user-visible modifications
4681 case CEPH_OSD_OP_WATCH:
4682 case CEPH_OSD_OP_CACHE_EVICT:
4683 case CEPH_OSD_OP_CACHE_FLUSH:
4684 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
4685 case CEPH_OSD_OP_UNDIRTY:
4686 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
4687 case CEPH_OSD_OP_CACHE_PIN:
4688 case CEPH_OSD_OP_CACHE_UNPIN:
31f18b77 4689 case CEPH_OSD_OP_SET_REDIRECT:
7c673cae
FG
4690 break;
4691 default:
4692 if (op.op & CEPH_OSD_OP_MODE_WR)
4693 ctx->user_modify = true;
4694 }
4695
4696 // munge -1 truncate to 0 truncate
4697 if (ceph_osd_op_uses_extent(op.op) &&
4698 op.extent.truncate_seq == 1 &&
4699 op.extent.truncate_size == (-1ULL)) {
4700 op.extent.truncate_size = 0;
4701 op.extent.truncate_seq = 0;
4702 }
4703
4704 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
4705 if (op.op == CEPH_OSD_OP_ZERO &&
4706 obs.exists &&
4707 op.extent.offset < cct->_conf->osd_max_object_size &&
4708 op.extent.length >= 1 &&
4709 op.extent.length <= cct->_conf->osd_max_object_size &&
4710 op.extent.offset + op.extent.length >= oi.size) {
4711 if (op.extent.offset >= oi.size) {
4712 // no-op
4713 goto fail;
4714 }
4715 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
4716 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
4717 op.op = CEPH_OSD_OP_TRUNCATE;
4718 }
4719
4720 switch (op.op) {
4721
4722 // --- READS ---
4723
4724 case CEPH_OSD_OP_CMPEXT:
4725 ++ctx->num_read;
4726 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4727 result = do_extent_cmp(ctx, osd_op);
4728 break;
4729
4730 case CEPH_OSD_OP_SYNC_READ:
4731 if (pool.info.require_rollback()) {
4732 result = -EOPNOTSUPP;
4733 break;
4734 }
4735 // fall through
4736 case CEPH_OSD_OP_READ:
4737 ++ctx->num_read;
4738 {
4739 __u32 seq = oi.truncate_seq;
4740 uint64_t size = oi.size;
4741 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), soid.snap.val, size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4742 bool trimmed_read = false;
4743 // are we beyond truncate_size?
4744 if ( (seq < op.extent.truncate_seq) &&
4745 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4746 size = op.extent.truncate_size;
4747
4748 if (op.extent.length == 0) //length is zero mean read the whole object
4749 op.extent.length = size;
4750
4751 if (op.extent.offset >= size) {
4752 op.extent.length = 0;
4753 trimmed_read = true;
4754 } else if (op.extent.offset + op.extent.length > size) {
4755 op.extent.length = size - op.extent.offset;
4756 trimmed_read = true;
4757 }
4758
4759 // read into a buffer
4760 bool async = false;
4761 if (trimmed_read && op.extent.length == 0) {
4762 // read size was trimmed to zero and it is expected to do nothing
4763 // a read operation of 0 bytes does *not* do nothing, this is why
4764 // the trimmed_read boolean is needed
4765 } else if (pool.info.require_rollback()) {
4766 async = true;
4767 boost::optional<uint32_t> maybe_crc;
4768 // If there is a data digest and it is possible we are reading
4769 // entire object, pass the digest. FillInVerifyExtent will
4770 // will check the oi.size again.
4771 if (oi.is_data_digest() && op.extent.offset == 0 &&
4772 op.extent.length >= oi.size)
4773 maybe_crc = oi.data_digest;
4774 ctx->pending_async_reads.push_back(
4775 make_pair(
4776 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4777 make_pair(&osd_op.outdata,
4778 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4779 &osd_op.outdata, maybe_crc, oi.size, osd,
4780 soid, op.flags))));
4781 dout(10) << " async_read noted for " << soid << dendl;
4782 } else {
4783 int r = pgbackend->objects_read_sync(
4784 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
224ce89b
WB
4785 if (r == -EIO) {
4786 r = rep_repair_primary_object(soid, ctx->op);
4787 }
7c673cae
FG
4788 if (r >= 0)
4789 op.extent.length = r;
4790 else {
4791 result = r;
4792 op.extent.length = 0;
4793 }
4794 dout(10) << " read got " << r << " / " << op.extent.length
4795 << " bytes from obj " << soid << dendl;
4796
4797 // whole object? can we verify the checksum?
4798 if (op.extent.length == oi.size && oi.is_data_digest()) {
4799 uint32_t crc = osd_op.outdata.crc32c(-1);
4800 if (oi.data_digest != crc) {
4801 osd->clog->error() << info.pgid << std::hex
4802 << " full-object read crc 0x" << crc
4803 << " != expected 0x" << oi.data_digest
4804 << std::dec << " on " << soid;
4805 // FIXME fall back to replica or something?
4806 result = -EIO;
4807 }
4808 }
4809 }
4810 if (first_read) {
4811 first_read = false;
4812 ctx->data_off = op.extent.offset;
4813 }
4814 // XXX the op.extent.length is the requested length for async read
4815 // On error this length is changed to 0 after the error comes back.
4816 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4817 ctx->delta_stats.num_rd++;
4818
4819 // Skip checking the result and just proceed to the next operation
4820 if (async)
4821 continue;
4822
4823 }
4824 break;
4825
4826 case CEPH_OSD_OP_CHECKSUM:
4827 ++ctx->num_read;
4828 {
4829 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
4830 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
4831 op.checksum.offset, op.checksum.length,
4832 op.checksum.chunk_size);
4833
4834 bool async_read;
4835 result = do_checksum(ctx, osd_op, &bp, &async_read);
4836 if (result == 0 && async_read) {
4837 continue;
4838 }
4839 }
4840 break;
4841
4842 /* map extents */
4843 case CEPH_OSD_OP_MAPEXT:
4844 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
4845 if (pool.info.require_rollback()) {
4846 result = -EOPNOTSUPP;
4847 break;
4848 }
4849 ++ctx->num_read;
4850 {
4851 // read into a buffer
4852 bufferlist bl;
4853 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4854 info.pgid.shard),
4855 op.extent.offset, op.extent.length, bl);
4856 osd_op.outdata.claim(bl);
4857 if (r < 0)
4858 result = r;
4859 else
4860 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
4861 ctx->delta_stats.num_rd++;
4862 dout(10) << " map_extents done on object " << soid << dendl;
4863 }
4864 break;
4865
4866 /* map extents */
4867 case CEPH_OSD_OP_SPARSE_READ:
4868 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4869 if (op.extent.truncate_seq) {
4870 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4871 result = -EINVAL;
4872 break;
4873 }
4874 ++ctx->num_read;
4875 if (pool.info.ec_pool()) {
4876 // translate sparse read to a normal one if not supported
4877 uint64_t offset = op.extent.offset;
4878 uint64_t length = op.extent.length;
4879 if (offset > oi.size) {
4880 length = 0;
4881 } else if (offset + length > oi.size) {
4882 length = oi.size - offset;
4883 }
4884 if (length > 0) {
4885 ctx->pending_async_reads.push_back(
4886 make_pair(
4887 boost::make_tuple(offset, length, op.flags),
4888 make_pair(
4889 &osd_op.outdata,
4890 new ToSparseReadResult(
4891 osd_op.outdata, offset,
4892 op.extent.length /* updated by the callback */))));
4893 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4894 } else {
4895 dout(10) << " sparse read ended up empty for " << soid << dendl;
4896 map<uint64_t, uint64_t> extents;
4897 ::encode(extents, osd_op.outdata);
4898 }
4899 } else {
4900 // read into a buffer
4901 map<uint64_t, uint64_t> m;
4902 uint32_t total_read = 0;
4903 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4904 info.pgid.shard),
4905 op.extent.offset, op.extent.length, m);
4906 if (r < 0) {
4907 result = r;
4908 break;
4909 }
4910 map<uint64_t, uint64_t>::iterator miter;
4911 bufferlist data_bl;
4912 uint64_t last = op.extent.offset;
4913 for (miter = m.begin(); miter != m.end(); ++miter) {
4914 // verify hole?
4915 if (cct->_conf->osd_verify_sparse_read_holes &&
4916 last < miter->first) {
4917 bufferlist t;
4918 uint64_t len = miter->first - last;
4919 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
224ce89b
WB
4920 if (r == -EIO) {
4921 r = rep_repair_primary_object(soid, ctx->op);
4922 }
7c673cae
FG
4923 if (r < 0) {
4924 osd->clog->error() << coll << " " << soid
4925 << " sparse-read failed to read: "
4926 << r;
4927 } else if (!t.is_zero()) {
4928 osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4929 << last << "~" << len;
4930 }
4931 }
4932
4933 bufferlist tmpbl;
4934 r = pgbackend->objects_read_sync(soid, miter->first, miter->second, op.flags, &tmpbl);
4935 if (r < 0) {
4936 result = r;
4937 break;
4938 }
4939
4940 if (r < (int)miter->second) /* this is usually happen when we get extent that exceeds the actual file size */
4941 miter->second = r;
4942 total_read += r;
4943 dout(10) << "sparse-read " << miter->first << "@" << miter->second << dendl;
4944 data_bl.claim_append(tmpbl);
4945 last = miter->first + r;
4946 }
4947
4948 if (r < 0) {
4949 result = r;
4950 break;
4951 }
4952
4953 // verify trailing hole?
4954 if (cct->_conf->osd_verify_sparse_read_holes) {
4955 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4956 if (last < end) {
4957 bufferlist t;
4958 uint64_t len = end - last;
4959 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4960 if (r < 0) {
4961 osd->clog->error() << coll << " " << soid
4962 << " sparse-read failed to read: "
4963 << r;
4964 } else if (!t.is_zero()) {
4965 osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4966 << last << "~" << len;
4967 }
4968 }
4969 }
4970
4971 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4972 // Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
4973 // So from this point, for spare-read add checksum make sense.
4974 if (total_read == oi.size && oi.is_data_digest()) {
4975 uint32_t crc = data_bl.crc32c(-1);
4976 if (oi.data_digest != crc) {
4977 osd->clog->error() << info.pgid << std::hex
4978 << " full-object read crc 0x" << crc
4979 << " != expected 0x" << oi.data_digest
4980 << std::dec << " on " << soid;
4981 // FIXME fall back to replica or something?
4982 result = -EIO;
4983 break;
4984 }
4985 }
4986
4987 op.extent.length = total_read;
4988
4989 ::encode(m, osd_op.outdata); // re-encode since it might be modified
4990 ::encode_destructively(data_bl, osd_op.outdata);
4991
4992 dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl;
4993 }
4994 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4995 ctx->delta_stats.num_rd++;
4996 break;
4997
4998 case CEPH_OSD_OP_CALL:
4999 {
5000 string cname, mname;
5001 bufferlist indata;
5002 try {
5003 bp.copy(op.cls.class_len, cname);
5004 bp.copy(op.cls.method_len, mname);
5005 bp.copy(op.cls.indata_len, indata);
5006 } catch (buffer::error& e) {
5007 dout(10) << "call unable to decode class + method + indata" << dendl;
5008 dout(30) << "in dump: ";
5009 osd_op.indata.hexdump(*_dout);
5010 *_dout << dendl;
5011 result = -EINVAL;
5012 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5013 break;
5014 }
5015 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5016
5017 ClassHandler::ClassData *cls;
5018 result = osd->class_handler->open_class(cname, &cls);
5019 assert(result == 0); // init_op_flags() already verified this works.
5020
5021 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5022 if (!method) {
5023 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5024 result = -EOPNOTSUPP;
5025 break;
5026 }
5027
5028 int flags = method->get_flags();
5029 if (flags & CLS_METHOD_WR)
5030 ctx->user_modify = true;
5031
5032 bufferlist outdata;
5033 dout(10) << "call method " << cname << "." << mname << dendl;
5034 int prev_rd = ctx->num_read;
5035 int prev_wr = ctx->num_write;
5036 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5037
5038 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5039 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5040 result = -EIO;
5041 break;
5042 }
5043 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5044 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5045 result = -EIO;
5046 break;
5047 }
5048
5049 dout(10) << "method called response length=" << outdata.length() << dendl;
5050 op.extent.length = outdata.length();
5051 osd_op.outdata.claim_append(outdata);
5052 dout(30) << "out dump: ";
5053 osd_op.outdata.hexdump(*_dout);
5054 *_dout << dendl;
5055 }
5056 break;
5057
5058 case CEPH_OSD_OP_STAT:
5059 // note: stat does not require RD
5060 {
5061 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5062
5063 if (obs.exists && !oi.is_whiteout()) {
5064 ::encode(oi.size, osd_op.outdata);
5065 ::encode(oi.mtime, osd_op.outdata);
5066 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5067 } else {
5068 result = -ENOENT;
5069 dout(10) << "stat oi object does not exist" << dendl;
5070 }
5071
5072 ctx->delta_stats.num_rd++;
5073 }
5074 break;
5075
5076 case CEPH_OSD_OP_ISDIRTY:
5077 ++ctx->num_read;
5078 {
5079 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5080 bool is_dirty = obs.oi.is_dirty();
5081 ::encode(is_dirty, osd_op.outdata);
5082 ctx->delta_stats.num_rd++;
5083 result = 0;
5084 }
5085 break;
5086
5087 case CEPH_OSD_OP_UNDIRTY:
5088 ++ctx->num_write;
5089 {
5090 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5091 if (oi.is_dirty()) {
5092 ctx->undirty = true; // see make_writeable()
5093 ctx->modify = true;
5094 ctx->delta_stats.num_wr++;
5095 }
5096 result = 0;
5097 }
5098 break;
5099
5100 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5101 ++ctx->num_write;
5102 {
5103 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5104 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5105 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5106 result = -EINVAL;
5107 break;
5108 }
5109 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5110 result = -EINVAL;
5111 break;
5112 }
5113 if (!obs.exists) {
5114 result = 0;
5115 break;
5116 }
5117 if (oi.is_cache_pinned()) {
5118 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5119 result = -EPERM;
5120 break;
5121 }
5122 if (oi.is_dirty()) {
5123 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5124 if (result == -EINPROGRESS)
5125 result = -EAGAIN;
5126 } else {
5127 result = 0;
5128 }
5129 }
5130 break;
5131
5132 case CEPH_OSD_OP_CACHE_FLUSH:
5133 ++ctx->num_write;
5134 {
5135 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5136 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5137 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5138 result = -EINVAL;
5139 break;
5140 }
5141 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5142 result = -EINVAL;
5143 break;
5144 }
5145 if (!obs.exists) {
5146 result = 0;
5147 break;
5148 }
5149 if (oi.is_cache_pinned()) {
5150 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5151 result = -EPERM;
5152 break;
5153 }
5154 hobject_t missing;
5155 if (oi.is_dirty()) {
5156 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5157 if (result == -EINPROGRESS)
5158 result = -EAGAIN;
5159 } else {
5160 result = 0;
5161 }
5162 // Check special return value which has set missing_return
5163 if (result == -ENOENT) {
5164 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5165 assert(!missing.is_min());
5166 wait_for_unreadable_object(missing, ctx->op);
5167 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5168 result = -EAGAIN;
5169 }
5170 }
5171 break;
5172
5173 case CEPH_OSD_OP_CACHE_EVICT:
5174 ++ctx->num_write;
5175 {
5176 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5177 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5178 result = -EINVAL;
5179 break;
5180 }
5181 if (!obs.exists) {
5182 result = 0;
5183 break;
5184 }
5185 if (oi.is_cache_pinned()) {
5186 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5187 result = -EPERM;
5188 break;
5189 }
5190 if (oi.is_dirty()) {
5191 result = -EBUSY;
5192 break;
5193 }
5194 if (!oi.watchers.empty()) {
5195 result = -EBUSY;
5196 break;
5197 }
5198 if (soid.snap == CEPH_NOSNAP) {
5199 result = _verify_no_head_clones(soid, ssc->snapset);
5200 if (result < 0)
5201 break;
5202 }
5203 result = _delete_oid(ctx, true, false);
5204 if (result >= 0) {
5205 // mark that this is a cache eviction to avoid triggering normal
5206 // make_writeable() clone or snapdir object creation in finish_ctx()
5207 ctx->cache_evict = true;
5208 }
5209 osd->logger->inc(l_osd_tier_evict);
5210 }
5211 break;
5212
5213 case CEPH_OSD_OP_GETXATTR:
5214 ++ctx->num_read;
5215 {
5216 string aname;
5217 bp.copy(op.xattr.name_len, aname);
5218 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5219 string name = "_" + aname;
5220 int r = getattr_maybe_cache(
5221 ctx->obc,
5222 name,
5223 &(osd_op.outdata));
5224 if (r >= 0) {
5225 op.xattr.value_len = osd_op.outdata.length();
5226 result = 0;
5227 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5228 } else
5229 result = r;
5230
5231 ctx->delta_stats.num_rd++;
5232 }
5233 break;
5234
5235 case CEPH_OSD_OP_GETXATTRS:
5236 ++ctx->num_read;
5237 {
5238 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5239 map<string, bufferlist> out;
5240 result = getattrs_maybe_cache(
5241 ctx->obc,
5242 &out,
5243 true);
5244
5245 bufferlist bl;
5246 ::encode(out, bl);
5247 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5248 ctx->delta_stats.num_rd++;
5249 osd_op.outdata.claim_append(bl);
5250 }
5251 break;
5252
5253 case CEPH_OSD_OP_CMPXATTR:
5254 ++ctx->num_read;
5255 {
5256 string aname;
5257 bp.copy(op.xattr.name_len, aname);
5258 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5259 string name = "_" + aname;
5260 name[op.xattr.name_len + 1] = 0;
5261
5262 bufferlist xattr;
5263 result = getattr_maybe_cache(
5264 ctx->obc,
5265 name,
5266 &xattr);
5267 if (result < 0 && result != -EEXIST && result != -ENODATA)
5268 break;
5269
5270 ctx->delta_stats.num_rd++;
5271 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5272
5273 switch (op.xattr.cmp_mode) {
5274 case CEPH_OSD_CMPXATTR_MODE_STRING:
5275 {
5276 string val;
5277 bp.copy(op.xattr.value_len, val);
5278 val[op.xattr.value_len] = 0;
5279 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5280 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5281 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5282 }
5283 break;
5284
5285 case CEPH_OSD_CMPXATTR_MODE_U64:
5286 {
5287 uint64_t u64val;
5288 try {
5289 ::decode(u64val, bp);
5290 }
5291 catch (buffer::error& e) {
5292 result = -EINVAL;
5293 goto fail;
5294 }
5295 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5296 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5297 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5298 }
5299 break;
5300
5301 default:
5302 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5303 result = -EINVAL;
5304 }
5305
5306 if (!result) {
5307 dout(10) << "comparison returned false" << dendl;
5308 result = -ECANCELED;
5309 break;
5310 }
5311 if (result < 0) {
5312 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5313 break;
5314 }
5315
5316 dout(10) << "comparison returned true" << dendl;
5317 }
5318 break;
5319
5320 case CEPH_OSD_OP_ASSERT_VER:
5321 ++ctx->num_read;
5322 {
5323 uint64_t ver = op.assert_ver.ver;
5324 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5325 if (!ver)
5326 result = -EINVAL;
5327 else if (ver < oi.user_version)
5328 result = -ERANGE;
5329 else if (ver > oi.user_version)
5330 result = -EOVERFLOW;
5331 }
5332 break;
5333
5334 case CEPH_OSD_OP_LIST_WATCHERS:
5335 ++ctx->num_read;
5336 {
5337 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5338 obj_list_watch_response_t resp;
5339
5340 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5341 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5342 ++oi_iter) {
5343 dout(20) << "key cookie=" << oi_iter->first.first
5344 << " entity=" << oi_iter->first.second << " "
5345 << oi_iter->second << dendl;
5346 assert(oi_iter->first.first == oi_iter->second.cookie);
5347 assert(oi_iter->first.second.is_client());
5348
5349 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5350 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5351 resp.entries.push_back(wi);
5352 }
5353
5354 resp.encode(osd_op.outdata, ctx->get_features());
5355 result = 0;
5356
5357 ctx->delta_stats.num_rd++;
5358 break;
5359 }
5360
5361 case CEPH_OSD_OP_LIST_SNAPS:
5362 ++ctx->num_read;
5363 {
5364 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5365 obj_list_snap_response_t resp;
5366
5367 if (!ssc) {
5368 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5369 }
5370 assert(ssc);
5371
5372 int clonecount = ssc->snapset.clones.size();
5373 if (ssc->snapset.head_exists)
5374 clonecount++;
5375 resp.clones.reserve(clonecount);
5376 for (auto clone_iter = ssc->snapset.clones.begin();
5377 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5378 clone_info ci;
5379 ci.cloneid = *clone_iter;
5380
5381 hobject_t clone_oid = soid;
5382 clone_oid.snap = *clone_iter;
5383
5384 if (!ssc->snapset.is_legacy()) {
5385 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5386 if (p == ssc->snapset.clone_snaps.end()) {
5387 osd->clog->error() << "osd." << osd->whoami
5388 << ": inconsistent clone_snaps found for oid "
5389 << soid << " clone " << *clone_iter
5390 << " snapset " << ssc->snapset;
5391 result = -EINVAL;
5392 break;
5393 }
5394 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5395 ci.snaps.push_back(*q);
5396 }
5397 } else {
5398 /* No need to take a lock here. We are only inspecting state cached on
5399 * in the ObjectContext, so we aren't performing an actual read unless
5400 * the clone obc is not already loaded (in which case, it cannot have
5401 * an in progress write). We also do not risk exposing uncommitted
5402 * state since we do have a read lock on the head object or snapdir,
5403 * which we would have to write lock in order to make user visible
5404 * modifications to the snapshot state (snap trim related mutations
5405 * are not user visible).
5406 */
5407 if (is_missing_object(clone_oid)) {
5408 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5409 wait_for_unreadable_object(clone_oid, ctx->op);
5410 result = -EAGAIN;
5411 break;
5412 }
5413
5414 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5415 if (!clone_obc) {
5416 if (maybe_handle_cache(
5417 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5418 // promoting the clone
5419 result = -EAGAIN;
5420 } else {
5421 osd->clog->error() << "osd." << osd->whoami
5422 << ": missing clone " << clone_oid
5423 << " for oid "
5424 << soid;
5425 // should not happen
5426 result = -ENOENT;
5427 }
5428 break;
5429 }
5430 for (vector<snapid_t>::reverse_iterator p =
5431 clone_obc->obs.oi.legacy_snaps.rbegin();
5432 p != clone_obc->obs.oi.legacy_snaps.rend();
5433 ++p) {
5434 ci.snaps.push_back(*p);
5435 }
5436 }
5437
5438 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5439
5440 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5441 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5442 if (coi == ssc->snapset.clone_overlap.end()) {
5443 osd->clog->error() << "osd." << osd->whoami
5444 << ": inconsistent clone_overlap found for oid "
5445 << soid << " clone " << *clone_iter;
5446 result = -EINVAL;
5447 break;
5448 }
5449 const interval_set<uint64_t> &o = coi->second;
5450 ci.overlap.reserve(o.num_intervals());
5451 for (interval_set<uint64_t>::const_iterator r = o.begin();
5452 r != o.end(); ++r) {
5453 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5454 r.get_len()));
5455 }
5456
5457 map<snapid_t, uint64_t>::const_iterator si;
5458 si = ssc->snapset.clone_size.find(ci.cloneid);
5459 if (si == ssc->snapset.clone_size.end()) {
5460 osd->clog->error() << "osd." << osd->whoami
5461 << ": inconsistent clone_size found for oid "
5462 << soid << " clone " << *clone_iter;
5463 result = -EINVAL;
5464 break;
5465 }
5466 ci.size = si->second;
5467
5468 resp.clones.push_back(ci);
5469 }
5470 if (result < 0) {
5471 break;
5472 }
5473 if (ssc->snapset.head_exists &&
5474 !ctx->obc->obs.oi.is_whiteout()) {
5475 assert(obs.exists);
5476 clone_info ci;
5477 ci.cloneid = CEPH_NOSNAP;
5478
5479 //Size for HEAD is oi.size
5480 ci.size = oi.size;
5481
5482 resp.clones.push_back(ci);
5483 }
5484 resp.seq = ssc->snapset.seq;
5485
5486 resp.encode(osd_op.outdata);
5487 result = 0;
5488
5489 ctx->delta_stats.num_rd++;
5490 break;
5491 }
5492
5493 case CEPH_OSD_OP_NOTIFY:
5494 ++ctx->num_read;
5495 {
5496 uint32_t timeout;
5497 bufferlist bl;
5498
5499 try {
5500 uint32_t ver; // obsolete
5501 ::decode(ver, bp);
5502 ::decode(timeout, bp);
5503 ::decode(bl, bp);
5504 } catch (const buffer::error &e) {
5505 timeout = 0;
5506 }
5507 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5508 if (!timeout)
5509 timeout = cct->_conf->osd_default_notify_timeout;
5510
5511 notify_info_t n;
5512 n.timeout = timeout;
5513 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5514 n.cookie = op.watch.cookie;
5515 n.bl = bl;
5516 ctx->notifies.push_back(n);
5517
5518 // return our unique notify id to the client
5519 ::encode(n.notify_id, osd_op.outdata);
5520 }
5521 break;
5522
5523 case CEPH_OSD_OP_NOTIFY_ACK:
5524 ++ctx->num_read;
5525 {
5526 try {
5527 uint64_t notify_id = 0;
5528 uint64_t watch_cookie = 0;
5529 ::decode(notify_id, bp);
5530 ::decode(watch_cookie, bp);
5531 bufferlist reply_bl;
5532 if (!bp.end()) {
5533 ::decode(reply_bl, bp);
5534 }
5535 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5536 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5537 ctx->notify_acks.push_back(ack);
5538 } catch (const buffer::error &e) {
5539 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5540 OpContext::NotifyAck ack(
5541 // op.watch.cookie is actually the notify_id for historical reasons
5542 op.watch.cookie
5543 );
5544 ctx->notify_acks.push_back(ack);
5545 }
5546 }
5547 break;
5548
5549 case CEPH_OSD_OP_SETALLOCHINT:
5550 ++ctx->num_write;
5551 {
5552 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5553 maybe_create_new_object(ctx);
5554 oi.expected_object_size = op.alloc_hint.expected_object_size;
5555 oi.expected_write_size = op.alloc_hint.expected_write_size;
5556 oi.alloc_hint_flags = op.alloc_hint.flags;
5557 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5558 op.alloc_hint.expected_write_size,
5559 op.alloc_hint.flags);
5560 ctx->delta_stats.num_wr++;
5561 result = 0;
5562 }
5563 break;
5564
5565
5566 // --- WRITES ---
5567
5568 // -- object data --
5569
5570 case CEPH_OSD_OP_WRITE:
5571 ++ctx->num_write;
5572 { // write
5573 __u32 seq = oi.truncate_seq;
5574 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5575 if (op.extent.length != osd_op.indata.length()) {
5576 result = -EINVAL;
5577 break;
5578 }
5579
5580 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5581 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5582
5583 if (pool.info.requires_aligned_append() &&
5584 (op.extent.offset % pool.info.required_alignment() != 0)) {
5585 result = -EOPNOTSUPP;
5586 break;
5587 }
5588
5589 if (!obs.exists) {
5590 if (pool.info.requires_aligned_append() && op.extent.offset) {
5591 result = -EOPNOTSUPP;
5592 break;
5593 }
5594 } else if (op.extent.offset != oi.size &&
5595 pool.info.requires_aligned_append()) {
5596 result = -EOPNOTSUPP;
5597 break;
5598 }
5599
5600 if (seq && (seq > op.extent.truncate_seq) &&
5601 (op.extent.offset + op.extent.length > oi.size)) {
5602 // old write, arrived after trimtrunc
5603 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5604 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5605 << ", adjusting write length to " << op.extent.length << dendl;
5606 bufferlist t;
5607 t.substr_of(osd_op.indata, 0, op.extent.length);
5608 osd_op.indata.swap(t);
5609 }
5610 if (op.extent.truncate_seq > seq) {
5611 // write arrives before trimtrunc
5612 if (obs.exists && !oi.is_whiteout()) {
5613 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5614 << ", truncating to " << op.extent.truncate_size << dendl;
5615 t->truncate(soid, op.extent.truncate_size);
5616 oi.truncate_seq = op.extent.truncate_seq;
5617 oi.truncate_size = op.extent.truncate_size;
5618 if (op.extent.truncate_size != oi.size) {
5619 ctx->delta_stats.num_bytes -= oi.size;
5620 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5621 oi.size = op.extent.truncate_size;
5622 }
5623 } else {
5624 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5625 << ", but object is new" << dendl;
5626 oi.truncate_seq = op.extent.truncate_seq;
5627 oi.truncate_size = op.extent.truncate_size;
5628 }
5629 }
5630 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5631 if (result < 0)
5632 break;
5633
5634 maybe_create_new_object(ctx);
5635
5636 if (op.extent.length == 0) {
5637 if (op.extent.offset > oi.size) {
5638 t->truncate(
5639 soid, op.extent.offset);
5640 } else {
5641 t->nop(soid);
5642 }
5643 } else {
5644 t->write(
5645 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5646 }
5647
5648 if (op.extent.offset == 0 && op.extent.length >= oi.size)
5649 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5650 else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5651 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5652 else
5653 obs.oi.clear_data_digest();
5654 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5655 op.extent.offset, op.extent.length);
5656
5657 }
5658 break;
5659
5660 case CEPH_OSD_OP_WRITEFULL:
5661 ++ctx->num_write;
5662 { // write full object
5663 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5664
5665 if (op.extent.length != osd_op.indata.length()) {
5666 result = -EINVAL;
5667 break;
5668 }
5669 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5670 if (result < 0)
5671 break;
5672
5673 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5674 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5675
5676 maybe_create_new_object(ctx);
5677 if (pool.info.require_rollback()) {
5678 t->truncate(soid, 0);
5679 } else if (obs.exists && op.extent.length < oi.size) {
5680 t->truncate(soid, op.extent.length);
5681 }
5682 if (op.extent.length) {
5683 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5684 }
5685 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5686
5687 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5688 0, op.extent.length, true);
5689 }
5690 break;
5691
5692 case CEPH_OSD_OP_WRITESAME:
5693 ++ctx->num_write;
5694 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5695 result = do_writesame(ctx, osd_op);
5696 break;
5697
5698 case CEPH_OSD_OP_ROLLBACK :
5699 ++ctx->num_write;
5700 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5701 result = _rollback_to(ctx, op);
5702 break;
5703
5704 case CEPH_OSD_OP_ZERO:
5705 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5706 if (pool.info.requires_aligned_append()) {
5707 result = -EOPNOTSUPP;
5708 break;
5709 }
5710 ++ctx->num_write;
5711 { // zero
5712 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5713 if (result < 0)
5714 break;
5715 assert(op.extent.length);
5716 if (obs.exists && !oi.is_whiteout()) {
5717 t->zero(soid, op.extent.offset, op.extent.length);
5718 interval_set<uint64_t> ch;
5719 ch.insert(op.extent.offset, op.extent.length);
5720 ctx->modified_ranges.union_of(ch);
5721 ctx->delta_stats.num_wr++;
5722 oi.clear_data_digest();
5723 } else {
5724 // no-op
5725 }
5726 }
5727 break;
5728 case CEPH_OSD_OP_CREATE:
5729 ++ctx->num_write;
5730 {
5731 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5732 int flags = le32_to_cpu(op.flags);
5733 if (obs.exists && !oi.is_whiteout() &&
5734 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5735 result = -EEXIST; /* this is an exclusive create */
5736 } else {
5737 if (osd_op.indata.length()) {
5738 bufferlist::iterator p = osd_op.indata.begin();
5739 string category;
5740 try {
5741 ::decode(category, p);
5742 }
5743 catch (buffer::error& e) {
5744 result = -EINVAL;
5745 goto fail;
5746 }
5747 // category is no longer implemented.
5748 }
5749 if (result >= 0) {
5750 maybe_create_new_object(ctx);
5751 t->nop(soid);
5752 }
5753 }
5754 }
5755 break;
5756
5757 case CEPH_OSD_OP_TRIMTRUNC:
5758 op.extent.offset = op.extent.truncate_size;
5759 // falling through
5760
5761 case CEPH_OSD_OP_TRUNCATE:
5762 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5763 if (pool.info.requires_aligned_append()) {
5764 result = -EOPNOTSUPP;
5765 break;
5766 }
5767 ++ctx->num_write;
5768 {
5769 // truncate
5770 if (!obs.exists || oi.is_whiteout()) {
5771 dout(10) << " object dne, truncate is a no-op" << dendl;
5772 break;
5773 }
5774
5775 if (op.extent.offset > cct->_conf->osd_max_object_size) {
5776 result = -EFBIG;
5777 break;
5778 }
5779
5780 if (op.extent.truncate_seq) {
5781 assert(op.extent.offset == op.extent.truncate_size);
5782 if (op.extent.truncate_seq <= oi.truncate_seq) {
5783 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5784 << ", no-op" << dendl;
5785 break; // old
5786 }
5787 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5788 << ", truncating" << dendl;
5789 oi.truncate_seq = op.extent.truncate_seq;
5790 oi.truncate_size = op.extent.truncate_size;
5791 }
5792
5793 maybe_create_new_object(ctx);
5794 t->truncate(soid, op.extent.offset);
5795 if (oi.size > op.extent.offset) {
5796 interval_set<uint64_t> trim;
5797 trim.insert(op.extent.offset, oi.size-op.extent.offset);
5798 ctx->modified_ranges.union_of(trim);
5799 }
5800 if (op.extent.offset != oi.size) {
5801 ctx->delta_stats.num_bytes -= oi.size;
5802 ctx->delta_stats.num_bytes += op.extent.offset;
5803 oi.size = op.extent.offset;
5804 }
5805 ctx->delta_stats.num_wr++;
5806 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5807
5808 oi.clear_data_digest();
5809 }
5810 break;
5811
5812 case CEPH_OSD_OP_DELETE:
5813 ++ctx->num_write;
5814 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5815 {
5816 result = _delete_oid(ctx, false, ctx->ignore_cache);
5817 }
5818 break;
5819
5820 case CEPH_OSD_OP_WATCH:
5821 ++ctx->num_write;
5822 {
5823 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
5824 op.watch.cookie, op.watch.op);
5825 if (!obs.exists) {
5826 result = -ENOENT;
5827 break;
5828 }
5829 uint64_t cookie = op.watch.cookie;
5830 entity_name_t entity = ctx->reqid.name;
5831 ObjectContextRef obc = ctx->obc;
5832
5833 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
5834 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
5835 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
5836 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
5837 dout(10) << "watch: peer_addr="
5838 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
5839
5840 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
5841 if (op.watch.timeout != 0) {
5842 timeout = op.watch.timeout;
5843 }
5844
5845 watch_info_t w(cookie, timeout,
5846 ctx->op->get_req()->get_connection()->get_peer_addr());
5847 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
5848 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
5849 if (oi.watchers.count(make_pair(cookie, entity))) {
5850 dout(10) << " found existing watch " << w << " by " << entity << dendl;
5851 } else {
5852 dout(10) << " registered new watch " << w << " by " << entity << dendl;
5853 oi.watchers[make_pair(cookie, entity)] = w;
5854 t->nop(soid); // make sure update the object_info on disk!
5855 }
5856 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
5857 ctx->watch_connects.push_back(make_pair(w, will_ping));
5858 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
5859 if (!oi.watchers.count(make_pair(cookie, entity))) {
5860 result = -ENOTCONN;
5861 break;
5862 }
5863 dout(10) << " found existing watch " << w << " by " << entity << dendl;
5864 ctx->watch_connects.push_back(make_pair(w, true));
5865 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
5866 /* Note: WATCH with PING doesn't cause may_write() to return true,
5867 * so if there is nothing else in the transaction, this is going
5868 * to run do_osd_op_effects, but not write out a log entry */
5869 if (!oi.watchers.count(make_pair(cookie, entity))) {
5870 result = -ENOTCONN;
5871 break;
5872 }
5873 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
5874 obc->watchers.find(make_pair(cookie, entity));
5875 if (p == obc->watchers.end() ||
5876 !p->second->is_connected()) {
5877 // client needs to reconnect
5878 result = -ETIMEDOUT;
5879 break;
5880 }
5881 dout(10) << " found existing watch " << w << " by " << entity << dendl;
5882 p->second->got_ping(ceph_clock_now());
5883 result = 0;
5884 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
5885 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
5886 oi.watchers.find(make_pair(cookie, entity));
5887 if (oi_iter != oi.watchers.end()) {
5888 dout(10) << " removed watch " << oi_iter->second << " by "
5889 << entity << dendl;
5890 oi.watchers.erase(oi_iter);
5891 t->nop(soid); // update oi on disk
5892 ctx->watch_disconnects.push_back(
5893 watch_disconnect_t(cookie, entity, false));
5894 } else {
5895 dout(10) << " can't remove: no watch by " << entity << dendl;
5896 }
5897 }
5898 }
5899 break;
5900
5901 case CEPH_OSD_OP_CACHE_PIN:
5902 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
5903 if ((!pool.info.is_tier() ||
5904 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5905 result = -EINVAL;
5906 dout(10) << " pin object is only allowed on the cache tier " << dendl;
5907 break;
5908 }
5909 ++ctx->num_write;
5910 {
5911 if (!obs.exists || oi.is_whiteout()) {
5912 result = -ENOENT;
5913 break;
5914 }
5915
5916 if (!oi.is_cache_pinned()) {
5917 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
5918 ctx->modify = true;
5919 ctx->delta_stats.num_objects_pinned++;
5920 ctx->delta_stats.num_wr++;
5921 }
5922 result = 0;
5923 }
5924 break;
5925
5926 case CEPH_OSD_OP_CACHE_UNPIN:
5927 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
5928 if ((!pool.info.is_tier() ||
5929 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5930 result = -EINVAL;
5931 dout(10) << " pin object is only allowed on the cache tier " << dendl;
5932 break;
5933 }
5934 ++ctx->num_write;
5935 {
5936 if (!obs.exists || oi.is_whiteout()) {
5937 result = -ENOENT;
5938 break;
5939 }
5940
5941 if (oi.is_cache_pinned()) {
5942 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
5943 ctx->modify = true;
5944 ctx->delta_stats.num_objects_pinned--;
5945 ctx->delta_stats.num_wr++;
5946 }
5947 result = 0;
5948 }
5949 break;
5950
31f18b77
FG
5951 case CEPH_OSD_OP_SET_REDIRECT:
5952 ++ctx->num_write;
5953 {
5954 if (pool.info.is_tier()) {
5955 result = -EINVAL;
5956 break;
5957 }
5958 if (!obs.exists) {
5959 result = -ENOENT;
5960 break;
5961 }
5962 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5963 result = -EOPNOTSUPP;
5964 break;
5965 }
5966
5967 object_t target_name;
5968 object_locator_t target_oloc;
5969 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
5970 version_t target_version = op.copy_from.src_version;
5971 try {
5972 ::decode(target_name, bp);
5973 ::decode(target_oloc, bp);
5974 }
5975 catch (buffer::error& e) {
5976 result = -EINVAL;
5977 goto fail;
5978 }
5979 pg_t raw_pg;
5980 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
5981 hobject_t target(target_name, target_oloc.key, target_snapid,
5982 raw_pg.ps(), raw_pg.pool(),
5983 target_oloc.nspace);
5984 if (target == soid) {
5985 dout(20) << " set-redirect self is invalid" << dendl;
5986 result = -EINVAL;
5987 break;
5988 }
5989 oi.set_flag(object_info_t::FLAG_MANIFEST);
5990 oi.manifest.redirect_target = target;
5991 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
5992 t->truncate(soid, 0);
5993 if (oi.is_omap() && pool.info.supports_omap()) {
5994 t->omap_clear(soid);
5995 obs.oi.clear_omap_digest();
5996 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
5997 }
5998 ctx->delta_stats.num_bytes -= oi.size;
5999 oi.size = 0;
6000 oi.new_object();
6001 oi.user_version = target_version;
6002 ctx->user_at_version = target_version;
6003 /* rm_attrs */
6004 map<string,bufferlist> rmattrs;
6005 result = getattrs_maybe_cache(ctx->obc,
6006 &rmattrs,
6007 true);
6008 if (result < 0) {
6009 return result;
6010 }
6011 map<string, bufferlist>::iterator iter;
6012 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6013 const string& name = iter->first;
6014 t->rmattr(soid, name);
6015 }
6016 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6017 }
6018
6019 break;
7c673cae
FG
6020
6021 // -- object attrs --
6022
6023 case CEPH_OSD_OP_SETXATTR:
6024 ++ctx->num_write;
6025 {
6026 if (cct->_conf->osd_max_attr_size > 0 &&
6027 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6028 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6029 result = -EFBIG;
6030 break;
6031 }
6032 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6033 cct->_conf->osd_max_attr_name_len);
6034 if (op.xattr.name_len > max_name_len) {
6035 result = -ENAMETOOLONG;
6036 break;
6037 }
6038 maybe_create_new_object(ctx);
6039 string aname;
6040 bp.copy(op.xattr.name_len, aname);
6041 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6042 string name = "_" + aname;
6043 bufferlist bl;
6044 bp.copy(op.xattr.value_len, bl);
6045 t->setattr(soid, name, bl);
6046 ctx->delta_stats.num_wr++;
6047 }
6048 break;
6049
6050 case CEPH_OSD_OP_RMXATTR:
6051 ++ctx->num_write;
6052 {
6053 string aname;
6054 bp.copy(op.xattr.name_len, aname);
6055 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6056 if (!obs.exists || oi.is_whiteout()) {
6057 result = -ENOENT;
6058 break;
6059 }
6060 string name = "_" + aname;
6061 t->rmattr(soid, name);
6062 ctx->delta_stats.num_wr++;
6063 }
6064 break;
6065
6066
6067 // -- fancy writers --
6068 case CEPH_OSD_OP_APPEND:
6069 {
6070 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6071 // just do it inline; this works because we are happy to execute
6072 // fancy op on replicas as well.
6073 vector<OSDOp> nops(1);
6074 OSDOp& newop = nops[0];
6075 newop.op.op = CEPH_OSD_OP_WRITE;
6076 newop.op.extent.offset = oi.size;
6077 newop.op.extent.length = op.extent.length;
6078 newop.op.extent.truncate_seq = oi.truncate_seq;
6079 newop.indata = osd_op.indata;
6080 result = do_osd_ops(ctx, nops);
6081 osd_op.outdata.claim(newop.outdata);
6082 }
6083 break;
6084
6085 case CEPH_OSD_OP_STARTSYNC:
6086 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6087 t->nop(soid);
6088 break;
6089
6090
6091 // -- trivial map --
6092 case CEPH_OSD_OP_TMAPGET:
6093 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6094 if (pool.info.require_rollback()) {
6095 result = -EOPNOTSUPP;
6096 break;
6097 }
6098 {
6099 vector<OSDOp> nops(1);
6100 OSDOp& newop = nops[0];
6101 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6102 newop.op.extent.offset = 0;
6103 newop.op.extent.length = 0;
6104 do_osd_ops(ctx, nops);
6105 osd_op.outdata.claim(newop.outdata);
6106 }
6107 break;
6108
6109 case CEPH_OSD_OP_TMAPPUT:
6110 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6111 if (pool.info.require_rollback()) {
6112 result = -EOPNOTSUPP;
6113 break;
6114 }
6115 {
6116 //_dout_lock.Lock();
6117 //osd_op.data.hexdump(*_dout);
6118 //_dout_lock.Unlock();
6119
6120 // verify sort order
6121 bool unsorted = false;
6122 if (true) {
6123 bufferlist header;
6124 ::decode(header, bp);
6125 uint32_t n;
6126 ::decode(n, bp);
6127 string last_key;
6128 while (n--) {
6129 string key;
6130 ::decode(key, bp);
6131 dout(10) << "tmapput key " << key << dendl;
6132 bufferlist val;
6133 ::decode(val, bp);
6134 if (key < last_key) {
6135 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6136 unsorted = true;
6137 break;
6138 }
6139 last_key = key;
6140 }
6141 }
6142
6143 // write it
6144 vector<OSDOp> nops(1);
6145 OSDOp& newop = nops[0];
6146 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6147 newop.op.extent.offset = 0;
6148 newop.op.extent.length = osd_op.indata.length();
6149 newop.indata = osd_op.indata;
6150
6151 if (unsorted) {
6152 bp = osd_op.indata.begin();
6153 bufferlist header;
6154 map<string, bufferlist> m;
6155 ::decode(header, bp);
6156 ::decode(m, bp);
6157 assert(bp.end());
6158 bufferlist newbl;
6159 ::encode(header, newbl);
6160 ::encode(m, newbl);
6161 newop.indata = newbl;
6162 }
6163 result = do_osd_ops(ctx, nops);
6164 assert(result == 0);
6165 }
6166 break;
6167
6168 case CEPH_OSD_OP_TMAPUP:
6169 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6170 if (pool.info.require_rollback()) {
6171 result = -EOPNOTSUPP;
6172 break;
6173 }
6174 ++ctx->num_write;
6175 result = do_tmapup(ctx, bp, osd_op);
6176 break;
6177
6178 case CEPH_OSD_OP_TMAP2OMAP:
6179 ++ctx->num_write;
6180 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6181 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6182 break;
6183
6184 // OMAP Read ops
6185 case CEPH_OSD_OP_OMAPGETKEYS:
6186 ++ctx->num_read;
6187 {
6188 string start_after;
6189 uint64_t max_return;
6190 try {
6191 ::decode(start_after, bp);
6192 ::decode(max_return, bp);
6193 }
6194 catch (buffer::error& e) {
6195 result = -EINVAL;
6196 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6197 goto fail;
6198 }
6199 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6200 max_return = cct->_conf->osd_max_omap_entries_per_request;
6201 }
6202 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6203
6204 bufferlist bl;
6205 uint32_t num = 0;
6206 bool truncated = false;
6207 if (oi.is_omap()) {
6208 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6209 coll, ghobject_t(soid)
6210 );
6211 assert(iter);
6212 iter->upper_bound(start_after);
6213 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6214 if (num >= max_return ||
6215 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6216 truncated = true;
6217 break;
6218 }
6219 ::encode(iter->key(), bl);
6220 }
6221 } // else return empty out_set
6222 ::encode(num, osd_op.outdata);
6223 osd_op.outdata.claim_append(bl);
6224 ::encode(truncated, osd_op.outdata);
6225 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6226 ctx->delta_stats.num_rd++;
6227 }
6228 break;
6229
6230 case CEPH_OSD_OP_OMAPGETVALS:
6231 ++ctx->num_read;
6232 {
6233 string start_after;
6234 uint64_t max_return;
6235 string filter_prefix;
6236 try {
6237 ::decode(start_after, bp);
6238 ::decode(max_return, bp);
6239 ::decode(filter_prefix, bp);
6240 }
6241 catch (buffer::error& e) {
6242 result = -EINVAL;
6243 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6244 goto fail;
6245 }
6246 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6247 max_return = cct->_conf->osd_max_omap_entries_per_request;
6248 }
6249 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6250
6251 uint32_t num = 0;
6252 bool truncated = false;
6253 bufferlist bl;
6254 if (oi.is_omap()) {
6255 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6256 coll, ghobject_t(soid)
6257 );
6258 if (!iter) {
6259 result = -ENOENT;
6260 goto fail;
6261 }
6262 iter->upper_bound(start_after);
6263 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6264 for (num = 0;
6265 iter->valid() &&
6266 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6267 ++num, iter->next(false)) {
6268 dout(20) << "Found key " << iter->key() << dendl;
6269 if (num >= max_return ||
6270 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6271 truncated = true;
6272 break;
6273 }
6274 ::encode(iter->key(), bl);
6275 ::encode(iter->value(), bl);
6276 }
6277 } // else return empty out_set
6278 ::encode(num, osd_op.outdata);
6279 osd_op.outdata.claim_append(bl);
6280 ::encode(truncated, osd_op.outdata);
6281 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6282 ctx->delta_stats.num_rd++;
6283 }
6284 break;
6285
6286 case CEPH_OSD_OP_OMAPGETHEADER:
6287 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6288 if (!oi.is_omap()) {
6289 // return empty header
6290 break;
6291 }
6292 ++ctx->num_read;
6293 {
6294 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6295 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6296 ctx->delta_stats.num_rd++;
6297 }
6298 break;
6299
6300 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6301 ++ctx->num_read;
6302 {
6303 set<string> keys_to_get;
6304 try {
6305 ::decode(keys_to_get, bp);
6306 }
6307 catch (buffer::error& e) {
6308 result = -EINVAL;
6309 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6310 goto fail;
6311 }
6312 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6313 map<string, bufferlist> out;
6314 if (oi.is_omap()) {
6315 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6316 } // else return empty omap entries
6317 ::encode(out, osd_op.outdata);
6318 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6319 ctx->delta_stats.num_rd++;
6320 }
6321 break;
6322
6323 case CEPH_OSD_OP_OMAP_CMP:
6324 ++ctx->num_read;
6325 {
6326 if (!obs.exists || oi.is_whiteout()) {
6327 result = -ENOENT;
6328 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6329 break;
6330 }
6331 map<string, pair<bufferlist, int> > assertions;
6332 try {
6333 ::decode(assertions, bp);
6334 }
6335 catch (buffer::error& e) {
6336 result = -EINVAL;
6337 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6338 goto fail;
6339 }
6340 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6341
6342 map<string, bufferlist> out;
6343
6344 if (oi.is_omap()) {
6345 set<string> to_get;
6346 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6347 i != assertions.end();
6348 ++i)
6349 to_get.insert(i->first);
6350 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6351 to_get, &out);
6352 if (r < 0) {
6353 result = r;
6354 break;
6355 }
6356 } // else leave out empty
6357
6358 //Should set num_rd_kb based on encode length of map
6359 ctx->delta_stats.num_rd++;
6360
6361 int r = 0;
6362 bufferlist empty;
6363 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6364 i != assertions.end();
6365 ++i) {
6366 auto out_entry = out.find(i->first);
6367 bufferlist &bl = (out_entry != out.end()) ?
6368 out_entry->second : empty;
6369 switch (i->second.second) {
6370 case CEPH_OSD_CMPXATTR_OP_EQ:
6371 if (!(bl == i->second.first)) {
6372 r = -ECANCELED;
6373 }
6374 break;
6375 case CEPH_OSD_CMPXATTR_OP_LT:
6376 if (!(bl < i->second.first)) {
6377 r = -ECANCELED;
6378 }
6379 break;
6380 case CEPH_OSD_CMPXATTR_OP_GT:
6381 if (!(bl > i->second.first)) {
6382 r = -ECANCELED;
6383 }
6384 break;
6385 default:
6386 r = -EINVAL;
6387 break;
6388 }
6389 if (r < 0)
6390 break;
6391 }
6392 if (r < 0) {
6393 result = r;
6394 }
6395 }
6396 break;
6397
6398 // OMAP Write ops
6399 case CEPH_OSD_OP_OMAPSETVALS:
6400 if (!pool.info.supports_omap()) {
6401 result = -EOPNOTSUPP;
6402 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6403 break;
6404 }
6405 ++ctx->num_write;
6406 {
6407 maybe_create_new_object(ctx);
6408 bufferlist to_set_bl;
6409 try {
6410 decode_str_str_map_to_bl(bp, &to_set_bl);
6411 }
6412 catch (buffer::error& e) {
6413 result = -EINVAL;
6414 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6415 goto fail;
6416 }
6417 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6418 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6419 dout(20) << "setting vals: " << dendl;
6420 map<string,bufferlist> to_set;
6421 bufferlist::iterator pt = to_set_bl.begin();
6422 ::decode(to_set, pt);
6423 for (map<string, bufferlist>::iterator i = to_set.begin();
6424 i != to_set.end();
6425 ++i) {
6426 dout(20) << "\t" << i->first << dendl;
6427 }
6428 }
6429 t->omap_setkeys(soid, to_set_bl);
6430 ctx->delta_stats.num_wr++;
6431 }
6432 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6433 obs.oi.clear_omap_digest();
6434 break;
6435
6436 case CEPH_OSD_OP_OMAPSETHEADER:
6437 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6438 if (!pool.info.supports_omap()) {
6439 result = -EOPNOTSUPP;
6440 break;
6441 }
6442 ++ctx->num_write;
6443 {
6444 maybe_create_new_object(ctx);
6445 t->omap_setheader(soid, osd_op.indata);
6446 ctx->delta_stats.num_wr++;
6447 }
6448 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6449 obs.oi.clear_omap_digest();
6450 break;
6451
6452 case CEPH_OSD_OP_OMAPCLEAR:
6453 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6454 if (!pool.info.supports_omap()) {
6455 result = -EOPNOTSUPP;
6456 break;
6457 }
6458 ++ctx->num_write;
6459 {
6460 if (!obs.exists || oi.is_whiteout()) {
6461 result = -ENOENT;
6462 break;
6463 }
6464 if (oi.is_omap()) {
6465 t->omap_clear(soid);
6466 ctx->delta_stats.num_wr++;
6467 obs.oi.clear_omap_digest();
6468 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6469 }
6470 }
6471 break;
6472
6473 case CEPH_OSD_OP_OMAPRMKEYS:
6474 if (!pool.info.supports_omap()) {
6475 result = -EOPNOTSUPP;
6476 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6477 break;
6478 }
6479 ++ctx->num_write;
6480 {
6481 if (!obs.exists || oi.is_whiteout()) {
6482 result = -ENOENT;
6483 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6484 break;
6485 }
6486 bufferlist to_rm_bl;
6487 try {
6488 decode_str_set_to_bl(bp, &to_rm_bl);
6489 }
6490 catch (buffer::error& e) {
6491 result = -EINVAL;
6492 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6493 goto fail;
6494 }
6495 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6496 t->omap_rmkeys(soid, to_rm_bl);
6497 ctx->delta_stats.num_wr++;
6498 }
6499 obs.oi.clear_omap_digest();
6500 break;
6501
6502 case CEPH_OSD_OP_COPY_GET:
6503 ++ctx->num_read;
6504 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), soid.snap.val);
6505 result = fill_in_copy_get(ctx, bp, osd_op, ctx->obc);
6506 break;
6507
6508 case CEPH_OSD_OP_COPY_FROM:
6509 ++ctx->num_write;
6510 {
6511 object_t src_name;
6512 object_locator_t src_oloc;
6513 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6514 version_t src_version = op.copy_from.src_version;
6515 try {
6516 ::decode(src_name, bp);
6517 ::decode(src_oloc, bp);
6518 }
6519 catch (buffer::error& e) {
6520 result = -EINVAL;
6521 tracepoint(osd,
6522 do_osd_op_pre_copy_from,
6523 soid.oid.name.c_str(),
6524 soid.snap.val,
6525 "???",
6526 0,
6527 "???",
6528 "???",
6529 0,
6530 src_snapid,
6531 src_version);
6532 goto fail;
6533 }
6534 tracepoint(osd,
6535 do_osd_op_pre_copy_from,
6536 soid.oid.name.c_str(),
6537 soid.snap.val,
6538 src_name.name.c_str(),
6539 src_oloc.pool,
6540 src_oloc.key.c_str(),
6541 src_oloc.nspace.c_str(),
6542 src_oloc.hash,
6543 src_snapid,
6544 src_version);
6545 if (!ctx->copy_cb) {
6546 // start
6547 pg_t raw_pg;
6548 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6549 hobject_t src(src_name, src_oloc.key, src_snapid,
6550 raw_pg.ps(), raw_pg.pool(),
6551 src_oloc.nspace);
6552 if (src == soid) {
6553 dout(20) << " copy from self is invalid" << dendl;
6554 result = -EINVAL;
6555 break;
6556 }
6557 CopyFromCallback *cb = new CopyFromCallback(ctx);
6558 ctx->copy_cb = cb;
6559 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6560 op.copy_from.flags,
6561 false,
6562 op.copy_from.src_fadvise_flags,
6563 op.flags);
6564 result = -EINPROGRESS;
6565 } else {
6566 // finish
6567 assert(ctx->copy_cb->get_result() >= 0);
6568 finish_copyfrom(ctx);
6569 result = 0;
6570 }
6571 }
6572 break;
6573
6574 default:
6575 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6576 dout(1) << "unrecognized osd op " << op.op
6577 << " " << ceph_osd_op_name(op.op)
6578 << dendl;
6579 result = -EOPNOTSUPP;
6580 }
6581
6582 fail:
6583 osd_op.rval = result;
6584 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6585 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6586 result = 0;
6587
6588 if (result < 0)
6589 break;
6590 }
6591 return result;
6592}
6593
6594int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6595{
6596 if (ctx->new_obs.oi.size == 0) {
6597 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6598 return -ENODATA;
6599 }
6600 vector<OSDOp> nops(1);
6601 OSDOp &newop = nops[0];
6602 newop.op.op = CEPH_OSD_OP_TMAPGET;
6603 do_osd_ops(ctx, nops);
6604 try {
6605 bufferlist::iterator i = newop.outdata.begin();
6606 ::decode(*header, i);
6607 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6608 } catch (...) {
6609 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6610 << dendl;
6611 return -EINVAL;
6612 }
6613 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6614 << dendl;
6615 return 0;
6616}
6617
6618int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6619 const SnapSet& ss)
6620{
6621 // verify that all clones have been evicted
6622 dout(20) << __func__ << " verifying clones are absent "
6623 << ss << dendl;
6624 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6625 p != ss.clones.end();
6626 ++p) {
6627 hobject_t clone_oid = soid;
6628 clone_oid.snap = *p;
6629 if (is_missing_object(clone_oid))
6630 return -EBUSY;
6631 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6632 if (clone_obc && clone_obc->obs.exists) {
6633 dout(10) << __func__ << " cannot evict head before clone "
6634 << clone_oid << dendl;
6635 return -EBUSY;
6636 }
6637 if (copy_ops.count(clone_oid)) {
6638 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6639 << clone_oid << dendl;
6640 return -EBUSY;
6641 }
6642 }
6643 return 0;
6644}
6645
6646inline int PrimaryLogPG::_delete_oid(
6647 OpContext *ctx,
6648 bool no_whiteout, // no whiteouts, no matter what.
6649 bool try_no_whiteout) // try not to whiteout
6650{
6651 SnapSet& snapset = ctx->new_snapset;
6652 ObjectState& obs = ctx->new_obs;
6653 object_info_t& oi = obs.oi;
6654 const hobject_t& soid = oi.soid;
6655 PGTransaction* t = ctx->op_t.get();
6656
6657 // cache: cache: set whiteout on delete?
6658 bool whiteout = false;
6659 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6660 && !no_whiteout
6661 && !try_no_whiteout) {
6662 whiteout = true;
6663 }
6664 bool legacy;
31f18b77 6665 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
6666 legacy = false;
6667 // in luminous or later, we can't delete the head if there are
6668 // clones. we trust the caller passing no_whiteout has already
6669 // verified they don't exist.
6670 if (!snapset.clones.empty() ||
6671 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6672 if (no_whiteout) {
6673 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6674 << dendl;
6675 } else {
6676 dout(20) << __func__ << " has or will have clones; will whiteout"
6677 << dendl;
6678 whiteout = true;
6679 }
6680 }
6681 } else {
6682 legacy = false;
6683 }
6684 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6685 << " no_whiteout=" << (int)no_whiteout
6686 << " try_no_whiteout=" << (int)try_no_whiteout
6687 << dendl;
6688 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6689 return -ENOENT;
6690
6691 t->remove(soid);
6692
6693 if (oi.size > 0) {
6694 interval_set<uint64_t> ch;
6695 ch.insert(0, oi.size);
6696 ctx->modified_ranges.union_of(ch);
6697 }
6698
6699 ctx->delta_stats.num_wr++;
6700 if (soid.is_snap()) {
6701 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6702 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6703 } else {
6704 ctx->delta_stats.num_bytes -= oi.size;
6705 }
6706 oi.size = 0;
6707 oi.new_object();
6708
6709 // disconnect all watchers
6710 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6711 oi.watchers.begin();
6712 p != oi.watchers.end();
6713 ++p) {
6714 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6715 ctx->watch_disconnects.push_back(
6716 watch_disconnect_t(p->first.first, p->first.second, true));
6717 }
6718 oi.watchers.clear();
6719
6720 if (whiteout) {
6721 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6722 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6723 ctx->delta_stats.num_whiteouts++;
6724 t->create(soid);
6725 osd->logger->inc(l_osd_tier_whiteout);
6726 return 0;
6727 }
6728
6729 // delete the head
6730 ctx->delta_stats.num_objects--;
6731 if (soid.is_snap())
6732 ctx->delta_stats.num_object_clones--;
6733 if (oi.is_whiteout()) {
6734 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6735 ctx->delta_stats.num_whiteouts--;
6736 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6737 }
6738 if (oi.is_cache_pinned()) {
6739 ctx->delta_stats.num_objects_pinned--;
6740 }
6741 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6742 snapset.head_exists = false;
6743 }
6744 obs.exists = false;
6745 return 0;
6746}
6747
6748int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6749{
6750 SnapSet& snapset = ctx->new_snapset;
6751 ObjectState& obs = ctx->new_obs;
6752 object_info_t& oi = obs.oi;
6753 const hobject_t& soid = oi.soid;
6754 PGTransaction* t = ctx->op_t.get();
6755 snapid_t snapid = (uint64_t)op.snap.snapid;
6756 hobject_t missing_oid;
6757
6758 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6759
6760 ObjectContextRef rollback_to;
6761 int ret = find_object_context(
6762 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6763 soid.get_namespace()),
6764 &rollback_to, false, false, &missing_oid);
6765 if (ret == -EAGAIN) {
6766 /* clone must be missing */
6767 assert(is_missing_object(missing_oid));
6768 dout(20) << "_rollback_to attempted to roll back to a missing object "
6769 << missing_oid << " (requested snapid: ) " << snapid << dendl;
6770 block_write_on_degraded_snap(missing_oid, ctx->op);
6771 return ret;
6772 }
6773 {
6774 ObjectContextRef promote_obc;
31f18b77
FG
6775 cache_result_t tier_mode_result;
6776 if (obs.exists && obs.oi.has_manifest()) {
6777 tier_mode_result =
6778 maybe_handle_manifest_detail(
6779 ctx->op,
6780 true,
6781 rollback_to);
6782 } else {
6783 tier_mode_result =
6784 maybe_handle_cache_detail(
6785 ctx->op,
6786 true,
6787 rollback_to,
6788 ret,
6789 missing_oid,
6790 true,
6791 false,
6792 &promote_obc);
6793 }
6794 switch (tier_mode_result) {
7c673cae
FG
6795 case cache_result_t::NOOP:
6796 break;
6797 case cache_result_t::BLOCKED_PROMOTE:
6798 assert(promote_obc);
6799 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6800 return -EAGAIN;
6801 case cache_result_t::BLOCKED_FULL:
6802 block_write_on_full_cache(soid, ctx->op);
6803 return -EAGAIN;
6804 default:
6805 assert(0 == "must promote was set, other values are not valid");
6806 return -EAGAIN;
6807 }
6808 }
6809
6810 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6811 // there's no snapshot here, or there's no object.
6812 // if there's no snapshot, we delete the object; otherwise, do nothing.
6813 dout(20) << "_rollback_to deleting head on " << soid.oid
6814 << " because got ENOENT|whiteout on find_object_context" << dendl;
6815 if (ctx->obc->obs.oi.watchers.size()) {
6816 // Cannot delete an object with watchers
6817 ret = -EBUSY;
6818 } else {
6819 _delete_oid(ctx, false, false);
6820 ret = 0;
6821 }
6822 } else if (ret) {
6823 // ummm....huh? It *can't* return anything else at time of writing.
6824 assert(0 == "unexpected error code in _rollback_to");
6825 } else { //we got our context, let's use it to do the rollback!
6826 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
6827 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
6828 dout(20) << "_rollback_to attempted to roll back to a degraded object "
6829 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
6830 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
6831 ret = -EAGAIN;
6832 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
6833 // rolling back to the head; we just need to clone it.
6834 ctx->modify = true;
6835 } else {
6836 /* 1) Delete current head
6837 * 2) Clone correct snapshot into head
6838 * 3) Calculate clone_overlaps by following overlaps
6839 * forward from rollback snapshot */
6840 dout(10) << "_rollback_to deleting " << soid.oid
6841 << " and rolling back to old snap" << dendl;
6842
6843 if (obs.exists) {
6844 t->remove(soid);
6845 }
6846 t->clone(soid, rollback_to_sobject);
6847 snapset.head_exists = true;
6848 t->add_obc(rollback_to);
6849
6850 map<snapid_t, interval_set<uint64_t> >::iterator iter =
6851 snapset.clone_overlap.lower_bound(snapid);
6852 interval_set<uint64_t> overlaps = iter->second;
6853 assert(iter != snapset.clone_overlap.end());
6854 for ( ;
6855 iter != snapset.clone_overlap.end();
6856 ++iter)
6857 overlaps.intersection_of(iter->second);
6858
6859 if (obs.oi.size > 0) {
6860 interval_set<uint64_t> modified;
6861 modified.insert(0, obs.oi.size);
6862 overlaps.intersection_of(modified);
6863 modified.subtract(overlaps);
6864 ctx->modified_ranges.union_of(modified);
6865 }
6866
6867 // Adjust the cached objectcontext
6868 maybe_create_new_object(ctx, true);
6869 ctx->delta_stats.num_bytes -= obs.oi.size;
6870 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
6871 obs.oi.size = rollback_to->obs.oi.size;
6872 if (rollback_to->obs.oi.is_data_digest())
6873 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
6874 else
6875 obs.oi.clear_data_digest();
6876 if (rollback_to->obs.oi.is_omap_digest())
6877 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
6878 else
6879 obs.oi.clear_omap_digest();
6880
6881 if (rollback_to->obs.oi.is_omap()) {
6882 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
6883 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6884 } else {
6885 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
6886 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6887 }
6888
6889 snapset.head_exists = true;
6890 }
6891 }
6892 return ret;
6893}
6894
6895void PrimaryLogPG::_make_clone(
6896 OpContext *ctx,
6897 PGTransaction* t,
6898 ObjectContextRef obc,
6899 const hobject_t& head, const hobject_t& coid,
6900 object_info_t *poi)
6901{
6902 bufferlist bv;
6903 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
6904
6905 t->clone(coid, head);
6906 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
6907 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
6908}
6909
6910void PrimaryLogPG::make_writeable(OpContext *ctx)
6911{
6912 const hobject_t& soid = ctx->obs->oi.soid;
6913 SnapContext& snapc = ctx->snapc;
6914
6915 // clone?
6916 assert(soid.snap == CEPH_NOSNAP);
6917 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
6918 << " snapc=" << snapc << dendl;
6919
6920 bool was_dirty = ctx->obc->obs.oi.is_dirty();
6921 if (ctx->new_obs.exists) {
6922 // we will mark the object dirty
6923 if (ctx->undirty && was_dirty) {
6924 dout(20) << " clearing DIRTY flag" << dendl;
6925 assert(ctx->new_obs.oi.is_dirty());
6926 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6927 --ctx->delta_stats.num_objects_dirty;
6928 osd->logger->inc(l_osd_tier_clean);
6929 } else if (!was_dirty && !ctx->undirty) {
6930 dout(20) << " setting DIRTY flag" << dendl;
6931 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
6932 ++ctx->delta_stats.num_objects_dirty;
6933 osd->logger->inc(l_osd_tier_dirty);
6934 }
6935 } else {
6936 if (was_dirty) {
6937 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
6938 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6939 --ctx->delta_stats.num_objects_dirty;
6940 }
6941 }
6942
6943 if ((ctx->new_obs.exists &&
6944 ctx->new_obs.oi.is_omap()) &&
6945 (!ctx->obc->obs.exists ||
6946 !ctx->obc->obs.oi.is_omap())) {
6947 ++ctx->delta_stats.num_objects_omap;
6948 }
6949 if ((!ctx->new_obs.exists ||
6950 !ctx->new_obs.oi.is_omap()) &&
6951 (ctx->obc->obs.exists &&
6952 ctx->obc->obs.oi.is_omap())) {
6953 --ctx->delta_stats.num_objects_omap;
6954 }
6955
6956 // use newer snapc?
6957 if (ctx->new_snapset.seq > snapc.seq) {
6958 snapc.seq = ctx->new_snapset.seq;
6959 snapc.snaps = ctx->new_snapset.snaps;
6960 filter_snapc(snapc.snaps);
6961 dout(10) << " using newer snapc " << snapc << dendl;
6962 }
6963
6964 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
6965 snapc.snaps.size() && // there are snaps
6966 !ctx->cache_evict &&
6967 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
6968 // clone
6969 hobject_t coid = soid;
6970 coid.snap = snapc.seq;
6971
6972 unsigned l;
6973 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
6974
6975 vector<snapid_t> snaps(l);
6976 for (unsigned i=0; i<l; i++)
6977 snaps[i] = snapc.snaps[i];
6978
6979 // prepare clone
6980 object_info_t static_snap_oi(coid);
6981 object_info_t *snap_oi;
6982 if (is_primary()) {
6983 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
6984 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
6985 ctx->clone_obc->obs.oi = static_snap_oi;
6986 ctx->clone_obc->obs.exists = true;
6987 ctx->clone_obc->ssc = ctx->obc->ssc;
6988 ctx->clone_obc->ssc->ref++;
6989 if (pool.info.require_rollback())
6990 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
6991 snap_oi = &ctx->clone_obc->obs.oi;
6992 bool got = ctx->lock_manager.get_write_greedy(
6993 coid,
6994 ctx->clone_obc,
6995 ctx->op);
6996 assert(got);
6997 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
6998 } else {
6999 snap_oi = &static_snap_oi;
7000 }
7001 snap_oi->version = ctx->at_version;
7002 snap_oi->prior_version = ctx->obs->oi.version;
7003 snap_oi->copy_user_bits(ctx->obs->oi);
7004
7005 bool legacy = ctx->new_snapset.is_legacy() ||
31f18b77 7006 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
7007 if (legacy) {
7008 snap_oi->legacy_snaps = snaps;
7009 }
7010
7011 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7012
7013 ctx->delta_stats.num_objects++;
7014 if (snap_oi->is_dirty()) {
7015 ctx->delta_stats.num_objects_dirty++;
7016 osd->logger->inc(l_osd_tier_dirty);
7017 }
7018 if (snap_oi->is_omap())
7019 ctx->delta_stats.num_objects_omap++;
7020 if (snap_oi->is_cache_pinned())
7021 ctx->delta_stats.num_objects_pinned++;
7022 ctx->delta_stats.num_object_clones++;
7023 ctx->new_snapset.clones.push_back(coid.snap);
7024 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7025 if (!legacy) {
7026 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7027 }
7028
7029 // clone_overlap should contain an entry for each clone
7030 // (an empty interval_set if there is no overlap)
7031 ctx->new_snapset.clone_overlap[coid.snap];
7032 if (ctx->obs->oi.size)
7033 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7034
7035 // log clone
7036 dout(10) << " cloning v " << ctx->obs->oi.version
7037 << " to " << coid << " v " << ctx->at_version
7038 << " snaps=" << snaps
7039 << " snapset=" << ctx->new_snapset << dendl;
7040 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7041 ctx->obs->oi.version,
7042 ctx->obs->oi.user_version,
7043 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7044 ::encode(snaps, ctx->log.back().snaps);
7045
7046 ctx->at_version.version++;
7047 }
7048
7049 // update most recent clone_overlap and usage stats
7050 if (ctx->new_snapset.clones.size() > 0) {
7051 /* we need to check whether the most recent clone exists, if it's been evicted,
7052 * it's not included in the stats */
7053 hobject_t last_clone_oid = soid;
7054 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7055 if (is_present_clone(last_clone_oid)) {
7056 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7057 ctx->modified_ranges.intersection_of(newest_overlap);
7058 // modified_ranges is still in use by the clone
7059 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7060 newest_overlap.subtract(ctx->modified_ranges);
7061 }
7062 }
7063
7064 // update snapset with latest snap context
7065 ctx->new_snapset.seq = snapc.seq;
7066 ctx->new_snapset.snaps = snapc.snaps;
31f18b77 7067 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
7068 // pessimistic assumption that this is a net-new legacy SnapSet
7069 ctx->delta_stats.num_legacy_snapsets++;
7070 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7071 } else if (ctx->new_snapset.is_legacy()) {
7072 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7073 }
7074 dout(20) << "make_writeable " << soid
7075 << " done, snapset=" << ctx->new_snapset << dendl;
7076}
7077
7078
7079void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7080 interval_set<uint64_t>& modified, uint64_t offset,
7081 uint64_t length, bool write_full)
7082{
7083 interval_set<uint64_t> ch;
7084 if (write_full) {
7085 if (oi.size)
7086 ch.insert(0, oi.size);
7087 } else if (length)
7088 ch.insert(offset, length);
7089 modified.union_of(ch);
7090 if (write_full || offset + length > oi.size) {
7091 uint64_t new_size = offset + length;
7092 delta_stats.num_bytes -= oi.size;
7093 delta_stats.num_bytes += new_size;
7094 oi.size = new_size;
7095 }
7096 delta_stats.num_wr++;
7097 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7098}
7099
7100void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7101{
7102 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7103 delta_stats.num_bytes += p.get_len();
7104 }
7105}
7106
7107void PrimaryLogPG::complete_disconnect_watches(
7108 ObjectContextRef obc,
7109 const list<watch_disconnect_t> &to_disconnect)
7110{
7111 for (list<watch_disconnect_t>::const_iterator i =
7112 to_disconnect.begin();
7113 i != to_disconnect.end();
7114 ++i) {
7115 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7116 auto watchers_entry = obc->watchers.find(watcher);
7117 if (watchers_entry != obc->watchers.end()) {
7118 WatchRef watch = watchers_entry->second;
7119 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7120 obc->watchers.erase(watcher);
7121 watch->remove(i->send_disconnect);
7122 } else {
7123 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7124 << watcher << dendl;
7125 }
7126 }
7127}
7128
7129void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7130{
7131 entity_name_t entity = ctx->reqid.name;
7132 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7133
7134 // disconnects first
7135 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7136
7137 assert(conn);
7138
7139 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7140 if (!session.get())
7141 return;
7142 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7143
7144 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7145 i != ctx->watch_connects.end();
7146 ++i) {
7147 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7148 dout(15) << "do_osd_op_effects applying watch connect on session "
7149 << session.get() << " watcher " << watcher << dendl;
7150 WatchRef watch;
7151 if (ctx->obc->watchers.count(watcher)) {
7152 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7153 << dendl;
7154 watch = ctx->obc->watchers[watcher];
7155 } else {
7156 dout(15) << "do_osd_op_effects new watcher " << watcher
7157 << dendl;
7158 watch = Watch::makeWatchRef(
7159 this, osd, ctx->obc, i->first.timeout_seconds,
7160 i->first.cookie, entity, conn->get_peer_addr());
7161 ctx->obc->watchers.insert(
7162 make_pair(
7163 watcher,
7164 watch));
7165 }
7166 watch->connect(conn, i->second);
7167 }
7168
7169 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7170 p != ctx->notifies.end();
7171 ++p) {
7172 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7173 ConnectionRef conn(ctx->op->get_req()->get_connection());
7174 NotifyRef notif(
7175 Notify::makeNotifyRef(
7176 conn,
7177 ctx->reqid.name.num(),
7178 p->bl,
7179 p->timeout,
7180 p->cookie,
7181 p->notify_id,
7182 ctx->obc->obs.oi.user_version,
7183 osd));
7184 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7185 ctx->obc->watchers.begin();
7186 i != ctx->obc->watchers.end();
7187 ++i) {
7188 dout(10) << "starting notify on watch " << i->first << dendl;
7189 i->second->start_notify(notif);
7190 }
7191 notif->init();
7192 }
7193
7194 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7195 p != ctx->notify_acks.end();
7196 ++p) {
7197 if (p->watch_cookie)
7198 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7199 else
7200 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7201 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7202 ctx->obc->watchers.begin();
7203 i != ctx->obc->watchers.end();
7204 ++i) {
7205 if (i->first.second != entity) continue;
7206 if (p->watch_cookie &&
7207 p->watch_cookie.get() != i->first.first) continue;
7208 dout(10) << "acking notify on watch " << i->first << dendl;
7209 i->second->notify_ack(p->notify_id, p->reply_bl);
7210 }
7211 }
7212}
7213
7214hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7215{
7216 ostringstream ss;
7217 ss << "temp_" << info.pgid << "_" << get_role()
7218 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7219 hobject_t hoid = target.make_temp_hobject(ss.str());
7220 dout(20) << __func__ << " " << hoid << dendl;
7221 return hoid;
7222}
7223
7224hobject_t PrimaryLogPG::get_temp_recovery_object(
7225 const hobject_t& target,
7226 eversion_t version)
7227{
7228 ostringstream ss;
7229 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7230 << "_" << version
7231 << "_" << info.history.same_interval_since
7232 << "_" << target.snap;
7233 // pgid + version + interval + snapid is unique, and short
7234 hobject_t hoid = target.make_temp_hobject(ss.str());
7235 dout(20) << __func__ << " " << hoid << dendl;
7236 return hoid;
7237}
7238
7239int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7240{
7241 assert(!ctx->ops.empty());
7242
7243 const hobject_t& soid = ctx->obs->oi.soid;
7244
7245 // valid snap context?
7246 if (!ctx->snapc.is_valid()) {
7247 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7248 return -EINVAL;
7249 }
7250
7251 // prepare the actual mutation
7252 int result = do_osd_ops(ctx, ctx->ops);
7253 if (result < 0) {
7254 if (ctx->op->may_write() &&
31f18b77 7255 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
7256 // need to save the error code in the pg log, to detect dup ops,
7257 // but do nothing else
7258 ctx->update_log_only = true;
7259 }
7260 return result;
7261 }
7262
7263 // read-op? write-op noop? done?
7264 if (ctx->op_t->empty() && !ctx->modify) {
7265 unstable_stats.add(ctx->delta_stats);
7266 if (ctx->op->may_write() &&
31f18b77 7267 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
7268 ctx->update_log_only = true;
7269 }
7270 return result;
7271 }
7272
7273 // check for full
7274 if ((ctx->delta_stats.num_bytes > 0 ||
7275 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7276 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7277 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7278 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7279 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7280 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7281 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7282 << dendl;
7283 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7284 // they tried, they failed.
7285 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7286 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7287 } else {
7288 // drop request
7289 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7290 return -EAGAIN;
7291 }
7292 }
7293
7294 // clone, if necessary
7295 if (soid.snap == CEPH_NOSNAP)
7296 make_writeable(ctx);
7297
7298 finish_ctx(ctx,
7299 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7300 pg_log_entry_t::DELETE);
7301
7302 return result;
7303}
7304
7305void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7306{
7307 const hobject_t& soid = ctx->obs->oi.soid;
7308 dout(20) << __func__ << " " << soid << " " << ctx
7309 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7310 << dendl;
7311 utime_t now = ceph_clock_now();
7312
7313 // snapset
7314 bufferlist bss;
7315
7316 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7317 ::encode(ctx->new_snapset, bss);
7318 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7319 !ctx->new_snapset.is_legacy());
7320
7321 if (ctx->new_obs.exists) {
7322 if (!ctx->obs->exists) {
7323 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7324 hobject_t snapoid = soid.get_snapdir();
7325 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7326 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7327 ctx->at_version,
7328 ctx->snapset_obc->obs.oi.version,
7329 0, osd_reqid_t(), ctx->mtime, 0));
7330 ctx->op_t->remove(snapoid);
7331
7332 ctx->at_version.version++;
7333
7334 ctx->snapset_obc->obs.exists = false;
7335 }
7336 }
7337 } else if (!ctx->new_snapset.clones.empty() &&
7338 !ctx->cache_evict &&
7339 !ctx->new_snapset.head_exists &&
7340 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7341 // save snapset on _snap
7342 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7343 info.pgid.pool(), soid.get_namespace());
7344 dout(10) << " final snapset " << ctx->new_snapset
7345 << " in " << snapoid << dendl;
31f18b77 7346 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
7347 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7348 ctx->at_version,
7349 eversion_t(),
7350 0, osd_reqid_t(), ctx->mtime, 0));
7351
7352 if (!ctx->snapset_obc)
7353 ctx->snapset_obc = get_object_context(snapoid, true);
7354 bool got = false;
7355 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7356 got = ctx->lock_manager.get_write_greedy(
7357 snapoid,
7358 ctx->snapset_obc,
7359 ctx->op);
7360 } else {
7361 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7362 got = ctx->lock_manager.get_lock_type(
7363 ObjectContext::RWState::RWEXCL,
7364 snapoid,
7365 ctx->snapset_obc,
7366 ctx->op);
7367 }
7368 assert(got);
7369 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7370 ctx->snapset_obc->obs.exists = true;
7371 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7372 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7373 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7374 ctx->snapset_obc->obs.oi.local_mtime = now;
7375
7376 map<string, bufferlist> attrs;
7377 bufferlist bv(sizeof(ctx->new_obs.oi));
7378 ::encode(ctx->snapset_obc->obs.oi, bv,
7379 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7380 ctx->op_t->create(snapoid);
7381 attrs[OI_ATTR].claim(bv);
7382 attrs[SS_ATTR].claim(bss);
7383 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7384 ctx->at_version.version++;
7385 }
7386 }
7387
7388 // finish and log the op.
7389 if (ctx->user_modify) {
7390 // update the user_version for any modify ops, except for the watch op
7391 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7392 /* In order for new clients and old clients to interoperate properly
7393 * when exchanging versions, we need to lower bound the user_version
7394 * (which our new clients pay proper attention to)
7395 * by the at_version (which is all the old clients can ever see). */
7396 if (ctx->at_version.version > ctx->user_at_version)
7397 ctx->user_at_version = ctx->at_version.version;
7398 ctx->new_obs.oi.user_version = ctx->user_at_version;
7399 }
7400 ctx->bytes_written = ctx->op_t->get_bytes_written();
7401
7402 if (ctx->new_obs.exists) {
7403 // on the head object
7404 ctx->new_obs.oi.version = ctx->at_version;
7405 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7406 ctx->new_obs.oi.last_reqid = ctx->reqid;
7407 if (ctx->mtime != utime_t()) {
7408 ctx->new_obs.oi.mtime = ctx->mtime;
7409 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7410 ctx->new_obs.oi.local_mtime = now;
7411 } else {
7412 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7413 }
7414
7415 map <string, bufferlist> attrs;
7416 bufferlist bv(sizeof(ctx->new_obs.oi));
7417 ::encode(ctx->new_obs.oi, bv,
7418 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7419 attrs[OI_ATTR].claim(bv);
7420
7421 if (soid.snap == CEPH_NOSNAP) {
7422 dout(10) << " final snapset " << ctx->new_snapset
7423 << " in " << soid << dendl;
7424 attrs[SS_ATTR].claim(bss);
7425 } else {
7426 dout(10) << " no snapset (this is a clone)" << dendl;
7427 }
7428 ctx->op_t->setattrs(soid, attrs);
7429 } else {
7430 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7431 }
7432
7433 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
31f18b77 7434 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
7435
7436 // append to log
7437 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7438 ctx->obs->oi.version,
7439 ctx->user_at_version, ctx->reqid,
7440 ctx->mtime, 0));
7441 if (soid.snap < CEPH_NOSNAP) {
7442 switch (log_op_type) {
7443 case pg_log_entry_t::MODIFY:
7444 case pg_log_entry_t::PROMOTE:
7445 case pg_log_entry_t::CLEAN:
7446 if (legacy_snapset) {
7447 dout(20) << __func__ << " encoding legacy_snaps "
7448 << ctx->new_obs.oi.legacy_snaps
7449 << dendl;
7450 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7451 } else {
7452 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7453 << dendl;
7454 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7455 }
7456 break;
7457 default:
7458 break;
7459 }
7460 }
7461
7462 if (!ctx->extra_reqids.empty()) {
7463 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7464 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7465 }
7466
7467 // apply new object state.
7468 ctx->obc->obs = ctx->new_obs;
7469
7470 if (soid.is_head() && !ctx->obc->obs.exists &&
7471 (!maintain_ssc || ctx->cache_evict)) {
7472 ctx->obc->ssc->exists = false;
7473 ctx->obc->ssc->snapset = SnapSet();
7474 } else {
7475 ctx->obc->ssc->exists = true;
7476 ctx->obc->ssc->snapset = ctx->new_snapset;
7477 }
7478}
7479
7480void PrimaryLogPG::apply_stats(
7481 const hobject_t &soid,
7482 const object_stat_sum_t &delta_stats) {
7483
7484 info.stats.stats.add(delta_stats);
7485
7486 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7487 i != backfill_targets.end();
7488 ++i) {
7489 pg_shard_t bt = *i;
7490 pg_info_t& pinfo = peer_info[bt];
7491 if (soid <= pinfo.last_backfill)
7492 pinfo.stats.stats.add(delta_stats);
7493 else if (soid <= last_backfill_started)
7494 pending_backfill_updates[soid].stats.add(delta_stats);
7495 }
7496
7497 if (is_primary() && scrubber.active) {
7498 if (soid < scrubber.start) {
7499 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7500 << "," << scrubber.end << ")" << dendl;
7501 scrub_cstat.add(delta_stats);
7502 } else {
7503 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7504 << "," << scrubber.end << ")" << dendl;
7505 }
7506 }
7507}
7508
7509void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7510{
7511 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7512 assert(ctx->async_reads_complete());
7513
7514 for (vector<OSDOp>::iterator p = ctx->ops.begin();
7515 p != ctx->ops.end() && result >= 0; ++p) {
7516 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7517 result = p->rval;
7518 break;
7519 }
7520 ctx->bytes_read += p->outdata.length();
7521 }
7522 ctx->reply->claim_op_out_data(ctx->ops);
7523 ctx->reply->get_header().data_off = ctx->data_off;
7524
7525 MOSDOpReply *reply = ctx->reply;
7526 ctx->reply = nullptr;
7527
7528 if (result >= 0) {
7529 if (!ctx->ignore_log_op_stats) {
7530 log_op_stats(ctx);
7531 publish_stats_to_osd();
7532 }
7533
7534 // on read, return the current object version
7535 if (ctx->obs) {
7536 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7537 } else {
7538 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7539 }
7540 } else if (result == -ENOENT) {
7541 // on ENOENT, set a floor for what the next user version will be.
7542 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7543 }
7544
7545 reply->set_result(result);
7546 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7547 osd->send_message_osd_client(reply, m->get_connection());
7548 close_op_ctx(ctx);
7549}
7550
7551// ========================================================================
7552// copyfrom
7553
7554struct C_Copyfrom : public Context {
7555 PrimaryLogPGRef pg;
7556 hobject_t oid;
7557 epoch_t last_peering_reset;
7558 ceph_tid_t tid;
7559 PrimaryLogPG::CopyOpRef cop;
7560 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7561 const PrimaryLogPG::CopyOpRef& c)
7562 : pg(p), oid(o), last_peering_reset(lpr),
7563 tid(0), cop(c)
7564 {}
7565 void finish(int r) override {
7566 if (r == -ECANCELED)
7567 return;
7568 pg->lock();
7569 if (last_peering_reset == pg->get_last_peering_reset()) {
7570 pg->process_copy_chunk(oid, tid, r);
7571 }
7572 pg->unlock();
7573 }
7574};
7575
7576struct C_CopyFrom_AsyncReadCb : public Context {
7577 OSDOp *osd_op;
7578 object_copy_data_t reply_obj;
7579 uint64_t features;
7580 size_t len;
7581 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7582 osd_op(osd_op), features(features), len(0) {}
7583 void finish(int r) override {
7584 assert(len > 0);
7585 assert(len <= reply_obj.data.length());
7586 bufferlist bl;
7587 bl.substr_of(reply_obj.data, 0, len);
7588 reply_obj.data.swap(bl);
7589 ::encode(reply_obj, osd_op->outdata, features);
7590 }
7591};
7592
7593int PrimaryLogPG::fill_in_copy_get(
7594 OpContext *ctx,
7595 bufferlist::iterator& bp,
7596 OSDOp& osd_op,
7597 ObjectContextRef &obc)
7598{
7599 object_info_t& oi = obc->obs.oi;
7600 hobject_t& soid = oi.soid;
7601 int result = 0;
7602 object_copy_cursor_t cursor;
7603 uint64_t out_max;
7604 try {
7605 ::decode(cursor, bp);
7606 ::decode(out_max, bp);
7607 }
7608 catch (buffer::error& e) {
7609 result = -EINVAL;
7610 return result;
7611 }
7612
7613 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7614 uint64_t features = op->get_features();
7615
7616 bool async_read_started = false;
7617 object_copy_data_t _reply_obj;
7618 C_CopyFrom_AsyncReadCb *cb = NULL;
7619 if (pool.info.require_rollback()) {
7620 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7621 }
7622 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7623 // size, mtime
7624 reply_obj.size = oi.size;
7625 reply_obj.mtime = oi.mtime;
7626 assert(obc->ssc);
7627 if (soid.snap < CEPH_NOSNAP) {
7628 if (obc->ssc->snapset.is_legacy()) {
7629 reply_obj.snaps = oi.legacy_snaps;
7630 } else {
7631 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7632 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7633 reply_obj.snaps = p->second;
7634 }
7635 } else {
7636 reply_obj.snap_seq = obc->ssc->snapset.seq;
7637 }
7638 if (oi.is_data_digest()) {
7639 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7640 reply_obj.data_digest = oi.data_digest;
7641 }
7642 if (oi.is_omap_digest()) {
7643 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7644 reply_obj.omap_digest = oi.omap_digest;
7645 }
7646 reply_obj.truncate_seq = oi.truncate_seq;
7647 reply_obj.truncate_size = oi.truncate_size;
7648
7649 // attrs
7650 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7651 if (!cursor.attr_complete) {
7652 result = getattrs_maybe_cache(
7653 ctx->obc,
7654 &out_attrs,
7655 true);
7656 if (result < 0) {
7657 if (cb) {
7658 delete cb;
7659 }
7660 return result;
7661 }
7662 cursor.attr_complete = true;
7663 dout(20) << " got attrs" << dendl;
7664 }
7665
7666 int64_t left = out_max - osd_op.outdata.length();
7667
7668 // data
7669 bufferlist& bl = reply_obj.data;
7670 if (left > 0 && !cursor.data_complete) {
7671 if (cursor.data_offset < oi.size) {
7672 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7673 if (cb) {
7674 async_read_started = true;
7675 ctx->pending_async_reads.push_back(
7676 make_pair(
7677 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7678 make_pair(&bl, cb)));
7679 result = max_read;
7680 cb->len = result;
7681 } else {
7682 result = pgbackend->objects_read_sync(
7683 oi.soid, cursor.data_offset, left, osd_op.op.flags, &bl);
7684 if (result < 0)
7685 return result;
7686 }
7687 assert(result <= left);
7688 left -= result;
7689 cursor.data_offset += result;
7690 }
7691 if (cursor.data_offset == oi.size) {
7692 cursor.data_complete = true;
7693 dout(20) << " got data" << dendl;
7694 }
7695 assert(cursor.data_offset <= oi.size);
7696 }
7697
7698 // omap
7699 uint32_t omap_keys = 0;
7700 if (!pool.info.supports_omap() || !oi.is_omap()) {
7701 cursor.omap_complete = true;
7702 } else {
7703 if (left > 0 && !cursor.omap_complete) {
7704 assert(cursor.data_complete);
7705 if (cursor.omap_offset.empty()) {
7706 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7707 &reply_obj.omap_header);
7708 }
7709 bufferlist omap_data;
7710 ObjectMap::ObjectMapIterator iter =
7711 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7712 assert(iter);
7713 iter->upper_bound(cursor.omap_offset);
7714 for (; iter->valid(); iter->next(false)) {
7715 ++omap_keys;
7716 ::encode(iter->key(), omap_data);
7717 ::encode(iter->value(), omap_data);
7718 left -= iter->key().length() + 4 + iter->value().length() + 4;
7719 if (left <= 0)
7720 break;
7721 }
7722 if (omap_keys) {
7723 ::encode(omap_keys, reply_obj.omap_data);
7724 reply_obj.omap_data.claim_append(omap_data);
7725 }
7726 if (iter->valid()) {
7727 cursor.omap_offset = iter->key();
7728 } else {
7729 cursor.omap_complete = true;
7730 dout(20) << " got omap" << dendl;
7731 }
7732 }
7733 }
7734
7735 if (cursor.is_complete()) {
7736 // include reqids only in the final step. this is a bit fragile
7737 // but it works...
7738 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7739 dout(20) << " got reqids" << dendl;
7740 }
7741
7742 dout(20) << " cursor.is_complete=" << cursor.is_complete()
7743 << " " << out_attrs.size() << " attrs"
7744 << " " << bl.length() << " bytes"
7745 << " " << reply_obj.omap_header.length() << " omap header bytes"
7746 << " " << reply_obj.omap_data.length() << " omap data bytes in "
7747 << omap_keys << " keys"
7748 << " " << reply_obj.reqids.size() << " reqids"
7749 << dendl;
7750 reply_obj.cursor = cursor;
7751 if (!async_read_started) {
7752 ::encode(reply_obj, osd_op.outdata, features);
7753 }
7754 if (cb && !async_read_started) {
7755 delete cb;
7756 }
7757 result = 0;
7758 return result;
7759}
7760
7761void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7762 OSDOp& osd_op)
7763{
7764 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7765 // be careful not to modify anything else that will upset a racing
7766 // operator<<
7767 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7768 uint64_t features = m->get_features();
7769 object_copy_data_t reply_obj;
7770
7771 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7772 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7773 ::encode(reply_obj, osd_op.outdata, features);
7774 osd_op.rval = -ENOENT;
7775 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7776 reply->claim_op_out_data(m->ops);
7777 reply->set_result(-ENOENT);
7778 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7779 osd->send_message_osd_client(reply, m->get_connection());
7780}
7781
7782void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7783 hobject_t src, object_locator_t oloc,
7784 version_t version, unsigned flags,
7785 bool mirror_snapset,
7786 unsigned src_obj_fadvise_flags,
7787 unsigned dest_obj_fadvise_flags)
7788{
7789 const hobject_t& dest = obc->obs.oi.soid;
7790 dout(10) << __func__ << " " << dest
7791 << " from " << src << " " << oloc << " v" << version
7792 << " flags " << flags
7793 << (mirror_snapset ? " mirror_snapset" : "")
7794 << dendl;
7795
7796 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7797 src.snap == CEPH_SNAPDIR));
7798
7799 // cancel a previous in-progress copy?
7800 if (copy_ops.count(dest)) {
7801 // FIXME: if the src etc match, we could avoid restarting from the
7802 // beginning.
7803 CopyOpRef cop = copy_ops[dest];
7804 cancel_copy(cop, false);
7805 }
7806
7807 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
7808 mirror_snapset, src_obj_fadvise_flags,
7809 dest_obj_fadvise_flags));
7810 copy_ops[dest] = cop;
7811 obc->start_block();
7812
7813 _copy_some(obc, cop);
7814}
7815
7816void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
7817{
7818 dout(10) << __func__ << " " << obc << " " << cop << dendl;
7819
7820 unsigned flags = 0;
7821 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
7822 flags |= CEPH_OSD_FLAG_FLUSH;
7823 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
7824 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
7825 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
7826 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
7827 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
7828 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
7829 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
7830 flags |= CEPH_OSD_FLAG_RWORDERED;
7831
7832 C_GatherBuilder gather(cct);
7833
7834 if (cop->cursor.is_initial() && cop->mirror_snapset) {
7835 // list snaps too.
7836 assert(cop->src.snap == CEPH_NOSNAP);
7837 ObjectOperation op;
7838 op.list_snaps(&cop->results.snapset, NULL);
7839 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7840 CEPH_SNAPDIR, NULL,
7841 flags, gather.new_sub(), NULL);
7842 cop->objecter_tid2 = tid;
7843 }
7844
7845 ObjectOperation op;
7846 if (cop->results.user_version) {
7847 op.assert_version(cop->results.user_version);
7848 } else {
7849 // we should learn the version after the first chunk, if we didn't know
7850 // it already!
7851 assert(cop->cursor.is_initial());
7852 }
7853 op.copy_get(&cop->cursor, get_copy_chunk_size(),
7854 &cop->results.object_size, &cop->results.mtime,
7855 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
7856 &cop->results.snaps, &cop->results.snap_seq,
7857 &cop->results.flags,
7858 &cop->results.source_data_digest,
7859 &cop->results.source_omap_digest,
7860 &cop->results.reqids,
7861 &cop->results.truncate_seq,
7862 &cop->results.truncate_size,
7863 &cop->rval);
7864 op.set_last_op_flags(cop->src_obj_fadvise_flags);
7865
7866 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
7867 get_last_peering_reset(), cop);
7868 gather.set_finisher(new C_OnFinisher(fin,
7869 &osd->objecter_finisher));
7870
7871 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7872 cop->src.snap, NULL,
7873 flags,
7874 gather.new_sub(),
7875 // discover the object version if we don't know it yet
7876 cop->results.user_version ? NULL : &cop->results.user_version);
7877 fin->tid = tid;
7878 cop->objecter_tid = tid;
7879 gather.activate();
7880}
7881
7882void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
7883{
7884 dout(10) << __func__ << " " << oid << " tid " << tid
7885 << " " << cpp_strerror(r) << dendl;
7886 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
7887 if (p == copy_ops.end()) {
7888 dout(10) << __func__ << " no copy_op found" << dendl;
7889 return;
7890 }
7891 CopyOpRef cop = p->second;
7892 if (tid != cop->objecter_tid) {
7893 dout(10) << __func__ << " tid " << tid << " != cop " << cop
7894 << " tid " << cop->objecter_tid << dendl;
7895 return;
7896 }
7897
7898 if (cop->omap_data.length() || cop->omap_header.length())
7899 cop->results.has_omap = true;
7900
7901 if (r >= 0 && !pool.info.supports_omap() &&
7902 (cop->omap_data.length() || cop->omap_header.length())) {
7903 r = -EOPNOTSUPP;
7904 }
7905 cop->objecter_tid = 0;
7906 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
7907 ObjectContextRef& cobc = cop->obc;
7908
7909 if (r < 0)
7910 goto out;
7911
7912 assert(cop->rval >= 0);
7913
7914 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
7915 // verify snap hasn't been deleted
7916 vector<snapid_t>::iterator p = cop->results.snaps.begin();
7917 while (p != cop->results.snaps.end()) {
7918 if (pool.info.is_removed_snap(*p)) {
7919 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
7920 << dendl;
7921 for (vector<snapid_t>::iterator q = p + 1;
7922 q != cop->results.snaps.end();
7923 ++q)
7924 *(q - 1) = *q;
7925 cop->results.snaps.resize(cop->results.snaps.size() - 1);
7926 } else {
7927 ++p;
7928 }
7929 }
7930 if (cop->results.snaps.empty()) {
7931 dout(10) << __func__ << " no more snaps for " << oid << dendl;
7932 r = -ENOENT;
7933 goto out;
7934 }
7935 }
7936
7937 assert(cop->rval >= 0);
7938
7939 if (!cop->temp_cursor.data_complete) {
7940 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
7941 }
7942 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
7943 if (cop->omap_header.length()) {
7944 cop->results.omap_digest =
7945 cop->omap_header.crc32c(cop->results.omap_digest);
7946 }
7947 if (cop->omap_data.length()) {
7948 bufferlist keys;
7949 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
7950 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
7951 }
7952 }
7953
7954 if (!cop->temp_cursor.attr_complete) {
7955 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
7956 p != cop->attrs.end();
7957 ++p) {
7958 cop->results.attrs[string("_") + p->first] = p->second;
7959 }
7960 cop->attrs.clear();
7961 }
7962
7963 if (!cop->cursor.is_complete()) {
7964 // write out what we have so far
7965 if (cop->temp_cursor.is_initial()) {
7966 assert(!cop->results.started_temp_obj);
7967 cop->results.started_temp_obj = true;
7968 cop->results.temp_oid = generate_temp_object(oid);
7969 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
7970 }
7971 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
7972 OpContextUPtr ctx = simple_opc_create(tempobc);
7973 if (cop->temp_cursor.is_initial()) {
7974 ctx->new_temp_oid = cop->results.temp_oid;
7975 }
7976 _write_copy_chunk(cop, ctx->op_t.get());
7977 simple_opc_submit(std::move(ctx));
7978 dout(10) << __func__ << " fetching more" << dendl;
7979 _copy_some(cobc, cop);
7980 return;
7981 }
7982
7983 // verify digests?
7984 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
7985 dout(20) << __func__ << std::hex
7986 << " got digest: rx data 0x" << cop->results.data_digest
7987 << " omap 0x" << cop->results.omap_digest
7988 << ", source: data 0x" << cop->results.source_data_digest
7989 << " omap 0x" << cop->results.source_omap_digest
7990 << std::dec
7991 << " flags " << cop->results.flags
7992 << dendl;
7993 }
7994 if (cop->results.is_data_digest() &&
7995 cop->results.data_digest != cop->results.source_data_digest) {
7996 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
7997 << " != source 0x" << cop->results.source_data_digest << std::dec
7998 << dendl;
7999 osd->clog->error() << info.pgid << " copy from " << cop->src
8000 << " to " << cop->obc->obs.oi.soid << std::hex
8001 << " data digest 0x" << cop->results.data_digest
8002 << " != source 0x" << cop->results.source_data_digest
8003 << std::dec;
8004 r = -EIO;
8005 goto out;
8006 }
8007 if (cop->results.is_omap_digest() &&
8008 cop->results.omap_digest != cop->results.source_omap_digest) {
8009 derr << __func__ << std::hex
8010 << " omap digest 0x" << cop->results.omap_digest
8011 << " != source 0x" << cop->results.source_omap_digest
8012 << std::dec << dendl;
8013 osd->clog->error() << info.pgid << " copy from " << cop->src
8014 << " to " << cop->obc->obs.oi.soid << std::hex
8015 << " omap digest 0x" << cop->results.omap_digest
8016 << " != source 0x" << cop->results.source_omap_digest
8017 << std::dec;
8018 r = -EIO;
8019 goto out;
8020 }
8021 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8022 derr << __func__ << " injecting copyfrom failure" << dendl;
8023 r = -EIO;
8024 goto out;
8025 }
8026
8027 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8028 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8029 ObjectState& obs = cop->obc->obs;
8030 if (cop->temp_cursor.is_initial()) {
8031 dout(20) << "fill_in_final_tx: writing "
8032 << "directly to final object" << dendl;
8033 // write directly to final object
8034 cop->results.temp_oid = obs.oi.soid;
8035 _write_copy_chunk(cop, t);
8036 } else {
8037 // finish writing to temp object, then move into place
8038 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8039 _write_copy_chunk(cop, t);
8040 t->rename(obs.oi.soid, cop->results.temp_oid);
8041 }
8042 t->setattrs(obs.oi.soid, cop->results.attrs);
8043 });
8044
8045 dout(20) << __func__ << " success; committing" << dendl;
8046
8047 out:
8048 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8049 CopyCallbackResults results(r, &cop->results);
8050 cop->cb->complete(results);
8051
8052 copy_ops.erase(cobc->obs.oi.soid);
8053 cobc->stop_block();
8054
8055 if (r < 0 && cop->results.started_temp_obj) {
8056 dout(10) << __func__ << " deleting partial temp object "
8057 << cop->results.temp_oid << dendl;
8058 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8059 OpContextUPtr ctx = simple_opc_create(tempobc);
8060 ctx->op_t->remove(cop->results.temp_oid);
8061 ctx->discard_temp_oid = cop->results.temp_oid;
8062 simple_opc_submit(std::move(ctx));
8063 }
8064
8065 // cancel and requeue proxy ops on this object
8066 if (!r) {
8067 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8068 it != proxyread_ops.end();) {
8069 if (it->second->soid == cobc->obs.oi.soid) {
8070 cancel_proxy_read((it++)->second);
8071 } else {
8072 ++it;
8073 }
8074 }
8075 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8076 it != proxywrite_ops.end();) {
8077 if (it->second->soid == cobc->obs.oi.soid) {
8078 cancel_proxy_write((it++)->second);
8079 } else {
8080 ++it;
8081 }
8082 }
8083 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8084 }
8085
8086 kick_object_context_blocked(cobc);
8087}
8088
8089void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8090{
8091 dout(20) << __func__ << " " << cop
8092 << " " << cop->attrs.size() << " attrs"
8093 << " " << cop->data.length() << " bytes"
8094 << " " << cop->omap_header.length() << " omap header bytes"
8095 << " " << cop->omap_data.length() << " omap data bytes"
8096 << dendl;
8097 if (!cop->temp_cursor.attr_complete) {
8098 t->create(cop->results.temp_oid);
8099 }
8100 if (!cop->temp_cursor.data_complete) {
8101 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8102 cop->cursor.data_offset);
8103 if (pool.info.requires_aligned_append() &&
8104 !cop->cursor.data_complete) {
8105 /**
8106 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8107 * to pick it up on the next pass.
8108 */
8109 assert(cop->temp_cursor.data_offset %
8110 pool.info.required_alignment() == 0);
8111 if (cop->data.length() % pool.info.required_alignment() != 0) {
8112 uint64_t to_trim =
8113 cop->data.length() % pool.info.required_alignment();
8114 bufferlist bl;
8115 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8116 cop->data.swap(bl);
8117 cop->cursor.data_offset -= to_trim;
8118 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8119 cop->cursor.data_offset);
8120 }
8121 }
8122 if (cop->data.length()) {
8123 t->write(
8124 cop->results.temp_oid,
8125 cop->temp_cursor.data_offset,
8126 cop->data.length(),
8127 cop->data,
8128 cop->dest_obj_fadvise_flags);
8129 }
8130 cop->data.clear();
8131 }
8132 if (pool.info.supports_omap()) {
8133 if (!cop->temp_cursor.omap_complete) {
8134 if (cop->omap_header.length()) {
8135 t->omap_setheader(
8136 cop->results.temp_oid,
8137 cop->omap_header);
8138 cop->omap_header.clear();
8139 }
8140 if (cop->omap_data.length()) {
8141 map<string,bufferlist> omap;
8142 bufferlist::iterator p = cop->omap_data.begin();
8143 ::decode(omap, p);
8144 t->omap_setkeys(cop->results.temp_oid, omap);
8145 cop->omap_data.clear();
8146 }
8147 }
8148 } else {
8149 assert(cop->omap_header.length() == 0);
8150 assert(cop->omap_data.length() == 0);
8151 }
8152 cop->temp_cursor = cop->cursor;
8153}
8154
8155void PrimaryLogPG::finish_copyfrom(OpContext *ctx)
8156{
8157 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8158 ObjectState& obs = ctx->new_obs;
8159 CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
8160
8161 if (obs.exists) {
8162 dout(20) << __func__ << ": exists, removing" << dendl;
8163 ctx->op_t->remove(obs.oi.soid);
8164 } else {
8165 ctx->delta_stats.num_objects++;
8166 obs.exists = true;
8167 }
8168 if (cb->is_temp_obj_used()) {
8169 ctx->discard_temp_oid = cb->results->temp_oid;
8170 }
8171 cb->results->fill_in_final_tx(ctx->op_t.get());
8172
8173 // CopyFromCallback fills this in for us
8174 obs.oi.user_version = ctx->user_at_version;
8175
8176 obs.oi.set_data_digest(cb->results->data_digest);
8177 obs.oi.set_omap_digest(cb->results->omap_digest);
8178
8179 obs.oi.truncate_seq = cb->results->truncate_seq;
8180 obs.oi.truncate_size = cb->results->truncate_size;
8181
8182 ctx->extra_reqids = cb->results->reqids;
8183
8184 // cache: clear whiteout?
8185 if (obs.oi.is_whiteout()) {
8186 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8187 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8188 --ctx->delta_stats.num_whiteouts;
8189 }
8190
8191 if (cb->results->has_omap) {
8192 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8193 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8194 } else {
8195 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8196 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8197 }
8198
8199 interval_set<uint64_t> ch;
8200 if (obs.oi.size > 0)
8201 ch.insert(0, obs.oi.size);
8202 ctx->modified_ranges.union_of(ch);
8203
8204 if (cb->get_data_size() != obs.oi.size) {
8205 ctx->delta_stats.num_bytes -= obs.oi.size;
8206 obs.oi.size = cb->get_data_size();
8207 ctx->delta_stats.num_bytes += obs.oi.size;
8208 }
8209 ctx->delta_stats.num_wr++;
8210 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8211
8212 osd->logger->inc(l_osd_copyfrom);
8213}
8214
8215void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8216 ObjectContextRef obc)
8217{
8218 const hobject_t& soid = obc->obs.oi.soid;
8219 dout(10) << __func__ << " " << soid << " r=" << r
8220 << " uv" << results->user_version << dendl;
8221
8222 if (r == -ECANCELED) {
8223 return;
8224 }
8225
8226 if (r != -ENOENT && soid.is_snap()) {
8227 if (results->snaps.empty()) {
8228 // we must have read "snap" content from the head object in
8229 // the base pool. use snap_seq to construct what snaps should
8230 // be for this clone (what is was before we evicted the clean
8231 // clone from this pool, and what it will be when we flush and
8232 // the clone eventually happens in the base pool).
8233 SnapSet& snapset = obc->ssc->snapset;
8234 vector<snapid_t>::iterator p = snapset.snaps.begin();
8235 while (p != snapset.snaps.end() && *p > soid.snap)
8236 ++p;
8237 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8238 results->snaps.push_back(*p);
8239 ++p;
8240 }
8241 }
8242
8243 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8244 filter_snapc(results->snaps);
8245
8246 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8247 if (results->snaps.empty()) {
8248 dout(20) << __func__
8249 << " snaps are empty, clone is invalid,"
8250 << " setting r to ENOENT" << dendl;
8251 r = -ENOENT;
8252 }
8253 }
8254
8255 if (r < 0 && results->started_temp_obj) {
8256 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8257 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8258 assert(tempobc);
8259 OpContextUPtr ctx = simple_opc_create(tempobc);
8260 ctx->op_t->remove(results->temp_oid);
8261 simple_opc_submit(std::move(ctx));
8262 results->started_temp_obj = false;
8263 }
8264
8265 if (r == -ENOENT && soid.is_snap()) {
8266 dout(10) << __func__
8267 << ": enoent while trying to promote clone, " << soid
8268 << " must have been trimmed, removing from snapset"
8269 << dendl;
8270 hobject_t head(soid.get_head());
8271 ObjectContextRef obc = get_object_context(head, false);
8272 assert(obc);
8273
8274 OpContextUPtr tctx = simple_opc_create(obc);
8275 tctx->at_version = get_next_version();
8276 filter_snapc(tctx->new_snapset.snaps);
8277 vector<snapid_t> new_clones;
8278 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8279 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8280 i != tctx->new_snapset.clones.end();
8281 ++i) {
8282 if (*i != soid.snap) {
8283 new_clones.push_back(*i);
8284 auto p = tctx->new_snapset.clone_snaps.find(*i);
8285 if (p != tctx->new_snapset.clone_snaps.end()) {
8286 new_clone_snaps[*i] = p->second;
8287 }
8288 }
8289 }
8290 tctx->new_snapset.clones.swap(new_clones);
8291 tctx->new_snapset.clone_overlap.erase(soid.snap);
8292 tctx->new_snapset.clone_size.erase(soid.snap);
8293 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8294
8295 // take RWWRITE lock for duration of our local write. ignore starvation.
8296 if (!tctx->lock_manager.take_write_lock(
8297 head,
8298 obc)) {
8299 assert(0 == "problem!");
8300 }
8301 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8302
8303 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8304
8305 simple_opc_submit(std::move(tctx));
8306 return;
8307 }
8308
8309 bool whiteout = false;
8310 if (r == -ENOENT) {
8311 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8312 dout(10) << __func__ << " whiteout " << soid << dendl;
8313 whiteout = true;
8314 }
8315
8316 if (r < 0 && !whiteout) {
8317 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8318 // pass error to everyone blocked on this object
8319 // FIXME: this is pretty sloppy, but at this point we got
8320 // something unexpected and don't have many other options.
8321 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8322 waiting_for_blocked_object.find(soid);
8323 if (blocked_iter != waiting_for_blocked_object.end()) {
8324 while (!blocked_iter->second.empty()) {
8325 osd->reply_op_error(blocked_iter->second.front(), r);
8326 blocked_iter->second.pop_front();
8327 }
8328 waiting_for_blocked_object.erase(blocked_iter);
8329 }
8330 return;
8331 }
8332
8333 osd->promote_finish(results->object_size);
8334
8335 OpContextUPtr tctx = simple_opc_create(obc);
8336 tctx->at_version = get_next_version();
8337
8338 ++tctx->delta_stats.num_objects;
8339 if (soid.snap < CEPH_NOSNAP)
8340 ++tctx->delta_stats.num_object_clones;
8341 tctx->new_obs.exists = true;
8342
8343 tctx->extra_reqids = results->reqids;
8344
8345 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
31f18b77 8346 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7c673cae
FG
8347
8348 if (whiteout) {
8349 // create a whiteout
8350 tctx->op_t->create(soid);
8351 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8352 ++tctx->delta_stats.num_whiteouts;
8353 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8354 osd->logger->inc(l_osd_tier_whiteout);
8355 } else {
8356 if (results->has_omap) {
8357 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8358 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8359 ++tctx->delta_stats.num_objects_omap;
8360 }
8361
8362 results->fill_in_final_tx(tctx->op_t.get());
8363 if (results->started_temp_obj) {
8364 tctx->discard_temp_oid = results->temp_oid;
8365 }
8366 tctx->new_obs.oi.size = results->object_size;
8367 tctx->new_obs.oi.user_version = results->user_version;
8368 // Don't care src object whether have data or omap digest
8369 if (results->object_size)
8370 tctx->new_obs.oi.set_data_digest(results->data_digest);
8371 if (results->has_omap)
8372 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8373 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8374 tctx->new_obs.oi.truncate_size = results->truncate_size;
8375
8376 if (soid.snap != CEPH_NOSNAP) {
8377 if (legacy_snapset) {
8378 tctx->new_obs.oi.legacy_snaps = results->snaps;
8379 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8380 } else {
8381 // it's already in the snapset
8382 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8383 }
8384 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8385 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8386 results->object_size);
8387 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8388
8389 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8390 } else {
8391 tctx->delta_stats.num_bytes += results->object_size;
8392 }
8393 }
8394
8395 if (results->mirror_snapset) {
8396 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8397 tctx->new_snapset.from_snap_set(
8398 results->snapset,
31f18b77 8399 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
8400 }
8401 tctx->new_snapset.head_exists = true;
8402 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8403
8404 // take RWWRITE lock for duration of our local write. ignore starvation.
8405 if (!tctx->lock_manager.take_write_lock(
8406 obc->obs.oi.soid,
8407 obc)) {
8408 assert(0 == "problem!");
8409 }
8410 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8411
8412 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8413
8414 simple_opc_submit(std::move(tctx));
8415
8416 osd->logger->inc(l_osd_tier_promote);
8417
8418 if (agent_state &&
8419 agent_state->is_idle())
8420 agent_choose_mode();
8421}
8422
8423void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8424{
8425 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8426 << " from " << cop->src << " " << cop->oloc
8427 << " v" << cop->results.user_version << dendl;
8428
8429 // cancel objecter op, if we can
8430 if (cop->objecter_tid) {
8431 osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8432 cop->objecter_tid = 0;
8433 if (cop->objecter_tid2) {
8434 osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8435 cop->objecter_tid2 = 0;
8436 }
8437 }
8438
8439 copy_ops.erase(cop->obc->obs.oi.soid);
8440 cop->obc->stop_block();
8441
8442 kick_object_context_blocked(cop->obc);
8443 cop->results.should_requeue = requeue;
8444 CopyCallbackResults result(-ECANCELED, &cop->results);
8445 cop->cb->complete(result);
8446
8447 // There may still be an objecter callback referencing this copy op.
8448 // That callback will not need the obc since it's been canceled, and
8449 // we need the obc reference to go away prior to flush.
8450 cop->obc = ObjectContextRef();
8451}
8452
8453void PrimaryLogPG::cancel_copy_ops(bool requeue)
8454{
8455 dout(10) << __func__ << dendl;
8456 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8457 while (p != copy_ops.end()) {
8458 // requeue this op? can I queue up all of them?
8459 cancel_copy((p++)->second, requeue);
8460 }
8461}
8462
8463
8464// ========================================================================
8465// flush
8466//
8467// Flush a dirty object in the cache tier by writing it back to the
8468// base tier. The sequence looks like:
8469//
8470// * send a copy-from operation to the base tier to copy the current
8471// version of the object
8472// * base tier will pull the object via (perhaps multiple) copy-get(s)
8473// * on completion, we check if the object has been modified. if so,
8474// just reply with -EAGAIN.
8475// * try to take a write lock so we can clear the dirty flag. if this
8476// fails, wait and retry
8477// * start a repop that clears the bit.
8478//
8479// If we have to wait, we will retry by coming back through the
8480// start_flush method. We check if a flush is already in progress
8481// and, if so, try to finish it by rechecking the version and trying
8482// to clear the dirty bit.
8483//
8484// In order for the cache-flush (a write op) to not block the copy-get
8485// from reading the object, the client *must* set the SKIPRWLOCKS
8486// flag.
8487//
8488// NOTE: normally writes are strictly ordered for the client, but
8489// flushes are special in that they can be reordered with respect to
8490// other writes. In particular, we can't have a flush request block
8491// an update to the cache pool object!
8492
8493struct C_Flush : public Context {
8494 PrimaryLogPGRef pg;
8495 hobject_t oid;
8496 epoch_t last_peering_reset;
8497 ceph_tid_t tid;
8498 utime_t start;
8499 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8500 : pg(p), oid(o), last_peering_reset(lpr),
8501 tid(0), start(ceph_clock_now())
8502 {}
8503 void finish(int r) override {
8504 if (r == -ECANCELED)
8505 return;
8506 pg->lock();
8507 if (last_peering_reset == pg->get_last_peering_reset()) {
8508 pg->finish_flush(oid, tid, r);
8509 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8510 }
8511 pg->unlock();
8512 }
8513};
8514
8515int PrimaryLogPG::start_flush(
8516 OpRequestRef op, ObjectContextRef obc,
8517 bool blocking, hobject_t *pmissing,
8518 boost::optional<std::function<void()>> &&on_flush)
8519{
8520 const object_info_t& oi = obc->obs.oi;
8521 const hobject_t& soid = oi.soid;
8522 dout(10) << __func__ << " " << soid
8523 << " v" << oi.version
8524 << " uv" << oi.user_version
8525 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8526 << dendl;
8527
8528 // get a filtered snapset, need to remove removed snaps
8529 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8530
8531 // verify there are no (older) check for dirty clones
8532 {
8533 dout(20) << " snapset " << snapset << dendl;
8534 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8535 while (p != snapset.clones.rend() && *p >= soid.snap)
8536 ++p;
8537 if (p != snapset.clones.rend()) {
8538 hobject_t next = soid;
8539 next.snap = *p;
8540 assert(next.snap < soid.snap);
8541 if (pg_log.get_missing().is_missing(next)) {
8542 dout(10) << __func__ << " missing clone is " << next << dendl;
8543 if (pmissing)
8544 *pmissing = next;
8545 return -ENOENT;
8546 }
8547 ObjectContextRef older_obc = get_object_context(next, false);
8548 if (older_obc) {
8549 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8550 << dendl;
8551 if (older_obc->obs.oi.is_dirty()) {
8552 dout(10) << __func__ << " next oldest clone is dirty: "
8553 << older_obc->obs.oi << dendl;
8554 return -EBUSY;
8555 }
8556 } else {
8557 dout(20) << __func__ << " next oldest clone " << next
8558 << " is not present; implicitly clean" << dendl;
8559 }
8560 } else {
8561 dout(20) << __func__ << " no older clones" << dendl;
8562 }
8563 }
8564
8565 if (blocking)
8566 obc->start_block();
8567
8568 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8569 if (p != flush_ops.end()) {
8570 FlushOpRef fop = p->second;
8571 if (fop->op == op) {
8572 // we couldn't take the write lock on a cache-try-flush before;
8573 // now we are trying again for the lock.
8574 return try_flush_mark_clean(fop);
8575 }
8576 if (fop->flushed_version == obc->obs.oi.user_version &&
8577 (fop->blocking || !blocking)) {
8578 // nonblocking can join anything
8579 // blocking can only join a blocking flush
8580 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8581 if (op)
8582 fop->dup_ops.push_back(op);
8583 return -EAGAIN; // clean up this ctx; op will retry later
8584 }
8585
8586 // cancel current flush since it will fail anyway, or because we
8587 // are blocking and the existing flush is nonblocking.
8588 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8589 if (fop->op)
8590 osd->reply_op_error(fop->op, -EBUSY);
8591 while (!fop->dup_ops.empty()) {
8592 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8593 fop->dup_ops.pop_front();
8594 }
8595 cancel_flush(fop, false);
8596 }
8597
8598 /**
8599 * In general, we need to send a delete and a copyfrom.
8600 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8601 * where 4 is marked as clean. To flush 10, we have to:
8602 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8603 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8604 *
8605 * There is a complicating case. Supposed there had been a clone 7
8606 * for snaps [7, 6] which has been trimmed since they no longer exist.
8607 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8608 * the delete, the snap will be promoted to 5, and the head will become
8609 * a snapdir. When the copy-from goes through, we'll end up with
8610 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8611 *
8612 * Another complication is the case where there is an interval change
8613 * after doing the delete and the flush but before marking the object
8614 * clean. We'll happily delete head and then recreate it at the same
8615 * sequence number, which works out ok.
8616 */
8617
8618 SnapContext snapc, dsnapc;
8619 if (snapset.seq != 0) {
8620 if (soid.snap == CEPH_NOSNAP) {
8621 snapc.seq = snapset.seq;
8622 snapc.snaps = snapset.snaps;
8623 } else {
8624 snapid_t min_included_snap;
8625 if (snapset.is_legacy()) {
8626 min_included_snap = oi.legacy_snaps.back();
8627 } else {
8628 auto p = snapset.clone_snaps.find(soid.snap);
8629 assert(p != snapset.clone_snaps.end());
8630 min_included_snap = p->second.back();
8631 }
8632 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8633 }
8634
8635 snapid_t prev_snapc = 0;
8636 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8637 citer != snapset.clones.rend();
8638 ++citer) {
8639 if (*citer < soid.snap) {
8640 prev_snapc = *citer;
8641 break;
8642 }
8643 }
8644
8645 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8646 }
8647
8648 object_locator_t base_oloc(soid);
8649 base_oloc.pool = pool.info.tier_of;
8650
8651 if (dsnapc.seq < snapc.seq) {
8652 ObjectOperation o;
8653 o.remove();
8654 osd->objecter->mutate(
8655 soid.oid,
8656 base_oloc,
8657 o,
8658 dsnapc,
8659 ceph::real_clock::from_ceph_timespec(oi.mtime),
8660 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8661 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8662 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8663 }
8664
8665 FlushOpRef fop(std::make_shared<FlushOp>());
8666 fop->obc = obc;
8667 fop->flushed_version = oi.user_version;
8668 fop->blocking = blocking;
8669 fop->on_flush = std::move(on_flush);
8670 fop->op = op;
8671
8672 ObjectOperation o;
8673 if (oi.is_whiteout()) {
8674 fop->removal = true;
8675 o.remove();
8676 } else {
8677 object_locator_t oloc(soid);
8678 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8679 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8680 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8681 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8682 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8683 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8684
8685 //mean the base tier don't cache data after this
8686 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8687 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8688 }
8689 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8690
8691 ceph_tid_t tid = osd->objecter->mutate(
8692 soid.oid, base_oloc, o, snapc,
8693 ceph::real_clock::from_ceph_timespec(oi.mtime),
8694 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8695 new C_OnFinisher(fin,
8696 &osd->objecter_finisher));
8697 /* we're under the pg lock and fin->finish() is grabbing that */
8698 fin->tid = tid;
8699 fop->objecter_tid = tid;
8700
8701 flush_ops[soid] = fop;
8702 info.stats.stats.sum.num_flush++;
8703 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8704 return -EINPROGRESS;
8705}
8706
8707void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8708{
8709 dout(10) << __func__ << " " << oid << " tid " << tid
8710 << " " << cpp_strerror(r) << dendl;
8711 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8712 if (p == flush_ops.end()) {
8713 dout(10) << __func__ << " no flush_op found" << dendl;
8714 return;
8715 }
8716 FlushOpRef fop = p->second;
8717 if (tid != fop->objecter_tid) {
8718 dout(10) << __func__ << " tid " << tid << " != fop " << fop
8719 << " tid " << fop->objecter_tid << dendl;
8720 return;
8721 }
8722 ObjectContextRef obc = fop->obc;
8723 fop->objecter_tid = 0;
8724
8725 if (r < 0 && !(r == -ENOENT && fop->removal)) {
8726 if (fop->op)
8727 osd->reply_op_error(fop->op, -EBUSY);
8728 if (fop->blocking) {
8729 obc->stop_block();
8730 kick_object_context_blocked(obc);
8731 }
8732
8733 if (!fop->dup_ops.empty()) {
8734 dout(20) << __func__ << " requeueing dups" << dendl;
8735 requeue_ops(fop->dup_ops);
8736 }
8737 if (fop->on_flush) {
8738 (*(fop->on_flush))();
8739 fop->on_flush = boost::none;
8740 }
8741 flush_ops.erase(oid);
8742 return;
8743 }
8744
8745 r = try_flush_mark_clean(fop);
8746 if (r == -EBUSY && fop->op) {
8747 osd->reply_op_error(fop->op, r);
8748 }
8749}
8750
8751int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8752{
8753 ObjectContextRef obc = fop->obc;
8754 const hobject_t& oid = obc->obs.oi.soid;
8755
8756 if (fop->blocking) {
8757 obc->stop_block();
8758 kick_object_context_blocked(obc);
8759 }
8760
8761 if (fop->flushed_version != obc->obs.oi.user_version ||
8762 !obc->obs.exists) {
8763 if (obc->obs.exists)
8764 dout(10) << __func__ << " flushed_version " << fop->flushed_version
8765 << " != current " << obc->obs.oi.user_version
8766 << dendl;
8767 else
8768 dout(10) << __func__ << " object no longer exists" << dendl;
8769
8770 if (!fop->dup_ops.empty()) {
8771 dout(20) << __func__ << " requeueing dups" << dendl;
8772 requeue_ops(fop->dup_ops);
8773 }
8774 if (fop->on_flush) {
8775 (*(fop->on_flush))();
8776 fop->on_flush = boost::none;
8777 }
8778 flush_ops.erase(oid);
8779 if (fop->blocking)
8780 osd->logger->inc(l_osd_tier_flush_fail);
8781 else
8782 osd->logger->inc(l_osd_tier_try_flush_fail);
8783 return -EBUSY;
8784 }
8785
8786 if (!fop->blocking &&
8787 scrubber.write_blocked_by_scrub(oid)) {
8788 if (fop->op) {
8789 dout(10) << __func__ << " blocked by scrub" << dendl;
8790 requeue_op(fop->op);
8791 requeue_ops(fop->dup_ops);
8792 return -EAGAIN; // will retry
8793 } else {
8794 osd->logger->inc(l_osd_tier_try_flush_fail);
8795 cancel_flush(fop, false);
8796 return -ECANCELED;
8797 }
8798 }
8799
8800 // successfully flushed, can we evict this object?
8801 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8802 agent_maybe_evict(obc, true)) {
8803 osd->logger->inc(l_osd_tier_clean);
8804 if (fop->on_flush) {
8805 (*(fop->on_flush))();
8806 fop->on_flush = boost::none;
8807 }
8808 flush_ops.erase(oid);
8809 return 0;
8810 }
8811
8812 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
8813 OpContextUPtr ctx = simple_opc_create(fop->obc);
8814
8815 // successfully flushed; can we clear the dirty bit?
8816 // try to take the lock manually, since we don't
8817 // have a ctx yet.
8818 if (ctx->lock_manager.get_lock_type(
8819 ObjectContext::RWState::RWWRITE,
8820 oid,
8821 obc,
8822 fop->op)) {
8823 dout(20) << __func__ << " took write lock" << dendl;
8824 } else if (fop->op) {
8825 dout(10) << __func__ << " waiting on write lock" << dendl;
8826 close_op_ctx(ctx.release());
8827 requeue_op(fop->op);
8828 requeue_ops(fop->dup_ops);
8829 return -EAGAIN; // will retry
8830 } else {
8831 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
8832 close_op_ctx(ctx.release());
8833 osd->logger->inc(l_osd_tier_try_flush_fail);
8834 cancel_flush(fop, false);
8835 return -ECANCELED;
8836 }
8837
8838 if (fop->on_flush) {
8839 ctx->register_on_finish(*(fop->on_flush));
8840 fop->on_flush = boost::none;
8841 }
8842
8843 ctx->at_version = get_next_version();
8844
8845 ctx->new_obs = obc->obs;
8846 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8847 --ctx->delta_stats.num_objects_dirty;
8848
8849 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
8850
8851 osd->logger->inc(l_osd_tier_clean);
8852
8853 if (!fop->dup_ops.empty() || fop->op) {
8854 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
8855 list<OpRequestRef> ls;
8856 if (fop->op)
8857 ls.push_back(fop->op);
8858 ls.splice(ls.end(), fop->dup_ops);
8859 requeue_ops(ls);
8860 }
8861
8862 simple_opc_submit(std::move(ctx));
8863
8864 flush_ops.erase(oid);
8865
8866 if (fop->blocking)
8867 osd->logger->inc(l_osd_tier_flush);
8868 else
8869 osd->logger->inc(l_osd_tier_try_flush);
8870
8871 return -EINPROGRESS;
8872}
8873
8874void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
8875{
8876 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
8877 << fop->objecter_tid << dendl;
8878 if (fop->objecter_tid) {
8879 osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
8880 fop->objecter_tid = 0;
8881 }
8882 if (fop->blocking) {
8883 fop->obc->stop_block();
8884 kick_object_context_blocked(fop->obc);
8885 }
8886 if (requeue) {
8887 if (fop->op)
8888 requeue_op(fop->op);
8889 requeue_ops(fop->dup_ops);
8890 }
8891 if (fop->on_flush) {
8892 (*(fop->on_flush))();
8893 fop->on_flush = boost::none;
8894 }
8895 flush_ops.erase(fop->obc->obs.oi.soid);
8896}
8897
8898void PrimaryLogPG::cancel_flush_ops(bool requeue)
8899{
8900 dout(10) << __func__ << dendl;
8901 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
8902 while (p != flush_ops.end()) {
8903 cancel_flush((p++)->second, requeue);
8904 }
8905}
8906
8907bool PrimaryLogPG::is_present_clone(hobject_t coid)
8908{
8909 if (!pool.info.allow_incomplete_clones())
8910 return true;
8911 if (is_missing_object(coid))
8912 return true;
8913 ObjectContextRef obc = get_object_context(coid, false);
8914 return obc && obc->obs.exists;
8915}
8916
8917// ========================================================================
8918// rep op gather
8919
8920class C_OSD_RepopApplied : public Context {
8921 PrimaryLogPGRef pg;
8922 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8923public:
8924 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8925 : pg(pg), repop(repop) {}
8926 void finish(int) override {
8927 pg->repop_all_applied(repop.get());
8928 }
8929};
8930
8931
8932void PrimaryLogPG::repop_all_applied(RepGather *repop)
8933{
8934 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
8935 << dendl;
8936 assert(!repop->applies_with_commit);
8937 repop->all_applied = true;
8938 if (!repop->rep_aborted) {
8939 eval_repop(repop);
8940 }
8941}
8942
8943class C_OSD_RepopCommit : public Context {
8944 PrimaryLogPGRef pg;
8945 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8946public:
8947 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8948 : pg(pg), repop(repop) {}
8949 void finish(int) override {
8950 pg->repop_all_committed(repop.get());
8951 }
8952};
8953
8954void PrimaryLogPG::repop_all_committed(RepGather *repop)
8955{
8956 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
8957 << dendl;
8958 repop->all_committed = true;
8959 if (repop->applies_with_commit) {
8960 assert(!repop->all_applied);
8961 repop->all_applied = true;
8962 }
8963
8964 if (!repop->rep_aborted) {
8965 if (repop->v != eversion_t()) {
8966 last_update_ondisk = repop->v;
8967 last_complete_ondisk = repop->pg_local_last_complete;
8968 }
8969 eval_repop(repop);
8970 }
8971}
8972
8973void PrimaryLogPG::op_applied(const eversion_t &applied_version)
8974{
8975 dout(10) << "op_applied version " << applied_version << dendl;
8976 if (applied_version == eversion_t())
8977 return;
8978 assert(applied_version > last_update_applied);
8979 assert(applied_version <= info.last_update);
8980 last_update_applied = applied_version;
8981 if (is_primary()) {
8982 if (scrubber.active) {
8983 if (last_update_applied == scrubber.subset_last_update) {
31f18b77
FG
8984 if (ops_blocked_by_scrub()) {
8985 requeue_scrub(true);
8986 } else {
8987 requeue_scrub(false);
8988 }
8989
7c673cae
FG
8990 }
8991 } else {
8992 assert(scrubber.start == scrubber.end);
8993 }
8994 } else {
8995 if (scrubber.active_rep_scrub) {
8996 if (last_update_applied == static_cast<const MOSDRepScrub*>(
8997 scrubber.active_rep_scrub->get_req())->scrub_to) {
8998 osd->enqueue_back(
8999 info.pgid,
9000 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9001 scrubber.active_rep_scrub = OpRequestRef();
9002 }
9003 }
9004 }
9005}
9006
9007void PrimaryLogPG::eval_repop(RepGather *repop)
9008{
9009 const MOSDOp *m = NULL;
9010 if (repop->op)
9011 m = static_cast<const MOSDOp *>(repop->op->get_req());
9012
9013 if (m)
9014 dout(10) << "eval_repop " << *repop
9015 << (repop->rep_done ? " DONE" : "")
9016 << dendl;
9017 else
9018 dout(10) << "eval_repop " << *repop << " (no op)"
9019 << (repop->rep_done ? " DONE" : "")
9020 << dendl;
9021
9022 if (repop->rep_done)
9023 return;
9024
9025 // ondisk?
9026 if (repop->all_committed) {
9027 dout(10) << " commit: " << *repop << dendl;
9028 for (auto p = repop->on_committed.begin();
9029 p != repop->on_committed.end();
9030 repop->on_committed.erase(p++)) {
9031 (*p)();
9032 }
9033 // send dup commits, in order
9034 if (waiting_for_ondisk.count(repop->v)) {
9035 assert(waiting_for_ondisk.begin()->first == repop->v);
9036 for (list<pair<OpRequestRef, version_t> >::iterator i =
9037 waiting_for_ondisk[repop->v].begin();
9038 i != waiting_for_ondisk[repop->v].end();
9039 ++i) {
9040 osd->reply_op_error(i->first, repop->r, repop->v,
9041 i->second);
9042 }
9043 waiting_for_ondisk.erase(repop->v);
9044 }
9045 }
9046
9047 // applied?
9048 if (repop->all_applied) {
9049 if (repop->applies_with_commit) {
9050 assert(repop->on_applied.empty());
9051 }
9052 dout(10) << " applied: " << *repop << " " << dendl;
9053 for (auto p = repop->on_applied.begin();
9054 p != repop->on_applied.end();
9055 repop->on_applied.erase(p++)) {
9056 (*p)();
9057 }
9058 }
9059
9060 // done.
9061 if (repop->all_applied && repop->all_committed) {
9062 repop->rep_done = true;
9063
9064 publish_stats_to_osd();
9065 calc_min_last_complete_ondisk();
9066
9067 dout(10) << " removing " << *repop << dendl;
9068 assert(!repop_queue.empty());
9069 dout(20) << " q front is " << *repop_queue.front() << dendl;
9070 if (repop_queue.front() != repop) {
9071 if (!repop->applies_with_commit) {
9072 dout(0) << " removing " << *repop << dendl;
9073 dout(0) << " q front is " << *repop_queue.front() << dendl;
9074 assert(repop_queue.front() == repop);
9075 }
9076 } else {
9077 RepGather *to_remove = nullptr;
9078 while (!repop_queue.empty() &&
9079 (to_remove = repop_queue.front())->rep_done) {
9080 repop_queue.pop_front();
9081 for (auto p = to_remove->on_success.begin();
9082 p != to_remove->on_success.end();
9083 to_remove->on_success.erase(p++)) {
9084 (*p)();
9085 }
9086 remove_repop(to_remove);
9087 }
9088 }
9089 }
9090}
9091
9092void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9093{
9094 FUNCTRACE();
9095 const hobject_t& soid = ctx->obs->oi.soid;
9096 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9097 << " o " << soid
9098 << dendl;
9099
9100 repop->v = ctx->at_version;
9101 if (ctx->at_version > eversion_t()) {
9102 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9103 i != actingbackfill.end();
9104 ++i) {
9105 if (*i == get_primary()) continue;
9106 pg_info_t &pinfo = peer_info[*i];
9107 // keep peer_info up to date
9108 if (pinfo.last_complete == pinfo.last_update)
9109 pinfo.last_complete = ctx->at_version;
9110 pinfo.last_update = ctx->at_version;
9111 }
9112 }
9113
9114 ctx->obc->ondisk_write_lock();
9115
9116 bool unlock_snapset_obc = false;
9117 ctx->op_t->add_obc(ctx->obc);
9118 if (ctx->clone_obc) {
9119 ctx->clone_obc->ondisk_write_lock();
9120 ctx->op_t->add_obc(ctx->clone_obc);
9121 }
9122 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9123 ctx->obc->obs.oi.soid) {
9124 ctx->snapset_obc->ondisk_write_lock();
9125 unlock_snapset_obc = true;
9126 ctx->op_t->add_obc(ctx->snapset_obc);
9127 }
9128
9129 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9130 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9131 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9132 ctx->obc,
9133 ctx->clone_obc,
9134 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9135 if (!(ctx->log.empty())) {
9136 assert(ctx->at_version >= projected_last_update);
9137 projected_last_update = ctx->at_version;
9138 }
9139 for (auto &&entry: ctx->log) {
9140 projected_log.add(entry);
9141 }
9142 pgbackend->submit_transaction(
9143 soid,
9144 ctx->delta_stats,
9145 ctx->at_version,
9146 std::move(ctx->op_t),
9147 pg_trim_to,
9148 min_last_complete_ondisk,
9149 ctx->log,
9150 ctx->updated_hset_history,
9151 onapplied_sync,
9152 on_all_applied,
9153 on_all_commit,
9154 repop->rep_tid,
9155 ctx->reqid,
9156 ctx->op);
9157}
9158
9159PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9160 OpContext *ctx, ObjectContextRef obc,
9161 ceph_tid_t rep_tid)
9162{
9163 if (ctx->op)
9164 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9165 else
9166 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9167
9168 RepGather *repop = new RepGather(
9169 ctx, rep_tid, info.last_complete, false);
9170
9171 repop->start = ceph_clock_now();
9172
9173 repop_queue.push_back(&repop->queue_item);
9174 repop->get();
9175
9176 osd->logger->inc(l_osd_op_wip);
9177
9178 dout(10) << __func__ << ": " << *repop << dendl;
9179 return repop;
9180}
9181
9182boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9183 eversion_t version,
9184 int r,
9185 ObcLockManager &&manager,
9186 OpRequestRef &&op,
9187 boost::optional<std::function<void(void)> > &&on_complete)
9188{
9189 RepGather *repop = new RepGather(
9190 std::move(manager),
9191 std::move(op),
9192 std::move(on_complete),
9193 osd->get_tid(),
9194 info.last_complete,
9195 true,
9196 r);
9197 repop->v = version;
9198
9199 repop->start = ceph_clock_now();
9200
9201 repop_queue.push_back(&repop->queue_item);
9202
9203 osd->logger->inc(l_osd_op_wip);
9204
9205 dout(10) << __func__ << ": " << *repop << dendl;
9206 return boost::intrusive_ptr<RepGather>(repop);
9207}
9208
9209void PrimaryLogPG::remove_repop(RepGather *repop)
9210{
9211 dout(20) << __func__ << " " << *repop << dendl;
9212
9213 for (auto p = repop->on_finish.begin();
9214 p != repop->on_finish.end();
9215 repop->on_finish.erase(p++)) {
9216 (*p)();
9217 }
9218
9219 release_object_locks(
9220 repop->lock_manager);
9221 repop->put();
9222
9223 osd->logger->dec(l_osd_op_wip);
9224}
9225
9226PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9227{
9228 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9229 vector<OSDOp> ops;
9230 ceph_tid_t rep_tid = osd->get_tid();
9231 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9232 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, ops, obc, this));
9233 ctx->op_t.reset(new PGTransaction());
9234 ctx->mtime = ceph_clock_now();
9235 return ctx;
9236}
9237
9238void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9239{
9240 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9241 dout(20) << __func__ << " " << repop << dendl;
9242 issue_repop(repop, ctx.get());
9243 eval_repop(repop);
224ce89b 9244 calc_trim_to();
7c673cae
FG
9245 repop->put();
9246}
9247
9248
9249void PrimaryLogPG::submit_log_entries(
31f18b77 9250 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae
FG
9251 ObcLockManager &&manager,
9252 boost::optional<std::function<void(void)> > &&_on_complete,
9253 OpRequestRef op,
9254 int r)
9255{
9256 dout(10) << __func__ << " " << entries << dendl;
9257 assert(is_primary());
9258
9259 eversion_t version;
9260 if (!entries.empty()) {
9261 assert(entries.rbegin()->version >= projected_last_update);
9262 version = projected_last_update = entries.rbegin()->version;
9263 }
9264
9265 boost::intrusive_ptr<RepGather> repop;
9266 boost::optional<std::function<void(void)> > on_complete;
31f18b77 9267 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9268 repop = new_repop(
9269 version,
9270 r,
9271 std::move(manager),
9272 std::move(op),
9273 std::move(_on_complete));
9274 } else {
9275 on_complete = std::move(_on_complete);
9276 }
9277
9278 pgbackend->call_write_ordered(
9279 [this, entries, repop, on_complete]() {
9280 ObjectStore::Transaction t;
9281 eversion_t old_last_update = info.last_update;
9282 merge_new_log_entries(entries, t);
9283
9284
9285 set<pg_shard_t> waiting_on;
9286 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9287 i != actingbackfill.end();
9288 ++i) {
9289 pg_shard_t peer(*i);
9290 if (peer == pg_whoami) continue;
9291 assert(peer_missing.count(peer));
9292 assert(peer_info.count(peer));
31f18b77 9293 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9294 assert(repop);
9295 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9296 entries,
9297 spg_t(info.pgid.pgid, i->shard),
9298 pg_whoami.shard,
9299 get_osdmap()->get_epoch(),
9300 last_peering_reset,
9301 repop->rep_tid);
9302 osd->send_message_osd_cluster(
9303 peer.osd, m, get_osdmap()->get_epoch());
9304 waiting_on.insert(peer);
9305 } else {
9306 MOSDPGLog *m = new MOSDPGLog(
9307 peer.shard, pg_whoami.shard,
9308 info.last_update.epoch,
9309 info);
9310 m->log.log = entries;
9311 m->log.tail = old_last_update;
9312 m->log.head = info.last_update;
9313 osd->send_message_osd_cluster(
9314 peer.osd, m, get_osdmap()->get_epoch());
9315 }
9316 }
31f18b77 9317 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
9318 ceph_tid_t rep_tid = repop->rep_tid;
9319 waiting_on.insert(pg_whoami);
9320 log_entry_update_waiting_on.insert(
9321 make_pair(
9322 rep_tid,
9323 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9324 ));
9325 struct OnComplete : public Context {
9326 PrimaryLogPGRef pg;
9327 ceph_tid_t rep_tid;
9328 epoch_t epoch;
9329 OnComplete(
9330 PrimaryLogPGRef pg,
9331 ceph_tid_t rep_tid,
9332 epoch_t epoch)
9333 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9334 void finish(int) override {
9335 pg->lock();
9336 if (!pg->pg_has_reset_since(epoch)) {
9337 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9338 assert(it != pg->log_entry_update_waiting_on.end());
9339 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9340 assert(it2 != it->second.waiting_on.end());
9341 it->second.waiting_on.erase(it2);
9342 if (it->second.waiting_on.empty()) {
9343 pg->repop_all_committed(it->second.repop.get());
9344 pg->log_entry_update_waiting_on.erase(it);
9345 }
9346 }
9347 pg->unlock();
9348 }
9349 };
9350 t.register_on_commit(
9351 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9352 } else {
9353 if (on_complete) {
9354 struct OnComplete : public Context {
9355 PrimaryLogPGRef pg;
9356 std::function<void(void)> on_complete;
9357 epoch_t epoch;
9358 OnComplete(
9359 PrimaryLogPGRef pg,
9360 const std::function<void(void)> &on_complete,
9361 epoch_t epoch)
9362 : pg(pg),
9363 on_complete(std::move(on_complete)),
9364 epoch(epoch) {}
9365 void finish(int) override {
9366 pg->lock();
9367 if (!pg->pg_has_reset_since(epoch))
9368 on_complete();
9369 pg->unlock();
9370 }
9371 };
9372 t.register_on_complete(
9373 new OnComplete{
9374 this, *on_complete, get_osdmap()->get_epoch()
9375 });
9376 }
9377 }
9378 t.register_on_applied(
9379 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9380 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9381 assert(r == 0);
9382 });
9383}
9384
9385void PrimaryLogPG::cancel_log_updates()
9386{
9387 // get rid of all the LogUpdateCtx so their references to repops are
9388 // dropped
9389 log_entry_update_waiting_on.clear();
9390}
9391
9392// -------------------------------------------------------
9393
9394void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9395{
9396 pair<hobject_t, ObjectContextRef> i;
9397 while (object_contexts.get_next(i.first, &i)) {
9398 ObjectContextRef obc(i.second);
9399 get_obc_watchers(obc, pg_watchers);
9400 }
9401}
9402
9403void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9404{
9405 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9406 obc->watchers.begin();
9407 j != obc->watchers.end();
9408 ++j) {
9409 obj_watch_item_t owi;
9410
9411 owi.obj = obc->obs.oi.soid;
9412 owi.wi.addr = j->second->get_peer_addr();
9413 owi.wi.name = j->second->get_entity();
9414 owi.wi.cookie = j->second->get_cookie();
9415 owi.wi.timeout_seconds = j->second->get_timeout();
9416
9417 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9418 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9419
9420 pg_watchers.push_back(owi);
9421 }
9422}
9423
9424void PrimaryLogPG::check_blacklisted_watchers()
9425{
9426 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9427 pair<hobject_t, ObjectContextRef> i;
9428 while (object_contexts.get_next(i.first, &i))
9429 check_blacklisted_obc_watchers(i.second);
9430}
9431
9432void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9433{
9434 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9435 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9436 obc->watchers.begin();
9437 k != obc->watchers.end();
9438 ) {
9439 //Advance iterator now so handle_watch_timeout() can erase element
9440 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9441 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9442 entity_addr_t ea = j->second->get_peer_addr();
9443 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9444 if (get_osdmap()->is_blacklisted(ea)) {
9445 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9446 assert(j->second->get_pg() == this);
9447 j->second->unregister_cb();
9448 handle_watch_timeout(j->second);
9449 }
9450 }
9451}
9452
9453void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9454{
9455 assert(is_active());
9456 assert((recovering.count(obc->obs.oi.soid) ||
9457 !is_missing_object(obc->obs.oi.soid)) ||
9458 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9459 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9460 pg_log_entry_t::LOST_REVERT &&
9461 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9462 obc->obs.oi.version));
9463
9464 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9465 assert(obc->watchers.empty());
9466 // populate unconnected_watchers
9467 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9468 obc->obs.oi.watchers.begin();
9469 p != obc->obs.oi.watchers.end();
9470 ++p) {
9471 utime_t expire = info.stats.last_became_active;
9472 expire += p->second.timeout_seconds;
9473 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9474 WatchRef watch(
9475 Watch::makeWatchRef(
9476 this, osd, obc, p->second.timeout_seconds, p->first.first,
9477 p->first.second, p->second.addr));
9478 watch->disconnect();
9479 obc->watchers.insert(
9480 make_pair(
9481 make_pair(p->first.first, p->first.second),
9482 watch));
9483 }
9484 // Look for watchers from blacklisted clients and drop
9485 check_blacklisted_obc_watchers(obc);
9486}
9487
9488void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9489{
9490 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9491 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9492
9493 if (!is_active()) {
9494 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9495 return;
9496 }
9497 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9498 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9499 watch->get_delayed_cb()
9500 );
9501 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9502 << obc->obs.oi.soid
9503 << dendl;
9504 return;
9505 }
9506
9507 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9508 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9509 << obc->obs.oi.soid
9510 << dendl;
9511 scrubber.add_callback(
9512 watch->get_delayed_cb() // This callback!
9513 );
9514 return;
9515 }
9516
9517 OpContextUPtr ctx = simple_opc_create(obc);
9518 ctx->at_version = get_next_version();
9519
9520 object_info_t& oi = ctx->new_obs.oi;
9521 oi.watchers.erase(make_pair(watch->get_cookie(),
9522 watch->get_entity()));
9523
9524 list<watch_disconnect_t> watch_disconnects = {
9525 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9526 };
9527 ctx->register_on_success(
9528 [this, obc, watch_disconnects]() {
9529 complete_disconnect_watches(obc, watch_disconnects);
9530 });
9531
9532
9533 PGTransaction *t = ctx->op_t.get();
9534 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9535 ctx->at_version,
9536 oi.version,
9537 0,
9538 osd_reqid_t(), ctx->mtime, 0));
9539
9540 oi.prior_version = obc->obs.oi.version;
9541 oi.version = ctx->at_version;
9542 bufferlist bl;
9543 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9544 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9545
9546 // apply new object state.
9547 ctx->obc->obs = ctx->new_obs;
9548
9549 // no ctx->delta_stats
9550 simple_opc_submit(std::move(ctx));
9551}
9552
9553ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9554 SnapSetContext *ssc)
9555{
9556 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9557 assert(obc->destructor_callback == NULL);
9558 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9559 obc->obs.oi = oi;
9560 obc->obs.exists = false;
9561 obc->ssc = ssc;
9562 if (ssc)
9563 register_snapset_context(ssc);
9564 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9565 if (is_active())
9566 populate_obc_watchers(obc);
9567 return obc;
9568}
9569
9570ObjectContextRef PrimaryLogPG::get_object_context(
9571 const hobject_t& soid,
9572 bool can_create,
9573 const map<string, bufferlist> *attrs)
9574{
9575 assert(
9576 attrs || !pg_log.get_missing().is_missing(soid) ||
9577 // or this is a revert... see recover_primary()
9578 (pg_log.get_log().objects.count(soid) &&
9579 pg_log.get_log().objects.find(soid)->second->op ==
9580 pg_log_entry_t::LOST_REVERT));
9581 ObjectContextRef obc = object_contexts.lookup(soid);
9582 osd->logger->inc(l_osd_object_ctx_cache_total);
9583 if (obc) {
9584 osd->logger->inc(l_osd_object_ctx_cache_hit);
9585 dout(10) << __func__ << ": found obc in cache: " << obc
9586 << dendl;
9587 } else {
9588 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9589 // check disk
9590 bufferlist bv;
9591 if (attrs) {
9592 assert(attrs->count(OI_ATTR));
9593 bv = attrs->find(OI_ATTR)->second;
9594 } else {
9595 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9596 if (r < 0) {
9597 if (!can_create) {
9598 dout(10) << __func__ << ": no obc for soid "
9599 << soid << " and !can_create"
9600 << dendl;
9601 return ObjectContextRef(); // -ENOENT!
9602 }
9603
9604 dout(10) << __func__ << ": no obc for soid "
9605 << soid << " but can_create"
9606 << dendl;
9607 // new object.
9608 object_info_t oi(soid);
9609 SnapSetContext *ssc = get_snapset_context(
9610 soid, true, 0, false);
224ce89b 9611 assert(ssc);
7c673cae
FG
9612 obc = create_object_context(oi, ssc);
9613 dout(10) << __func__ << ": " << obc << " " << soid
9614 << " " << obc->rwstate
9615 << " oi: " << obc->obs.oi
9616 << " ssc: " << obc->ssc
9617 << " snapset: " << obc->ssc->snapset << dendl;
9618 return obc;
9619 }
9620 }
9621
9622 object_info_t oi;
9623 try {
9624 bufferlist::iterator bliter = bv.begin();
9625 ::decode(oi, bliter);
9626 } catch (...) {
9627 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9628 return ObjectContextRef(); // -ENOENT!
9629 }
9630
9631 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9632
9633 obc = object_contexts.lookup_or_create(oi.soid);
9634 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9635 obc->obs.oi = oi;
9636 obc->obs.exists = true;
9637
9638 obc->ssc = get_snapset_context(
9639 soid, true,
9640 soid.has_snapset() ? attrs : 0);
9641
9642 if (is_active())
9643 populate_obc_watchers(obc);
9644
9645 if (pool.info.require_rollback()) {
9646 if (attrs) {
9647 obc->attr_cache = *attrs;
9648 } else {
9649 int r = pgbackend->objects_get_attrs(
9650 soid,
9651 &obc->attr_cache);
9652 assert(r == 0);
9653 }
9654 }
9655
9656 dout(10) << __func__ << ": creating obc from disk: " << obc
9657 << dendl;
9658 }
224ce89b
WB
9659
9660 // XXX: Caller doesn't expect this
9661 if (obc->ssc == NULL) {
9662 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9663 return ObjectContextRef(); // -ENOENT!
9664 }
9665
7c673cae
FG
9666 dout(10) << __func__ << ": " << obc << " " << soid
9667 << " " << obc->rwstate
9668 << " oi: " << obc->obs.oi
9669 << " exists: " << (int)obc->obs.exists
9670 << " ssc: " << obc->ssc
9671 << " snapset: " << obc->ssc->snapset << dendl;
9672 return obc;
9673}
9674
9675void PrimaryLogPG::context_registry_on_change()
9676{
9677 pair<hobject_t, ObjectContextRef> i;
9678 while (object_contexts.get_next(i.first, &i)) {
9679 ObjectContextRef obc(i.second);
9680 if (obc) {
9681 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9682 obc->watchers.begin();
9683 j != obc->watchers.end();
9684 obc->watchers.erase(j++)) {
9685 j->second->discard();
9686 }
9687 }
9688 }
9689}
9690
9691
9692/*
9693 * If we return an error, and set *pmissing, then promoting that
9694 * object may help.
9695 *
9696 * If we return -EAGAIN, we will always set *pmissing to the missing
9697 * object to wait for.
9698 *
9699 * If we return an error but do not set *pmissing, then we know the
9700 * object does not exist.
9701 */
9702int PrimaryLogPG::find_object_context(const hobject_t& oid,
9703 ObjectContextRef *pobc,
9704 bool can_create,
9705 bool map_snapid_to_clone,
9706 hobject_t *pmissing)
9707{
9708 FUNCTRACE();
9709 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9710 // want the head?
9711 if (oid.snap == CEPH_NOSNAP) {
9712 ObjectContextRef obc = get_object_context(oid, can_create);
9713 if (!obc) {
9714 if (pmissing)
9715 *pmissing = oid;
9716 return -ENOENT;
9717 }
9718 dout(10) << "find_object_context " << oid
9719 << " @" << oid.snap
9720 << " oi=" << obc->obs.oi
9721 << dendl;
9722 *pobc = obc;
9723
9724 return 0;
9725 }
9726
9727 hobject_t head = oid.get_head();
9728
9729 // want the snapdir?
9730 if (oid.snap == CEPH_SNAPDIR) {
9731 // return head or snapdir, whichever exists.
9732 ObjectContextRef headobc = get_object_context(head, can_create);
9733 ObjectContextRef obc = headobc;
9734 if (!obc || !obc->obs.exists)
9735 obc = get_object_context(oid, can_create);
9736 if (!obc || !obc->obs.exists) {
9737 // if we have neither, we would want to promote the head.
9738 if (pmissing)
9739 *pmissing = head;
9740 if (pobc)
9741 *pobc = headobc; // may be null
9742 return -ENOENT;
9743 }
9744 dout(10) << "find_object_context " << oid
9745 << " @" << oid.snap
9746 << " oi=" << obc->obs.oi
9747 << dendl;
9748 *pobc = obc;
9749
9750 // always populate ssc for SNAPDIR...
9751 if (!obc->ssc)
9752 obc->ssc = get_snapset_context(
9753 oid, true);
9754 return 0;
9755 }
9756
9757 // we want a snap
9758 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9759 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9760 return -ENOENT;
9761 }
9762
9763 SnapSetContext *ssc = get_snapset_context(oid, can_create);
9764 if (!ssc || !(ssc->exists || can_create)) {
9765 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9766 if (pmissing)
9767 *pmissing = head; // start by getting the head
9768 if (ssc)
9769 put_snapset_context(ssc);
9770 return -ENOENT;
9771 }
9772
9773 if (map_snapid_to_clone) {
9774 dout(10) << "find_object_context " << oid << " @" << oid.snap
9775 << " snapset " << ssc->snapset
9776 << " map_snapid_to_clone=true" << dendl;
9777 if (oid.snap > ssc->snapset.seq) {
9778 // already must be readable
9779 ObjectContextRef obc = get_object_context(head, false);
9780 dout(10) << "find_object_context " << oid << " @" << oid.snap
9781 << " snapset " << ssc->snapset
9782 << " maps to head" << dendl;
9783 *pobc = obc;
9784 put_snapset_context(ssc);
9785 return (obc && obc->obs.exists) ? 0 : -ENOENT;
9786 } else {
9787 vector<snapid_t>::const_iterator citer = std::find(
9788 ssc->snapset.clones.begin(),
9789 ssc->snapset.clones.end(),
9790 oid.snap);
9791 if (citer == ssc->snapset.clones.end()) {
9792 dout(10) << "find_object_context " << oid << " @" << oid.snap
9793 << " snapset " << ssc->snapset
9794 << " maps to nothing" << dendl;
9795 put_snapset_context(ssc);
9796 return -ENOENT;
9797 }
9798
9799 dout(10) << "find_object_context " << oid << " @" << oid.snap
9800 << " snapset " << ssc->snapset
9801 << " maps to " << oid << dendl;
9802
9803 if (pg_log.get_missing().is_missing(oid)) {
9804 dout(10) << "find_object_context " << oid << " @" << oid.snap
9805 << " snapset " << ssc->snapset
9806 << " " << oid << " is missing" << dendl;
9807 if (pmissing)
9808 *pmissing = oid;
9809 put_snapset_context(ssc);
9810 return -EAGAIN;
9811 }
9812
9813 ObjectContextRef obc = get_object_context(oid, false);
9814 if (!obc || !obc->obs.exists) {
9815 dout(10) << "find_object_context " << oid << " @" << oid.snap
9816 << " snapset " << ssc->snapset
9817 << " " << oid << " is not present" << dendl;
9818 if (pmissing)
9819 *pmissing = oid;
9820 put_snapset_context(ssc);
9821 return -ENOENT;
9822 }
9823 dout(10) << "find_object_context " << oid << " @" << oid.snap
9824 << " snapset " << ssc->snapset
9825 << " " << oid << " HIT" << dendl;
9826 *pobc = obc;
9827 put_snapset_context(ssc);
9828 return 0;
9829 }
9830 ceph_abort(); //unreachable
9831 }
9832
9833 dout(10) << "find_object_context " << oid << " @" << oid.snap
9834 << " snapset " << ssc->snapset << dendl;
9835
9836 // head?
9837 if (oid.snap > ssc->snapset.seq) {
9838 if (ssc->snapset.head_exists) {
9839 ObjectContextRef obc = get_object_context(head, false);
9840 dout(10) << "find_object_context " << head
9841 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9842 << " -- HIT " << obc->obs
9843 << dendl;
9844 if (!obc->ssc)
9845 obc->ssc = ssc;
9846 else {
9847 assert(ssc == obc->ssc);
9848 put_snapset_context(ssc);
9849 }
9850 *pobc = obc;
9851 return 0;
9852 }
9853 dout(10) << "find_object_context " << head
9854 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9855 << " but head dne -- DNE"
9856 << dendl;
9857 put_snapset_context(ssc);
9858 return -ENOENT;
9859 }
9860
9861 // which clone would it be?
9862 unsigned k = 0;
9863 while (k < ssc->snapset.clones.size() &&
9864 ssc->snapset.clones[k] < oid.snap)
9865 k++;
9866 if (k == ssc->snapset.clones.size()) {
9867 dout(10) << "find_object_context no clones with last >= oid.snap "
9868 << oid.snap << " -- DNE" << dendl;
9869 put_snapset_context(ssc);
9870 return -ENOENT;
9871 }
9872 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
9873 info.pgid.pool(), oid.get_namespace());
9874
9875 if (pg_log.get_missing().is_missing(soid)) {
9876 dout(20) << "find_object_context " << soid << " missing, try again later"
9877 << dendl;
9878 if (pmissing)
9879 *pmissing = soid;
9880 put_snapset_context(ssc);
9881 return -EAGAIN;
9882 }
9883
9884 ObjectContextRef obc = get_object_context(soid, false);
9885 if (!obc || !obc->obs.exists) {
9886 dout(20) << __func__ << " missing clone " << soid << dendl;
9887 if (pmissing)
9888 *pmissing = soid;
9889 put_snapset_context(ssc);
9890 return -ENOENT;
9891 }
9892
9893 if (!obc->ssc) {
9894 obc->ssc = ssc;
9895 } else {
9896 assert(obc->ssc == ssc);
9897 put_snapset_context(ssc);
9898 }
9899 ssc = 0;
9900
9901 // clone
9902 dout(20) << "find_object_context " << soid
9903 << " snapset " << obc->ssc->snapset
9904 << " legacy_snaps " << obc->obs.oi.legacy_snaps
9905 << dendl;
9906 snapid_t first, last;
9907 if (obc->ssc->snapset.is_legacy()) {
9908 first = obc->obs.oi.legacy_snaps.back();
9909 last = obc->obs.oi.legacy_snaps.front();
9910 } else {
9911 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9912 assert(p != obc->ssc->snapset.clone_snaps.end());
9913 first = p->second.back();
9914 last = p->second.front();
9915 }
9916 if (first <= oid.snap) {
9917 dout(20) << "find_object_context " << soid << " [" << first << "," << last
9918 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
9919 *pobc = obc;
9920 return 0;
9921 } else {
9922 dout(20) << "find_object_context " << soid << " [" << first << "," << last
9923 << "] does not contain " << oid.snap << " -- DNE" << dendl;
9924 return -ENOENT;
9925 }
9926}
9927
9928void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
9929{
9930 if (obc->ssc)
9931 put_snapset_context(obc->ssc);
9932}
9933
9934void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
9935{
9936 object_info_t& oi = obc->obs.oi;
9937
9938 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
9939 object_stat_sum_t stat;
9940
9941 stat.num_bytes += oi.size;
9942
9943 if (oi.soid.snap != CEPH_SNAPDIR)
9944 stat.num_objects++;
9945 if (oi.is_dirty())
9946 stat.num_objects_dirty++;
9947 if (oi.is_whiteout())
9948 stat.num_whiteouts++;
9949 if (oi.is_omap())
9950 stat.num_objects_omap++;
9951 if (oi.is_cache_pinned())
9952 stat.num_objects_pinned++;
9953
9954 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
9955 stat.num_object_clones++;
9956
9957 if (!obc->ssc)
9958 obc->ssc = get_snapset_context(oi.soid, false);
9959 assert(obc->ssc);
9960
9961 // subtract off clone overlap
9962 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
9963 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
9964 for (interval_set<uint64_t>::const_iterator r = o.begin();
9965 r != o.end();
9966 ++r) {
9967 stat.num_bytes -= r.get_len();
9968 }
9969 }
9970 }
9971
9972 // add it in
9973 pgstat->stats.sum.add(stat);
9974}
9975
9976void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
9977{
9978 const hobject_t& soid = obc->obs.oi.soid;
9979 if (obc->is_blocked()) {
9980 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
9981 return;
9982 }
9983
9984 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
9985 if (p != waiting_for_blocked_object.end()) {
9986 list<OpRequestRef>& ls = p->second;
9987 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
9988 requeue_ops(ls);
9989 waiting_for_blocked_object.erase(p);
9990 }
9991
9992 map<hobject_t, ObjectContextRef>::iterator i =
9993 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
9994 if (i != objects_blocked_on_snap_promotion.end()) {
9995 assert(i->second == obc);
9996 objects_blocked_on_snap_promotion.erase(i);
9997 }
9998
9999 if (obc->requeue_scrub_on_unblock) {
10000 obc->requeue_scrub_on_unblock = false;
10001 requeue_scrub();
10002 }
10003}
10004
10005SnapSetContext *PrimaryLogPG::get_snapset_context(
10006 const hobject_t& oid,
10007 bool can_create,
10008 const map<string, bufferlist> *attrs,
10009 bool oid_existed)
10010{
10011 Mutex::Locker l(snapset_contexts_lock);
10012 SnapSetContext *ssc;
10013 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10014 oid.get_snapdir());
10015 if (p != snapset_contexts.end()) {
10016 if (can_create || p->second->exists) {
10017 ssc = p->second;
10018 } else {
10019 return NULL;
10020 }
10021 } else {
10022 bufferlist bv;
10023 if (!attrs) {
10024 int r = -ENOENT;
10025 if (!(oid.is_head() && !oid_existed))
10026 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10027 if (r < 0) {
10028 // try _snapset
10029 if (!(oid.is_snapdir() && !oid_existed))
10030 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10031 if (r < 0 && !can_create)
10032 return NULL;
10033 }
10034 } else {
10035 assert(attrs->count(SS_ATTR));
10036 bv = attrs->find(SS_ATTR)->second;
10037 }
10038 ssc = new SnapSetContext(oid.get_snapdir());
10039 _register_snapset_context(ssc);
10040 if (bv.length()) {
10041 bufferlist::iterator bvp = bv.begin();
224ce89b
WB
10042 try {
10043 ssc->snapset.decode(bvp);
10044 } catch (buffer::error& e) {
10045 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10046 return NULL;
10047 }
7c673cae
FG
10048 ssc->exists = true;
10049 } else {
10050 ssc->exists = false;
10051 }
10052 }
10053 assert(ssc);
10054 ssc->ref++;
10055 return ssc;
10056}
10057
10058void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10059{
10060 Mutex::Locker l(snapset_contexts_lock);
10061 --ssc->ref;
10062 if (ssc->ref == 0) {
10063 if (ssc->registered)
10064 snapset_contexts.erase(ssc->oid);
10065 delete ssc;
10066 }
10067}
10068
10069/** pull - request object from a peer
10070 */
10071
10072/*
10073 * Return values:
10074 * NONE - didn't pull anything
10075 * YES - pulled what the caller wanted
10076 * OTHER - needed to pull something else first (_head or _snapdir)
10077 */
10078enum { PULL_NONE, PULL_OTHER, PULL_YES };
10079
10080int PrimaryLogPG::recover_missing(
10081 const hobject_t &soid, eversion_t v,
10082 int priority,
10083 PGBackend::RecoveryHandle *h)
10084{
10085 if (missing_loc.is_unfound(soid)) {
10086 dout(7) << "pull " << soid
10087 << " v " << v
10088 << " but it is unfound" << dendl;
10089 return PULL_NONE;
10090 }
10091
10092 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10093 ObjectContextRef obc;
10094 ObjectContextRef head_obc;
10095 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10096 // do we have the head and/or snapdir?
10097 hobject_t head = soid.get_head();
10098 if (pg_log.get_missing().is_missing(head)) {
10099 if (recovering.count(head)) {
10100 dout(10) << " missing but already recovering head " << head << dendl;
10101 return PULL_NONE;
10102 } else {
10103 int r = recover_missing(
10104 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10105 h);
10106 if (r != PULL_NONE)
10107 return PULL_OTHER;
10108 return PULL_NONE;
10109 }
10110 }
10111 head = soid.get_snapdir();
10112 if (pg_log.get_missing().is_missing(head)) {
10113 if (recovering.count(head)) {
10114 dout(10) << " missing but already recovering snapdir " << head << dendl;
10115 return PULL_NONE;
10116 } else {
10117 int r = recover_missing(
10118 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10119 h);
10120 if (r != PULL_NONE)
10121 return PULL_OTHER;
10122 return PULL_NONE;
10123 }
10124 }
10125
10126 // we must have one or the other
10127 head_obc = get_object_context(
10128 soid.get_head(),
10129 false,
10130 0);
10131 if (!head_obc)
10132 head_obc = get_object_context(
10133 soid.get_snapdir(),
10134 false,
10135 0);
10136 assert(head_obc);
10137 }
10138 start_recovery_op(soid);
10139 assert(!recovering.count(soid));
10140 recovering.insert(make_pair(soid, obc));
224ce89b 10141 int r = pgbackend->recover_object(
7c673cae
FG
10142 soid,
10143 v,
10144 head_obc,
10145 obc,
10146 h);
224ce89b
WB
10147 // This is only a pull which shouldn't return an error
10148 assert(r >= 0);
7c673cae
FG
10149 return PULL_YES;
10150}
10151
10152void PrimaryLogPG::send_remove_op(
10153 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10154{
10155 ceph_tid_t tid = osd->get_tid();
10156 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10157
10158 dout(10) << "send_remove_op " << oid << " from osd." << peer
10159 << " tid " << tid << dendl;
10160
10161 MOSDSubOp *subop = new MOSDSubOp(
10162 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10163 oid, CEPH_OSD_FLAG_ACK,
10164 get_osdmap()->get_epoch(), tid, v);
10165 subop->ops = vector<OSDOp>(1);
10166 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10167
10168 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10169}
10170
10171
10172void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10173{
10174 dout(10) << "finish_degraded_object " << oid << dendl;
10175 ObjectContextRef obc(object_contexts.lookup(oid));
10176 if (callbacks_for_degraded_object.count(oid)) {
10177 list<Context*> contexts;
10178 contexts.swap(callbacks_for_degraded_object[oid]);
10179 callbacks_for_degraded_object.erase(oid);
10180 for (list<Context*>::iterator i = contexts.begin();
10181 i != contexts.end();
10182 ++i) {
10183 (*i)->complete(0);
10184 }
10185 }
10186 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10187 oid.get_head());
10188 if (i != objects_blocked_on_degraded_snap.end() &&
10189 i->second == oid.snap)
10190 objects_blocked_on_degraded_snap.erase(i);
10191}
10192
10193void PrimaryLogPG::_committed_pushed_object(
10194 epoch_t epoch, eversion_t last_complete)
10195{
10196 lock();
10197 if (!pg_has_reset_since(epoch)) {
10198 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10199 last_complete_ondisk = last_complete;
10200
10201 if (last_complete_ondisk == info.last_update) {
10202 if (!is_primary()) {
10203 // Either we are a replica or backfill target.
10204 // we are fully up to date. tell the primary!
10205 osd->send_message_osd_cluster(
10206 get_primary().osd,
10207 new MOSDPGTrim(
10208 get_osdmap()->get_epoch(),
10209 spg_t(info.pgid.pgid, get_primary().shard),
10210 last_complete_ondisk),
10211 get_osdmap()->get_epoch());
10212 } else {
10213 calc_min_last_complete_ondisk();
10214 }
10215 }
10216
10217 } else {
10218 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10219 }
10220
10221 unlock();
10222}
10223
10224void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10225{
10226 lock();
10227 dout(10) << "_applied_recovered_object " << *obc << dendl;
10228
10229 assert(active_pushes >= 1);
10230 --active_pushes;
10231
10232 // requeue an active chunky scrub waiting on recovery ops
10233 if (!deleting && active_pushes == 0
10234 && scrubber.is_chunky_scrub_active()) {
31f18b77
FG
10235 if (ops_blocked_by_scrub()) {
10236 requeue_scrub(true);
10237 } else {
10238 requeue_scrub(false);
10239 }
7c673cae
FG
10240 }
10241
10242 unlock();
10243}
10244
10245void PrimaryLogPG::_applied_recovered_object_replica()
10246{
10247 lock();
10248 dout(10) << "_applied_recovered_object_replica" << dendl;
10249
10250 assert(active_pushes >= 1);
10251 --active_pushes;
10252
10253 // requeue an active chunky scrub waiting on recovery ops
10254 if (!deleting && active_pushes == 0 &&
10255 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10256 scrubber.active_rep_scrub->get_req())->chunky) {
10257 osd->enqueue_back(
10258 info.pgid,
10259 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10260 scrubber.active_rep_scrub = OpRequestRef();
10261 }
10262
10263 unlock();
10264}
10265
10266void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10267{
10268 dout(10) << "got missing " << oid << " v " << v << dendl;
10269 pg_log.recover_got(oid, v, info);
10270 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10271 dout(10) << "last_complete now " << info.last_complete
10272 << " log.complete_to " << pg_log.get_log().complete_to->version
10273 << dendl;
10274 } else {
10275 dout(10) << "last_complete now " << info.last_complete
10276 << " log.complete_to at end" << dendl;
10277 //below is not true in the repair case.
10278 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10279 assert(info.last_complete == info.last_update);
10280 }
10281}
10282
224ce89b
WB
10283void PrimaryLogPG::primary_failed(const hobject_t &soid)
10284{
10285 list<pg_shard_t> fl = { pg_whoami };
10286 failed_push(fl, soid);
10287}
10288
7c673cae
FG
10289void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10290{
10291 dout(20) << __func__ << ": " << soid << dendl;
10292 assert(recovering.count(soid));
10293 auto obc = recovering[soid];
10294 if (obc) {
10295 list<OpRequestRef> blocked_ops;
10296 obc->drop_recovery_read(&blocked_ops);
10297 requeue_ops(blocked_ops);
10298 }
10299 recovering.erase(soid);
10300 for (auto&& i : from)
10301 missing_loc.remove_location(soid, i);
10302 dout(0) << __func__ << " " << soid << " from shard " << from
10303 << ", reps on " << missing_loc.get_locations(soid)
10304 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10305 finish_recovery_op(soid); // close out this attempt,
10306}
10307
10308void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10309{
10310 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10311 assert(m->get_type() == MSG_OSD_SUBOP);
10312 dout(7) << "sub_op_remove " << m->poid << dendl;
10313
10314 op->mark_started();
10315
10316 ObjectStore::Transaction t;
10317 remove_snap_mapped_object(t, m->poid);
10318 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10319 assert(r == 0);
10320}
10321
10322eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10323{
10324 eversion_t v;
10325 pg_missing_item pmi;
10326 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10327 assert(is_missing);
10328 v = pmi.have;
10329 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10330
10331 assert(!actingbackfill.empty());
10332 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10333 i != actingbackfill.end();
10334 ++i) {
10335 if (*i == get_primary()) continue;
10336 pg_shard_t peer = *i;
10337 if (!peer_missing[peer].is_missing(oid)) {
7c673cae
FG
10338 continue;
10339 }
10340 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10341 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10342 if (h > v)
10343 v = h;
10344 }
10345
10346 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10347 return v;
10348}
10349
10350void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10351{
10352 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10353 op->get_req());
10354 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10355 ObjectStore::Transaction t;
10356 append_log_entries_update_missing(m->entries, t);
10357
10358 Context *complete = new FunctionContext(
10359 [=](int) {
10360 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10361 op->get_req());
10362 lock();
10363 if (!pg_has_reset_since(msg->get_epoch())) {
10364 MOSDPGUpdateLogMissingReply *reply =
10365 new MOSDPGUpdateLogMissingReply(
10366 spg_t(info.pgid.pgid, primary_shard().shard),
10367 pg_whoami.shard,
10368 msg->get_epoch(),
10369 msg->min_epoch,
10370 msg->get_tid());
10371 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10372 msg->get_connection()->send_message(reply);
10373 }
10374 unlock();
10375 });
10376
31f18b77 10377 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7c673cae
FG
10378 t.register_on_commit(complete);
10379 } else {
10380 /* Hack to work around the fact that ReplicatedBackend sends
10381 * ack+commit if commit happens first
10382 *
10383 * This behavior is no longer necessary, but we preserve it so old
10384 * primaries can keep their repops in order */
10385 if (pool.info.ec_pool()) {
10386 t.register_on_complete(complete);
10387 } else {
10388 t.register_on_commit(complete);
10389 }
10390 }
10391 t.register_on_applied(
10392 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10393 int tr = osd->store->queue_transaction(
10394 osr.get(),
10395 std::move(t),
10396 nullptr);
10397 assert(tr == 0);
10398}
10399
10400void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10401{
10402 const MOSDPGUpdateLogMissingReply *m =
10403 static_cast<const MOSDPGUpdateLogMissingReply*>(
10404 op->get_req());
10405 dout(20) << __func__ << " got reply from "
10406 << m->get_from() << dendl;
10407
10408 auto it = log_entry_update_waiting_on.find(m->get_tid());
10409 if (it != log_entry_update_waiting_on.end()) {
10410 if (it->second.waiting_on.count(m->get_from())) {
10411 it->second.waiting_on.erase(m->get_from());
10412 } else {
10413 osd->clog->error()
10414 << info.pgid << " got reply "
10415 << *m << " from shard we are not waiting for "
10416 << m->get_from();
10417 }
10418
10419 if (it->second.waiting_on.empty()) {
10420 repop_all_committed(it->second.repop.get());
10421 log_entry_update_waiting_on.erase(it);
10422 }
10423 } else {
10424 osd->clog->error()
10425 << info.pgid << " got reply "
10426 << *m << " on unknown tid " << m->get_tid();
10427 }
10428}
10429
10430/* Mark all unfound objects as lost.
10431 */
10432void PrimaryLogPG::mark_all_unfound_lost(
10433 int what,
10434 ConnectionRef con,
10435 ceph_tid_t tid)
10436{
10437 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
224ce89b 10438 list<hobject_t> oids;
7c673cae
FG
10439
10440 dout(30) << __func__ << ": log before:\n";
10441 pg_log.get_log().print(*_dout);
10442 *_dout << dendl;
10443
31f18b77 10444 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
7c673cae
FG
10445
10446 utime_t mtime = ceph_clock_now();
10447 map<hobject_t, pg_missing_item>::const_iterator m =
10448 missing_loc.get_needs_recovery().begin();
10449 map<hobject_t, pg_missing_item>::const_iterator mend =
10450 missing_loc.get_needs_recovery().end();
10451
10452 ObcLockManager manager;
10453 eversion_t v = get_next_version();
10454 v.epoch = get_osdmap()->get_epoch();
10455 uint64_t num_unfound = missing_loc.num_unfound();
10456 while (m != mend) {
10457 const hobject_t &oid(m->first);
10458 if (!missing_loc.is_unfound(oid)) {
10459 // We only care about unfound objects
10460 ++m;
10461 continue;
10462 }
10463
10464 ObjectContextRef obc;
10465 eversion_t prev;
10466
10467 switch (what) {
10468 case pg_log_entry_t::LOST_MARK:
10469 assert(0 == "actually, not implemented yet!");
10470 break;
10471
10472 case pg_log_entry_t::LOST_REVERT:
10473 prev = pick_newest_available(oid);
10474 if (prev > eversion_t()) {
10475 // log it
10476 pg_log_entry_t e(
10477 pg_log_entry_t::LOST_REVERT, oid, v,
10478 m->second.need, 0, osd_reqid_t(), mtime, 0);
10479 e.reverting_to = prev;
10480 e.mark_unrollbackable();
10481 log_entries.push_back(e);
10482 dout(10) << e << dendl;
10483
10484 // we are now missing the new version; recovery code will sort it out.
10485 ++v.version;
10486 ++m;
10487 break;
10488 }
10489
10490 case pg_log_entry_t::LOST_DELETE:
10491 {
10492 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10493 0, osd_reqid_t(), mtime, 0);
31f18b77 10494 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
7c673cae
FG
10495 if (pool.info.require_rollback()) {
10496 e.mod_desc.try_rmobject(v.version);
10497 } else {
10498 e.mark_unrollbackable();
10499 }
10500 } // otherwise, just do what we used to do
10501 dout(10) << e << dendl;
10502 log_entries.push_back(e);
224ce89b 10503 oids.push_back(oid);
7c673cae
FG
10504
10505 ++v.version;
10506 ++m;
10507 }
10508 break;
10509
10510 default:
10511 ceph_abort();
10512 }
10513 }
10514
10515 info.stats.stats_invalid = true;
10516
10517 submit_log_entries(
10518 log_entries,
10519 std::move(manager),
10520 boost::optional<std::function<void(void)> >(
224ce89b
WB
10521 [this, oids, con, num_unfound, tid]() {
10522 for (auto oid: oids)
10523 missing_loc.recovered(oid);
7c673cae
FG
10524 for (auto& p : waiting_for_unreadable_object) {
10525 release_backoffs(p.first);
10526 }
10527 requeue_object_waiters(waiting_for_unreadable_object);
10528 queue_recovery();
10529
10530 stringstream ss;
10531 ss << "pg has " << num_unfound
10532 << " objects unfound and apparently lost marking";
10533 string rs = ss.str();
10534 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10535 osd->clog->info() << rs;
10536 if (con) {
10537 MCommandReply *reply = new MCommandReply(0, rs);
10538 reply->set_tid(tid);
10539 con->send_message(reply);
10540 }
10541 }),
10542 OpRequestRef());
10543}
10544
10545void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10546{
10547 assert(repop_queue.empty());
10548}
10549
10550/*
10551 * pg status change notification
10552 */
10553
10554void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10555{
10556 list<OpRequestRef> rq;
10557
10558 // apply all repops
10559 while (!repop_queue.empty()) {
10560 RepGather *repop = repop_queue.front();
10561 repop_queue.pop_front();
10562 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10563 repop->rep_aborted = true;
10564 repop->on_applied.clear();
10565 repop->on_committed.clear();
10566 repop->on_success.clear();
10567
10568 if (requeue) {
10569 if (repop->op) {
10570 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10571 rq.push_back(repop->op);
10572 repop->op = OpRequestRef();
10573 }
10574
10575 // also requeue any dups, interleaved into position
10576 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10577 waiting_for_ondisk.find(repop->v);
10578 if (p != waiting_for_ondisk.end()) {
10579 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10580 for (list<pair<OpRequestRef, version_t> >::iterator i =
10581 p->second.begin();
10582 i != p->second.end();
10583 ++i) {
10584 rq.push_back(i->first);
10585 }
10586 waiting_for_ondisk.erase(p);
10587 }
10588 }
10589
10590 remove_repop(repop);
10591 }
10592
10593 assert(repop_queue.empty());
10594
10595 if (requeue) {
10596 requeue_ops(rq);
10597 if (!waiting_for_ondisk.empty()) {
10598 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10599 waiting_for_ondisk.begin();
10600 i != waiting_for_ondisk.end();
10601 ++i) {
10602 for (list<pair<OpRequestRef, version_t> >::iterator j =
10603 i->second.begin();
10604 j != i->second.end();
10605 ++j) {
10606 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10607 << i->first << dendl;
10608 }
10609 }
10610 assert(waiting_for_ondisk.empty());
10611 }
10612 }
10613
10614 waiting_for_ondisk.clear();
10615}
10616
10617void PrimaryLogPG::on_flushed()
10618{
10619 assert(flushes_in_progress > 0);
10620 flushes_in_progress--;
10621 if (flushes_in_progress == 0) {
10622 requeue_ops(waiting_for_peered);
10623 }
10624 if (!is_peered() || !is_primary()) {
10625 pair<hobject_t, ObjectContextRef> i;
10626 while (object_contexts.get_next(i.first, &i)) {
10627 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10628 }
10629 assert(object_contexts.empty());
10630 }
10631 pgbackend->on_flushed();
10632}
10633
10634void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10635{
10636 dout(10) << "on_removal" << dendl;
10637
10638 // adjust info to backfill
10639 info.set_last_backfill(hobject_t());
10640 pg_log.reset_backfill();
10641 dirty_info = true;
10642
10643
10644 // clear log
10645 PGLogEntryHandler rollbacker{this, t};
10646 pg_log.roll_forward(&rollbacker);
10647
10648 write_if_dirty(*t);
10649
10650 if (!deleting)
10651 on_shutdown();
10652}
10653
10654void PrimaryLogPG::on_shutdown()
10655{
10656 dout(10) << "on_shutdown" << dendl;
10657
10658 // remove from queues
10659 osd->pg_stat_queue_dequeue(this);
10660 osd->peering_wq.dequeue(this);
10661
10662 // handles queue races
10663 deleting = true;
10664
224ce89b
WB
10665 if (recovery_queued) {
10666 recovery_queued = false;
10667 osd->clear_queued_recovery(this);
10668 }
10669
7c673cae
FG
10670 clear_scrub_reserved();
10671 scrub_clear_state();
10672
10673 unreg_next_scrub();
10674 cancel_copy_ops(false);
10675 cancel_flush_ops(false);
10676 cancel_proxy_ops(false);
10677 apply_and_flush_repops(false);
10678 cancel_log_updates();
31f18b77
FG
10679 // we must remove PGRefs, so do this this prior to release_backoffs() callers
10680 clear_backoffs();
10681 // clean up snap trim references
10682 snap_trimmer_machine.process_event(Reset());
7c673cae
FG
10683
10684 pgbackend->on_change();
10685
10686 context_registry_on_change();
10687 object_contexts.clear();
10688
10689 osd->remote_reserver.cancel_reservation(info.pgid);
10690 osd->local_reserver.cancel_reservation(info.pgid);
10691
10692 clear_primary_state();
10693 cancel_recovery();
10694}
10695
10696void PrimaryLogPG::on_activate()
10697{
10698 // all clean?
10699 if (needs_recovery()) {
10700 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10701 queue_peering_event(
10702 CephPeeringEvtRef(
10703 std::make_shared<CephPeeringEvt>(
10704 get_osdmap()->get_epoch(),
10705 get_osdmap()->get_epoch(),
10706 DoRecovery())));
10707 } else if (needs_backfill()) {
10708 dout(10) << "activate queueing backfill" << dendl;
10709 queue_peering_event(
10710 CephPeeringEvtRef(
10711 std::make_shared<CephPeeringEvt>(
10712 get_osdmap()->get_epoch(),
10713 get_osdmap()->get_epoch(),
10714 RequestBackfill())));
10715 } else {
10716 dout(10) << "activate all replicas clean, no recovery" << dendl;
224ce89b 10717 eio_errors_to_process = false;
7c673cae
FG
10718 queue_peering_event(
10719 CephPeeringEvtRef(
10720 std::make_shared<CephPeeringEvt>(
10721 get_osdmap()->get_epoch(),
10722 get_osdmap()->get_epoch(),
10723 AllReplicasRecovered())));
10724 }
10725
10726 publish_stats_to_osd();
10727
10728 if (!backfill_targets.empty()) {
10729 last_backfill_started = earliest_backfill();
10730 new_backfill = true;
10731 assert(!last_backfill_started.is_max());
10732 dout(5) << "on activate: bft=" << backfill_targets
10733 << " from " << last_backfill_started << dendl;
10734 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
10735 i != backfill_targets.end();
10736 ++i) {
10737 dout(5) << "target shard " << *i
10738 << " from " << peer_info[*i].last_backfill
10739 << dendl;
10740 }
10741 }
10742
10743 hit_set_setup();
10744 agent_setup();
10745}
10746
10747void PrimaryLogPG::_on_new_interval()
10748{
10749}
10750
10751void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
10752{
10753 dout(10) << "on_change" << dendl;
10754
10755 if (hit_set && hit_set->insert_count() == 0) {
10756 dout(20) << " discarding empty hit_set" << dendl;
10757 hit_set_clear();
10758 }
10759
10760 if (recovery_queued) {
10761 recovery_queued = false;
10762 osd->clear_queued_recovery(this);
10763 }
10764
10765 // requeue everything in the reverse order they should be
10766 // reexamined.
10767 requeue_ops(waiting_for_peered);
10768 requeue_ops(waiting_for_active);
10769
10770 clear_scrub_reserved();
10771
10772 cancel_copy_ops(is_primary());
10773 cancel_flush_ops(is_primary());
10774 cancel_proxy_ops(is_primary());
10775
10776 // requeue object waiters
10777 for (auto& p : waiting_for_unreadable_object) {
10778 release_backoffs(p.first);
10779 }
10780 if (is_primary()) {
10781 requeue_object_waiters(waiting_for_unreadable_object);
10782 } else {
10783 waiting_for_unreadable_object.clear();
10784 }
10785 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
10786 p != waiting_for_degraded_object.end();
10787 waiting_for_degraded_object.erase(p++)) {
10788 release_backoffs(p->first);
10789 if (is_primary())
10790 requeue_ops(p->second);
10791 else
10792 p->second.clear();
10793 finish_degraded_object(p->first);
10794 }
10795
10796 // requeues waiting_for_scrub
10797 scrub_clear_state();
10798
10799 for (auto p = waiting_for_blocked_object.begin();
10800 p != waiting_for_blocked_object.end();
10801 waiting_for_blocked_object.erase(p++)) {
10802 if (is_primary())
10803 requeue_ops(p->second);
10804 else
10805 p->second.clear();
10806 }
10807 for (auto i = callbacks_for_degraded_object.begin();
10808 i != callbacks_for_degraded_object.end();
10809 ) {
10810 finish_degraded_object((i++)->first);
10811 }
10812 assert(callbacks_for_degraded_object.empty());
10813
10814 if (is_primary()) {
10815 requeue_ops(waiting_for_cache_not_full);
7c673cae
FG
10816 } else {
10817 waiting_for_cache_not_full.clear();
7c673cae
FG
10818 }
10819 objects_blocked_on_cache_full.clear();
10820
10821 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
10822 in_progress_async_reads.begin();
10823 i != in_progress_async_reads.end();
10824 in_progress_async_reads.erase(i++)) {
10825 close_op_ctx(i->second);
10826 if (is_primary())
10827 requeue_op(i->first);
10828 }
10829
10830 // this will requeue ops we were working on but didn't finish, and
10831 // any dups
10832 apply_and_flush_repops(is_primary());
10833 cancel_log_updates();
10834
10835 // do this *after* apply_and_flush_repops so that we catch any newly
10836 // registered watches.
10837 context_registry_on_change();
10838
10839 pgbackend->on_change_cleanup(t);
10840 scrubber.cleanup_store(t);
10841 pgbackend->on_change();
10842
10843 // clear snap_trimmer state
10844 snap_trimmer_machine.process_event(Reset());
10845
10846 debug_op_order.clear();
10847 unstable_stats.clear();
10848
10849 // we don't want to cache object_contexts through the interval change
10850 // NOTE: we actually assert that all currently live references are dead
10851 // by the time the flush for the next interval completes.
10852 object_contexts.clear();
10853
10854 // should have been cleared above by finishing all of the degraded objects
10855 assert(objects_blocked_on_degraded_snap.empty());
10856}
10857
10858void PrimaryLogPG::on_role_change()
10859{
10860 dout(10) << "on_role_change" << dendl;
10861 if (get_role() != 0 && hit_set) {
10862 dout(10) << " clearing hit set" << dendl;
10863 hit_set_clear();
10864 }
10865}
10866
10867void PrimaryLogPG::on_pool_change()
10868{
10869 dout(10) << __func__ << dendl;
10870 // requeue cache full waiters just in case the cache_mode is
10871 // changing away from writeback mode. note that if we are not
10872 // active the normal requeuing machinery is sufficient (and properly
10873 // ordered).
10874 if (is_active() &&
10875 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10876 !waiting_for_cache_not_full.empty()) {
10877 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
10878 << dendl;
10879 requeue_ops(waiting_for_cache_not_full);
10880 objects_blocked_on_cache_full.clear();
10881 }
10882 hit_set_setup();
10883 agent_setup();
10884}
10885
10886// clear state. called on recovery completion AND cancellation.
10887void PrimaryLogPG::_clear_recovery_state()
10888{
10889 missing_loc.clear();
10890#ifdef DEBUG_RECOVERY_OIDS
10891 recovering_oids.clear();
10892#endif
10893 last_backfill_started = hobject_t();
10894 set<hobject_t>::iterator i = backfills_in_flight.begin();
10895 while (i != backfills_in_flight.end()) {
10896 assert(recovering.count(*i));
10897 backfills_in_flight.erase(i++);
10898 }
10899
10900 list<OpRequestRef> blocked_ops;
10901 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
10902 i != recovering.end();
10903 recovering.erase(i++)) {
10904 if (i->second) {
10905 i->second->drop_recovery_read(&blocked_ops);
10906 requeue_ops(blocked_ops);
10907 }
10908 }
10909 assert(backfills_in_flight.empty());
10910 pending_backfill_updates.clear();
10911 assert(recovering.empty());
10912 pgbackend->clear_recovery_state();
10913}
10914
10915void PrimaryLogPG::cancel_pull(const hobject_t &soid)
10916{
10917 dout(20) << __func__ << ": " << soid << dendl;
10918 assert(recovering.count(soid));
10919 ObjectContextRef obc = recovering[soid];
10920 if (obc) {
10921 list<OpRequestRef> blocked_ops;
10922 obc->drop_recovery_read(&blocked_ops);
10923 requeue_ops(blocked_ops);
10924 }
10925 recovering.erase(soid);
10926 finish_recovery_op(soid);
10927 release_backoffs(soid);
10928 if (waiting_for_degraded_object.count(soid)) {
10929 dout(20) << " kicking degraded waiters on " << soid << dendl;
10930 requeue_ops(waiting_for_degraded_object[soid]);
10931 waiting_for_degraded_object.erase(soid);
10932 }
10933 if (waiting_for_unreadable_object.count(soid)) {
10934 dout(20) << " kicking unreadable waiters on " << soid << dendl;
10935 requeue_ops(waiting_for_unreadable_object[soid]);
10936 waiting_for_unreadable_object.erase(soid);
10937 }
10938 if (is_missing_object(soid))
10939 pg_log.set_last_requested(0); // get recover_primary to start over
10940 finish_degraded_object(soid);
10941}
10942
10943void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
10944{
10945 /*
10946 * check that any peers we are planning to (or currently) pulling
10947 * objects from are dealt with.
10948 */
10949 missing_loc.check_recovery_sources(osdmap);
10950 pgbackend->check_recovery_sources(osdmap);
10951
10952 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
10953 i != peer_log_requested.end();
10954 ) {
10955 if (!osdmap->is_up(i->osd)) {
10956 dout(10) << "peer_log_requested removing " << *i << dendl;
10957 peer_log_requested.erase(i++);
10958 } else {
10959 ++i;
10960 }
10961 }
10962
10963 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
10964 i != peer_missing_requested.end();
10965 ) {
10966 if (!osdmap->is_up(i->osd)) {
10967 dout(10) << "peer_missing_requested removing " << *i << dendl;
10968 peer_missing_requested.erase(i++);
10969 } else {
10970 ++i;
10971 }
10972 }
10973}
10974
10975void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
10976{
10977 set<pg_shard_t> now_down;
10978 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
10979 p != missing_loc_sources.end();
10980 ) {
10981 if (osdmap->is_up(p->osd)) {
10982 ++p;
10983 continue;
10984 }
10985 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
10986 now_down.insert(*p);
10987 missing_loc_sources.erase(p++);
10988 }
10989
10990 if (now_down.empty()) {
10991 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
10992 } else {
10993 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
10994 << missing_loc_sources << dendl;
10995
10996 // filter missing_loc
10997 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
10998 while (p != missing_loc.end()) {
10999 set<pg_shard_t>::iterator q = p->second.begin();
11000 while (q != p->second.end())
11001 if (now_down.count(*q)) {
11002 p->second.erase(q++);
11003 } else {
11004 ++q;
11005 }
11006 if (p->second.empty())
11007 missing_loc.erase(p++);
11008 else
11009 ++p;
11010 }
11011 }
11012}
11013
11014
11015bool PrimaryLogPG::start_recovery_ops(
11016 uint64_t max,
11017 ThreadPool::TPHandle &handle,
11018 uint64_t *ops_started)
11019{
11020 uint64_t& started = *ops_started;
11021 started = 0;
11022 bool work_in_progress = false;
11023 assert(is_primary());
11024
11025 if (!state_test(PG_STATE_RECOVERING) &&
11026 !state_test(PG_STATE_BACKFILL)) {
11027 /* TODO: I think this case is broken and will make do_recovery()
11028 * unhappy since we're returning false */
11029 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11030 return false;
11031 }
11032
11033 const pg_missing_t &missing = pg_log.get_missing();
11034
11035 unsigned int num_missing = missing.num_missing();
11036 uint64_t num_unfound = get_num_unfound();
11037
11038 if (num_missing == 0) {
11039 info.last_complete = info.last_update;
11040 }
11041
11042 if (num_missing == num_unfound) {
11043 // All of the missing objects we have are unfound.
11044 // Recover the replicas.
11045 started = recover_replicas(max, handle);
11046 }
11047 if (!started) {
11048 // We still have missing objects that we should grab from replicas.
11049 started += recover_primary(max, handle);
11050 }
11051 if (!started && num_unfound != get_num_unfound()) {
11052 // second chance to recovery replicas
11053 started = recover_replicas(max, handle);
11054 }
11055
11056 if (started)
11057 work_in_progress = true;
11058
11059 bool deferred_backfill = false;
11060 if (recovering.empty() &&
11061 state_test(PG_STATE_BACKFILL) &&
11062 !backfill_targets.empty() && started < max &&
11063 missing.num_missing() == 0 &&
11064 waiting_on_backfill.empty()) {
11065 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11066 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11067 deferred_backfill = true;
11068 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11069 !is_degraded()) {
11070 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11071 deferred_backfill = true;
11072 } else if (!backfill_reserved) {
11073 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11074 if (!backfill_reserving) {
11075 dout(10) << "queueing RequestBackfill" << dendl;
11076 backfill_reserving = true;
11077 queue_peering_event(
11078 CephPeeringEvtRef(
11079 std::make_shared<CephPeeringEvt>(
11080 get_osdmap()->get_epoch(),
11081 get_osdmap()->get_epoch(),
11082 RequestBackfill())));
11083 }
11084 deferred_backfill = true;
11085 } else {
11086 started += recover_backfill(max - started, handle, &work_in_progress);
11087 }
11088 }
11089
11090 dout(10) << " started " << started << dendl;
11091 osd->logger->inc(l_osd_rop, started);
11092
11093 if (!recovering.empty() ||
11094 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11095 return work_in_progress;
11096
11097 assert(recovering.empty());
11098 assert(recovery_ops_active == 0);
11099
11100 dout(10) << __func__ << " needs_recovery: "
11101 << missing_loc.get_needs_recovery()
11102 << dendl;
11103 dout(10) << __func__ << " missing_loc: "
11104 << missing_loc.get_missing_locs()
11105 << dendl;
11106 int unfound = get_num_unfound();
11107 if (unfound) {
11108 dout(10) << " still have " << unfound << " unfound" << dendl;
11109 return work_in_progress;
11110 }
11111
11112 if (missing.num_missing() > 0) {
11113 // this shouldn't happen!
11114 osd->clog->error() << info.pgid << " recovery ending with " << missing.num_missing()
11115 << ": " << missing.get_items();
11116 return work_in_progress;
11117 }
11118
11119 if (needs_recovery()) {
11120 // this shouldn't happen!
11121 // We already checked num_missing() so we must have missing replicas
11122 osd->clog->error() << info.pgid << " recovery ending with missing replicas";
11123 return work_in_progress;
11124 }
11125
11126 if (state_test(PG_STATE_RECOVERING)) {
11127 state_clear(PG_STATE_RECOVERING);
11128 if (needs_backfill()) {
11129 dout(10) << "recovery done, queuing backfill" << dendl;
11130 queue_peering_event(
11131 CephPeeringEvtRef(
11132 std::make_shared<CephPeeringEvt>(
11133 get_osdmap()->get_epoch(),
11134 get_osdmap()->get_epoch(),
11135 RequestBackfill())));
11136 } else {
11137 dout(10) << "recovery done, no backfill" << dendl;
224ce89b 11138 eio_errors_to_process = false;
7c673cae
FG
11139 queue_peering_event(
11140 CephPeeringEvtRef(
11141 std::make_shared<CephPeeringEvt>(
11142 get_osdmap()->get_epoch(),
11143 get_osdmap()->get_epoch(),
11144 AllReplicasRecovered())));
11145 }
11146 } else { // backfilling
11147 state_clear(PG_STATE_BACKFILL);
11148 dout(10) << "recovery done, backfill done" << dendl;
224ce89b 11149 eio_errors_to_process = false;
7c673cae
FG
11150 queue_peering_event(
11151 CephPeeringEvtRef(
11152 std::make_shared<CephPeeringEvt>(
11153 get_osdmap()->get_epoch(),
11154 get_osdmap()->get_epoch(),
11155 Backfilled())));
11156 }
11157
11158 return false;
11159}
11160
11161/**
11162 * do one recovery op.
11163 * return true if done, false if nothing left to do.
11164 */
11165uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11166{
11167 assert(is_primary());
11168
11169 const pg_missing_t &missing = pg_log.get_missing();
11170
11171 dout(10) << "recover_primary recovering " << recovering.size()
11172 << " in pg" << dendl;
11173 dout(10) << "recover_primary " << missing << dendl;
11174 dout(25) << "recover_primary " << missing.get_items() << dendl;
11175
11176 // look at log!
11177 pg_log_entry_t *latest = 0;
11178 unsigned started = 0;
11179 int skipped = 0;
11180
11181 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11182 map<version_t, hobject_t>::const_iterator p =
11183 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11184 while (p != missing.get_rmissing().end()) {
11185 handle.reset_tp_timeout();
11186 hobject_t soid;
11187 version_t v = p->first;
11188
11189 if (pg_log.get_log().objects.count(p->second)) {
11190 latest = pg_log.get_log().objects.find(p->second)->second;
11191 assert(latest->is_update());
11192 soid = latest->soid;
11193 } else {
11194 latest = 0;
11195 soid = p->second;
11196 }
11197 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11198 ++p;
11199
224ce89b 11200 hobject_t head = soid.get_head();
7c673cae
FG
11201
11202 eversion_t need = item.need;
11203
11204 dout(10) << "recover_primary "
11205 << soid << " " << item.need
11206 << (missing.is_missing(soid) ? " (missing)":"")
11207 << (missing.is_missing(head) ? " (missing head)":"")
11208 << (recovering.count(soid) ? " (recovering)":"")
11209 << (recovering.count(head) ? " (recovering head)":"")
11210 << dendl;
11211
11212 if (latest) {
11213 switch (latest->op) {
11214 case pg_log_entry_t::CLONE:
11215 /*
11216 * Handling for this special case removed for now, until we
11217 * can correctly construct an accurate SnapSet from the old
11218 * one.
11219 */
11220 break;
11221
11222 case pg_log_entry_t::LOST_REVERT:
11223 {
11224 if (item.have == latest->reverting_to) {
11225 ObjectContextRef obc = get_object_context(soid, true);
11226
11227 if (obc->obs.oi.version == latest->version) {
11228 // I'm already reverting
11229 dout(10) << " already reverting " << soid << dendl;
11230 } else {
11231 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11232 obc->ondisk_write_lock();
11233 obc->obs.oi.version = latest->version;
11234
11235 ObjectStore::Transaction t;
11236 bufferlist b2;
11237 obc->obs.oi.encode(
11238 b2,
11239 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11240 assert(!pool.info.require_rollback());
11241 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11242
11243 recover_got(soid, latest->version);
11244 missing_loc.add_location(soid, pg_whoami);
11245
11246 ++active_pushes;
11247
11248 osd->store->queue_transaction(osr.get(), std::move(t),
11249 new C_OSD_AppliedRecoveredObject(this, obc),
11250 new C_OSD_CommittedPushedObject(
11251 this,
11252 get_osdmap()->get_epoch(),
11253 info.last_complete),
11254 new C_OSD_OndiskWriteUnlock(obc));
11255 continue;
11256 }
11257 } else {
11258 /*
11259 * Pull the old version of the object. Update missing_loc here to have the location
11260 * of the version we want.
11261 *
11262 * This doesn't use the usual missing_loc paths, but that's okay:
11263 * - if we have it locally, we hit the case above, and go from there.
11264 * - if we don't, we always pass through this case during recovery and set up the location
11265 * properly.
11266 * - this way we don't need to mangle the missing code to be general about needing an old
11267 * version...
11268 */
11269 eversion_t alternate_need = latest->reverting_to;
11270 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11271
11272 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11273 p != peer_missing.end();
11274 ++p)
11275 if (p->second.is_missing(soid, need) &&
11276 p->second.get_items().at(soid).have == alternate_need) {
11277 missing_loc.add_location(soid, p->first);
11278 }
11279 dout(10) << " will pull " << alternate_need << " or " << need
11280 << " from one of " << missing_loc.get_locations(soid)
11281 << dendl;
11282 }
11283 }
11284 break;
11285 }
11286 }
11287
11288 if (!recovering.count(soid)) {
11289 if (recovering.count(head)) {
11290 ++skipped;
11291 } else {
11292 int r = recover_missing(
11293 soid, need, get_recovery_op_priority(), h);
11294 switch (r) {
11295 case PULL_YES:
11296 ++started;
11297 break;
11298 case PULL_OTHER:
11299 ++started;
11300 case PULL_NONE:
11301 ++skipped;
11302 break;
11303 default:
11304 ceph_abort();
11305 }
11306 if (started >= max)
11307 break;
11308 }
11309 }
11310
11311 // only advance last_requested if we haven't skipped anything
11312 if (!skipped)
11313 pg_log.set_last_requested(v);
11314 }
11315
11316 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11317 return started;
11318}
11319
224ce89b
WB
11320bool PrimaryLogPG::primary_error(
11321 const hobject_t& soid, eversion_t v)
11322{
11323 pg_log.missing_add(soid, v, eversion_t());
11324 pg_log.set_last_requested(0);
11325 missing_loc.remove_location(soid, pg_whoami);
11326 bool uhoh = true;
11327 assert(!actingbackfill.empty());
11328 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11329 i != actingbackfill.end();
11330 ++i) {
11331 if (*i == get_primary()) continue;
11332 pg_shard_t peer = *i;
11333 if (!peer_missing[peer].is_missing(soid, v)) {
11334 missing_loc.add_location(soid, peer);
11335 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11336 << ", there should be a copy on shard " << peer << dendl;
11337 uhoh = false;
11338 }
11339 }
11340 if (uhoh)
11341 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11342 else
11343 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11344 << ", will try copies on " << missing_loc.get_locations(soid);
11345 return uhoh;
11346}
11347
7c673cae
FG
11348int PrimaryLogPG::prep_object_replica_pushes(
11349 const hobject_t& soid, eversion_t v,
11350 PGBackend::RecoveryHandle *h)
11351{
11352 assert(is_primary());
11353 dout(10) << __func__ << ": on " << soid << dendl;
11354
11355 // NOTE: we know we will get a valid oloc off of disk here.
11356 ObjectContextRef obc = get_object_context(soid, false);
11357 if (!obc) {
224ce89b 11358 primary_error(soid, v);
7c673cae
FG
11359 return 0;
11360 }
11361
11362 if (!obc->get_recovery_read()) {
11363 dout(20) << "recovery delayed on " << soid
11364 << "; could not get rw_manager lock" << dendl;
11365 return 0;
11366 } else {
11367 dout(20) << "recovery got recovery read lock on " << soid
11368 << dendl;
11369 }
11370
11371 start_recovery_op(soid);
11372 assert(!recovering.count(soid));
11373 recovering.insert(make_pair(soid, obc));
11374
11375 /* We need this in case there is an in progress write on the object. In fact,
11376 * the only possible write is an update to the xattr due to a lost_revert --
11377 * a client write would be blocked since the object is degraded.
11378 * In almost all cases, therefore, this lock should be uncontended.
11379 */
11380 obc->ondisk_read_lock();
224ce89b 11381 int r = pgbackend->recover_object(
7c673cae
FG
11382 soid,
11383 v,
11384 ObjectContextRef(),
11385 obc, // has snapset context
11386 h);
11387 obc->ondisk_read_unlock();
224ce89b
WB
11388 if (r < 0) {
11389 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11390 primary_failed(soid);
11391 primary_error(soid, v);
11392 return 0;
11393 }
7c673cae
FG
11394 return 1;
11395}
11396
11397uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11398{
11399 dout(10) << __func__ << "(" << max << ")" << dendl;
11400 uint64_t started = 0;
11401
11402 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11403
11404 // this is FAR from an optimal recovery order. pretty lame, really.
11405 assert(!actingbackfill.empty());
11406 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11407 i != actingbackfill.end();
11408 ++i) {
11409 if (*i == get_primary()) continue;
11410 pg_shard_t peer = *i;
11411 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11412 assert(pm != peer_missing.end());
11413 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11414 assert(pi != peer_info.end());
11415 size_t m_sz = pm->second.num_missing();
11416
11417 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11418 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11419
11420 // oldest first!
11421 const pg_missing_t &m(pm->second);
11422 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11423 p != m.get_rmissing().end() && started < max;
11424 ++p) {
11425 handle.reset_tp_timeout();
11426 const hobject_t soid(p->second);
11427
224ce89b
WB
11428 if (missing_loc.is_unfound(soid)) {
11429 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11430 continue;
11431 }
11432
7c673cae
FG
11433 if (soid > pi->second.last_backfill) {
11434 if (!recovering.count(soid)) {
224ce89b 11435 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
7c673cae
FG
11436 derr << __func__ << ": object added to missing set for backfill, but "
11437 << "is not in recovering, error!" << dendl;
11438 ceph_abort();
11439 }
11440 continue;
11441 }
11442
11443 if (recovering.count(soid)) {
11444 dout(10) << __func__ << ": already recovering " << soid << dendl;
11445 continue;
11446 }
11447
7c673cae
FG
11448 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11449 dout(10) << __func__ << ": " << soid.get_head()
11450 << " still missing on primary" << dendl;
11451 continue;
11452 }
11453
11454 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11455 dout(10) << __func__ << ": " << soid.get_snapdir()
11456 << " still missing on primary" << dendl;
11457 continue;
11458 }
11459
11460 if (pg_log.get_missing().is_missing(soid)) {
11461 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11462 continue;
11463 }
11464
11465 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11466 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11467 started += prep_object_replica_pushes(soid, r->second.need,
11468 h);
11469 }
11470 }
11471
11472 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11473 return started;
11474}
11475
11476hobject_t PrimaryLogPG::earliest_peer_backfill() const
11477{
11478 hobject_t e = hobject_t::get_max();
11479 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11480 i != backfill_targets.end();
11481 ++i) {
11482 pg_shard_t peer = *i;
11483 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11484 peer_backfill_info.find(peer);
11485 assert(iter != peer_backfill_info.end());
11486 if (iter->second.begin < e)
11487 e = iter->second.begin;
11488 }
11489 return e;
11490}
11491
11492bool PrimaryLogPG::all_peer_done() const
11493{
11494 // Primary hasn't got any more objects
11495 assert(backfill_info.empty());
11496
11497 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11498 i != backfill_targets.end();
11499 ++i) {
11500 pg_shard_t bt = *i;
11501 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11502 peer_backfill_info.find(bt);
11503 assert(piter != peer_backfill_info.end());
11504 const BackfillInterval& pbi = piter->second;
11505 // See if peer has more to process
11506 if (!pbi.extends_to_end() || !pbi.empty())
11507 return false;
11508 }
11509 return true;
11510}
11511
11512/**
11513 * recover_backfill
11514 *
11515 * Invariants:
11516 *
11517 * backfilled: fully pushed to replica or present in replica's missing set (both
11518 * our copy and theirs).
11519 *
11520 * All objects on a backfill_target in
11521 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11522 * objects have been actually deleted and all logically-valid objects are replicated.
11523 * There may be PG objects in this interval yet to be backfilled.
11524 *
11525 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11526 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11527 *
11528 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11529 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11530 * interval remain on the backfill target.
11531 *
11532 * For a backfill target, all objects <= peer_info[target].last_backfill
11533 * have been backfilled to target
11534 *
11535 * There *MAY* be missing/outdated objects between last_backfill_started and
11536 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11537 * io created objects since the last scan. For this reason, we call
11538 * update_range() again before continuing backfill.
11539 */
11540uint64_t PrimaryLogPG::recover_backfill(
11541 uint64_t max,
11542 ThreadPool::TPHandle &handle, bool *work_started)
11543{
11544 dout(10) << "recover_backfill (" << max << ")"
11545 << " bft=" << backfill_targets
11546 << " last_backfill_started " << last_backfill_started
11547 << (new_backfill ? " new_backfill":"")
11548 << dendl;
11549 assert(!backfill_targets.empty());
11550
11551 // Initialize from prior backfill state
11552 if (new_backfill) {
11553 // on_activate() was called prior to getting here
11554 assert(last_backfill_started == earliest_backfill());
11555 new_backfill = false;
11556
11557 // initialize BackfillIntervals
11558 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11559 i != backfill_targets.end();
11560 ++i) {
11561 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11562 }
11563 backfill_info.reset(last_backfill_started);
11564
11565 backfills_in_flight.clear();
11566 pending_backfill_updates.clear();
11567 }
11568
11569 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11570 i != backfill_targets.end();
11571 ++i) {
11572 dout(10) << "peer osd." << *i
11573 << " info " << peer_info[*i]
11574 << " interval " << peer_backfill_info[*i].begin
11575 << "-" << peer_backfill_info[*i].end
11576 << " " << peer_backfill_info[*i].objects.size() << " objects"
11577 << dendl;
11578 }
11579
11580 // update our local interval to cope with recent changes
11581 backfill_info.begin = last_backfill_started;
11582 update_range(&backfill_info, handle);
11583
11584 unsigned ops = 0;
7c673cae
FG
11585 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11586 set<hobject_t> add_to_stat;
11587
11588 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11589 i != backfill_targets.end();
11590 ++i) {
11591 peer_backfill_info[*i].trim_to(
11592 std::max(peer_info[*i].last_backfill, last_backfill_started));
11593 }
11594 backfill_info.trim_to(last_backfill_started);
11595
224ce89b 11596 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
7c673cae
FG
11597 while (ops < max) {
11598 if (backfill_info.begin <= earliest_peer_backfill() &&
11599 !backfill_info.extends_to_end() && backfill_info.empty()) {
11600 hobject_t next = backfill_info.end;
11601 backfill_info.reset(next);
11602 backfill_info.end = hobject_t::get_max();
11603 update_range(&backfill_info, handle);
11604 backfill_info.trim();
11605 }
11606
11607 dout(20) << " my backfill interval " << backfill_info << dendl;
11608
11609 bool sent_scan = false;
11610 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11611 i != backfill_targets.end();
11612 ++i) {
11613 pg_shard_t bt = *i;
11614 BackfillInterval& pbi = peer_backfill_info[bt];
11615
11616 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11617 if (pbi.begin <= backfill_info.begin &&
11618 !pbi.extends_to_end() && pbi.empty()) {
11619 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11620 epoch_t e = get_osdmap()->get_epoch();
11621 MOSDPGScan *m = new MOSDPGScan(
11622 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11623 spg_t(info.pgid.pgid, bt.shard),
11624 pbi.end, hobject_t());
11625 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11626 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11627 waiting_on_backfill.insert(bt);
11628 sent_scan = true;
11629 }
11630 }
11631
11632 // Count simultaneous scans as a single op and let those complete
11633 if (sent_scan) {
11634 ops++;
11635 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11636 break;
11637 }
11638
11639 if (backfill_info.empty() && all_peer_done()) {
11640 dout(10) << " reached end for both local and all peers" << dendl;
11641 break;
11642 }
11643
11644 // Get object within set of peers to operate on and
11645 // the set of targets for which that object applies.
11646 hobject_t check = earliest_peer_backfill();
11647
11648 if (check < backfill_info.begin) {
11649
11650 set<pg_shard_t> check_targets;
11651 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11652 i != backfill_targets.end();
11653 ++i) {
11654 pg_shard_t bt = *i;
11655 BackfillInterval& pbi = peer_backfill_info[bt];
11656 if (pbi.begin == check)
11657 check_targets.insert(bt);
11658 }
11659 assert(!check_targets.empty());
11660
11661 dout(20) << " BACKFILL removing " << check
11662 << " from peers " << check_targets << dendl;
11663 for (set<pg_shard_t>::iterator i = check_targets.begin();
11664 i != check_targets.end();
11665 ++i) {
11666 pg_shard_t bt = *i;
11667 BackfillInterval& pbi = peer_backfill_info[bt];
11668 assert(pbi.begin == check);
11669
11670 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11671 pbi.pop_front();
11672 }
11673
11674 /* This requires a bit of explanation. We compare head against
11675 * last_backfill to determine whether to send an operation
11676 * to the replica. A single write operation can touch up to three
11677 * objects: head, the snapdir, and a new clone which sorts closer to
11678 * head than any existing clone. If last_backfill points at a clone,
11679 * the transaction won't be sent and all 3 must lie on the right side
11680 * of the line (i.e., we'll backfill them later). If last_backfill
11681 * points at snapdir, it sorts greater than head, so we send the
11682 * transaction which is correct because all three must lie to the left
11683 * of the line.
11684 *
11685 * If it points at head, we have a bit of an issue. If head actually
11686 * exists, no problem, because any transaction which touches snapdir
11687 * must end up creating it (and deleting head), so sending the
11688 * operation won't pose a problem -- we'll end up having to scan it,
11689 * but it'll end up being the right version so we won't bother to
11690 * rebackfill it. However, if head doesn't exist, any write on head
11691 * will remove snapdir. For a replicated pool, this isn't a problem,
11692 * ENOENT on remove isn't an issue and it's in backfill future anyway.
11693 * It only poses a problem for EC pools, because we never just delete
11694 * an object, we rename it into a rollback object. That operation
11695 * will end up crashing the osd with ENOENT. Tolerating the failure
11696 * wouldn't work either, even if snapdir exists, we'd be creating a
11697 * rollback object past the last_backfill line which wouldn't get
11698 * cleaned up (no rollback objects past the last_backfill line is an
11699 * existing important invariant). Thus, let's avoid the whole issue
11700 * by just not updating last_backfill_started here if head doesn't
11701 * exist and snapdir does. We aren't using up a recovery count here,
11702 * so we're going to recover snapdir immediately anyway. We'll only
11703 * fail "backward" if we fail to get the rw lock and that just means
11704 * we'll re-process this section of the hash space again.
11705 *
11706 * I'm choosing this hack here because the really "correct" answer is
11707 * going to be to unify snapdir and head into a single object (a
11708 * snapdir is really just a confusing way to talk about head existing
11709 * as a whiteout), but doing that is going to be a somewhat larger
11710 * undertaking.
11711 *
11712 * @see http://tracker.ceph.com/issues/17668
11713 */
11714 if (!(check.is_head() &&
11715 backfill_info.begin.is_snapdir() &&
11716 check == backfill_info.begin.get_head()))
11717 last_backfill_started = check;
11718
11719 // Don't increment ops here because deletions
11720 // are cheap and not replied to unlike real recovery_ops,
11721 // and we can't increment ops without requeueing ourself
11722 // for recovery.
11723 } else {
11724 eversion_t& obj_v = backfill_info.objects.begin()->second;
11725
11726 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
11727 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11728 i != backfill_targets.end();
11729 ++i) {
11730 pg_shard_t bt = *i;
11731 BackfillInterval& pbi = peer_backfill_info[bt];
11732 // Find all check peers that have the wrong version
11733 if (check == backfill_info.begin && check == pbi.begin) {
11734 if (pbi.objects.begin()->second != obj_v) {
11735 need_ver_targs.push_back(bt);
11736 } else {
11737 keep_ver_targs.push_back(bt);
11738 }
11739 } else {
11740 pg_info_t& pinfo = peer_info[bt];
11741
11742 // Only include peers that we've caught up to their backfill line
11743 // otherwise, they only appear to be missing this object
11744 // because their pbi.begin > backfill_info.begin.
11745 if (backfill_info.begin > pinfo.last_backfill)
11746 missing_targs.push_back(bt);
11747 else
11748 skip_targs.push_back(bt);
11749 }
11750 }
11751
11752 if (!keep_ver_targs.empty()) {
11753 // These peers have version obj_v
11754 dout(20) << " BACKFILL keeping " << check
11755 << " with ver " << obj_v
11756 << " on peers " << keep_ver_targs << dendl;
11757 //assert(!waiting_for_degraded_object.count(check));
11758 }
11759 if (!need_ver_targs.empty() || !missing_targs.empty()) {
11760 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11761 assert(obc);
11762 if (obc->get_recovery_read()) {
11763 if (!need_ver_targs.empty()) {
11764 dout(20) << " BACKFILL replacing " << check
11765 << " with ver " << obj_v
11766 << " to peers " << need_ver_targs << dendl;
11767 }
11768 if (!missing_targs.empty()) {
11769 dout(20) << " BACKFILL pushing " << backfill_info.begin
11770 << " with ver " << obj_v
11771 << " to peers " << missing_targs << dendl;
11772 }
11773 vector<pg_shard_t> all_push = need_ver_targs;
11774 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
11775
224ce89b
WB
11776 handle.reset_tp_timeout();
11777 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
11778 if (r < 0) {
11779 *work_started = true;
11780 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
11781 break;
11782 }
7c673cae
FG
11783 ops++;
11784 } else {
11785 *work_started = true;
11786 dout(20) << "backfill blocking on " << backfill_info.begin
11787 << "; could not get rw_manager lock" << dendl;
11788 break;
11789 }
11790 }
11791 dout(20) << "need_ver_targs=" << need_ver_targs
11792 << " keep_ver_targs=" << keep_ver_targs << dendl;
11793 dout(20) << "backfill_targets=" << backfill_targets
11794 << " missing_targs=" << missing_targs
11795 << " skip_targs=" << skip_targs << dendl;
11796
11797 last_backfill_started = backfill_info.begin;
11798 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
11799 backfill_info.pop_front();
11800 vector<pg_shard_t> check_targets = need_ver_targs;
11801 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
11802 for (vector<pg_shard_t>::iterator i = check_targets.begin();
11803 i != check_targets.end();
11804 ++i) {
11805 pg_shard_t bt = *i;
11806 BackfillInterval& pbi = peer_backfill_info[bt];
11807 pbi.pop_front();
11808 }
11809 }
11810 }
11811
11812 hobject_t backfill_pos =
11813 std::min(backfill_info.begin, earliest_peer_backfill());
11814
11815 for (set<hobject_t>::iterator i = add_to_stat.begin();
11816 i != add_to_stat.end();
11817 ++i) {
11818 ObjectContextRef obc = get_object_context(*i, false);
11819 assert(obc);
11820 pg_stat_t stat;
11821 add_object_context_to_pg_stat(obc, &stat);
11822 pending_backfill_updates[*i] = stat;
11823 }
11824 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
11825 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
11826 for (unsigned i = 0; i < to_remove.size(); ++i) {
11827 handle.reset_tp_timeout();
11828 const hobject_t& oid = to_remove[i].get<0>();
11829 eversion_t v = to_remove[i].get<1>();
11830 pg_shard_t peer = to_remove[i].get<2>();
11831 MOSDPGBackfillRemove *m;
11832 auto it = reqs.find(peer);
11833 if (it != reqs.end()) {
11834 m = it->second;
11835 } else {
11836 m = reqs[peer] = new MOSDPGBackfillRemove(
11837 spg_t(info.pgid.pgid, peer.shard),
11838 get_osdmap()->get_epoch());
11839 }
11840 m->ls.push_back(make_pair(oid, v));
11841
11842 if (oid <= last_backfill_started)
11843 pending_backfill_updates[oid]; // add empty stat!
11844 }
11845 for (auto p : reqs) {
11846 osd->send_message_osd_cluster(p.first.osd, p.second,
11847 get_osdmap()->get_epoch());
11848 }
11849 } else {
11850 // for jewel targets
11851 for (unsigned i = 0; i < to_remove.size(); ++i) {
11852 handle.reset_tp_timeout();
11853
11854 // ordered before any subsequent updates
11855 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
11856 to_remove[i].get<2>());
11857
11858 if (to_remove[i].get<0>() <= last_backfill_started)
11859 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
11860 }
11861 }
11862
7c673cae
FG
11863 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11864
11865 dout(5) << "backfill_pos is " << backfill_pos << dendl;
11866 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
11867 i != backfills_in_flight.end();
11868 ++i) {
11869 dout(20) << *i << " is still in flight" << dendl;
11870 }
11871
11872 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
11873 backfill_pos : *(backfills_in_flight.begin());
11874 hobject_t new_last_backfill = earliest_backfill();
11875 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
11876 for (map<hobject_t, pg_stat_t>::iterator i =
11877 pending_backfill_updates.begin();
11878 i != pending_backfill_updates.end() &&
11879 i->first < next_backfill_to_complete;
11880 pending_backfill_updates.erase(i++)) {
11881 dout(20) << " pending_backfill_update " << i->first << dendl;
11882 assert(i->first > new_last_backfill);
11883 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
11884 j != backfill_targets.end();
11885 ++j) {
11886 pg_shard_t bt = *j;
11887 pg_info_t& pinfo = peer_info[bt];
11888 //Add stats to all peers that were missing object
11889 if (i->first > pinfo.last_backfill)
11890 pinfo.stats.add(i->second);
11891 }
11892 new_last_backfill = i->first;
11893 }
11894 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
11895
11896 assert(!pending_backfill_updates.empty() ||
11897 new_last_backfill == last_backfill_started);
11898 if (pending_backfill_updates.empty() &&
11899 backfill_pos.is_max()) {
11900 assert(backfills_in_flight.empty());
11901 new_last_backfill = backfill_pos;
11902 last_backfill_started = backfill_pos;
11903 }
11904 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
11905
11906 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
11907 // all the backfill targets. Otherwise, we will move last_backfill up on
11908 // those targets need it and send OP_BACKFILL_PROGRESS to them.
11909 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11910 i != backfill_targets.end();
11911 ++i) {
11912 pg_shard_t bt = *i;
11913 pg_info_t& pinfo = peer_info[bt];
11914
11915 if (new_last_backfill > pinfo.last_backfill) {
11916 pinfo.set_last_backfill(new_last_backfill);
11917 epoch_t e = get_osdmap()->get_epoch();
11918 MOSDPGBackfill *m = NULL;
11919 if (pinfo.last_backfill.is_max()) {
11920 m = new MOSDPGBackfill(
11921 MOSDPGBackfill::OP_BACKFILL_FINISH,
11922 e,
11923 last_peering_reset,
11924 spg_t(info.pgid.pgid, bt.shard));
11925 // Use default priority here, must match sub_op priority
11926 /* pinfo.stats might be wrong if we did log-based recovery on the
11927 * backfilled portion in addition to continuing backfill.
11928 */
11929 pinfo.stats = info.stats;
11930 start_recovery_op(hobject_t::get_max());
11931 } else {
11932 m = new MOSDPGBackfill(
11933 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
11934 e,
11935 last_peering_reset,
11936 spg_t(info.pgid.pgid, bt.shard));
11937 // Use default priority here, must match sub_op priority
11938 }
11939 m->last_backfill = pinfo.last_backfill;
11940 m->stats = pinfo.stats;
11941 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11942 dout(10) << " peer " << bt
11943 << " num_objects now " << pinfo.stats.stats.sum.num_objects
11944 << " / " << info.stats.stats.sum.num_objects << dendl;
11945 }
11946 }
11947
11948 if (ops)
11949 *work_started = true;
11950 return ops;
11951}
11952
224ce89b 11953int PrimaryLogPG::prep_backfill_object_push(
7c673cae
FG
11954 hobject_t oid, eversion_t v,
11955 ObjectContextRef obc,
11956 vector<pg_shard_t> peers,
11957 PGBackend::RecoveryHandle *h)
11958{
224ce89b 11959 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
7c673cae
FG
11960 assert(!peers.empty());
11961
11962 backfills_in_flight.insert(oid);
11963 for (unsigned int i = 0 ; i < peers.size(); ++i) {
11964 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
11965 assert(bpm != peer_missing.end());
11966 bpm->second.add(oid, eversion_t(), eversion_t());
11967 }
11968
11969 assert(!recovering.count(oid));
11970
11971 start_recovery_op(oid);
11972 recovering.insert(make_pair(oid, obc));
11973
11974 // We need to take the read_lock here in order to flush in-progress writes
11975 obc->ondisk_read_lock();
224ce89b 11976 int r = pgbackend->recover_object(
7c673cae
FG
11977 oid,
11978 v,
11979 ObjectContextRef(),
11980 obc,
11981 h);
11982 obc->ondisk_read_unlock();
224ce89b
WB
11983 if (r < 0) {
11984 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
11985 primary_failed(oid);
11986 primary_error(oid, v);
11987 backfills_in_flight.erase(oid);
11988 missing_loc.add_missing(oid, v, eversion_t());
11989 }
11990 return r;
7c673cae
FG
11991}
11992
11993void PrimaryLogPG::update_range(
11994 BackfillInterval *bi,
11995 ThreadPool::TPHandle &handle)
11996{
11997 int local_min = cct->_conf->osd_backfill_scan_min;
11998 int local_max = cct->_conf->osd_backfill_scan_max;
11999
12000 if (bi->version < info.log_tail) {
12001 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12002 << dendl;
12003 if (last_update_applied >= info.log_tail) {
12004 bi->version = last_update_applied;
12005 } else {
12006 osr->flush();
12007 bi->version = info.last_update;
12008 }
12009 scan_range(local_min, local_max, bi, handle);
12010 }
12011
12012 if (bi->version >= projected_last_update) {
12013 dout(10) << __func__<< ": bi is current " << dendl;
12014 assert(bi->version == projected_last_update);
12015 } else if (bi->version >= info.log_tail) {
12016 if (pg_log.get_log().empty() && projected_log.empty()) {
12017 /* Because we don't move log_tail on split, the log might be
12018 * empty even if log_tail != last_update. However, the only
12019 * way to get here with an empty log is if log_tail is actually
12020 * eversion_t(), because otherwise the entry which changed
12021 * last_update since the last scan would have to be present.
12022 */
12023 assert(bi->version == eversion_t());
12024 return;
12025 }
12026
12027 dout(10) << __func__<< ": bi is old, (" << bi->version
12028 << ") can be updated with log to projected_last_update "
12029 << projected_last_update << dendl;
12030
12031 auto func = [&](const pg_log_entry_t &e) {
12032 dout(10) << __func__ << ": updating from version " << e.version
12033 << dendl;
12034 const hobject_t &soid = e.soid;
12035 if (soid >= bi->begin &&
12036 soid < bi->end) {
12037 if (e.is_update()) {
12038 dout(10) << __func__ << ": " << e.soid << " updated to version "
12039 << e.version << dendl;
12040 bi->objects.erase(e.soid);
12041 bi->objects.insert(
12042 make_pair(
12043 e.soid,
12044 e.version));
12045 } else if (e.is_delete()) {
12046 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12047 bi->objects.erase(e.soid);
12048 }
12049 }
12050 };
12051 dout(10) << "scanning pg log first" << dendl;
12052 pg_log.get_log().scan_log_after(bi->version, func);
12053 dout(10) << "scanning projected log" << dendl;
12054 projected_log.scan_log_after(bi->version, func);
12055 bi->version = projected_last_update;
12056 } else {
12057 assert(0 == "scan_range should have raised bi->version past log_tail");
12058 }
12059}
12060
12061void PrimaryLogPG::scan_range(
12062 int min, int max, BackfillInterval *bi,
12063 ThreadPool::TPHandle &handle)
12064{
12065 assert(is_locked());
12066 dout(10) << "scan_range from " << bi->begin << dendl;
12067 bi->clear_objects();
12068
12069 vector<hobject_t> ls;
12070 ls.reserve(max);
12071 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12072 assert(r >= 0);
12073 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12074 dout(20) << ls << dendl;
12075
12076 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12077 handle.reset_tp_timeout();
12078 ObjectContextRef obc;
12079 if (is_primary())
12080 obc = object_contexts.lookup(*p);
12081 if (obc) {
12082 bi->objects[*p] = obc->obs.oi.version;
12083 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12084 } else {
12085 bufferlist bl;
12086 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12087
12088 /* If the object does not exist here, it must have been removed
12089 * between the collection_list_partial and here. This can happen
12090 * for the first item in the range, which is usually last_backfill.
12091 */
12092 if (r == -ENOENT)
12093 continue;
12094
12095 assert(r >= 0);
12096 object_info_t oi(bl);
12097 bi->objects[*p] = oi.version;
12098 dout(20) << " " << *p << " " << oi.version << dendl;
12099 }
12100 }
12101}
12102
12103
12104/** check_local
12105 *
12106 * verifies that stray objects have been deleted
12107 */
12108void PrimaryLogPG::check_local()
12109{
12110 dout(10) << __func__ << dendl;
12111
12112 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12113
12114 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12115 return;
12116
12117 // just scan the log.
12118 set<hobject_t> did;
12119 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12120 p != pg_log.get_log().log.rend();
12121 ++p) {
12122 if (did.count(p->soid))
12123 continue;
12124 did.insert(p->soid);
12125
12126 if (p->is_delete()) {
12127 dout(10) << " checking " << p->soid
12128 << " at " << p->version << dendl;
12129 struct stat st;
12130 int r = osd->store->stat(
12131 ch,
12132 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12133 &st);
12134 if (r != -ENOENT) {
12135 derr << __func__ << " " << p->soid << " exists, but should have been "
12136 << "deleted" << dendl;
12137 assert(0 == "erroneously present object");
12138 }
12139 } else {
12140 // ignore old(+missing) objects
12141 }
12142 }
12143}
12144
12145
12146
12147// ===========================
12148// hit sets
12149
12150hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12151{
12152 ostringstream ss;
12153 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12154 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12155 info.pgid.ps(), info.pgid.pool(),
12156 cct->_conf->osd_hit_set_namespace);
12157 dout(20) << __func__ << " " << hoid << dendl;
12158 return hoid;
12159}
12160
12161hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12162 utime_t end,
12163 bool using_gmt)
12164{
12165 ostringstream ss;
12166 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12167 if (using_gmt) {
12168 start.gmtime(ss) << "_";
12169 end.gmtime(ss);
12170 } else {
12171 start.localtime(ss) << "_";
12172 end.localtime(ss);
12173 }
12174 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12175 info.pgid.ps(), info.pgid.pool(),
12176 cct->_conf->osd_hit_set_namespace);
12177 dout(20) << __func__ << " " << hoid << dendl;
12178 return hoid;
12179}
12180
12181void PrimaryLogPG::hit_set_clear()
12182{
12183 dout(20) << __func__ << dendl;
12184 hit_set.reset();
12185 hit_set_start_stamp = utime_t();
12186}
12187
12188void PrimaryLogPG::hit_set_setup()
12189{
12190 if (!is_active() ||
12191 !is_primary()) {
12192 hit_set_clear();
12193 return;
12194 }
12195
12196 if (is_active() && is_primary() &&
12197 (!pool.info.hit_set_count ||
12198 !pool.info.hit_set_period ||
12199 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12200 hit_set_clear();
12201
12202 // only primary is allowed to remove all the hit set objects
12203 hit_set_remove_all();
12204 return;
12205 }
12206
12207 // FIXME: discard any previous data for now
12208 hit_set_create();
12209
12210 // include any writes we know about from the pg log. this doesn't
12211 // capture reads, but it is better than nothing!
12212 hit_set_apply_log();
12213}
12214
12215void PrimaryLogPG::hit_set_remove_all()
12216{
12217 // If any archives are degraded we skip this
12218 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12219 p != info.hit_set.history.end();
12220 ++p) {
12221 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12222
12223 // Once we hit a degraded object just skip
12224 if (is_degraded_or_backfilling_object(aoid))
12225 return;
12226 if (scrubber.write_blocked_by_scrub(aoid))
12227 return;
12228 }
12229
12230 if (!info.hit_set.history.empty()) {
12231 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12232 assert(p != info.hit_set.history.rend());
12233 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12234 assert(!is_degraded_or_backfilling_object(oid));
12235 ObjectContextRef obc = get_object_context(oid, false);
12236 assert(obc);
12237
12238 OpContextUPtr ctx = simple_opc_create(obc);
12239 ctx->at_version = get_next_version();
12240 ctx->updated_hset_history = info.hit_set;
12241 utime_t now = ceph_clock_now();
12242 ctx->mtime = now;
12243 hit_set_trim(ctx, 0);
12244 simple_opc_submit(std::move(ctx));
12245 }
12246
12247 info.hit_set = pg_hit_set_history_t();
12248 if (agent_state) {
12249 agent_state->discard_hit_sets();
12250 }
12251}
12252
12253void PrimaryLogPG::hit_set_create()
12254{
12255 utime_t now = ceph_clock_now();
12256 // make a copy of the params to modify
12257 HitSet::Params params(pool.info.hit_set_params);
12258
12259 dout(20) << __func__ << " " << params << dendl;
12260 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12261 BloomHitSet::Params *p =
12262 static_cast<BloomHitSet::Params*>(params.impl.get());
12263
12264 // convert false positive rate so it holds up across the full period
12265 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12266 if (p->get_fpp() <= 0.0)
12267 p->set_fpp(.01); // fpp cannot be zero!
12268
12269 // if we don't have specified size, estimate target size based on the
12270 // previous bin!
12271 if (p->target_size == 0 && hit_set) {
12272 utime_t dur = now - hit_set_start_stamp;
12273 unsigned unique = hit_set->approx_unique_insert_count();
12274 dout(20) << __func__ << " previous set had approx " << unique
12275 << " unique items over " << dur << " seconds" << dendl;
12276 p->target_size = (double)unique * (double)pool.info.hit_set_period
12277 / (double)dur;
12278 }
12279 if (p->target_size <
12280 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12281 p->target_size = cct->_conf->osd_hit_set_min_size;
12282
12283 if (p->target_size
12284 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12285 p->target_size = cct->_conf->osd_hit_set_max_size;
12286
12287 p->seed = now.sec();
12288
12289 dout(10) << __func__ << " target_size " << p->target_size
12290 << " fpp " << p->get_fpp() << dendl;
12291 }
12292 hit_set.reset(new HitSet(params));
12293 hit_set_start_stamp = now;
12294}
12295
12296/**
12297 * apply log entries to set
12298 *
12299 * this would only happen after peering, to at least capture writes
12300 * during an interval that was potentially lost.
12301 */
12302bool PrimaryLogPG::hit_set_apply_log()
12303{
12304 if (!hit_set)
12305 return false;
12306
12307 eversion_t to = info.last_update;
12308 eversion_t from = info.hit_set.current_last_update;
12309 if (to <= from) {
12310 dout(20) << __func__ << " no update" << dendl;
12311 return false;
12312 }
12313
12314 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12315 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12316 while (p != pg_log.get_log().log.rend() && p->version > to)
12317 ++p;
12318 while (p != pg_log.get_log().log.rend() && p->version > from) {
12319 hit_set->insert(p->soid);
12320 ++p;
12321 }
12322
12323 return true;
12324}
12325
12326void PrimaryLogPG::hit_set_persist()
12327{
12328 dout(10) << __func__ << dendl;
12329 bufferlist bl;
12330 unsigned max = pool.info.hit_set_count;
12331
12332 utime_t now = ceph_clock_now();
12333 hobject_t oid;
12334
12335 // If any archives are degraded we skip this persist request
12336 // account for the additional entry being added below
12337 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12338 p != info.hit_set.history.end();
12339 ++p) {
12340 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12341
12342 // Once we hit a degraded object just skip further trim
12343 if (is_degraded_or_backfilling_object(aoid))
12344 return;
12345 if (scrubber.write_blocked_by_scrub(aoid))
12346 return;
12347 }
12348
12349 // If backfill is in progress and we could possibly overlap with the
12350 // hit_set_* objects, back off. Since these all have
12351 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12352 // look just at that. This is necessary because our transactions
12353 // may include a modify of the new hit_set *and* a delete of the
12354 // old one, and this may span the backfill boundary.
12355 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12356 p != backfill_targets.end();
12357 ++p) {
12358 assert(peer_info.count(*p));
12359 const pg_info_t& pi = peer_info[*p];
12360 if (pi.last_backfill == hobject_t() ||
12361 pi.last_backfill.get_hash() == info.pgid.ps()) {
12362 dout(10) << __func__ << " backfill target osd." << *p
12363 << " last_backfill has not progressed past pgid ps"
12364 << dendl;
12365 return;
12366 }
12367 }
12368
12369
12370 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12371 new_hset.begin = hit_set_start_stamp;
12372 new_hset.end = now;
12373 oid = get_hit_set_archive_object(
12374 new_hset.begin,
12375 new_hset.end,
12376 new_hset.using_gmt);
12377
12378 // If the current object is degraded we skip this persist request
12379 if (scrubber.write_blocked_by_scrub(oid))
12380 return;
12381
12382 hit_set->seal();
12383 ::encode(*hit_set, bl);
12384 dout(20) << __func__ << " archive " << oid << dendl;
12385
12386 if (agent_state) {
12387 agent_state->add_hit_set(new_hset.begin, hit_set);
12388 uint32_t size = agent_state->hit_set_map.size();
12389 if (size >= pool.info.hit_set_count) {
12390 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12391 }
12392 hit_set_in_memory_trim(size);
12393 }
12394
12395 ObjectContextRef obc = get_object_context(oid, true);
12396 OpContextUPtr ctx = simple_opc_create(obc);
12397
12398 ctx->at_version = get_next_version();
12399 ctx->updated_hset_history = info.hit_set;
12400 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12401
12402 updated_hit_set_hist.current_last_update = info.last_update;
12403 new_hset.version = ctx->at_version;
12404
12405 updated_hit_set_hist.history.push_back(new_hset);
12406 hit_set_create();
12407
12408 // fabricate an object_info_t and SnapSet
12409 obc->obs.oi.version = ctx->at_version;
12410 obc->obs.oi.mtime = now;
12411 obc->obs.oi.size = bl.length();
12412 obc->obs.exists = true;
12413 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12414
12415 ctx->new_obs = obc->obs;
12416
12417 obc->ssc->snapset.head_exists = true;
12418 ctx->new_snapset = obc->ssc->snapset;
12419
12420 ctx->delta_stats.num_objects++;
12421 ctx->delta_stats.num_objects_hit_set_archive++;
12422 ctx->delta_stats.num_bytes += bl.length();
12423 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12424
12425 bufferlist bss;
12426 ::encode(ctx->new_snapset, bss);
12427 bufferlist boi(sizeof(ctx->new_obs.oi));
12428 ::encode(ctx->new_obs.oi, boi,
12429 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12430
12431 ctx->op_t->create(oid);
12432 if (bl.length()) {
12433 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12434 }
12435 map <string, bufferlist> attrs;
12436 attrs[OI_ATTR].claim(boi);
12437 attrs[SS_ATTR].claim(bss);
12438 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12439 ctx->log.push_back(
12440 pg_log_entry_t(
12441 pg_log_entry_t::MODIFY,
12442 oid,
12443 ctx->at_version,
12444 eversion_t(),
12445 0,
12446 osd_reqid_t(),
12447 ctx->mtime,
12448 0)
12449 );
12450
12451 hit_set_trim(ctx, max);
12452
12453 simple_opc_submit(std::move(ctx));
12454}
12455
12456void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12457{
12458 assert(ctx->updated_hset_history);
12459 pg_hit_set_history_t &updated_hit_set_hist =
12460 *(ctx->updated_hset_history);
12461 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12462 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12463 assert(p != updated_hit_set_hist.history.end());
12464 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12465
12466 assert(!is_degraded_or_backfilling_object(oid));
12467
12468 dout(20) << __func__ << " removing " << oid << dendl;
12469 ++ctx->at_version.version;
12470 ctx->log.push_back(
12471 pg_log_entry_t(pg_log_entry_t::DELETE,
12472 oid,
12473 ctx->at_version,
12474 p->version,
12475 0,
12476 osd_reqid_t(),
12477 ctx->mtime,
12478 0));
12479
12480 ctx->op_t->remove(oid);
12481 updated_hit_set_hist.history.pop_front();
12482
12483 ObjectContextRef obc = get_object_context(oid, false);
12484 assert(obc);
12485 --ctx->delta_stats.num_objects;
12486 --ctx->delta_stats.num_objects_hit_set_archive;
12487 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12488 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12489 }
12490}
12491
12492void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12493{
12494 while (agent_state->hit_set_map.size() > max_in_memory) {
12495 agent_state->remove_oldest_hit_set();
12496 }
12497}
12498
12499
12500// =======================================
12501// cache agent
12502
12503void PrimaryLogPG::agent_setup()
12504{
12505 assert(is_locked());
12506 if (!is_active() ||
12507 !is_primary() ||
12508 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12509 pool.info.tier_of < 0 ||
12510 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12511 agent_clear();
12512 return;
12513 }
12514 if (!agent_state) {
12515 agent_state.reset(new TierAgentState);
12516
12517 // choose random starting position
12518 agent_state->position = hobject_t();
12519 agent_state->position.pool = info.pgid.pool();
12520 agent_state->position.set_hash(pool.info.get_random_pg_position(
12521 info.pgid.pgid,
12522 rand()));
12523 agent_state->start = agent_state->position;
12524
12525 dout(10) << __func__ << " allocated new state, position "
12526 << agent_state->position << dendl;
12527 } else {
12528 dout(10) << __func__ << " keeping existing state" << dendl;
12529 }
12530
12531 if (info.stats.stats_invalid) {
12532 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12533 }
12534
12535 agent_choose_mode();
12536}
12537
12538void PrimaryLogPG::agent_clear()
12539{
12540 agent_stop();
12541 agent_state.reset(NULL);
12542}
12543
12544// Return false if no objects operated on since start of object hash space
12545bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12546{
12547 lock();
12548 if (!agent_state) {
12549 dout(10) << __func__ << " no agent state, stopping" << dendl;
12550 unlock();
12551 return true;
12552 }
12553
12554 assert(!deleting);
12555
12556 if (agent_state->is_idle()) {
12557 dout(10) << __func__ << " idle, stopping" << dendl;
12558 unlock();
12559 return true;
12560 }
12561
12562 osd->logger->inc(l_osd_agent_wake);
12563
12564 dout(10) << __func__
12565 << " max " << start_max
12566 << ", flush " << agent_state->get_flush_mode_name()
12567 << ", evict " << agent_state->get_evict_mode_name()
12568 << ", pos " << agent_state->position
12569 << dendl;
12570 assert(is_primary());
12571 assert(is_active());
12572
12573 agent_load_hit_sets();
12574
12575 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12576 assert(base_pool);
12577
12578 int ls_min = 1;
12579 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12580
12581 // list some objects. this conveniently lists clones (oldest to
12582 // newest) before heads... the same order we want to flush in.
12583 //
12584 // NOTE: do not flush the Sequencer. we will assume that the
12585 // listing we get back is imprecise.
12586 vector<hobject_t> ls;
12587 hobject_t next;
12588 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12589 &ls, &next);
12590 assert(r >= 0);
12591 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12592 int started = 0;
12593 for (vector<hobject_t>::iterator p = ls.begin();
12594 p != ls.end();
12595 ++p) {
12596 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12597 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12598 osd->logger->inc(l_osd_agent_skip);
12599 continue;
12600 }
12601 if (is_degraded_or_backfilling_object(*p)) {
12602 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12603 osd->logger->inc(l_osd_agent_skip);
12604 continue;
12605 }
12606 if (is_missing_object(p->get_head())) {
12607 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12608 osd->logger->inc(l_osd_agent_skip);
12609 continue;
12610 }
12611 ObjectContextRef obc = get_object_context(*p, false, NULL);
12612 if (!obc) {
12613 // we didn't flush; we may miss something here.
12614 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12615 osd->logger->inc(l_osd_agent_skip);
12616 continue;
12617 }
12618 if (!obc->obs.exists) {
12619 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12620 osd->logger->inc(l_osd_agent_skip);
12621 continue;
12622 }
12623 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12624 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12625 osd->logger->inc(l_osd_agent_skip);
12626 continue;
12627 }
12628 if (obc->is_blocked()) {
12629 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12630 osd->logger->inc(l_osd_agent_skip);
12631 continue;
12632 }
12633 if (obc->is_request_pending()) {
12634 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12635 osd->logger->inc(l_osd_agent_skip);
12636 continue;
12637 }
12638
12639 // be careful flushing omap to an EC pool.
12640 if (!base_pool->supports_omap() &&
12641 obc->obs.oi.is_omap()) {
12642 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12643 osd->logger->inc(l_osd_agent_skip);
12644 continue;
12645 }
12646
12647 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12648 agent_maybe_evict(obc, false))
12649 ++started;
12650 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12651 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12652 ++started;
12653 --agent_flush_quota;
12654 }
12655 if (started >= start_max) {
12656 // If finishing early, set "next" to the next object
12657 if (++p != ls.end())
12658 next = *p;
12659 break;
12660 }
12661 }
12662
12663 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12664 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12665 agent_state->hist_age = 0;
12666 agent_state->temp_hist.decay();
12667 }
12668
12669 // Total objects operated on so far
12670 int total_started = agent_state->started + started;
12671 bool need_delay = false;
12672
12673 dout(20) << __func__ << " start pos " << agent_state->position
12674 << " next start pos " << next
12675 << " started " << total_started << dendl;
12676
12677 // See if we've made a full pass over the object hash space
12678 // This might check at most ls_max objects a second time to notice that
12679 // we've checked every objects at least once.
12680 if (agent_state->position < agent_state->start &&
12681 next >= agent_state->start) {
12682 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
12683 if (total_started == 0)
12684 need_delay = true;
12685 else
12686 total_started = 0;
12687 agent_state->start = next;
12688 }
12689 agent_state->started = total_started;
12690
12691 // See if we are starting from beginning
12692 if (next.is_max())
12693 agent_state->position = hobject_t();
12694 else
12695 agent_state->position = next;
12696
12697 // Discard old in memory HitSets
12698 hit_set_in_memory_trim(pool.info.hit_set_count);
12699
12700 if (need_delay) {
12701 assert(agent_state->delaying == false);
12702 agent_delay();
12703 unlock();
12704 return false;
12705 }
12706 agent_choose_mode();
12707 unlock();
12708 return true;
12709}
12710
12711void PrimaryLogPG::agent_load_hit_sets()
12712{
12713 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
12714 return;
12715 }
12716
12717 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
12718 dout(10) << __func__ << dendl;
12719 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12720 p != info.hit_set.history.end(); ++p) {
12721 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
12722 dout(10) << __func__ << " loading " << p->begin << "-"
12723 << p->end << dendl;
12724 if (!pool.info.is_replicated()) {
12725 // FIXME: EC not supported here yet
12726 derr << __func__ << " on non-replicated pool" << dendl;
12727 break;
12728 }
12729
12730 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12731 if (is_unreadable_object(oid)) {
12732 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
12733 break;
12734 }
12735
12736 ObjectContextRef obc = get_object_context(oid, false);
12737 if (!obc) {
12738 derr << __func__ << ": could not load hitset " << oid << dendl;
12739 break;
12740 }
12741
12742 bufferlist bl;
12743 {
12744 obc->ondisk_read_lock();
12745 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
12746 assert(r >= 0);
12747 obc->ondisk_read_unlock();
12748 }
12749 HitSetRef hs(new HitSet);
12750 bufferlist::iterator pbl = bl.begin();
12751 ::decode(*hs, pbl);
12752 agent_state->add_hit_set(p->begin.sec(), hs);
12753 }
12754 }
12755 }
12756}
12757
12758bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
12759{
12760 if (!obc->obs.oi.is_dirty()) {
12761 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
12762 osd->logger->inc(l_osd_agent_skip);
12763 return false;
12764 }
12765 if (obc->obs.oi.is_cache_pinned()) {
12766 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12767 osd->logger->inc(l_osd_agent_skip);
12768 return false;
12769 }
12770
12771 utime_t now = ceph_clock_now();
12772 utime_t ob_local_mtime;
12773 if (obc->obs.oi.local_mtime != utime_t()) {
12774 ob_local_mtime = obc->obs.oi.local_mtime;
12775 } else {
12776 ob_local_mtime = obc->obs.oi.mtime;
12777 }
12778 bool evict_mode_full =
12779 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
12780 if (!evict_mode_full &&
12781 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
12782 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
12783 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12784 osd->logger->inc(l_osd_agent_skip);
12785 return false;
12786 }
12787
12788 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
12789 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
12790 osd->logger->inc(l_osd_agent_skip);
12791 return false;
12792 }
12793
12794 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
12795
12796 // FIXME: flush anything dirty, regardless of what distribution of
12797 // ages we expect.
12798
12799 hobject_t oid = obc->obs.oi.soid;
12800 osd->agent_start_op(oid);
12801 // no need to capture a pg ref, can't outlive fop or ctx
12802 std::function<void()> on_flush = [this, oid]() {
12803 osd->agent_finish_op(oid);
12804 };
12805
12806 int result = start_flush(
12807 OpRequestRef(), obc, false, NULL,
12808 on_flush);
12809 if (result != -EINPROGRESS) {
12810 on_flush();
12811 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
12812 << " with " << result << dendl;
12813 osd->logger->inc(l_osd_agent_skip);
12814 return false;
12815 }
12816
12817 osd->logger->inc(l_osd_agent_flush);
12818 return true;
12819}
12820
12821bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
12822{
12823 const hobject_t& soid = obc->obs.oi.soid;
12824 if (!after_flush && obc->obs.oi.is_dirty()) {
12825 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
12826 return false;
12827 }
12828 if (!obc->obs.oi.watchers.empty()) {
12829 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
12830 return false;
12831 }
12832 if (obc->is_blocked()) {
12833 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12834 return false;
12835 }
12836 if (obc->obs.oi.is_cache_pinned()) {
12837 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12838 return false;
12839 }
12840
12841 if (soid.snap == CEPH_NOSNAP) {
12842 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
12843 if (result < 0) {
12844 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
12845 return false;
12846 }
12847 }
12848
12849 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
12850 // is this object old than cache_min_evict_age?
12851 utime_t now = ceph_clock_now();
12852 utime_t ob_local_mtime;
12853 if (obc->obs.oi.local_mtime != utime_t()) {
12854 ob_local_mtime = obc->obs.oi.local_mtime;
12855 } else {
12856 ob_local_mtime = obc->obs.oi.mtime;
12857 }
12858 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
12859 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12860 osd->logger->inc(l_osd_agent_skip);
12861 return false;
12862 }
12863 // is this object old and/or cold enough?
12864 int temp = 0;
12865 uint64_t temp_upper = 0, temp_lower = 0;
12866 if (hit_set)
12867 agent_estimate_temp(soid, &temp);
12868 agent_state->temp_hist.add(temp);
12869 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
12870
12871 dout(20) << __func__
12872 << " temp " << temp
12873 << " pos " << temp_lower << "-" << temp_upper
12874 << ", evict_effort " << agent_state->evict_effort
12875 << dendl;
12876 dout(30) << "agent_state:\n";
12877 Formatter *f = Formatter::create("");
12878 f->open_object_section("agent_state");
12879 agent_state->dump(f);
12880 f->close_section();
12881 f->flush(*_dout);
12882 delete f;
12883 *_dout << dendl;
12884
12885 if (1000000 - temp_upper >= agent_state->evict_effort)
12886 return false;
12887 }
12888
12889 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
12890 OpContextUPtr ctx = simple_opc_create(obc);
12891
12892 if (!ctx->lock_manager.get_lock_type(
12893 ObjectContext::RWState::RWWRITE,
12894 obc->obs.oi.soid,
12895 obc,
12896 OpRequestRef())) {
12897 close_op_ctx(ctx.release());
12898 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
12899 return false;
12900 }
12901
12902 osd->agent_start_evict_op();
12903 ctx->register_on_finish(
12904 [this]() {
12905 osd->agent_finish_evict_op();
12906 });
12907
12908 ctx->at_version = get_next_version();
12909 assert(ctx->new_obs.exists);
12910 int r = _delete_oid(ctx.get(), true, false);
12911 if (obc->obs.oi.is_omap())
12912 ctx->delta_stats.num_objects_omap--;
12913 ctx->delta_stats.num_evict++;
12914 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
12915 if (obc->obs.oi.is_dirty())
12916 --ctx->delta_stats.num_objects_dirty;
12917 assert(r == 0);
12918 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
12919 simple_opc_submit(std::move(ctx));
12920 osd->logger->inc(l_osd_tier_evict);
12921 osd->logger->inc(l_osd_agent_evict);
12922 return true;
12923}
12924
12925void PrimaryLogPG::agent_stop()
12926{
12927 dout(20) << __func__ << dendl;
12928 if (agent_state && !agent_state->is_idle()) {
12929 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
12930 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12931 osd->agent_disable_pg(this, agent_state->evict_effort);
12932 }
12933}
12934
12935void PrimaryLogPG::agent_delay()
12936{
12937 dout(20) << __func__ << dendl;
12938 if (agent_state && !agent_state->is_idle()) {
12939 assert(agent_state->delaying == false);
12940 agent_state->delaying = true;
12941 osd->agent_disable_pg(this, agent_state->evict_effort);
12942 }
12943}
12944
12945void PrimaryLogPG::agent_choose_mode_restart()
12946{
12947 dout(20) << __func__ << dendl;
12948 lock();
12949 if (agent_state && agent_state->delaying) {
12950 agent_state->delaying = false;
12951 agent_choose_mode(true);
12952 }
12953 unlock();
12954}
12955
12956bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
12957{
12958 bool requeued = false;
12959 // Let delay play out
12960 if (agent_state->delaying) {
12961 dout(20) << __func__ << this << " delaying, ignored" << dendl;
12962 return requeued;
12963 }
12964
12965 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12966 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
12967 unsigned evict_effort = 0;
12968
12969 if (info.stats.stats_invalid) {
12970 // idle; stats can't be trusted until we scrub.
12971 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
12972 goto skip_calc;
12973 }
12974
12975 {
12976 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
12977 assert(divisor > 0);
12978
12979 // adjust (effective) user objects down based on the number
12980 // of HitSet objects, which should not count toward our total since
12981 // they cannot be flushed.
12982 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
12983
12984 // also exclude omap objects if ec backing pool
12985 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12986 assert(base_pool);
12987 if (!base_pool->supports_omap())
12988 unflushable += info.stats.stats.sum.num_objects_omap;
12989
12990 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
12991 if (num_user_objects > unflushable)
12992 num_user_objects -= unflushable;
12993 else
12994 num_user_objects = 0;
12995
12996 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
12997 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
12998 num_user_bytes -= unflushable_bytes;
12999 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13000 num_user_bytes += num_overhead_bytes;
13001
13002 // also reduce the num_dirty by num_objects_omap
13003 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13004 if (!base_pool->supports_omap()) {
13005 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13006 num_dirty -= info.stats.stats.sum.num_objects_omap;
13007 else
13008 num_dirty = 0;
13009 }
13010
13011 dout(10) << __func__
13012 << " flush_mode: "
13013 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13014 << " evict_mode: "
13015 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13016 << " num_objects: " << info.stats.stats.sum.num_objects
13017 << " num_bytes: " << info.stats.stats.sum.num_bytes
13018 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13019 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13020 << " num_dirty: " << num_dirty
13021 << " num_user_objects: " << num_user_objects
13022 << " num_user_bytes: " << num_user_bytes
13023 << " num_overhead_bytes: " << num_overhead_bytes
13024 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13025 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13026 << dendl;
13027
13028 // get dirty, full ratios
13029 uint64_t dirty_micro = 0;
13030 uint64_t full_micro = 0;
13031 if (pool.info.target_max_bytes && num_user_objects > 0) {
13032 uint64_t avg_size = num_user_bytes / num_user_objects;
13033 dirty_micro =
13034 num_dirty * avg_size * 1000000 /
13035 MAX(pool.info.target_max_bytes / divisor, 1);
13036 full_micro =
13037 num_user_objects * avg_size * 1000000 /
13038 MAX(pool.info.target_max_bytes / divisor, 1);
13039 }
13040 if (pool.info.target_max_objects > 0) {
13041 uint64_t dirty_objects_micro =
13042 num_dirty * 1000000 /
13043 MAX(pool.info.target_max_objects / divisor, 1);
13044 if (dirty_objects_micro > dirty_micro)
13045 dirty_micro = dirty_objects_micro;
13046 uint64_t full_objects_micro =
13047 num_user_objects * 1000000 /
13048 MAX(pool.info.target_max_objects / divisor, 1);
13049 if (full_objects_micro > full_micro)
13050 full_micro = full_objects_micro;
13051 }
13052 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13053 << " full " << ((float)full_micro / 1000000.0)
13054 << dendl;
13055
13056 // flush mode
13057 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13058 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13059 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13060 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13061 flush_target += flush_slop;
13062 flush_high_target += flush_slop;
13063 } else {
13064 flush_target -= MIN(flush_target, flush_slop);
13065 flush_high_target -= MIN(flush_high_target, flush_slop);
13066 }
13067
13068 if (dirty_micro > flush_high_target) {
13069 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13070 } else if (dirty_micro > flush_target) {
13071 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13072 }
13073
13074 // evict mode
13075 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13076 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13077 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13078 evict_target += evict_slop;
13079 else
13080 evict_target -= MIN(evict_target, evict_slop);
13081
13082 if (full_micro > 1000000) {
13083 // evict anything clean
13084 evict_mode = TierAgentState::EVICT_MODE_FULL;
13085 evict_effort = 1000000;
13086 } else if (full_micro > evict_target) {
13087 // set effort in [0..1] range based on where we are between
13088 evict_mode = TierAgentState::EVICT_MODE_SOME;
13089 uint64_t over = full_micro - evict_target;
13090 uint64_t span = 1000000 - evict_target;
13091 evict_effort = MAX(over * 1000000 / span,
13092 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13093
13094 // quantize effort to avoid too much reordering in the agent_queue.
13095 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13096 assert(inc > 0);
13097 uint64_t was = evict_effort;
13098 evict_effort -= evict_effort % inc;
13099 if (evict_effort < inc)
13100 evict_effort = inc;
13101 assert(evict_effort >= inc && evict_effort <= 1000000);
13102 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13103 }
13104 }
13105
13106 skip_calc:
13107 bool old_idle = agent_state->is_idle();
13108 if (flush_mode != agent_state->flush_mode) {
13109 dout(5) << __func__ << " flush_mode "
13110 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13111 << " -> "
13112 << TierAgentState::get_flush_mode_name(flush_mode)
13113 << dendl;
13114 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13115 osd->agent_inc_high_count();
13116 info.stats.stats.sum.num_flush_mode_high = 1;
13117 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13118 info.stats.stats.sum.num_flush_mode_low = 1;
13119 }
13120 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13121 osd->agent_dec_high_count();
13122 info.stats.stats.sum.num_flush_mode_high = 0;
13123 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13124 info.stats.stats.sum.num_flush_mode_low = 0;
13125 }
13126 agent_state->flush_mode = flush_mode;
13127 }
13128 if (evict_mode != agent_state->evict_mode) {
13129 dout(5) << __func__ << " evict_mode "
13130 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13131 << " -> "
13132 << TierAgentState::get_evict_mode_name(evict_mode)
13133 << dendl;
13134 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13135 is_active()) {
13136 if (op)
13137 requeue_op(op);
13138 requeue_ops(waiting_for_active);
13139 requeue_ops(waiting_for_scrub);
13140 requeue_ops(waiting_for_cache_not_full);
13141 objects_blocked_on_cache_full.clear();
13142 requeued = true;
13143 }
13144 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13145 info.stats.stats.sum.num_evict_mode_some = 1;
13146 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13147 info.stats.stats.sum.num_evict_mode_full = 1;
13148 }
13149 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13150 info.stats.stats.sum.num_evict_mode_some = 0;
13151 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13152 info.stats.stats.sum.num_evict_mode_full = 0;
13153 }
13154 agent_state->evict_mode = evict_mode;
13155 }
13156 uint64_t old_effort = agent_state->evict_effort;
13157 if (evict_effort != agent_state->evict_effort) {
13158 dout(5) << __func__ << " evict_effort "
13159 << ((float)agent_state->evict_effort / 1000000.0)
13160 << " -> "
13161 << ((float)evict_effort / 1000000.0)
13162 << dendl;
13163 agent_state->evict_effort = evict_effort;
13164 }
13165
13166 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13167 // (including flush). This is probably fine (they should be
13168 // correlated) but it is not precisely correct.
13169 if (agent_state->is_idle()) {
13170 if (!restart && !old_idle) {
13171 osd->agent_disable_pg(this, old_effort);
13172 }
13173 } else {
13174 if (restart || old_idle) {
13175 osd->agent_enable_pg(this, agent_state->evict_effort);
13176 } else if (old_effort != agent_state->evict_effort) {
13177 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13178 }
13179 }
13180 return requeued;
13181}
13182
13183void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13184{
13185 assert(hit_set);
13186 assert(temp);
13187 *temp = 0;
13188 if (hit_set->contains(oid))
13189 *temp = 1000000;
13190 unsigned i = 0;
13191 int last_n = pool.info.hit_set_search_last_n;
13192 for (map<time_t,HitSetRef>::reverse_iterator p =
13193 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13194 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13195 if (p->second->contains(oid)) {
13196 *temp += pool.info.get_grade(i);
13197 --last_n;
13198 }
13199 }
13200}
13201
13202// Dup op detection
13203
13204bool PrimaryLogPG::already_complete(eversion_t v)
13205{
13206 dout(20) << __func__ << ": " << v << dendl;
13207 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13208 !i.end();
13209 ++i) {
13210 dout(20) << __func__ << ": " << **i << dendl;
13211 // skip copy from temp object ops
13212 if ((*i)->v == eversion_t()) {
13213 dout(20) << __func__ << ": " << **i
13214 << " version is empty" << dendl;
13215 continue;
13216 }
13217 if ((*i)->v > v) {
13218 dout(20) << __func__ << ": " << **i
13219 << " (*i)->v past v" << dendl;
13220 break;
13221 }
13222 if (!(*i)->all_committed) {
13223 dout(20) << __func__ << ": " << **i
13224 << " not committed, returning false"
13225 << dendl;
13226 return false;
13227 }
13228 }
13229 dout(20) << __func__ << ": returning true" << dendl;
13230 return true;
13231}
13232
13233bool PrimaryLogPG::already_ack(eversion_t v)
13234{
13235 dout(20) << __func__ << ": " << v << dendl;
13236 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13237 !i.end();
13238 ++i) {
13239 // skip copy from temp object ops
13240 if ((*i)->v == eversion_t()) {
13241 dout(20) << __func__ << ": " << **i
13242 << " version is empty" << dendl;
13243 continue;
13244 }
13245 if ((*i)->v > v) {
13246 dout(20) << __func__ << ": " << **i
13247 << " (*i)->v past v" << dendl;
13248 break;
13249 }
13250 if (!(*i)->all_applied) {
13251 dout(20) << __func__ << ": " << **i
13252 << " not applied, returning false"
13253 << dendl;
13254 return false;
13255 }
13256 }
13257 dout(20) << __func__ << ": returning true" << dendl;
13258 return true;
13259}
13260
13261
13262// ==========================================================================================
13263// SCRUB
13264
13265
13266bool PrimaryLogPG::_range_available_for_scrub(
13267 const hobject_t &begin, const hobject_t &end)
13268{
13269 pair<hobject_t, ObjectContextRef> next;
13270 next.second = object_contexts.lookup(begin);
13271 next.first = begin;
13272 bool more = true;
13273 while (more && next.first < end) {
13274 if (next.second && next.second->is_blocked()) {
13275 next.second->requeue_scrub_on_unblock = true;
13276 dout(10) << __func__ << ": scrub delayed, "
13277 << next.first << " is blocked"
13278 << dendl;
13279 return false;
13280 }
13281 more = object_contexts.get_next(next.first, &next);
13282 }
13283 return true;
13284}
13285
13286static bool doing_clones(const boost::optional<SnapSet> &snapset,
13287 const vector<snapid_t>::reverse_iterator &curclone) {
13288 return snapset && curclone != snapset.get().clones.rend();
13289}
13290
13291void PrimaryLogPG::log_missing(unsigned missing,
13292 const boost::optional<hobject_t> &head,
13293 LogChannelRef clog,
13294 const spg_t &pgid,
13295 const char *func,
13296 const char *mode,
13297 bool allow_incomplete_clones)
13298{
13299 assert(head);
13300 if (allow_incomplete_clones) {
13301 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13302 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13303 } else {
13304 clog->info() << mode << " " << pgid << " " << head.get()
13305 << " " << missing << " missing clone(s)";
13306 }
13307}
13308
13309unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13310 const boost::optional<SnapSet> &snapset,
13311 LogChannelRef clog,
13312 const spg_t &pgid,
13313 const char *mode,
13314 bool allow_incomplete_clones,
13315 boost::optional<snapid_t> target,
13316 vector<snapid_t>::reverse_iterator *curclone,
13317 inconsistent_snapset_wrapper &e)
13318{
13319 assert(head);
13320 assert(snapset);
13321 unsigned missing = 0;
13322
13323 // NOTE: clones are in descending order, thus **curclone > target test here
13324 hobject_t next_clone(head.get());
13325 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13326 ++missing;
13327 // it is okay to be missing one or more clones in a cache tier.
13328 // skip higher-numbered clones in the list.
13329 if (!allow_incomplete_clones) {
13330 next_clone.snap = **curclone;
13331 clog->error() << mode << " " << pgid << " " << head.get()
13332 << " expected clone " << next_clone;
13333 ++scrubber.shallow_errors;
13334 e.set_clone_missing(next_clone.snap);
13335 }
13336 // Clones are descending
13337 ++(*curclone);
13338 }
13339 return missing;
13340}
13341
13342/*
13343 * Validate consistency of the object info and snap sets.
13344 *
13345 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13346 * the comparison of the objects is against multiple snapset.clones. There are
13347 * multiple clone lists and in between lists we expect head or snapdir.
13348 *
13349 * Example
13350 *
13351 * objects expected
13352 * ======= =======
13353 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13354 * obj2 head head/snapdir, head ok
13355 * [SnapSet clones 6 4 2 1]
13356 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13357 * obj2 snap 6 obj2 snap 6, match
13358 * obj2 snap 4 obj2 snap 4, match
13359 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13360 * [Snapset clones 3 1]
13361 * obj3 snap 3 obj3 snap 3 match
13362 * obj3 snap 1 obj3 snap 1 match
13363 * obj4 snapdir head/snapdir, snapdir ok
13364 * [Snapset clones 4]
13365 * EOL obj4 snap 4, (expected)
13366 */
13367void PrimaryLogPG::scrub_snapshot_metadata(
13368 ScrubMap &scrubmap,
13369 const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13370{
13371 dout(10) << __func__ << dendl;
13372
13373 coll_t c(info.pgid);
13374 bool repair = state_test(PG_STATE_REPAIR);
13375 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13376 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13377 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13378
13379 /// snapsets to repair
13380 map<hobject_t,SnapSet> snapset_to_repair;
13381
13382 // traverse in reverse order.
13383 boost::optional<hobject_t> head;
13384 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13385 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13386 unsigned missing = 0;
13387 inconsistent_snapset_wrapper soid_error, head_error;
13388
13389 bufferlist last_data;
13390
13391 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13392 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13393 const hobject_t& soid = p->first;
13394 soid_error = inconsistent_snapset_wrapper{soid};
13395 object_stat_sum_t stat;
13396 boost::optional<object_info_t> oi;
13397
13398 if (!soid.is_snapdir())
13399 stat.num_objects++;
13400
13401 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13402 stat.num_objects_hit_set_archive++;
13403
13404 if (soid.is_snap()) {
13405 // it's a clone
13406 stat.num_object_clones++;
13407 }
13408
13409 // basic checks.
13410 if (p->second.attrs.count(OI_ATTR) == 0) {
13411 oi = boost::none;
13412 osd->clog->error() << mode << " " << info.pgid << " " << soid
13413 << " no '" << OI_ATTR << "' attr";
13414 ++scrubber.shallow_errors;
13415 soid_error.set_oi_attr_missing();
13416 } else {
13417 bufferlist bv;
13418 bv.push_back(p->second.attrs[OI_ATTR]);
13419 try {
13420 oi = object_info_t(); // Initialize optional<> before decode into it
13421 oi.get().decode(bv);
13422 } catch (buffer::error& e) {
13423 oi = boost::none;
13424 osd->clog->error() << mode << " " << info.pgid << " " << soid
13425 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13426 ++scrubber.shallow_errors;
13427 soid_error.set_oi_attr_corrupted();
13428 soid_error.set_oi_attr_missing(); // Not available too
13429 }
13430 }
13431
13432 if (oi) {
13433 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13434 osd->clog->error() << mode << " " << info.pgid << " " << soid
13435 << " on disk size (" << p->second.size
13436 << ") does not match object info size ("
13437 << oi->size << ") adjusted for ondisk to ("
13438 << pgbackend->be_get_ondisk_size(oi->size)
13439 << ")";
13440 soid_error.set_size_mismatch();
13441 ++scrubber.shallow_errors;
13442 }
13443
13444 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13445
13446 // A clone num_bytes will be added later when we have snapset
13447 if (!soid.is_snap()) {
13448 stat.num_bytes += oi->size;
13449 }
13450 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13451 stat.num_bytes_hit_set_archive += oi->size;
13452
13453 if (!soid.is_snapdir()) {
13454 if (oi->is_dirty())
13455 ++stat.num_objects_dirty;
13456 if (oi->is_whiteout())
13457 ++stat.num_whiteouts;
13458 if (oi->is_omap())
13459 ++stat.num_objects_omap;
13460 if (oi->is_cache_pinned())
13461 ++stat.num_objects_pinned;
13462 }
13463 } else {
13464 // pessimistic assumption that this object might contain a
13465 // legacy SnapSet
13466 stat.num_legacy_snapsets++;
13467 }
13468
13469 // Check for any problems while processing clones
13470 if (doing_clones(snapset, curclone)) {
13471 boost::optional<snapid_t> target;
13472 // Expecting an object with snap for current head
13473 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13474
13475 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13476 << soid << " while processing " << head.get() << dendl;
13477
13478 target = all_clones;
13479 } else {
13480 assert(soid.is_snap());
13481 target = soid.snap;
13482 }
13483
13484 // Log any clones we were expecting to be there up to target
13485 // This will set missing, but will be a no-op if snap.soid == *curclone.
13486 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13487 pool.info.allow_incomplete_clones(), target, &curclone,
13488 head_error);
13489 }
13490 bool expected;
13491 // Check doing_clones() again in case we ran process_clones_to()
13492 if (doing_clones(snapset, curclone)) {
13493 // A head/snapdir would have processed all clones above
13494 // or all greater than *curclone.
13495 assert(soid.is_snap() && *curclone <= soid.snap);
13496
13497 // After processing above clone snap should match the expected curclone
13498 expected = (*curclone == soid.snap);
13499 } else {
13500 // If we aren't doing clones any longer, then expecting head/snapdir
13501 expected = soid.has_snapset();
13502 }
13503 if (!expected) {
13504 // If we couldn't read the head's snapset, just ignore clones
13505 if (head && !snapset) {
13506 osd->clog->error() << mode << " " << info.pgid << " " << soid
13507 << " clone ignored due to missing snapset";
13508 } else {
13509 osd->clog->error() << mode << " " << info.pgid << " " << soid
13510 << " is an unexpected clone";
13511 }
13512 ++scrubber.shallow_errors;
13513 soid_error.set_headless();
13514 scrubber.store->add_snap_error(pool.id, soid_error);
13515 if (head && soid.get_head() == head->get_head())
13516 head_error.set_clone(soid.snap);
13517 continue;
13518 }
13519
13520 // new snapset?
13521 if (soid.has_snapset()) {
13522
13523 if (missing) {
13524 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13525 pool.info.allow_incomplete_clones());
13526 }
13527
13528 // Save previous head error information
13529 if (head && head_error.errors)
13530 scrubber.store->add_snap_error(pool.id, head_error);
13531 // Set this as a new head object
13532 head = soid;
13533 missing = 0;
13534 head_error = soid_error;
13535
13536 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13537
13538 if (p->second.attrs.count(SS_ATTR) == 0) {
13539 osd->clog->error() << mode << " " << info.pgid << " " << soid
13540 << " no '" << SS_ATTR << "' attr";
13541 ++scrubber.shallow_errors;
13542 snapset = boost::none;
13543 head_error.set_ss_attr_missing();
13544 } else {
13545 bufferlist bl;
13546 bl.push_back(p->second.attrs[SS_ATTR]);
13547 bufferlist::iterator blp = bl.begin();
13548 try {
13549 snapset = SnapSet(); // Initialize optional<> before decoding into it
13550 ::decode(snapset.get(), blp);
13551 } catch (buffer::error& e) {
13552 snapset = boost::none;
13553 osd->clog->error() << mode << " " << info.pgid << " " << soid
13554 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13555 ++scrubber.shallow_errors;
13556 head_error.set_ss_attr_corrupted();
13557 }
13558 }
13559
13560 if (snapset) {
13561 // what will be next?
13562 curclone = snapset->clones.rbegin();
13563
13564 if (!snapset->clones.empty()) {
13565 dout(20) << " snapset " << snapset.get() << dendl;
13566 if (snapset->seq == 0) {
13567 osd->clog->error() << mode << " " << info.pgid << " " << soid
13568 << " snaps.seq not set";
13569 ++scrubber.shallow_errors;
13570 head_error.set_snapset_mismatch();
13571 }
13572 }
13573
13574 if (soid.is_head() && !snapset->head_exists) {
13575 osd->clog->error() << mode << " " << info.pgid << " " << soid
13576 << " snapset.head_exists=false, but head exists";
13577 ++scrubber.shallow_errors;
13578 head_error.set_head_mismatch();
13579 }
13580 if (soid.is_snapdir() && snapset->head_exists) {
13581 osd->clog->error() << mode << " " << info.pgid << " " << soid
13582 << " snapset.head_exists=true, but snapdir exists";
13583 ++scrubber.shallow_errors;
13584 head_error.set_head_mismatch();
13585 }
13586
31f18b77 13587 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
13588 if (soid.is_snapdir()) {
13589 dout(10) << " will move snapset to head from " << soid << dendl;
13590 snapset_to_repair[soid.get_head()] = *snapset;
13591 } else if (snapset->is_legacy()) {
13592 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13593 << dendl;
13594 snapset_to_repair[soid.get_head()] = *snapset;
13595 }
13596 } else {
13597 stat.num_legacy_snapsets++;
13598 }
13599 } else {
13600 // pessimistic assumption that this object might contain a
13601 // legacy SnapSet
13602 stat.num_legacy_snapsets++;
13603 }
13604 } else {
13605 assert(soid.is_snap());
13606 assert(head);
13607 assert(snapset);
13608 assert(soid.snap == *curclone);
13609
13610 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13611
13612 if (snapset->clone_size.count(soid.snap) == 0) {
13613 osd->clog->error() << mode << " " << info.pgid << " " << soid
13614 << " is missing in clone_size";
13615 ++scrubber.shallow_errors;
13616 soid_error.set_size_mismatch();
13617 } else {
13618 if (oi && oi->size != snapset->clone_size[soid.snap]) {
13619 osd->clog->error() << mode << " " << info.pgid << " " << soid
13620 << " size " << oi->size << " != clone_size "
13621 << snapset->clone_size[*curclone];
13622 ++scrubber.shallow_errors;
13623 soid_error.set_size_mismatch();
13624 }
13625
13626 if (snapset->clone_overlap.count(soid.snap) == 0) {
13627 osd->clog->error() << mode << " " << info.pgid << " " << soid
13628 << " is missing in clone_overlap";
13629 ++scrubber.shallow_errors;
13630 soid_error.set_size_mismatch();
13631 } else {
13632 // This checking is based on get_clone_bytes(). The first 2 asserts
13633 // can't happen because we know we have a clone_size and
13634 // a clone_overlap. Now we check that the interval_set won't
13635 // cause the last assert.
13636 uint64_t size = snapset->clone_size.find(soid.snap)->second;
13637 const interval_set<uint64_t> &overlap =
13638 snapset->clone_overlap.find(soid.snap)->second;
13639 bool bad_interval_set = false;
13640 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13641 i != overlap.end(); ++i) {
13642 if (size < i.get_len()) {
13643 bad_interval_set = true;
13644 break;
13645 }
13646 size -= i.get_len();
13647 }
13648
13649 if (bad_interval_set) {
13650 osd->clog->error() << mode << " " << info.pgid << " " << soid
13651 << " bad interval_set in clone_overlap";
13652 ++scrubber.shallow_errors;
13653 soid_error.set_size_mismatch();
13654 } else {
13655 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13656 }
13657 }
13658 }
13659
13660 // migrate legacy_snaps to snapset?
13661 auto p = snapset_to_repair.find(soid.get_head());
13662 if (p != snapset_to_repair.end()) {
13663 if (!oi || oi->legacy_snaps.empty()) {
13664 osd->clog->error() << mode << " " << info.pgid << " " << soid
13665 << " has no oi or legacy_snaps; cannot convert "
13666 << *snapset;
13667 ++scrubber.shallow_errors;
13668 } else {
13669 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
13670 << " to snapset " << p->second << dendl;
13671 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13672 }
13673 }
13674
13675 // what's next?
13676 ++curclone;
13677 if (soid_error.errors)
13678 scrubber.store->add_snap_error(pool.id, soid_error);
13679 }
13680
13681 scrub_cstat.add(stat);
13682 }
13683
13684 if (doing_clones(snapset, curclone)) {
13685 dout(10) << __func__ << " " << mode << " " << info.pgid
13686 << " No more objects while processing " << head.get() << dendl;
13687
13688 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13689 pool.info.allow_incomplete_clones(), all_clones, &curclone,
13690 head_error);
13691 }
13692 // There could be missing found by the test above or even
13693 // before dropping out of the loop for the last head.
13694 if (missing) {
13695 log_missing(missing, head, osd->clog, info.pgid, __func__,
13696 mode, pool.info.allow_incomplete_clones());
13697 }
13698 if (head && head_error.errors)
13699 scrubber.store->add_snap_error(pool.id, head_error);
13700
13701 for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
13702 missing_digest.begin();
13703 p != missing_digest.end();
13704 ++p) {
13705 if (p->first.is_snapdir())
13706 continue;
13707 dout(10) << __func__ << " recording digests for " << p->first << dendl;
13708 ObjectContextRef obc = get_object_context(p->first, false);
13709 if (!obc) {
13710 osd->clog->error() << info.pgid << " " << mode
13711 << " cannot get object context for "
13712 << p->first;
13713 continue;
13714 } else if (obc->obs.oi.soid != p->first) {
13715 osd->clog->error() << info.pgid << " " << mode
13716 << " object " << p->first
13717 << " has a valid oi attr with a mismatched name, "
13718 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13719 continue;
13720 }
13721 OpContextUPtr ctx = simple_opc_create(obc);
13722 ctx->at_version = get_next_version();
13723 ctx->mtime = utime_t(); // do not update mtime
13724 ctx->new_obs.oi.set_data_digest(p->second.first);
13725 ctx->new_obs.oi.set_omap_digest(p->second.second);
13726 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13727
13728 ctx->register_on_success(
13729 [this]() {
13730 dout(20) << "updating scrub digest" << dendl;
13731 if (--scrubber.num_digest_updates_pending == 0) {
13732 requeue_scrub();
13733 }
13734 });
13735
13736 simple_opc_submit(std::move(ctx));
13737 ++scrubber.num_digest_updates_pending;
13738 }
13739 for (auto& p : snapset_to_repair) {
13740 // cache pools may not have the clones, which means we won't know
13741 // what snaps they have. fake out the clone_snaps entries anyway (with
13742 // blank snap lists).
13743 p.second.head_exists = true;
13744 if (pool.info.allow_incomplete_clones()) {
13745 for (auto s : p.second.clones) {
13746 if (p.second.clone_snaps.count(s) == 0) {
13747 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
13748 << s << dendl;
13749 p.second.clone_snaps[s];
13750 }
13751 }
13752 }
13753 if (p.second.clones.size() != p.second.clone_snaps.size() ||
13754 p.second.is_legacy()) {
13755 // this happens if we encounter other errors above, like a missing
13756 // or extra clone.
13757 dout(10) << __func__ << " not writing snapset to " << p.first
13758 << " snapset " << p.second << " clones " << p.second.clones
13759 << "; didn't convert fully" << dendl;
13760 scrub_cstat.sum.num_legacy_snapsets++;
13761 continue;
13762 }
13763 dout(10) << __func__ << " writing snapset to " << p.first
13764 << " " << p.second << dendl;
13765 ObjectContextRef obc = get_object_context(p.first, true);
13766 if (!obc) {
13767 osd->clog->error() << info.pgid << " " << mode
13768 << " cannot get object context for "
13769 << p.first;
13770 continue;
13771 } else if (obc->obs.oi.soid != p.first) {
13772 osd->clog->error() << info.pgid << " " << mode
13773 << " object " << p.first
13774 << " has a valid oi attr with a mismatched name, "
13775 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13776 continue;
13777 }
13778 ObjectContextRef snapset_obc;
13779 if (!obc->obs.exists) {
13780 snapset_obc = get_object_context(p.first.get_snapdir(), false);
13781 if (!snapset_obc) {
13782 osd->clog->error() << info.pgid << " " << mode
13783 << " cannot get object context for "
13784 << p.first.get_snapdir();
13785 continue;
13786 }
13787 }
13788 OpContextUPtr ctx = simple_opc_create(obc);
13789 PGTransaction *t = ctx->op_t.get();
13790 ctx->snapset_obc = snapset_obc;
13791 ctx->at_version = get_next_version();
13792 ctx->mtime = utime_t(); // do not update mtime
13793 ctx->new_snapset = p.second;
13794 if (!ctx->new_obs.exists) {
13795 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
13796 ctx->new_obs.exists = true;
13797 ctx->new_snapset.head_exists = true;
13798 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
13799 ++ctx->delta_stats.num_whiteouts;
13800 ++ctx->delta_stats.num_objects;
13801 t->create(p.first);
13802 if (p.first < scrubber.start) {
13803 dout(20) << __func__ << " kludging around update outside of scrub range"
13804 << dendl;
13805 } else {
13806 scrub_cstat.add(ctx->delta_stats);
13807 }
13808 }
13809 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
13810 assert(!ctx->new_snapset.is_legacy());
13811 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13812 ctx->register_on_success(
13813 [this]() {
13814 dout(20) << "updating snapset" << dendl;
13815 if (--scrubber.num_digest_updates_pending == 0) {
13816 requeue_scrub();
13817 }
13818 });
13819
13820 simple_opc_submit(std::move(ctx));
13821 ++scrubber.num_digest_updates_pending;
13822 }
13823
13824 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
13825}
13826
13827void PrimaryLogPG::_scrub_clear_state()
13828{
13829 scrub_cstat = object_stat_collection_t();
13830}
13831
13832void PrimaryLogPG::_scrub_finish()
13833{
13834 bool repair = state_test(PG_STATE_REPAIR);
13835 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13836 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13837
13838 if (info.stats.stats_invalid) {
13839 info.stats.stats = scrub_cstat;
13840 info.stats.stats_invalid = false;
13841
13842 if (agent_state)
13843 agent_choose_mode();
13844 }
13845
13846 dout(10) << mode << " got "
13847 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13848 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13849 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13850 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13851 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13852 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13853 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13854 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
13855 << dendl;
13856
13857 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
13858 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
13859 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
13860 !info.stats.dirty_stats_invalid) ||
13861 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
13862 !info.stats.omap_stats_invalid) ||
13863 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
13864 !info.stats.pin_stats_invalid) ||
13865 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
13866 !info.stats.hitset_stats_invalid) ||
13867 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
13868 !info.stats.hitset_bytes_stats_invalid) ||
13869 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
13870 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
13871 osd->clog->error() << info.pgid << " " << mode
13872 << " stat mismatch, got "
13873 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13874 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13875 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13876 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13877 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13878 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13879 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
13880 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13881 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
13882 ++scrubber.shallow_errors;
13883
13884 if (repair) {
13885 ++scrubber.fixed;
13886 info.stats.stats = scrub_cstat;
13887 info.stats.dirty_stats_invalid = false;
13888 info.stats.omap_stats_invalid = false;
13889 info.stats.hitset_stats_invalid = false;
13890 info.stats.hitset_bytes_stats_invalid = false;
13891 publish_stats_to_osd();
13892 share_pg_info();
13893 }
13894 } else if (scrub_cstat.sum.num_legacy_snapsets !=
13895 info.stats.stats.sum.num_legacy_snapsets) {
13896 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
13897 << " from " << info.stats.stats.sum.num_legacy_snapsets
13898 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
13899 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
13900 publish_stats_to_osd();
13901 share_pg_info();
13902 }
224ce89b
WB
13903 // Clear object context cache to get repair information
13904 if (repair)
13905 object_contexts.clear();
7c673cae
FG
13906}
13907
13908bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
13909{
13910 return osd->check_osdmap_full(missing_on);
13911}
13912
224ce89b
WB
13913int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
13914{
13915 // Only supports replicated pools
13916 assert(!pool.info.require_rollback());
13917 assert(is_primary());
13918
13919 dout(10) << __func__ << " " << soid
13920 << " peers osd.{" << actingbackfill << "}" << dendl;
13921
13922 if (!is_clean()) {
13923 block_for_clean(soid, op);
13924 return -EAGAIN;
13925 }
13926
13927 assert(!pg_log.get_missing().is_missing(soid));
13928 bufferlist bv;
13929 object_info_t oi;
13930 eversion_t v;
13931 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
13932 if (r < 0) {
13933 // Leave v and try to repair without a version, getting attr failed
13934 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
13935 << soid << " error=" << r << dendl;
13936 } else try {
13937 bufferlist::iterator bliter = bv.begin();
13938 ::decode(oi, bliter);
13939 v = oi.version;
13940 } catch (...) {
13941 // Leave v as default constructed. This will fail when sent to older OSDs, but
13942 // not much worse than failing here.
13943 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
13944 }
13945
13946 missing_loc.add_missing(soid, v, eversion_t());
13947 if (primary_error(soid, v)) {
13948 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
13949 // XXX: If we knew that there is no down osd which could include this
13950 // object, it would be nice if we could return EIO here.
13951 // If a "never fail" flag was available, that could be used
13952 // for rbd to NOT return EIO until object marked lost.
13953
13954 // Drop through to save this op in case an osd comes up with the object.
13955 }
13956
13957 // Restart the op after object becomes readable again
13958 waiting_for_unreadable_object[soid].push_back(op);
13959 op->mark_delayed("waiting for missing object");
13960
13961 if (!eio_errors_to_process) {
13962 eio_errors_to_process = true;
13963 assert(is_clean());
13964 queue_peering_event(
13965 CephPeeringEvtRef(
13966 std::make_shared<CephPeeringEvt>(
13967 get_osdmap()->get_epoch(),
13968 get_osdmap()->get_epoch(),
13969 DoRecovery())));
13970 } else {
13971 // A prior error must have already cleared clean state and queued recovery
13972 // or a map change has triggered re-peering.
13973 // Not inlining the recovery by calling maybe_kick_recovery(soid);
13974 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
13975 }
13976
13977 return -EAGAIN;
13978}
13979
7c673cae
FG
13980/*---SnapTrimmer Logging---*/
13981#undef dout_prefix
13982#define dout_prefix *_dout << pg->gen_prefix()
13983
13984void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
13985{
13986 ldout(pg->cct, 20) << "enter " << state_name << dendl;
13987}
13988
13989void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
13990{
13991 ldout(pg->cct, 20) << "exit " << state_name << dendl;
13992}
13993
13994/*---SnapTrimmer states---*/
13995#undef dout_prefix
13996#define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
13997 << "SnapTrimmer state<" << get_state_name() << ">: ")
13998
13999/* NotTrimming */
14000PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14001 : my_base(ctx),
14002 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14003{
14004 context< SnapTrimmer >().log_enter(state_name);
14005}
14006
14007void PrimaryLogPG::NotTrimming::exit()
14008{
14009 context< SnapTrimmer >().log_exit(state_name, enter_time);
14010}
14011
14012boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14013{
14014 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14015 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14016
14017 if (!(pg->is_primary() && pg->is_active())) {
14018 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14019 return discard_event();
14020 }
14021 if (!pg->is_clean() ||
14022 pg->snap_trimq.empty()) {
14023 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14024 return discard_event();
14025 }
14026 if (pg->scrubber.active) {
14027 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
7c673cae
FG
14028 return transit< WaitScrub >();
14029 } else {
14030 return transit< Trimming >();
14031 }
14032}
14033
14034boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14035{
14036 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14037 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14038
14039 pending = nullptr;
14040 if (!context< SnapTrimmer >().can_trim()) {
14041 post_event(KickTrim());
14042 return transit< NotTrimming >();
14043 }
14044
14045 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14046 ldout(pg->cct, 10) << "NotTrimming: trimming "
14047 << pg->snap_trimq.range_start()
14048 << dendl;
14049 return transit< AwaitAsyncWork >();
14050}
14051
14052/* AwaitAsyncWork */
14053PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14054 : my_base(ctx),
14055 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14056{
14057 auto *pg = context< SnapTrimmer >().pg;
14058 context< SnapTrimmer >().log_enter(state_name);
14059 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14060 pg->state_set(PG_STATE_SNAPTRIM);
224ce89b 14061 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
14062 pg->publish_stats_to_osd();
14063}
14064
14065boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14066{
14067 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14068 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14069 auto &in_flight = context<Trimming>().in_flight;
14070 assert(in_flight.empty());
14071
14072 assert(pg->is_primary() && pg->is_active());
14073 if (!context< SnapTrimmer >().can_trim()) {
14074 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14075 post_event(KickTrim());
14076 return transit< NotTrimming >();
14077 }
14078
14079 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14080
14081 vector<hobject_t> to_trim;
14082 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14083 to_trim.reserve(max);
14084 int r = pg->snap_mapper.get_next_objects_to_trim(
14085 snap_to_trim,
14086 max,
14087 &to_trim);
14088 if (r != 0 && r != -ENOENT) {
14089 lderr(pg->cct) << "get_next_objects_to_trim returned "
14090 << cpp_strerror(r) << dendl;
14091 assert(0 == "get_next_objects_to_trim returned an invalid code");
14092 } else if (r == -ENOENT) {
14093 // Done!
14094 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14095
14096 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14097 << " to purged_snaps"
14098 << dendl;
14099 pg->info.purged_snaps.insert(snap_to_trim);
14100 pg->snap_trimq.erase(snap_to_trim);
14101 ldout(pg->cct, 10) << "purged_snaps now "
14102 << pg->info.purged_snaps << ", snap_trimq now "
14103 << pg->snap_trimq << dendl;
14104
14105 ObjectStore::Transaction t;
14106 pg->dirty_big_info = true;
14107 pg->write_if_dirty(t);
14108 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14109 assert(tr == 0);
14110
14111 pg->share_pg_info();
14112 post_event(KickTrim());
14113 return transit< NotTrimming >();
14114 }
14115 assert(!to_trim.empty());
14116
14117 for (auto &&object: to_trim) {
14118 // Get next
14119 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
224ce89b
WB
14120 OpContextUPtr ctx;
14121 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14122 if (error) {
14123 if (error == -ENOLCK) {
14124 ldout(pg->cct, 10) << "could not get write lock on obj "
14125 << object << dendl;
14126 } else {
14127 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14128 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14129 }
14130 if (!in_flight.empty()) {
14131 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14132 return transit< WaitRepops >();
14133 }
14134 if (error == -ENOLCK) {
7c673cae
FG
14135 ldout(pg->cct, 10) << "waiting for it to clear"
14136 << dendl;
14137 return transit< WaitRWLock >();
7c673cae 14138 } else {
224ce89b 14139 return transit< NotTrimming >();
7c673cae
FG
14140 }
14141 }
14142
14143 in_flight.insert(object);
14144 ctx->register_on_success(
14145 [pg, object, &in_flight]() {
14146 assert(in_flight.find(object) != in_flight.end());
14147 in_flight.erase(object);
224ce89b
WB
14148 if (in_flight.empty()) {
14149 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14150 pg->snap_trimmer_machine.process_event(Reset());
14151 } else {
14152 pg->snap_trimmer_machine.process_event(RepopsComplete());
14153 }
14154 }
7c673cae
FG
14155 });
14156
14157 pg->simple_opc_submit(std::move(ctx));
14158 }
14159
14160 return transit< WaitRepops >();
14161}
14162
14163void PrimaryLogPG::setattr_maybe_cache(
14164 ObjectContextRef obc,
14165 OpContext *op,
14166 PGTransaction *t,
14167 const string &key,
14168 bufferlist &val)
14169{
14170 t->setattr(obc->obs.oi.soid, key, val);
14171}
14172
14173void PrimaryLogPG::setattrs_maybe_cache(
14174 ObjectContextRef obc,
14175 OpContext *op,
14176 PGTransaction *t,
14177 map<string, bufferlist> &attrs)
14178{
14179 t->setattrs(obc->obs.oi.soid, attrs);
14180}
14181
14182void PrimaryLogPG::rmattr_maybe_cache(
14183 ObjectContextRef obc,
14184 OpContext *op,
14185 PGTransaction *t,
14186 const string &key)
14187{
14188 t->rmattr(obc->obs.oi.soid, key);
14189}
14190
14191int PrimaryLogPG::getattr_maybe_cache(
14192 ObjectContextRef obc,
14193 const string &key,
14194 bufferlist *val)
14195{
14196 if (pool.info.require_rollback()) {
14197 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14198 if (i != obc->attr_cache.end()) {
14199 if (val)
14200 *val = i->second;
14201 return 0;
14202 } else {
14203 return -ENODATA;
14204 }
14205 }
14206 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14207}
14208
14209int PrimaryLogPG::getattrs_maybe_cache(
14210 ObjectContextRef obc,
14211 map<string, bufferlist> *out,
14212 bool user_only)
14213{
14214 int r = 0;
14215 if (pool.info.require_rollback()) {
14216 if (out)
14217 *out = obc->attr_cache;
14218 } else {
14219 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14220 }
14221 if (out && user_only) {
14222 map<string, bufferlist> tmp;
14223 for (map<string, bufferlist>::iterator i = out->begin();
14224 i != out->end();
14225 ++i) {
14226 if (i->first.size() > 1 && i->first[0] == '_')
14227 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14228 }
14229 tmp.swap(*out);
14230 }
14231 return r;
14232}
14233
14234bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14235 return osd->check_failsafe_full(ss);
14236}
14237
14238void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14239void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14240
14241#ifdef PG_DEBUG_REFS
14242uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14243void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14244#endif
14245
14246void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14247void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }