]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
update sources to v12.2.3
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
31
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
47
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
56
57 #ifdef WITH_LTTNG
58 #include "tracing/osd.h"
59 #else
60 #define tracepoint(...)
61 #endif
62
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66 #undef dout_prefix
67 #define dout_prefix _prefix(_dout, this)
68 template <typename T>
69 static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71 }
72
73
74 #include <sstream>
75 #include <utility>
76
77 #include <errno.h>
78
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81 PGLSFilter::PGLSFilter() : cct(nullptr)
82 {
83 }
84
85 PGLSFilter::~PGLSFilter()
86 {
87 }
88
89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104 };
105
106 /**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115 protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125 public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128 };
129
130 template <typename T>
131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135 public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146 };
147
148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152 }
153
154 class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158 public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169 };
170
171
172 Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174 }
175
176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185 };
186
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201 };
202
203 struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
210 opcontext->finish_read(pg);
211 }
212 ~OnReadComplete() override {}
213 };
214
215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
216 PrimaryLogPGRef pg;
217 ObjectContextRef obc;
218 public:
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
220 pg(p), obc(o) {}
221 void finish(int r) override {
222 pg->_applied_recovered_object(obc);
223 }
224 };
225
226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238 };
239
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 void finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 }
248 };
249
250 // OpContext
251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
252 {
253 inflightreads = 1;
254 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
255 pair<bufferlist*, Context*> > > in;
256 in.swap(pending_async_reads);
257 pg->pgbackend->objects_read_async(
258 obc->obs.oi.soid,
259 in,
260 new OnReadComplete(pg, this), pg->get_pool().fast_read);
261 }
262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
263 {
264 assert(inflightreads > 0);
265 --inflightreads;
266 if (async_reads_complete()) {
267 assert(pg->in_progress_async_reads.size());
268 assert(pg->in_progress_async_reads.front().second == this);
269 pg->in_progress_async_reads.pop_front();
270
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg->execute_ctx(this);
274 }
275 }
276
277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
278 public:
279 PrimaryLogPG::CopyResults *results = nullptr;
280 PrimaryLogPG::OpContext *ctx;
281 OSDOp &osd_op;
282
283 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
284 : ctx(ctx), osd_op(osd_op) {
285 }
286 ~CopyFromCallback() override {}
287
288 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
289 results = results_.get<1>();
290 int r = results_.get<0>();
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
297 } else {
298 if (r != -ECANCELED) { // on cancel just toss it out; client resends
299 if (ctx->op)
300 ctx->pg->osd->reply_op_error(ctx->op, r);
301 } else if (results->should_requeue) {
302 if (ctx->op)
303 ctx->pg->requeue_op(ctx->op);
304 }
305 ctx->pg->close_op_ctx(ctx);
306 }
307 }
308
309 bool is_temp_obj_used() {
310 return results->started_temp_obj;
311 }
312 uint64_t get_data_size() {
313 return results->object_size;
314 }
315 };
316
317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
318 CopyFromCallback *copy_from_callback;
319
320 CopyFromFinisher(CopyFromCallback *copy_from_callback)
321 : copy_from_callback(copy_from_callback) {
322 }
323
324 int execute() override {
325 // instance will be destructed after this method completes
326 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
327 return 0;
328 }
329 };
330
331 // ======================
332 // PGBackend::Listener
333
334 void PrimaryLogPG::on_local_recover(
335 const hobject_t &hoid,
336 const ObjectRecoveryInfo &_recovery_info,
337 ObjectContextRef obc,
338 bool is_delete,
339 ObjectStore::Transaction *t
340 )
341 {
342 dout(10) << __func__ << ": " << hoid << dendl;
343
344 ObjectRecoveryInfo recovery_info(_recovery_info);
345 clear_object_snap_mapping(t, hoid);
346 if (!is_delete && recovery_info.soid.is_snap()) {
347 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
348 set<snapid_t> snaps;
349 dout(20) << " snapset " << recovery_info.ss
350 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
351 if (recovery_info.ss.is_legacy() ||
352 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
353 assert(recovery_info.oi.legacy_snaps.size());
354 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
355 recovery_info.oi.legacy_snaps.end());
356 } else {
357 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
358 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
359 snaps.insert(p->second.begin(), p->second.end());
360 }
361 dout(20) << " snaps " << snaps << dendl;
362 snap_mapper.add_oid(
363 recovery_info.soid,
364 snaps,
365 &_t);
366 }
367 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
368 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
369 assert(is_primary());
370 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
371 if (latest->op == pg_log_entry_t::LOST_REVERT &&
372 latest->reverting_to == recovery_info.version) {
373 dout(10) << " got old revert version " << recovery_info.version
374 << " for " << *latest << dendl;
375 recovery_info.version = latest->version;
376 // update the attr to the revert event version
377 recovery_info.oi.prior_version = recovery_info.oi.version;
378 recovery_info.oi.version = latest->version;
379 bufferlist bl;
380 ::encode(recovery_info.oi, bl,
381 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
382 assert(!pool.info.require_rollback());
383 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
384 if (obc)
385 obc->attr_cache[OI_ATTR] = bl;
386 }
387 }
388
389 // keep track of active pushes for scrub
390 ++active_pushes;
391
392 if (recovery_info.version > pg_log.get_can_rollback_to()) {
393 /* This can only happen during a repair, and even then, it would
394 * be one heck of a race. If we are repairing the object, the
395 * write in question must be fully committed, so it's not valid
396 * to roll it back anyway (and we'll be rolled forward shortly
397 * anyway) */
398 PGLogEntryHandler h{this, t};
399 pg_log.roll_forward_to(recovery_info.version, &h);
400 }
401 recover_got(recovery_info.soid, recovery_info.version);
402
403 if (is_primary()) {
404 if (!is_delete) {
405 obc->obs.exists = true;
406 obc->ondisk_write_lock();
407
408 bool got = obc->get_recovery_read();
409 assert(got);
410
411 assert(recovering.count(obc->obs.oi.soid));
412 recovering[obc->obs.oi.soid] = obc;
413 obc->obs.oi = recovery_info.oi; // may have been updated above
414 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
415 }
416
417 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
418
419 publish_stats_to_osd();
420 assert(missing_loc.needs_recovery(hoid));
421 if (!is_delete)
422 missing_loc.add_location(hoid, pg_whoami);
423 release_backoffs(hoid);
424 if (!is_unreadable_object(hoid)) {
425 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
426 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
427 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
428 requeue_ops(unreadable_object_entry->second);
429 waiting_for_unreadable_object.erase(unreadable_object_entry);
430 }
431 }
432 } else {
433 t->register_on_applied(
434 new C_OSD_AppliedRecoveredObjectReplica(this));
435
436 }
437
438 t->register_on_commit(
439 new C_OSD_CommittedPushedObject(
440 this,
441 get_osdmap()->get_epoch(),
442 info.last_complete));
443
444 // update pg
445 dirty_info = true;
446 write_if_dirty(*t);
447 }
448
449 void PrimaryLogPG::on_global_recover(
450 const hobject_t &soid,
451 const object_stat_sum_t &stat_diff,
452 bool is_delete)
453 {
454 info.stats.stats.sum.add(stat_diff);
455 missing_loc.recovered(soid);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
459 assert(i != recovering.end());
460
461 if (!is_delete) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 assert(i->second);
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488 }
489
490 void PrimaryLogPG::on_peer_recover(
491 pg_shard_t peer,
492 const hobject_t &soid,
493 const ObjectRecoveryInfo &recovery_info)
494 {
495 publish_stats_to_osd();
496 // done!
497 peer_missing[peer].got(soid, recovery_info.version);
498 }
499
500 void PrimaryLogPG::begin_peer_recover(
501 pg_shard_t peer,
502 const hobject_t soid)
503 {
504 peer_missing[peer].revise_have(soid, eversion_t());
505 }
506
507 void PrimaryLogPG::schedule_recovery_work(
508 GenContext<ThreadPool::TPHandle&> *c)
509 {
510 osd->recovery_gen_wq.queue(c);
511 }
512
513 void PrimaryLogPG::send_message_osd_cluster(
514 int peer, Message *m, epoch_t from_epoch)
515 {
516 osd->send_message_osd_cluster(peer, m, from_epoch);
517 }
518
519 void PrimaryLogPG::send_message_osd_cluster(
520 Message *m, Connection *con)
521 {
522 osd->send_message_osd_cluster(m, con);
523 }
524
525 void PrimaryLogPG::send_message_osd_cluster(
526 Message *m, const ConnectionRef& con)
527 {
528 osd->send_message_osd_cluster(m, con);
529 }
530
531 void PrimaryLogPG::on_primary_error(
532 const hobject_t &oid,
533 eversion_t v)
534 {
535 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
536 primary_failed(oid);
537 primary_error(oid, v);
538 backfill_add_missing(oid, v);
539 }
540
541 void PrimaryLogPG::backfill_add_missing(
542 const hobject_t &oid,
543 eversion_t v)
544 {
545 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
546 backfills_in_flight.erase(oid);
547 missing_loc.add_missing(oid, v, eversion_t());
548 }
549
550 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
551 int peer, epoch_t from_epoch)
552 {
553 return osd->get_con_osd_cluster(peer, from_epoch);
554 }
555
556 PerfCounters *PrimaryLogPG::get_logger()
557 {
558 return osd->logger;
559 }
560
561
562 // ====================
563 // missing objects
564
565 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
566 {
567 return pg_log.get_missing().get_items().count(soid);
568 }
569
570 void PrimaryLogPG::maybe_kick_recovery(
571 const hobject_t &soid)
572 {
573 eversion_t v;
574 if (!missing_loc.needs_recovery(soid, &v))
575 return;
576
577 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
578 if (p != recovering.end()) {
579 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
580 } else if (missing_loc.is_unfound(soid)) {
581 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
582 } else {
583 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
584 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
585 if (is_missing_object(soid)) {
586 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
587 } else if (missing_loc.is_deleted(soid)) {
588 prep_object_replica_deletes(soid, v, h);
589 } else {
590 prep_object_replica_pushes(soid, v, h);
591 }
592 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
593 }
594 }
595
596 void PrimaryLogPG::wait_for_unreadable_object(
597 const hobject_t& soid, OpRequestRef op)
598 {
599 assert(is_unreadable_object(soid));
600 maybe_kick_recovery(soid);
601 waiting_for_unreadable_object[soid].push_back(op);
602 op->mark_delayed("waiting for missing object");
603 }
604
605 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
606 {
607 /* The conditions below may clear (on_local_recover, before we queue
608 * the transaction) before we actually requeue the degraded waiters
609 * in on_global_recover after the transaction completes.
610 */
611 if (waiting_for_degraded_object.count(soid))
612 return true;
613 if (pg_log.get_missing().get_items().count(soid))
614 return true;
615 assert(!actingbackfill.empty());
616 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
617 i != actingbackfill.end();
618 ++i) {
619 if (*i == get_primary()) continue;
620 pg_shard_t peer = *i;
621 auto peer_missing_entry = peer_missing.find(peer);
622 if (peer_missing_entry != peer_missing.end() &&
623 peer_missing_entry->second.get_items().count(soid))
624 return true;
625
626 // Object is degraded if after last_backfill AND
627 // we are backfilling it
628 if (is_backfill_targets(peer) &&
629 peer_info[peer].last_backfill <= soid &&
630 last_backfill_started >= soid &&
631 backfills_in_flight.count(soid))
632 return true;
633 }
634 return false;
635 }
636
637 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
638 {
639 assert(is_degraded_or_backfilling_object(soid));
640
641 maybe_kick_recovery(soid);
642 waiting_for_degraded_object[soid].push_back(op);
643 op->mark_delayed("waiting for degraded object");
644 }
645
646 void PrimaryLogPG::block_write_on_full_cache(
647 const hobject_t& _oid, OpRequestRef op)
648 {
649 const hobject_t oid = _oid.get_head();
650 dout(20) << __func__ << ": blocking object " << oid
651 << " on full cache" << dendl;
652 objects_blocked_on_cache_full.insert(oid);
653 waiting_for_cache_not_full.push_back(op);
654 op->mark_delayed("waiting for cache not full");
655 }
656
657 void PrimaryLogPG::block_for_clean(
658 const hobject_t& oid, OpRequestRef op)
659 {
660 dout(20) << __func__ << ": blocking object " << oid
661 << " on primary repair" << dendl;
662 waiting_for_clean_to_primary_repair.push_back(op);
663 op->mark_delayed("waiting for clean to repair");
664 }
665
666 void PrimaryLogPG::block_write_on_snap_rollback(
667 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
668 {
669 dout(20) << __func__ << ": blocking object " << oid.get_head()
670 << " on snap promotion " << obc->obs.oi.soid << dendl;
671 // otherwise, we'd have blocked in do_op
672 assert(oid.is_head());
673 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
674 objects_blocked_on_snap_promotion[oid] = obc;
675 wait_for_blocked_object(obc->obs.oi.soid, op);
676 }
677
678 void PrimaryLogPG::block_write_on_degraded_snap(
679 const hobject_t& snap, OpRequestRef op)
680 {
681 dout(20) << __func__ << ": blocking object " << snap.get_head()
682 << " on degraded snap " << snap << dendl;
683 // otherwise, we'd have blocked in do_op
684 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
685 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
686 wait_for_degraded_object(snap, op);
687 }
688
689 bool PrimaryLogPG::maybe_await_blocked_snapset(
690 const hobject_t &hoid,
691 OpRequestRef op)
692 {
693 ObjectContextRef obc;
694 obc = object_contexts.lookup(hoid.get_head());
695 if (obc) {
696 if (obc->is_blocked()) {
697 wait_for_blocked_object(obc->obs.oi.soid, op);
698 return true;
699 } else {
700 return false;
701 }
702 }
703 obc = object_contexts.lookup(hoid.get_snapdir());
704 if (obc) {
705 if (obc->is_blocked()) {
706 wait_for_blocked_object(obc->obs.oi.soid, op);
707 return true;
708 } else {
709 return false;
710 }
711 }
712 return false;
713 }
714
715 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
716 {
717 dout(10) << __func__ << " " << soid << " " << op << dendl;
718 waiting_for_blocked_object[soid].push_back(op);
719 op->mark_delayed("waiting for blocked object");
720 }
721
722 void PrimaryLogPG::maybe_force_recovery()
723 {
724 // no force if not in degraded/recovery/backfill states
725 if (!is_degraded() &&
726 !state_test(PG_STATE_RECOVERING |
727 PG_STATE_RECOVERY_WAIT |
728 PG_STATE_BACKFILLING |
729 PG_STATE_BACKFILL_WAIT |
730 PG_STATE_BACKFILL_TOOFULL))
731 return;
732
733 if (pg_log.get_log().approx_size() <
734 cct->_conf->osd_max_pg_log_entries *
735 cct->_conf->osd_force_recovery_pg_log_entries_factor)
736 return;
737
738 // find the oldest missing object
739 version_t min_version = 0;
740 hobject_t soid;
741 if (!pg_log.get_missing().get_items().empty()) {
742 min_version = pg_log.get_missing().get_rmissing().begin()->first;
743 soid = pg_log.get_missing().get_rmissing().begin()->second;
744 }
745 assert(!actingbackfill.empty());
746 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
747 it != actingbackfill.end();
748 ++it) {
749 if (*it == get_primary()) continue;
750 pg_shard_t peer = *it;
751 if (peer_missing.count(peer) &&
752 !peer_missing[peer].get_items().empty() &&
753 min_version > peer_missing[peer].get_rmissing().begin()->first) {
754 min_version = peer_missing[peer].get_rmissing().begin()->first;
755 soid = peer_missing[peer].get_rmissing().begin()->second;
756 }
757 }
758
759 // recover it
760 if (soid != hobject_t())
761 maybe_kick_recovery(soid);
762 }
763
764 class PGLSPlainFilter : public PGLSFilter {
765 string val;
766 public:
767 int init(bufferlist::iterator &params) override
768 {
769 try {
770 ::decode(xattr, params);
771 ::decode(val, params);
772 } catch (buffer::error &e) {
773 return -EINVAL;
774 }
775
776 return 0;
777 }
778 ~PGLSPlainFilter() override {}
779 bool filter(const hobject_t &obj, bufferlist& xattr_data,
780 bufferlist& outdata) override;
781 };
782
783 class PGLSParentFilter : public PGLSFilter {
784 inodeno_t parent_ino;
785 public:
786 CephContext* cct;
787 PGLSParentFilter(CephContext* cct) : cct(cct) {
788 xattr = "_parent";
789 }
790 int init(bufferlist::iterator &params) override
791 {
792 try {
793 ::decode(parent_ino, params);
794 } catch (buffer::error &e) {
795 return -EINVAL;
796 }
797 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
798
799 return 0;
800 }
801 ~PGLSParentFilter() override {}
802 bool filter(const hobject_t &obj, bufferlist& xattr_data,
803 bufferlist& outdata) override;
804 };
805
806 bool PGLSParentFilter::filter(const hobject_t &obj,
807 bufferlist& xattr_data, bufferlist& outdata)
808 {
809 bufferlist::iterator iter = xattr_data.begin();
810 inode_backtrace_t bt;
811
812 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
813
814 ::decode(bt, iter);
815
816 vector<inode_backpointer_t>::iterator vi;
817 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
818 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
819 if (vi->dirino == parent_ino) {
820 ::encode(*vi, outdata);
821 return true;
822 }
823 }
824
825 return false;
826 }
827
828 bool PGLSPlainFilter::filter(const hobject_t &obj,
829 bufferlist& xattr_data, bufferlist& outdata)
830 {
831 if (val.size() != xattr_data.length())
832 return false;
833
834 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
835 return false;
836
837 return true;
838 }
839
840 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
841 {
842 bufferlist bl;
843
844 // If filter has expressed an interest in an xattr, load it.
845 if (!filter->get_xattr().empty()) {
846 int ret = pgbackend->objects_get_attr(
847 sobj,
848 filter->get_xattr(),
849 &bl);
850 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
851 if (ret < 0) {
852 if (ret != -ENODATA || filter->reject_empty_xattr()) {
853 return false;
854 }
855 }
856 }
857
858 return filter->filter(sobj, bl, outdata);
859 }
860
861 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
862 {
863 string type;
864 PGLSFilter *filter;
865
866 try {
867 ::decode(type, iter);
868 }
869 catch (buffer::error& e) {
870 return -EINVAL;
871 }
872
873 if (type.compare("parent") == 0) {
874 filter = new PGLSParentFilter(cct);
875 } else if (type.compare("plain") == 0) {
876 filter = new PGLSPlainFilter();
877 } else {
878 std::size_t dot = type.find(".");
879 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
880 return -EINVAL;
881 }
882
883 const std::string class_name = type.substr(0, dot);
884 const std::string filter_name = type.substr(dot + 1);
885 ClassHandler::ClassData *cls = NULL;
886 int r = osd->class_handler->open_class(class_name, &cls);
887 if (r != 0) {
888 derr << "Error opening class '" << class_name << "': "
889 << cpp_strerror(r) << dendl;
890 if (r != -EPERM) // propogate permission error
891 r = -EINVAL;
892 return r;
893 } else {
894 assert(cls);
895 }
896
897 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
898 if (class_filter == NULL) {
899 derr << "Error finding filter '" << filter_name << "' in class "
900 << class_name << dendl;
901 return -EINVAL;
902 }
903 filter = class_filter->fn();
904 if (!filter) {
905 // Object classes are obliged to return us something, but let's
906 // give an error rather than asserting out.
907 derr << "Buggy class " << class_name << " failed to construct "
908 "filter " << filter_name << dendl;
909 return -EINVAL;
910 }
911 }
912
913 assert(filter);
914 int r = filter->init(iter);
915 if (r < 0) {
916 derr << "Error initializing filter " << type << ": "
917 << cpp_strerror(r) << dendl;
918 delete filter;
919 return -EINVAL;
920 } else {
921 // Successfully constructed and initialized, return it.
922 *pfilter = filter;
923 return 0;
924 }
925 }
926
927
928 // ==========================================================
929
930 int PrimaryLogPG::do_command(
931 cmdmap_t cmdmap,
932 ostream& ss,
933 bufferlist& idata,
934 bufferlist& odata,
935 ConnectionRef con,
936 ceph_tid_t tid)
937 {
938 const auto &missing = pg_log.get_missing();
939 string prefix;
940 string format;
941
942 cmd_getval(cct, cmdmap, "format", format);
943 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
944
945 string command;
946 cmd_getval(cct, cmdmap, "cmd", command);
947 if (command == "query") {
948 f->open_object_section("pg");
949 f->dump_string("state", pg_state_string(get_state()));
950 f->dump_stream("snap_trimq") << snap_trimq;
951 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
952 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
953 f->open_array_section("up");
954 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
955 f->dump_unsigned("osd", *p);
956 f->close_section();
957 f->open_array_section("acting");
958 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
959 f->dump_unsigned("osd", *p);
960 f->close_section();
961 if (!backfill_targets.empty()) {
962 f->open_array_section("backfill_targets");
963 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
964 p != backfill_targets.end();
965 ++p)
966 f->dump_stream("shard") << *p;
967 f->close_section();
968 }
969 if (!actingbackfill.empty()) {
970 f->open_array_section("actingbackfill");
971 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
972 p != actingbackfill.end();
973 ++p)
974 f->dump_stream("shard") << *p;
975 f->close_section();
976 }
977 f->open_object_section("info");
978 _update_calc_stats();
979 info.dump(f.get());
980 f->close_section();
981
982 f->open_array_section("peer_info");
983 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
984 p != peer_info.end();
985 ++p) {
986 f->open_object_section("info");
987 f->dump_stream("peer") << p->first;
988 p->second.dump(f.get());
989 f->close_section();
990 }
991 f->close_section();
992
993 f->open_array_section("recovery_state");
994 handle_query_state(f.get());
995 f->close_section();
996
997 f->open_object_section("agent_state");
998 if (agent_state)
999 agent_state->dump(f.get());
1000 f->close_section();
1001
1002 f->close_section();
1003 f->flush(odata);
1004 return 0;
1005 }
1006 else if (command == "mark_unfound_lost") {
1007 string mulcmd;
1008 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1009 int mode = -1;
1010 if (mulcmd == "revert") {
1011 if (pool.info.ec_pool()) {
1012 ss << "mode must be 'delete' for ec pool";
1013 return -EINVAL;
1014 }
1015 mode = pg_log_entry_t::LOST_REVERT;
1016 } else if (mulcmd == "delete") {
1017 mode = pg_log_entry_t::LOST_DELETE;
1018 } else {
1019 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1020 return -EINVAL;
1021 }
1022 assert(mode == pg_log_entry_t::LOST_REVERT ||
1023 mode == pg_log_entry_t::LOST_DELETE);
1024
1025 if (!is_primary()) {
1026 ss << "not primary";
1027 return -EROFS;
1028 }
1029
1030 uint64_t unfound = missing_loc.num_unfound();
1031 if (!unfound) {
1032 ss << "pg has no unfound objects";
1033 return 0; // make command idempotent
1034 }
1035
1036 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1037 ss << "pg has " << unfound
1038 << " unfound objects but we haven't probed all sources, not marking lost";
1039 return -EINVAL;
1040 }
1041
1042 mark_all_unfound_lost(mode, con, tid);
1043 return -EAGAIN;
1044 }
1045 else if (command == "list_missing") {
1046 hobject_t offset;
1047 string offset_json;
1048 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1049 json_spirit::Value v;
1050 try {
1051 if (!json_spirit::read(offset_json, v))
1052 throw std::runtime_error("bad json");
1053 offset.decode(v);
1054 } catch (std::runtime_error& e) {
1055 ss << "error parsing offset: " << e.what();
1056 return -EINVAL;
1057 }
1058 }
1059 f->open_object_section("missing");
1060 {
1061 f->open_object_section("offset");
1062 offset.dump(f.get());
1063 f->close_section();
1064 }
1065 f->dump_int("num_missing", missing.num_missing());
1066 f->dump_int("num_unfound", get_num_unfound());
1067 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1068 missing_loc.get_needs_recovery();
1069 map<hobject_t, pg_missing_item>::const_iterator p =
1070 needs_recovery_map.upper_bound(offset);
1071 {
1072 f->open_array_section("objects");
1073 int32_t num = 0;
1074 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1075 if (missing_loc.is_unfound(p->first)) {
1076 f->open_object_section("object");
1077 {
1078 f->open_object_section("oid");
1079 p->first.dump(f.get());
1080 f->close_section();
1081 }
1082 p->second.dump(f.get()); // have, need keys
1083 {
1084 f->open_array_section("locations");
1085 for (set<pg_shard_t>::iterator r =
1086 missing_loc.get_locations(p->first).begin();
1087 r != missing_loc.get_locations(p->first).end();
1088 ++r)
1089 f->dump_stream("shard") << *r;
1090 f->close_section();
1091 }
1092 f->close_section();
1093 num++;
1094 }
1095 }
1096 f->close_section();
1097 }
1098 f->dump_bool("more", p != needs_recovery_map.end());
1099 f->close_section();
1100 f->flush(odata);
1101 return 0;
1102 }
1103
1104 ss << "unknown pg command " << prefix;
1105 return -EINVAL;
1106 }
1107
1108 // ==========================================================
1109
1110 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1111 {
1112 // NOTE: this is non-const because we modify the OSDOp.outdata in
1113 // place
1114 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1115 assert(m->get_type() == CEPH_MSG_OSD_OP);
1116 dout(10) << "do_pg_op " << *m << dendl;
1117
1118 op->mark_started();
1119
1120 int result = 0;
1121 string cname, mname;
1122 PGLSFilter *filter = NULL;
1123 bufferlist filter_out;
1124
1125 snapid_t snapid = m->get_snapid();
1126
1127 vector<OSDOp> ops = m->ops;
1128
1129 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1130 OSDOp& osd_op = *p;
1131 bufferlist::iterator bp = p->indata.begin();
1132 switch (p->op.op) {
1133 case CEPH_OSD_OP_PGNLS_FILTER:
1134 try {
1135 ::decode(cname, bp);
1136 ::decode(mname, bp);
1137 }
1138 catch (const buffer::error& e) {
1139 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1140 result = -EINVAL;
1141 break;
1142 }
1143 if (filter) {
1144 delete filter;
1145 filter = NULL;
1146 }
1147 result = get_pgls_filter(bp, &filter);
1148 if (result < 0)
1149 break;
1150
1151 assert(filter);
1152
1153 // fall through
1154
1155 case CEPH_OSD_OP_PGNLS:
1156 if (snapid != CEPH_NOSNAP) {
1157 result = -EINVAL;
1158 break;
1159 }
1160 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1161 dout(10) << " pgnls pg=" << m->get_pg()
1162 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1163 << " != " << info.pgid << dendl;
1164 result = 0; // hmm?
1165 } else {
1166 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1167
1168 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1169 // read into a buffer
1170 vector<hobject_t> sentries;
1171 pg_nls_response_t response;
1172 try {
1173 ::decode(response.handle, bp);
1174 }
1175 catch (const buffer::error& e) {
1176 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1177 result = -EINVAL;
1178 break;
1179 }
1180
1181 hobject_t next;
1182 hobject_t lower_bound = response.handle;
1183 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1184 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1185 dout(10) << " pgnls lower_bound " << lower_bound
1186 << " pg_end " << pg_end << dendl;
1187 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1188 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1189 // this should only happen with a buggy client.
1190 dout(10) << "outside of PG bounds " << pg_start << " .. "
1191 << pg_end << dendl;
1192 result = -EINVAL;
1193 break;
1194 }
1195
1196 hobject_t current = lower_bound;
1197 osr->flush();
1198 int r = pgbackend->objects_list_partial(
1199 current,
1200 list_size,
1201 list_size,
1202 &sentries,
1203 &next);
1204 if (r != 0) {
1205 result = -EINVAL;
1206 break;
1207 }
1208
1209 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1210 pg_log.get_missing().get_items().lower_bound(current);
1211 vector<hobject_t>::iterator ls_iter = sentries.begin();
1212 hobject_t _max = hobject_t::get_max();
1213 while (1) {
1214 const hobject_t &mcand =
1215 missing_iter == pg_log.get_missing().get_items().end() ?
1216 _max :
1217 missing_iter->first;
1218 const hobject_t &lcand =
1219 ls_iter == sentries.end() ?
1220 _max :
1221 *ls_iter;
1222
1223 hobject_t candidate;
1224 if (mcand == lcand) {
1225 candidate = mcand;
1226 if (!mcand.is_max()) {
1227 ++ls_iter;
1228 ++missing_iter;
1229 }
1230 } else if (mcand < lcand) {
1231 candidate = mcand;
1232 assert(!mcand.is_max());
1233 ++missing_iter;
1234 } else {
1235 candidate = lcand;
1236 assert(!lcand.is_max());
1237 ++ls_iter;
1238 }
1239
1240 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1241 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1242
1243 if (candidate >= next) {
1244 break;
1245 }
1246
1247 if (response.entries.size() == list_size) {
1248 next = candidate;
1249 break;
1250 }
1251
1252 // skip snapdir objects
1253 if (candidate.snap == CEPH_SNAPDIR)
1254 continue;
1255
1256 if (candidate.snap != CEPH_NOSNAP)
1257 continue;
1258
1259 // skip internal namespace
1260 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1261 continue;
1262
1263 if (missing_loc.is_deleted(candidate))
1264 continue;
1265
1266 // skip wrong namespace
1267 if (m->get_hobj().nspace != librados::all_nspaces &&
1268 candidate.get_namespace() != m->get_hobj().nspace)
1269 continue;
1270
1271 if (filter && !pgls_filter(filter, candidate, filter_out))
1272 continue;
1273
1274 dout(20) << "pgnls item 0x" << std::hex
1275 << candidate.get_hash()
1276 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1277 << std::dec << " "
1278 << candidate.oid.name << dendl;
1279
1280 librados::ListObjectImpl item;
1281 item.nspace = candidate.get_namespace();
1282 item.oid = candidate.oid.name;
1283 item.locator = candidate.get_key();
1284 response.entries.push_back(item);
1285 }
1286
1287 if (next.is_max() &&
1288 missing_iter == pg_log.get_missing().get_items().end() &&
1289 ls_iter == sentries.end()) {
1290 result = 1;
1291
1292 // Set response.handle to the start of the next PG according
1293 // to the object sort order.
1294 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1295 } else {
1296 response.handle = next;
1297 }
1298 dout(10) << "pgnls handle=" << response.handle << dendl;
1299 ::encode(response, osd_op.outdata);
1300 if (filter)
1301 ::encode(filter_out, osd_op.outdata);
1302 dout(10) << " pgnls result=" << result << " outdata.length()="
1303 << osd_op.outdata.length() << dendl;
1304 }
1305 break;
1306
1307 case CEPH_OSD_OP_PGLS_FILTER:
1308 try {
1309 ::decode(cname, bp);
1310 ::decode(mname, bp);
1311 }
1312 catch (const buffer::error& e) {
1313 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1314 result = -EINVAL;
1315 break;
1316 }
1317 if (filter) {
1318 delete filter;
1319 filter = NULL;
1320 }
1321 result = get_pgls_filter(bp, &filter);
1322 if (result < 0)
1323 break;
1324
1325 assert(filter);
1326
1327 // fall through
1328
1329 case CEPH_OSD_OP_PGLS:
1330 if (snapid != CEPH_NOSNAP) {
1331 result = -EINVAL;
1332 break;
1333 }
1334 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1335 dout(10) << " pgls pg=" << m->get_pg()
1336 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1337 << " != " << info.pgid << dendl;
1338 result = 0; // hmm?
1339 } else {
1340 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1341
1342 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1343 // read into a buffer
1344 vector<hobject_t> sentries;
1345 pg_ls_response_t response;
1346 try {
1347 ::decode(response.handle, bp);
1348 }
1349 catch (const buffer::error& e) {
1350 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1351 result = -EINVAL;
1352 break;
1353 }
1354
1355 hobject_t next;
1356 hobject_t current = response.handle;
1357 osr->flush();
1358 int r = pgbackend->objects_list_partial(
1359 current,
1360 list_size,
1361 list_size,
1362 &sentries,
1363 &next);
1364 if (r != 0) {
1365 result = -EINVAL;
1366 break;
1367 }
1368
1369 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1370
1371 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1372 pg_log.get_missing().get_items().lower_bound(current);
1373 vector<hobject_t>::iterator ls_iter = sentries.begin();
1374 hobject_t _max = hobject_t::get_max();
1375 while (1) {
1376 const hobject_t &mcand =
1377 missing_iter == pg_log.get_missing().get_items().end() ?
1378 _max :
1379 missing_iter->first;
1380 const hobject_t &lcand =
1381 ls_iter == sentries.end() ?
1382 _max :
1383 *ls_iter;
1384
1385 hobject_t candidate;
1386 if (mcand == lcand) {
1387 candidate = mcand;
1388 if (!mcand.is_max()) {
1389 ++ls_iter;
1390 ++missing_iter;
1391 }
1392 } else if (mcand < lcand) {
1393 candidate = mcand;
1394 assert(!mcand.is_max());
1395 ++missing_iter;
1396 } else {
1397 candidate = lcand;
1398 assert(!lcand.is_max());
1399 ++ls_iter;
1400 }
1401
1402 if (candidate >= next) {
1403 break;
1404 }
1405
1406 if (response.entries.size() == list_size) {
1407 next = candidate;
1408 break;
1409 }
1410
1411 // skip snapdir objects
1412 if (candidate.snap == CEPH_SNAPDIR)
1413 continue;
1414
1415 if (candidate.snap != CEPH_NOSNAP)
1416 continue;
1417
1418 // skip wrong namespace
1419 if (candidate.get_namespace() != m->get_hobj().nspace)
1420 continue;
1421
1422 if (missing_loc.is_deleted(candidate))
1423 continue;
1424
1425 if (filter && !pgls_filter(filter, candidate, filter_out))
1426 continue;
1427
1428 response.entries.push_back(make_pair(candidate.oid,
1429 candidate.get_key()));
1430 }
1431 if (next.is_max() &&
1432 missing_iter == pg_log.get_missing().get_items().end() &&
1433 ls_iter == sentries.end()) {
1434 result = 1;
1435 }
1436 response.handle = next;
1437 ::encode(response, osd_op.outdata);
1438 if (filter)
1439 ::encode(filter_out, osd_op.outdata);
1440 dout(10) << " pgls result=" << result << " outdata.length()="
1441 << osd_op.outdata.length() << dendl;
1442 }
1443 break;
1444
1445 case CEPH_OSD_OP_PG_HITSET_LS:
1446 {
1447 list< pair<utime_t,utime_t> > ls;
1448 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1449 p != info.hit_set.history.end();
1450 ++p)
1451 ls.push_back(make_pair(p->begin, p->end));
1452 if (hit_set)
1453 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1454 ::encode(ls, osd_op.outdata);
1455 }
1456 break;
1457
1458 case CEPH_OSD_OP_PG_HITSET_GET:
1459 {
1460 utime_t stamp(osd_op.op.hit_set_get.stamp);
1461 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1462 // read the current in-memory HitSet, not the version we've
1463 // checkpointed.
1464 if (!hit_set) {
1465 result= -ENOENT;
1466 break;
1467 }
1468 ::encode(*hit_set, osd_op.outdata);
1469 result = osd_op.outdata.length();
1470 } else {
1471 // read an archived HitSet.
1472 hobject_t oid;
1473 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1474 p != info.hit_set.history.end();
1475 ++p) {
1476 if (stamp >= p->begin && stamp <= p->end) {
1477 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1478 break;
1479 }
1480 }
1481 if (oid == hobject_t()) {
1482 result = -ENOENT;
1483 break;
1484 }
1485 if (!pool.info.is_replicated()) {
1486 // FIXME: EC not supported yet
1487 result = -EOPNOTSUPP;
1488 break;
1489 }
1490 if (is_unreadable_object(oid)) {
1491 wait_for_unreadable_object(oid, op);
1492 delete filter;
1493 return;
1494 }
1495 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1496 }
1497 }
1498 break;
1499
1500 case CEPH_OSD_OP_SCRUBLS:
1501 result = do_scrub_ls(m, &osd_op);
1502 break;
1503
1504 default:
1505 result = -EINVAL;
1506 break;
1507 }
1508
1509 if (result < 0)
1510 break;
1511 }
1512
1513 // reply
1514 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1515 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1516 false);
1517 reply->claim_op_out_data(ops);
1518 reply->set_result(result);
1519 reply->set_reply_versions(info.last_update, info.last_user_version);
1520 osd->send_message_osd_client(reply, m->get_connection());
1521 delete filter;
1522 }
1523
1524 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1525 {
1526 if (m->get_pg() != info.pgid.pgid) {
1527 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1528 return -EINVAL; // hmm?
1529 }
1530 auto bp = osd_op->indata.begin();
1531 scrub_ls_arg_t arg;
1532 try {
1533 arg.decode(bp);
1534 } catch (buffer::error&) {
1535 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1536 return -EINVAL;
1537 }
1538 int r = 0;
1539 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1540 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1541 r = -EAGAIN;
1542 } else if (!scrubber.store) {
1543 r = -ENOENT;
1544 } else if (arg.get_snapsets) {
1545 result.vals = scrubber.store->get_snap_errors(osd->store,
1546 get_pgid().pool(),
1547 arg.start_after,
1548 arg.max_return);
1549 } else {
1550 result.vals = scrubber.store->get_object_errors(osd->store,
1551 get_pgid().pool(),
1552 arg.start_after,
1553 arg.max_return);
1554 }
1555 ::encode(result, osd_op->outdata);
1556 return r;
1557 }
1558
1559 void PrimaryLogPG::calc_trim_to()
1560 {
1561 size_t target = cct->_conf->osd_min_pg_log_entries;
1562 if (is_degraded() ||
1563 state_test(PG_STATE_RECOVERING |
1564 PG_STATE_RECOVERY_WAIT |
1565 PG_STATE_BACKFILLING |
1566 PG_STATE_BACKFILL_WAIT |
1567 PG_STATE_BACKFILL_TOOFULL)) {
1568 target = cct->_conf->osd_max_pg_log_entries;
1569 }
1570
1571 eversion_t limit = MIN(
1572 min_last_complete_ondisk,
1573 pg_log.get_can_rollback_to());
1574 if (limit != eversion_t() &&
1575 limit != pg_trim_to &&
1576 pg_log.get_log().approx_size() > target) {
1577 size_t num_to_trim = pg_log.get_log().approx_size() - target;
1578 if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1579 return;
1580 }
1581 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1582 eversion_t new_trim_to;
1583 for (size_t i = 0; i < num_to_trim; ++i) {
1584 new_trim_to = it->version;
1585 ++it;
1586 if (new_trim_to > limit) {
1587 new_trim_to = limit;
1588 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1589 break;
1590 }
1591 }
1592 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1593 pg_trim_to = new_trim_to;
1594 assert(pg_trim_to <= pg_log.get_head());
1595 assert(pg_trim_to <= min_last_complete_ondisk);
1596 }
1597 }
1598
1599 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1600 const PGPool &_pool, spg_t p) :
1601 PG(o, curmap, _pool, p),
1602 pgbackend(
1603 PGBackend::build_pg_backend(
1604 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1605 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1606 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1607 new_backfill(false),
1608 temp_seq(0),
1609 snap_trimmer_machine(this)
1610 {
1611 missing_loc.set_backend_predicates(
1612 pgbackend->get_is_readable_predicate(),
1613 pgbackend->get_is_recoverable_predicate());
1614 snap_trimmer_machine.initiate();
1615 }
1616
1617 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1618 {
1619 src_oloc = oloc;
1620 if (oloc.key.empty())
1621 src_oloc.key = oid.name;
1622 }
1623
1624 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1625 {
1626 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1627 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1628 if (!session)
1629 return; // drop it.
1630 session->put(); // get_priv takes a ref, and so does the SessionRef
1631 hobject_t begin = info.pgid.pgid.get_hobj_start();
1632 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1633 if (begin < m->begin) {
1634 begin = m->begin;
1635 }
1636 if (end > m->end) {
1637 end = m->end;
1638 }
1639 dout(10) << __func__ << " backoff ack id " << m->id
1640 << " [" << begin << "," << end << ")" << dendl;
1641 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1642 }
1643
1644 void PrimaryLogPG::do_request(
1645 OpRequestRef& op,
1646 ThreadPool::TPHandle &handle)
1647 {
1648 if (op->osd_trace) {
1649 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1650 op->pg_trace.event("do request");
1651 }
1652 // make sure we have a new enough map
1653 auto p = waiting_for_map.find(op->get_source());
1654 if (p != waiting_for_map.end()) {
1655 // preserve ordering
1656 dout(20) << __func__ << " waiting_for_map "
1657 << p->first << " not empty, queueing" << dendl;
1658 p->second.push_back(op);
1659 op->mark_delayed("waiting_for_map not empty");
1660 return;
1661 }
1662 if (!have_same_or_newer_map(op->min_epoch)) {
1663 dout(20) << __func__ << " min " << op->min_epoch
1664 << ", queue on waiting_for_map " << op->get_source() << dendl;
1665 waiting_for_map[op->get_source()].push_back(op);
1666 op->mark_delayed("op must wait for map");
1667 osd->request_osdmap_update(op->min_epoch);
1668 return;
1669 }
1670
1671 if (can_discard_request(op)) {
1672 return;
1673 }
1674
1675 // pg-wide backoffs
1676 const Message *m = op->get_req();
1677 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1678 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1679 if (!session)
1680 return; // drop it.
1681 session->put(); // get_priv takes a ref, and so does the SessionRef
1682
1683 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1684 if (session->check_backoff(cct, info.pgid,
1685 info.pgid.pgid.get_hobj_start(), m)) {
1686 return;
1687 }
1688
1689 bool backoff =
1690 is_down() ||
1691 is_incomplete() ||
1692 (!is_active() && is_peered());
1693 if (g_conf->osd_backoff_on_peering && !backoff) {
1694 if (is_peering()) {
1695 backoff = true;
1696 }
1697 }
1698 if (backoff) {
1699 add_pg_backoff(session);
1700 return;
1701 }
1702 }
1703 // pg backoff acks at pg-level
1704 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1705 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1706 if (ba->begin != ba->end) {
1707 handle_backoff(op);
1708 return;
1709 }
1710 }
1711 }
1712
1713 if (!is_peered()) {
1714 // Delay unless PGBackend says it's ok
1715 if (pgbackend->can_handle_while_inactive(op)) {
1716 bool handled = pgbackend->handle_message(op);
1717 assert(handled);
1718 return;
1719 } else {
1720 waiting_for_peered.push_back(op);
1721 op->mark_delayed("waiting for peered");
1722 return;
1723 }
1724 }
1725
1726 if (flushes_in_progress > 0) {
1727 dout(20) << flushes_in_progress
1728 << " flushes_in_progress pending "
1729 << "waiting for flush on " << op << dendl;
1730 waiting_for_flush.push_back(op);
1731 op->mark_delayed("waiting for flush");
1732 return;
1733 }
1734
1735 assert(is_peered() && flushes_in_progress == 0);
1736 if (pgbackend->handle_message(op))
1737 return;
1738
1739 switch (op->get_req()->get_type()) {
1740 case CEPH_MSG_OSD_OP:
1741 case CEPH_MSG_OSD_BACKOFF:
1742 if (!is_active()) {
1743 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1744 waiting_for_active.push_back(op);
1745 op->mark_delayed("waiting for active");
1746 return;
1747 }
1748 switch (op->get_req()->get_type()) {
1749 case CEPH_MSG_OSD_OP:
1750 // verify client features
1751 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1752 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1753 osd->reply_op_error(op, -EOPNOTSUPP);
1754 return;
1755 }
1756 do_op(op);
1757 break;
1758 case CEPH_MSG_OSD_BACKOFF:
1759 // object-level backoff acks handled in osdop context
1760 handle_backoff(op);
1761 break;
1762 }
1763 break;
1764
1765 case MSG_OSD_SUBOP:
1766 do_sub_op(op);
1767 break;
1768
1769 case MSG_OSD_SUBOPREPLY:
1770 do_sub_op_reply(op);
1771 break;
1772
1773 case MSG_OSD_PG_SCAN:
1774 do_scan(op, handle);
1775 break;
1776
1777 case MSG_OSD_PG_BACKFILL:
1778 do_backfill(op);
1779 break;
1780
1781 case MSG_OSD_PG_BACKFILL_REMOVE:
1782 do_backfill_remove(op);
1783 break;
1784
1785 case MSG_OSD_SCRUB_RESERVE:
1786 {
1787 const MOSDScrubReserve *m =
1788 static_cast<const MOSDScrubReserve*>(op->get_req());
1789 switch (m->type) {
1790 case MOSDScrubReserve::REQUEST:
1791 handle_scrub_reserve_request(op);
1792 break;
1793 case MOSDScrubReserve::GRANT:
1794 handle_scrub_reserve_grant(op, m->from);
1795 break;
1796 case MOSDScrubReserve::REJECT:
1797 handle_scrub_reserve_reject(op, m->from);
1798 break;
1799 case MOSDScrubReserve::RELEASE:
1800 handle_scrub_reserve_release(op);
1801 break;
1802 }
1803 }
1804 break;
1805
1806 case MSG_OSD_REP_SCRUB:
1807 replica_scrub(op, handle);
1808 break;
1809
1810 case MSG_OSD_REP_SCRUBMAP:
1811 do_replica_scrub_map(op);
1812 break;
1813
1814 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1815 do_update_log_missing(op);
1816 break;
1817
1818 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1819 do_update_log_missing_reply(op);
1820 break;
1821
1822 default:
1823 assert(0 == "bad message type in do_request");
1824 }
1825 }
1826
1827 hobject_t PrimaryLogPG::earliest_backfill() const
1828 {
1829 hobject_t e = hobject_t::get_max();
1830 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1831 i != backfill_targets.end();
1832 ++i) {
1833 pg_shard_t bt = *i;
1834 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1835 assert(iter != peer_info.end());
1836 if (iter->second.last_backfill < e)
1837 e = iter->second.last_backfill;
1838 }
1839 return e;
1840 }
1841
1842 /** do_op - do an op
1843 * pg lock will be held (if multithreaded)
1844 * osd_lock NOT held.
1845 */
1846 void PrimaryLogPG::do_op(OpRequestRef& op)
1847 {
1848 FUNCTRACE();
1849 // NOTE: take a non-const pointer here; we must be careful not to
1850 // change anything that will break other reads on m (operator<<).
1851 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1852 assert(m->get_type() == CEPH_MSG_OSD_OP);
1853 if (m->finish_decode()) {
1854 op->reset_desc(); // for TrackedOp
1855 m->clear_payload();
1856 }
1857
1858 dout(20) << __func__ << ": op " << *m << dendl;
1859
1860 hobject_t head = m->get_hobj();
1861 head.snap = CEPH_NOSNAP;
1862
1863 if (!info.pgid.pgid.contains(
1864 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1865 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1866 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1867 << std::hex << head.get_hash() << std::dec << dendl;
1868 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1869 << " op " << *m;
1870 assert(!cct->_conf->osd_debug_misdirected_ops);
1871 return;
1872 }
1873
1874 bool can_backoff =
1875 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1876 SessionRef session;
1877 if (can_backoff) {
1878 session = static_cast<Session*>(m->get_connection()->get_priv());
1879 if (!session.get()) {
1880 dout(10) << __func__ << " no session" << dendl;
1881 return;
1882 }
1883 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1884
1885 if (session->check_backoff(cct, info.pgid, head, m)) {
1886 return;
1887 }
1888 }
1889
1890 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1891 // not implemented.
1892 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1893 osd->reply_op_error(op, -EINVAL);
1894 return;
1895 }
1896
1897 if (op->rmw_flags == 0) {
1898 int r = osd->osd->init_op_flags(op);
1899 if (r) {
1900 osd->reply_op_error(op, r);
1901 return;
1902 }
1903 }
1904
1905 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1906 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1907 op->may_read() &&
1908 !(op->may_write() || op->may_cache())) {
1909 // balanced reads; any replica will do
1910 if (!(is_primary() || is_replica())) {
1911 osd->handle_misdirected_op(this, op);
1912 return;
1913 }
1914 } else {
1915 // normal case; must be primary
1916 if (!is_primary()) {
1917 osd->handle_misdirected_op(this, op);
1918 return;
1919 }
1920 }
1921
1922 if (!op_has_sufficient_caps(op)) {
1923 osd->reply_op_error(op, -EPERM);
1924 return;
1925 }
1926
1927 if (op->includes_pg_op()) {
1928 return do_pg_op(op);
1929 }
1930
1931 // object name too long?
1932 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1933 dout(4) << "do_op name is longer than "
1934 << cct->_conf->osd_max_object_name_len
1935 << " bytes" << dendl;
1936 osd->reply_op_error(op, -ENAMETOOLONG);
1937 return;
1938 }
1939 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1940 dout(4) << "do_op locator is longer than "
1941 << cct->_conf->osd_max_object_name_len
1942 << " bytes" << dendl;
1943 osd->reply_op_error(op, -ENAMETOOLONG);
1944 return;
1945 }
1946 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1947 dout(4) << "do_op namespace is longer than "
1948 << cct->_conf->osd_max_object_namespace_len
1949 << " bytes" << dendl;
1950 osd->reply_op_error(op, -ENAMETOOLONG);
1951 return;
1952 }
1953
1954 if (int r = osd->store->validate_hobject_key(head)) {
1955 dout(4) << "do_op object " << head << " invalid for backing store: "
1956 << r << dendl;
1957 osd->reply_op_error(op, r);
1958 return;
1959 }
1960
1961 // blacklisted?
1962 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1963 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1964 osd->reply_op_error(op, -EBLACKLISTED);
1965 return;
1966 }
1967
1968 // order this op as a write?
1969 bool write_ordered = op->rwordered();
1970
1971 // discard due to cluster full transition? (we discard any op that
1972 // originates before the cluster or pool is marked full; the client
1973 // will resend after the full flag is removed or if they expect the
1974 // op to succeed despite being full). The except is FULL_FORCE and
1975 // FULL_TRY ops, which there is no reason to discard because they
1976 // bypass all full checks anyway. If this op isn't write or
1977 // read-ordered, we skip.
1978 // FIXME: we exclude mds writes for now.
1979 if (write_ordered && !(m->get_source().is_mds() ||
1980 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1981 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1982 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1983 dout(10) << __func__ << " discarding op sent before full " << m << " "
1984 << *m << dendl;
1985 return;
1986 }
1987 // mds should have stopped writing before this point.
1988 // We can't allow OSD to become non-startable even if mds
1989 // could be writing as part of file removals.
1990 ostringstream ss;
1991 if (write_ordered && osd->check_failsafe_full(ss)) {
1992 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1993 << ss.str()
1994 << dendl;
1995 return;
1996 }
1997 int64_t poolid = get_pgid().pool();
1998 if (op->may_write()) {
1999
2000 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2001 if (!pi) {
2002 return;
2003 }
2004
2005 // invalid?
2006 if (m->get_snapid() != CEPH_NOSNAP) {
2007 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2008 osd->reply_op_error(op, -EINVAL);
2009 return;
2010 }
2011
2012 // too big?
2013 if (cct->_conf->osd_max_write_size &&
2014 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2015 // journal can't hold commit!
2016 derr << "do_op msg data len " << m->get_data_len()
2017 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2018 << " on " << *m << dendl;
2019 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2020 return;
2021 }
2022 }
2023
2024 dout(10) << "do_op " << *m
2025 << (op->may_write() ? " may_write" : "")
2026 << (op->may_read() ? " may_read" : "")
2027 << (op->may_cache() ? " may_cache" : "")
2028 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2029 << " flags " << ceph_osd_flag_string(m->get_flags())
2030 << dendl;
2031
2032 // missing object?
2033 if (is_unreadable_object(head)) {
2034 if (!is_primary()) {
2035 osd->reply_op_error(op, -EAGAIN);
2036 return;
2037 }
2038 if (can_backoff &&
2039 (g_conf->osd_backoff_on_degraded ||
2040 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2041 add_backoff(session, head, head);
2042 maybe_kick_recovery(head);
2043 } else {
2044 wait_for_unreadable_object(head, op);
2045 }
2046 return;
2047 }
2048
2049 // degraded object?
2050 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2051 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2052 add_backoff(session, head, head);
2053 maybe_kick_recovery(head);
2054 } else {
2055 wait_for_degraded_object(head, op);
2056 }
2057 return;
2058 }
2059
2060 if (write_ordered &&
2061 scrubber.write_blocked_by_scrub(head)) {
2062 dout(20) << __func__ << ": waiting for scrub" << dendl;
2063 waiting_for_scrub.push_back(op);
2064 op->mark_delayed("waiting for scrub");
2065 return;
2066 }
2067
2068 // blocked on snap?
2069 map<hobject_t, snapid_t>::iterator blocked_iter =
2070 objects_blocked_on_degraded_snap.find(head);
2071 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2072 hobject_t to_wait_on(head);
2073 to_wait_on.snap = blocked_iter->second;
2074 wait_for_degraded_object(to_wait_on, op);
2075 return;
2076 }
2077 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2078 objects_blocked_on_snap_promotion.find(head);
2079 if (write_ordered &&
2080 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2081 wait_for_blocked_object(
2082 blocked_snap_promote_iter->second->obs.oi.soid,
2083 op);
2084 return;
2085 }
2086 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2087 block_write_on_full_cache(head, op);
2088 return;
2089 }
2090
2091 // missing snapdir?
2092 hobject_t snapdir = head.get_snapdir();
2093
2094 if (is_unreadable_object(snapdir)) {
2095 wait_for_unreadable_object(snapdir, op);
2096 return;
2097 }
2098
2099 // degraded object?
2100 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2101 wait_for_degraded_object(snapdir, op);
2102 return;
2103 }
2104
2105 // dup/resent?
2106 if (op->may_write() || op->may_cache()) {
2107 // warning: we will get back *a* request for this reqid, but not
2108 // necessarily the most recent. this happens with flush and
2109 // promote ops, but we can't possible have both in our log where
2110 // the original request is still not stable on disk, so for our
2111 // purposes here it doesn't matter which one we get.
2112 eversion_t version;
2113 version_t user_version;
2114 int return_code = 0;
2115 bool got = check_in_progress_op(
2116 m->get_reqid(), &version, &user_version, &return_code);
2117 if (got) {
2118 dout(3) << __func__ << " dup " << m->get_reqid()
2119 << " version " << version << dendl;
2120 if (already_complete(version)) {
2121 osd->reply_op_error(op, return_code, version, user_version);
2122 } else {
2123 dout(10) << " waiting for " << version << " to commit" << dendl;
2124 // always queue ondisk waiters, so that we can requeue if needed
2125 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2126 op->mark_delayed("waiting for ondisk");
2127 }
2128 return;
2129 }
2130 }
2131
2132 ObjectContextRef obc;
2133 bool can_create = op->may_write() || op->may_cache();
2134 hobject_t missing_oid;
2135 const hobject_t& oid = m->get_hobj();
2136
2137 // io blocked on obc?
2138 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2139 maybe_await_blocked_snapset(oid, op)) {
2140 return;
2141 }
2142
2143 int r = find_object_context(
2144 oid, &obc, can_create,
2145 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2146 &missing_oid);
2147
2148 if (r == -EAGAIN) {
2149 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2150 // we have to wait for the object.
2151 if (is_primary()) {
2152 // missing the specific snap we need; requeue and wait.
2153 assert(!op->may_write()); // only happens on a read/cache
2154 wait_for_unreadable_object(missing_oid, op);
2155 return;
2156 }
2157 } else if (r == 0) {
2158 if (is_unreadable_object(obc->obs.oi.soid)) {
2159 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2160 << " is unreadable, waiting" << dendl;
2161 wait_for_unreadable_object(obc->obs.oi.soid, op);
2162 return;
2163 }
2164
2165 // degraded object? (the check above was for head; this could be a clone)
2166 if (write_ordered &&
2167 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2168 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2169 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2170 << " is degraded, waiting" << dendl;
2171 wait_for_degraded_object(obc->obs.oi.soid, op);
2172 return;
2173 }
2174 }
2175
2176 bool in_hit_set = false;
2177 if (hit_set) {
2178 if (obc.get()) {
2179 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2180 in_hit_set = true;
2181 } else {
2182 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2183 in_hit_set = true;
2184 }
2185 if (!op->hitset_inserted) {
2186 hit_set->insert(oid);
2187 op->hitset_inserted = true;
2188 if (hit_set->is_full() ||
2189 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2190 hit_set_persist();
2191 }
2192 }
2193 }
2194
2195 if (agent_state) {
2196 if (agent_choose_mode(false, op))
2197 return;
2198 }
2199
2200 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2201 if (maybe_handle_manifest(op,
2202 write_ordered,
2203 obc))
2204 return;
2205 }
2206
2207 if (maybe_handle_cache(op,
2208 write_ordered,
2209 obc,
2210 r,
2211 missing_oid,
2212 false,
2213 in_hit_set))
2214 return;
2215
2216 if (r && (r != -ENOENT || !obc)) {
2217 // copy the reqids for copy get on ENOENT
2218 if (r == -ENOENT &&
2219 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2220 fill_in_copy_get_noent(op, oid, m->ops[0]);
2221 return;
2222 }
2223 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2224 if (op->may_write() &&
2225 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2226 record_write_error(op, oid, nullptr, r);
2227 } else {
2228 osd->reply_op_error(op, r);
2229 }
2230 return;
2231 }
2232
2233 // make sure locator is consistent
2234 object_locator_t oloc(obc->obs.oi.soid);
2235 if (m->get_object_locator() != oloc) {
2236 dout(10) << " provided locator " << m->get_object_locator()
2237 << " != object's " << obc->obs.oi.soid << dendl;
2238 osd->clog->warn() << "bad locator " << m->get_object_locator()
2239 << " on object " << oloc
2240 << " op " << *m;
2241 }
2242
2243 // io blocked on obc?
2244 if (obc->is_blocked() &&
2245 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2246 wait_for_blocked_object(obc->obs.oi.soid, op);
2247 return;
2248 }
2249
2250 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2251
2252 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2253 OSDOp& osd_op = *p;
2254
2255 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2256 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2257 m->get_snapid() != CEPH_SNAPDIR) {
2258 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2259 osd->reply_op_error(op, -EINVAL);
2260 return;
2261 }
2262 }
2263
2264 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2265
2266 if (!obc->obs.exists)
2267 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2268
2269 /* Due to obc caching, we might have a cached non-existent snapset_obc
2270 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2271 * do_op pipeline make decisions based on whether snapset_obc is
2272 * populated.
2273 */
2274 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2275 ctx->snapset_obc = ObjectContextRef();
2276
2277 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2278 dout(20) << __func__ << ": skipping rw locks" << dendl;
2279 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2280 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2281
2282 // verify there is in fact a flush in progress
2283 // FIXME: we could make this a stronger test.
2284 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2285 if (p == flush_ops.end()) {
2286 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2287 reply_ctx(ctx, -EINVAL);
2288 return;
2289 }
2290 } else if (!get_rw_locks(write_ordered, ctx)) {
2291 dout(20) << __func__ << " waiting for rw locks " << dendl;
2292 op->mark_delayed("waiting for rw locks");
2293 close_op_ctx(ctx);
2294 return;
2295 }
2296 dout(20) << __func__ << " obc " << *obc << dendl;
2297
2298 if (r) {
2299 dout(20) << __func__ << " returned an error: " << r << dendl;
2300 close_op_ctx(ctx);
2301 if (op->may_write() &&
2302 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2303 record_write_error(op, oid, nullptr, r);
2304 } else {
2305 osd->reply_op_error(op, r);
2306 }
2307 return;
2308 }
2309
2310 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2311 ctx->ignore_cache = true;
2312 }
2313
2314 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2315 // This object is lost. Reading from it returns an error.
2316 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2317 << " is lost" << dendl;
2318 reply_ctx(ctx, -ENFILE);
2319 return;
2320 }
2321 if (!op->may_write() &&
2322 !op->may_cache() &&
2323 (!obc->obs.exists ||
2324 ((m->get_snapid() != CEPH_SNAPDIR) &&
2325 obc->obs.oi.is_whiteout()))) {
2326 // copy the reqids for copy get on ENOENT
2327 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2328 fill_in_copy_get_noent(op, oid, m->ops[0]);
2329 close_op_ctx(ctx);
2330 return;
2331 }
2332 reply_ctx(ctx, -ENOENT);
2333 return;
2334 }
2335
2336 op->mark_started();
2337
2338 execute_ctx(ctx);
2339 utime_t prepare_latency = ceph_clock_now();
2340 prepare_latency -= op->get_dequeued_time();
2341 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2342 if (op->may_read() && op->may_write()) {
2343 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2344 } else if (op->may_read()) {
2345 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2346 } else if (op->may_write() || op->may_cache()) {
2347 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2348 }
2349
2350 // force recovery of the oldest missing object if too many logs
2351 maybe_force_recovery();
2352 }
2353
2354 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2355 OpRequestRef op,
2356 bool write_ordered,
2357 ObjectContextRef obc)
2358 {
2359 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2360 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2361 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2362 return cache_result_t::NOOP;
2363 }
2364
2365 if (obc)
2366 dout(10) << __func__ << " " << obc->obs.oi << " "
2367 << (obc->obs.exists ? "exists" : "DNE")
2368 << dendl;
2369
2370 // if it is write-ordered and blocked, stop now
2371 if (obc.get() && obc->is_blocked() && write_ordered) {
2372 // we're already doing something with this object
2373 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2374 return cache_result_t::NOOP;
2375 }
2376
2377 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2378 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2379 OSDOp& osd_op = *p;
2380 ceph_osd_op& op = osd_op.op;
2381 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2382 return cache_result_t::NOOP;
2383 }
2384 }
2385
2386 switch (obc->obs.oi.manifest.type) {
2387 case object_manifest_t::TYPE_REDIRECT:
2388 if (op->may_write() || write_ordered) {
2389 do_proxy_write(op, obc->obs.oi.soid, obc);
2390 } else {
2391 do_proxy_read(op, obc);
2392 }
2393 return cache_result_t::HANDLED_PROXY;
2394 case object_manifest_t::TYPE_CHUNKED:
2395 default:
2396 assert(0 == "unrecognized manifest type");
2397 }
2398
2399 return cache_result_t::NOOP;
2400 }
2401
2402 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2403 MOSDOpReply *orig_reply, int r)
2404 {
2405 dout(20) << __func__ << " r=" << r << dendl;
2406 assert(op->may_write());
2407 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2408 mempool::osd_pglog::list<pg_log_entry_t> entries;
2409 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2410 get_next_version(), eversion_t(), 0,
2411 reqid, utime_t(), r));
2412
2413 struct OnComplete {
2414 PrimaryLogPG *pg;
2415 OpRequestRef op;
2416 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2417 int r;
2418 OnComplete(
2419 PrimaryLogPG *pg,
2420 OpRequestRef op,
2421 MOSDOpReply *orig_reply,
2422 int r)
2423 : pg(pg), op(op),
2424 orig_reply(orig_reply, false /* take over ref */), r(r)
2425 {}
2426 void operator()() {
2427 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2428 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2429 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2430 MOSDOpReply *reply = orig_reply.detach();
2431 if (reply == nullptr) {
2432 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2433 flags, true);
2434 }
2435 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2436 pg->osd->send_message_osd_client(reply, m->get_connection());
2437 }
2438 };
2439
2440 ObcLockManager lock_manager;
2441 submit_log_entries(
2442 entries,
2443 std::move(lock_manager),
2444 boost::optional<std::function<void(void)> >(
2445 OnComplete(this, op, orig_reply, r)),
2446 op,
2447 r);
2448 }
2449
2450 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2451 OpRequestRef op,
2452 bool write_ordered,
2453 ObjectContextRef obc,
2454 int r, hobject_t missing_oid,
2455 bool must_promote,
2456 bool in_hit_set,
2457 ObjectContextRef *promote_obc)
2458 {
2459 // return quickly if caching is not enabled
2460 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2461 return cache_result_t::NOOP;
2462
2463 if (op &&
2464 op->get_req() &&
2465 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2466 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2467 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2468 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2469 return cache_result_t::NOOP;
2470 }
2471
2472 must_promote = must_promote || op->need_promote();
2473
2474 if (obc)
2475 dout(25) << __func__ << " " << obc->obs.oi << " "
2476 << (obc->obs.exists ? "exists" : "DNE")
2477 << " missing_oid " << missing_oid
2478 << " must_promote " << (int)must_promote
2479 << " in_hit_set " << (int)in_hit_set
2480 << dendl;
2481 else
2482 dout(25) << __func__ << " (no obc)"
2483 << " missing_oid " << missing_oid
2484 << " must_promote " << (int)must_promote
2485 << " in_hit_set " << (int)in_hit_set
2486 << dendl;
2487
2488 // if it is write-ordered and blocked, stop now
2489 if (obc.get() && obc->is_blocked() && write_ordered) {
2490 // we're already doing something with this object
2491 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2492 return cache_result_t::NOOP;
2493 }
2494
2495 if (r == -ENOENT && missing_oid == hobject_t()) {
2496 // we know this object is logically absent (e.g., an undefined clone)
2497 return cache_result_t::NOOP;
2498 }
2499
2500 if (obc.get() && obc->obs.exists) {
2501 osd->logger->inc(l_osd_op_cache_hit);
2502 return cache_result_t::NOOP;
2503 }
2504 if (!is_primary()) {
2505 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2506 osd->reply_op_error(op, -EAGAIN);
2507 return cache_result_t::REPLIED_WITH_EAGAIN;
2508 }
2509
2510 if (missing_oid == hobject_t() && obc.get()) {
2511 missing_oid = obc->obs.oi.soid;
2512 }
2513
2514 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2515 const object_locator_t oloc = m->get_object_locator();
2516
2517 if (op->need_skip_handle_cache()) {
2518 return cache_result_t::NOOP;
2519 }
2520
2521 // older versions do not proxy the feature bits.
2522 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2523 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2524 OpRequestRef promote_op;
2525
2526 switch (pool.info.cache_mode) {
2527 case pg_pool_t::CACHEMODE_WRITEBACK:
2528 if (agent_state &&
2529 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2530 if (!op->may_write() && !op->may_cache() &&
2531 !write_ordered && !must_promote) {
2532 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2533 do_proxy_read(op);
2534 return cache_result_t::HANDLED_PROXY;
2535 }
2536 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2537 block_write_on_full_cache(missing_oid, op);
2538 return cache_result_t::BLOCKED_FULL;
2539 }
2540
2541 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2542 promote_object(obc, missing_oid, oloc, op, promote_obc);
2543 return cache_result_t::BLOCKED_PROMOTE;
2544 }
2545
2546 if (op->may_write() || op->may_cache()) {
2547 if (can_proxy_write) {
2548 do_proxy_write(op, missing_oid);
2549 } else {
2550 // promote if can't proxy the write
2551 promote_object(obc, missing_oid, oloc, op, promote_obc);
2552 return cache_result_t::BLOCKED_PROMOTE;
2553 }
2554
2555 // Promote too?
2556 if (!op->need_skip_promote() &&
2557 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2558 pool.info.min_write_recency_for_promote,
2559 OpRequestRef(),
2560 promote_obc)) {
2561 return cache_result_t::BLOCKED_PROMOTE;
2562 }
2563 return cache_result_t::HANDLED_PROXY;
2564 } else {
2565 do_proxy_read(op);
2566
2567 // Avoid duplicate promotion
2568 if (obc.get() && obc->is_blocked()) {
2569 if (promote_obc)
2570 *promote_obc = obc;
2571 return cache_result_t::BLOCKED_PROMOTE;
2572 }
2573
2574 // Promote too?
2575 if (!op->need_skip_promote()) {
2576 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2577 pool.info.min_read_recency_for_promote,
2578 promote_op, promote_obc);
2579 }
2580
2581 return cache_result_t::HANDLED_PROXY;
2582 }
2583 assert(0 == "unreachable");
2584 return cache_result_t::NOOP;
2585
2586 case pg_pool_t::CACHEMODE_FORWARD:
2587 // FIXME: this mode allows requests to be reordered.
2588 do_cache_redirect(op);
2589 return cache_result_t::HANDLED_REDIRECT;
2590
2591 case pg_pool_t::CACHEMODE_READONLY:
2592 // TODO: clean this case up
2593 if (!obc.get() && r == -ENOENT) {
2594 // we don't have the object and op's a read
2595 promote_object(obc, missing_oid, oloc, op, promote_obc);
2596 return cache_result_t::BLOCKED_PROMOTE;
2597 }
2598 if (!r) { // it must be a write
2599 do_cache_redirect(op);
2600 return cache_result_t::HANDLED_REDIRECT;
2601 }
2602 // crap, there was a failure of some kind
2603 return cache_result_t::NOOP;
2604
2605 case pg_pool_t::CACHEMODE_READFORWARD:
2606 // Do writeback to the cache tier for writes
2607 if (op->may_write() || write_ordered || must_promote) {
2608 if (agent_state &&
2609 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2610 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2611 block_write_on_full_cache(missing_oid, op);
2612 return cache_result_t::BLOCKED_FULL;
2613 }
2614 promote_object(obc, missing_oid, oloc, op, promote_obc);
2615 return cache_result_t::BLOCKED_PROMOTE;
2616 }
2617
2618 // If it is a read, we can read, we need to forward it
2619 do_cache_redirect(op);
2620 return cache_result_t::HANDLED_REDIRECT;
2621
2622 case pg_pool_t::CACHEMODE_PROXY:
2623 if (!must_promote) {
2624 if (op->may_write() || op->may_cache() || write_ordered) {
2625 if (can_proxy_write) {
2626 do_proxy_write(op, missing_oid);
2627 return cache_result_t::HANDLED_PROXY;
2628 }
2629 } else {
2630 do_proxy_read(op);
2631 return cache_result_t::HANDLED_PROXY;
2632 }
2633 }
2634 // ugh, we're forced to promote.
2635 if (agent_state &&
2636 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2637 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2638 block_write_on_full_cache(missing_oid, op);
2639 return cache_result_t::BLOCKED_FULL;
2640 }
2641 promote_object(obc, missing_oid, oloc, op, promote_obc);
2642 return cache_result_t::BLOCKED_PROMOTE;
2643
2644 case pg_pool_t::CACHEMODE_READPROXY:
2645 // Do writeback to the cache tier for writes
2646 if (op->may_write() || write_ordered || must_promote) {
2647 if (agent_state &&
2648 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2649 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2650 block_write_on_full_cache(missing_oid, op);
2651 return cache_result_t::BLOCKED_FULL;
2652 }
2653 promote_object(obc, missing_oid, oloc, op, promote_obc);
2654 return cache_result_t::BLOCKED_PROMOTE;
2655 }
2656
2657 // If it is a read, we can read, we need to proxy it
2658 do_proxy_read(op);
2659 return cache_result_t::HANDLED_PROXY;
2660
2661 default:
2662 assert(0 == "unrecognized cache_mode");
2663 }
2664 return cache_result_t::NOOP;
2665 }
2666
2667 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2668 const hobject_t& missing_oid,
2669 const object_locator_t& oloc,
2670 bool in_hit_set,
2671 uint32_t recency,
2672 OpRequestRef promote_op,
2673 ObjectContextRef *promote_obc)
2674 {
2675 dout(20) << __func__ << " missing_oid " << missing_oid
2676 << " in_hit_set " << in_hit_set << dendl;
2677
2678 switch (recency) {
2679 case 0:
2680 break;
2681 case 1:
2682 // Check if in the current hit set
2683 if (in_hit_set) {
2684 break;
2685 } else {
2686 // not promoting
2687 return false;
2688 }
2689 break;
2690 default:
2691 {
2692 unsigned count = (int)in_hit_set;
2693 if (count) {
2694 // Check if in other hit sets
2695 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2696 for (map<time_t,HitSetRef>::reverse_iterator itor =
2697 agent_state->hit_set_map.rbegin();
2698 itor != agent_state->hit_set_map.rend();
2699 ++itor) {
2700 if (!itor->second->contains(oid)) {
2701 break;
2702 }
2703 ++count;
2704 if (count >= recency) {
2705 break;
2706 }
2707 }
2708 }
2709 if (count >= recency) {
2710 break;
2711 }
2712 return false; // not promoting
2713 }
2714 break;
2715 }
2716
2717 if (osd->promote_throttle()) {
2718 dout(10) << __func__ << " promote throttled" << dendl;
2719 return false;
2720 }
2721 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2722 return true;
2723 }
2724
2725 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2726 {
2727 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2728 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2729 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2730 get_osdmap()->get_epoch(), flags, false);
2731 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2732 reply->set_redirect(redir);
2733 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2734 << op << dendl;
2735 m->get_connection()->send_message(reply);
2736 return;
2737 }
2738
2739 struct C_ProxyRead : public Context {
2740 PrimaryLogPGRef pg;
2741 hobject_t oid;
2742 epoch_t last_peering_reset;
2743 ceph_tid_t tid;
2744 PrimaryLogPG::ProxyReadOpRef prdop;
2745 utime_t start;
2746 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2747 const PrimaryLogPG::ProxyReadOpRef& prd)
2748 : pg(p), oid(o), last_peering_reset(lpr),
2749 tid(0), prdop(prd), start(ceph_clock_now())
2750 {}
2751 void finish(int r) override {
2752 if (prdop->canceled)
2753 return;
2754 pg->lock();
2755 if (prdop->canceled) {
2756 pg->unlock();
2757 return;
2758 }
2759 if (last_peering_reset == pg->get_last_peering_reset()) {
2760 pg->finish_proxy_read(oid, tid, r);
2761 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2762 }
2763 pg->unlock();
2764 }
2765 };
2766
2767 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2768 {
2769 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2770 // stash the result in the request's OSDOp vector
2771 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2772 object_locator_t oloc;
2773 hobject_t soid;
2774 /* extensible tier */
2775 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2776 switch (obc->obs.oi.manifest.type) {
2777 case object_manifest_t::TYPE_REDIRECT:
2778 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2779 soid = obc->obs.oi.manifest.redirect_target;
2780 break;
2781 case object_manifest_t::TYPE_CHUNKED:
2782 default:
2783 assert(0 == "unrecognized manifest type");
2784 }
2785 } else {
2786 /* proxy */
2787 soid = m->get_hobj();
2788 oloc = object_locator_t(m->get_object_locator());
2789 oloc.pool = pool.info.tier_of;
2790 }
2791 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2792
2793 // pass through some original flags that make sense.
2794 // - leave out redirection and balancing flags since we are
2795 // already proxying through the primary
2796 // - leave off read/write/exec flags that are derived from the op
2797 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2798 CEPH_OSD_FLAG_ORDERSNAP |
2799 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2800 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2801
2802 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2803
2804 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2805
2806 ObjectOperation obj_op;
2807 obj_op.dup(prdop->ops);
2808
2809 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2810 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2811 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2812 ceph_osd_op op = obj_op.ops[i].op;
2813 switch (op.op) {
2814 case CEPH_OSD_OP_READ:
2815 case CEPH_OSD_OP_SYNC_READ:
2816 case CEPH_OSD_OP_SPARSE_READ:
2817 case CEPH_OSD_OP_CHECKSUM:
2818 case CEPH_OSD_OP_CMPEXT:
2819 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2820 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2821 }
2822 }
2823 }
2824
2825 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2826 prdop);
2827 ceph_tid_t tid = osd->objecter->read(
2828 soid.oid, oloc, obj_op,
2829 m->get_snapid(), NULL,
2830 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2831 &prdop->user_version,
2832 &prdop->data_offset,
2833 m->get_features());
2834 fin->tid = tid;
2835 prdop->objecter_tid = tid;
2836 proxyread_ops[tid] = prdop;
2837 in_progress_proxy_ops[soid].push_back(op);
2838 }
2839
2840 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2841 {
2842 dout(10) << __func__ << " " << oid << " tid " << tid
2843 << " " << cpp_strerror(r) << dendl;
2844
2845 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2846 if (p == proxyread_ops.end()) {
2847 dout(10) << __func__ << " no proxyread_op found" << dendl;
2848 return;
2849 }
2850 ProxyReadOpRef prdop = p->second;
2851 if (tid != prdop->objecter_tid) {
2852 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2853 << " tid " << prdop->objecter_tid << dendl;
2854 return;
2855 }
2856 if (oid != prdop->soid) {
2857 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2858 << " soid " << prdop->soid << dendl;
2859 return;
2860 }
2861 proxyread_ops.erase(tid);
2862
2863 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2864 if (q == in_progress_proxy_ops.end()) {
2865 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2866 return;
2867 }
2868 assert(q->second.size());
2869 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2870 q->second.end(),
2871 prdop->op);
2872 assert(it != q->second.end());
2873 OpRequestRef op = *it;
2874 q->second.erase(it);
2875 if (q->second.size() == 0) {
2876 in_progress_proxy_ops.erase(oid);
2877 }
2878
2879 osd->logger->inc(l_osd_tier_proxy_read);
2880
2881 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2882 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2883 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2884 ctx->user_at_version = prdop->user_version;
2885 ctx->data_off = prdop->data_offset;
2886 ctx->ignore_log_op_stats = true;
2887 complete_read_ctx(r, ctx);
2888 }
2889
2890 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2891 {
2892 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2893 if (p == in_progress_proxy_ops.end())
2894 return;
2895
2896 list<OpRequestRef>& ls = p->second;
2897 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2898 requeue_ops(ls);
2899 in_progress_proxy_ops.erase(p);
2900 }
2901
2902 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2903 {
2904 dout(10) << __func__ << " " << prdop->soid << dendl;
2905 prdop->canceled = true;
2906
2907 // cancel objecter op, if we can
2908 if (prdop->objecter_tid) {
2909 osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2910 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2911 prdop->ops[i].outdata.clear();
2912 }
2913 proxyread_ops.erase(prdop->objecter_tid);
2914 prdop->objecter_tid = 0;
2915 }
2916 }
2917
2918 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2919 {
2920 dout(10) << __func__ << dendl;
2921
2922 // cancel proxy reads
2923 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2924 while (p != proxyread_ops.end()) {
2925 cancel_proxy_read((p++)->second);
2926 }
2927
2928 // cancel proxy writes
2929 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2930 while (q != proxywrite_ops.end()) {
2931 cancel_proxy_write((q++)->second);
2932 }
2933
2934 if (requeue) {
2935 map<hobject_t, list<OpRequestRef>>::iterator p =
2936 in_progress_proxy_ops.begin();
2937 while (p != in_progress_proxy_ops.end()) {
2938 list<OpRequestRef>& ls = p->second;
2939 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2940 << " requests" << dendl;
2941 requeue_ops(ls);
2942 in_progress_proxy_ops.erase(p++);
2943 }
2944 } else {
2945 in_progress_proxy_ops.clear();
2946 }
2947 }
2948
2949 struct C_ProxyWrite_Commit : public Context {
2950 PrimaryLogPGRef pg;
2951 hobject_t oid;
2952 epoch_t last_peering_reset;
2953 ceph_tid_t tid;
2954 PrimaryLogPG::ProxyWriteOpRef pwop;
2955 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2956 const PrimaryLogPG::ProxyWriteOpRef& pw)
2957 : pg(p), oid(o), last_peering_reset(lpr),
2958 tid(0), pwop(pw)
2959 {}
2960 void finish(int r) override {
2961 if (pwop->canceled)
2962 return;
2963 pg->lock();
2964 if (pwop->canceled) {
2965 pg->unlock();
2966 return;
2967 }
2968 if (last_peering_reset == pg->get_last_peering_reset()) {
2969 pg->finish_proxy_write(oid, tid, r);
2970 }
2971 pg->unlock();
2972 }
2973 };
2974
2975 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2976 {
2977 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2978 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2979 object_locator_t oloc;
2980 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2981 hobject_t soid;
2982 /* extensible tier */
2983 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2984 switch (obc->obs.oi.manifest.type) {
2985 case object_manifest_t::TYPE_REDIRECT:
2986 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2987 soid = obc->obs.oi.manifest.redirect_target;
2988 break;
2989 case object_manifest_t::TYPE_CHUNKED:
2990 default:
2991 assert(0 == "unrecognized manifest type");
2992 }
2993 } else {
2994 /* proxy */
2995 soid = m->get_hobj();
2996 oloc = object_locator_t(m->get_object_locator());
2997 oloc.pool = pool.info.tier_of;
2998 }
2999
3000 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3001 if (!(op->may_write() || op->may_cache())) {
3002 flags |= CEPH_OSD_FLAG_RWORDERED;
3003 }
3004 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3005
3006 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3007 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3008 pwop->mtime = m->get_mtime();
3009
3010 ObjectOperation obj_op;
3011 obj_op.dup(pwop->ops);
3012
3013 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3014 this, soid, get_last_peering_reset(), pwop);
3015 ceph_tid_t tid = osd->objecter->mutate(
3016 soid.oid, oloc, obj_op, snapc,
3017 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3018 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3019 &pwop->user_version, pwop->reqid);
3020 fin->tid = tid;
3021 pwop->objecter_tid = tid;
3022 proxywrite_ops[tid] = pwop;
3023 in_progress_proxy_ops[soid].push_back(op);
3024 }
3025
3026 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3027 {
3028 dout(10) << __func__ << " " << oid << " tid " << tid
3029 << " " << cpp_strerror(r) << dendl;
3030
3031 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3032 if (p == proxywrite_ops.end()) {
3033 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3034 return;
3035 }
3036 ProxyWriteOpRef pwop = p->second;
3037 assert(tid == pwop->objecter_tid);
3038 assert(oid == pwop->soid);
3039
3040 proxywrite_ops.erase(tid);
3041
3042 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3043 if (q == in_progress_proxy_ops.end()) {
3044 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3045 delete pwop->ctx;
3046 pwop->ctx = NULL;
3047 return;
3048 }
3049 list<OpRequestRef>& in_progress_op = q->second;
3050 assert(in_progress_op.size());
3051 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3052 in_progress_op.end(),
3053 pwop->op);
3054 assert(it != in_progress_op.end());
3055 in_progress_op.erase(it);
3056 if (in_progress_op.size() == 0) {
3057 in_progress_proxy_ops.erase(oid);
3058 }
3059
3060 osd->logger->inc(l_osd_tier_proxy_write);
3061
3062 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3063 assert(m != NULL);
3064
3065 if (!pwop->sent_reply) {
3066 // send commit.
3067 MOSDOpReply *reply = pwop->ctx->reply;
3068 if (reply)
3069 pwop->ctx->reply = NULL;
3070 else {
3071 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3072 reply->set_reply_versions(eversion_t(), pwop->user_version);
3073 }
3074 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3075 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3076 osd->send_message_osd_client(reply, m->get_connection());
3077 pwop->sent_reply = true;
3078 pwop->ctx->op->mark_commit_sent();
3079 }
3080
3081 delete pwop->ctx;
3082 pwop->ctx = NULL;
3083 }
3084
3085 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3086 {
3087 dout(10) << __func__ << " " << pwop->soid << dendl;
3088 pwop->canceled = true;
3089
3090 // cancel objecter op, if we can
3091 if (pwop->objecter_tid) {
3092 osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3093 delete pwop->ctx;
3094 pwop->ctx = NULL;
3095 proxywrite_ops.erase(pwop->objecter_tid);
3096 pwop->objecter_tid = 0;
3097 }
3098 }
3099
3100 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3101 ObjectContextRef obc;
3102 PrimaryLogPG *pg;
3103 utime_t start;
3104 public:
3105 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3106 : obc(obc_),
3107 pg(pg_),
3108 start(ceph_clock_now()) {}
3109
3110 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3111 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3112 int r = results.get<0>();
3113 pg->finish_promote(r, results_data, obc);
3114 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3115 }
3116 };
3117
3118 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3119 const hobject_t& missing_oid,
3120 const object_locator_t& oloc,
3121 OpRequestRef op,
3122 ObjectContextRef *promote_obc)
3123 {
3124 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3125 assert(hoid != hobject_t());
3126 if (scrubber.write_blocked_by_scrub(hoid)) {
3127 dout(10) << __func__ << " " << hoid
3128 << " blocked by scrub" << dendl;
3129 if (op) {
3130 waiting_for_scrub.push_back(op);
3131 op->mark_delayed("waiting for scrub");
3132 dout(10) << __func__ << " " << hoid
3133 << " placing op in waiting_for_scrub" << dendl;
3134 } else {
3135 dout(10) << __func__ << " " << hoid
3136 << " no op, dropping on the floor" << dendl;
3137 }
3138 return;
3139 }
3140 if (!obc) { // we need to create an ObjectContext
3141 assert(missing_oid != hobject_t());
3142 obc = get_object_context(missing_oid, true);
3143 }
3144 if (promote_obc)
3145 *promote_obc = obc;
3146
3147 /*
3148 * Before promote complete, if there are proxy-reads for the object,
3149 * for this case we don't use DONTNEED.
3150 */
3151 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3152 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3153 if (q == in_progress_proxy_ops.end()) {
3154 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3155 }
3156
3157 PromoteCallback *cb = new PromoteCallback(obc, this);
3158 object_locator_t my_oloc = oloc;
3159 my_oloc.pool = pool.info.tier_of;
3160
3161 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3162 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3163 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3164 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3165 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3166 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3167 src_fadvise_flags, 0);
3168
3169 assert(obc->is_blocked());
3170
3171 if (op)
3172 wait_for_blocked_object(obc->obs.oi.soid, op);
3173 info.stats.stats.sum.num_promote++;
3174 }
3175
3176 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3177 {
3178 FUNCTRACE();
3179 dout(10) << __func__ << " " << ctx << dendl;
3180 ctx->reset_obs(ctx->obc);
3181 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3182 OpRequestRef op = ctx->op;
3183 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3184 ObjectContextRef obc = ctx->obc;
3185 const hobject_t& soid = obc->obs.oi.soid;
3186
3187 // this method must be idempotent since we may call it several times
3188 // before we finally apply the resulting transaction.
3189 ctx->op_t.reset(new PGTransaction);
3190
3191 if (op->may_write() || op->may_cache()) {
3192 // snap
3193 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3194 pool.info.is_pool_snaps_mode()) {
3195 // use pool's snapc
3196 ctx->snapc = pool.snapc;
3197 } else {
3198 // client specified snapc
3199 ctx->snapc.seq = m->get_snap_seq();
3200 ctx->snapc.snaps = m->get_snaps();
3201 filter_snapc(ctx->snapc.snaps);
3202 }
3203 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3204 ctx->snapc.seq < obc->ssc->snapset.seq) {
3205 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3206 << " < snapset seq " << obc->ssc->snapset.seq
3207 << " on " << obc->obs.oi.soid << dendl;
3208 reply_ctx(ctx, -EOLDSNAPC);
3209 return;
3210 }
3211
3212 // version
3213 ctx->at_version = get_next_version();
3214 ctx->mtime = m->get_mtime();
3215
3216 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3217 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3218 << " snapc " << ctx->snapc
3219 << " snapset " << obc->ssc->snapset
3220 << dendl;
3221 } else {
3222 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3223 << " ov " << obc->obs.oi.version
3224 << dendl;
3225 }
3226
3227 if (!ctx->user_at_version)
3228 ctx->user_at_version = obc->obs.oi.user_version;
3229 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3230
3231 if (op->may_read()) {
3232 dout(10) << " taking ondisk_read_lock" << dendl;
3233 obc->ondisk_read_lock();
3234 }
3235
3236 {
3237 #ifdef WITH_LTTNG
3238 osd_reqid_t reqid = ctx->op->get_reqid();
3239 #endif
3240 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3241 reqid.name._num, reqid.tid, reqid.inc);
3242 }
3243
3244 int result = prepare_transaction(ctx);
3245
3246 {
3247 #ifdef WITH_LTTNG
3248 osd_reqid_t reqid = ctx->op->get_reqid();
3249 #endif
3250 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3251 reqid.name._num, reqid.tid, reqid.inc);
3252 }
3253
3254 if (op->may_read()) {
3255 dout(10) << " dropping ondisk_read_lock" << dendl;
3256 obc->ondisk_read_unlock();
3257 }
3258
3259 bool pending_async_reads = !ctx->pending_async_reads.empty();
3260 if (result == -EINPROGRESS || pending_async_reads) {
3261 // come back later.
3262 if (pending_async_reads) {
3263 in_progress_async_reads.push_back(make_pair(op, ctx));
3264 ctx->start_async_reads(this);
3265 }
3266 return;
3267 }
3268
3269 if (result == -EAGAIN) {
3270 // clean up after the ctx
3271 close_op_ctx(ctx);
3272 return;
3273 }
3274
3275 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3276 // prepare the reply
3277 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3278 successful_write);
3279
3280 // Write operations aren't allowed to return a data payload because
3281 // we can't do so reliably. If the client has to resend the request
3282 // and it has already been applied, we will return 0 with no
3283 // payload. Non-deterministic behavior is no good. However, it is
3284 // possible to construct an operation that does a read, does a guard
3285 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3286 // with the write, or return a CMPXATTR and the read value.
3287 if (successful_write) {
3288 // write. normalize the result code.
3289 dout(20) << " zeroing write result code " << result << dendl;
3290 result = 0;
3291 }
3292 ctx->reply->set_result(result);
3293
3294 // read or error?
3295 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3296 // finish side-effects
3297 if (result >= 0)
3298 do_osd_op_effects(ctx, m->get_connection());
3299
3300 complete_read_ctx(result, ctx);
3301 return;
3302 }
3303
3304 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3305
3306 assert(op->may_write() || op->may_cache());
3307
3308 // trim log?
3309 calc_trim_to();
3310
3311 // verify that we are doing this in order?
3312 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3313 !pool.info.is_tier() && !pool.info.has_tiers()) {
3314 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3315 ceph_tid_t t = m->get_tid();
3316 client_t n = m->get_source().num();
3317 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3318 if (p == cm.end()) {
3319 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3320 cm[n] = t;
3321 } else {
3322 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3323 if (p->second > t) {
3324 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3325 assert(0 == "out of order op");
3326 }
3327 p->second = t;
3328 }
3329 }
3330
3331 if (ctx->update_log_only) {
3332 if (result >= 0)
3333 do_osd_op_effects(ctx, m->get_connection());
3334
3335 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3336 // save just what we need from ctx
3337 MOSDOpReply *reply = ctx->reply;
3338 ctx->reply = nullptr;
3339 reply->claim_op_out_data(*ctx->ops);
3340 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3341 close_op_ctx(ctx);
3342
3343 if (result == -ENOENT) {
3344 reply->set_enoent_reply_versions(info.last_update,
3345 info.last_user_version);
3346 }
3347 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3348 // append to pg log for dup detection - don't save buffers for now
3349 record_write_error(op, soid, reply, result);
3350 return;
3351 }
3352
3353 // no need to capture PG ref, repop cancel will handle that
3354 // Can capture the ctx by pointer, it's owned by the repop
3355 ctx->register_on_commit(
3356 [m, ctx, this](){
3357 if (ctx->op)
3358 log_op_stats(
3359 ctx);
3360
3361 if (m && !ctx->sent_reply) {
3362 MOSDOpReply *reply = ctx->reply;
3363 if (reply)
3364 ctx->reply = nullptr;
3365 else {
3366 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3367 reply->set_reply_versions(ctx->at_version,
3368 ctx->user_at_version);
3369 }
3370 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3371 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3372 osd->send_message_osd_client(reply, m->get_connection());
3373 ctx->sent_reply = true;
3374 ctx->op->mark_commit_sent();
3375 }
3376 });
3377 ctx->register_on_success(
3378 [ctx, this]() {
3379 do_osd_op_effects(
3380 ctx,
3381 ctx->op ? ctx->op->get_req()->get_connection() :
3382 ConnectionRef());
3383 });
3384 ctx->register_on_finish(
3385 [ctx, this]() {
3386 delete ctx;
3387 });
3388
3389 // issue replica writes
3390 ceph_tid_t rep_tid = osd->get_tid();
3391
3392 RepGather *repop = new_repop(ctx, obc, rep_tid);
3393
3394 issue_repop(repop, ctx);
3395 eval_repop(repop);
3396 repop->put();
3397 }
3398
3399 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3400 release_object_locks(ctx->lock_manager);
3401
3402 ctx->op_t.reset();
3403
3404 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3405 ctx->on_finish.erase(p++)) {
3406 (*p)();
3407 }
3408 delete ctx;
3409 }
3410
3411 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3412 {
3413 if (ctx->op)
3414 osd->reply_op_error(ctx->op, r);
3415 close_op_ctx(ctx);
3416 }
3417
3418 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3419 {
3420 if (ctx->op)
3421 osd->reply_op_error(ctx->op, r, v, uv);
3422 close_op_ctx(ctx);
3423 }
3424
3425 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3426 {
3427 OpRequestRef op = ctx->op;
3428 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3429
3430 utime_t now = ceph_clock_now();
3431 utime_t latency = now;
3432 latency -= ctx->op->get_req()->get_recv_stamp();
3433 utime_t process_latency = now;
3434 process_latency -= ctx->op->get_dequeued_time();
3435
3436 uint64_t inb = ctx->bytes_written;
3437 uint64_t outb = ctx->bytes_read;
3438
3439 osd->logger->inc(l_osd_op);
3440
3441 osd->logger->inc(l_osd_op_outb, outb);
3442 osd->logger->inc(l_osd_op_inb, inb);
3443 osd->logger->tinc(l_osd_op_lat, latency);
3444 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3445
3446 if (op->may_read() && op->may_write()) {
3447 osd->logger->inc(l_osd_op_rw);
3448 osd->logger->inc(l_osd_op_rw_inb, inb);
3449 osd->logger->inc(l_osd_op_rw_outb, outb);
3450 osd->logger->tinc(l_osd_op_rw_lat, latency);
3451 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3452 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3453 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3454 } else if (op->may_read()) {
3455 osd->logger->inc(l_osd_op_r);
3456 osd->logger->inc(l_osd_op_r_outb, outb);
3457 osd->logger->tinc(l_osd_op_r_lat, latency);
3458 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3459 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3460 } else if (op->may_write() || op->may_cache()) {
3461 osd->logger->inc(l_osd_op_w);
3462 osd->logger->inc(l_osd_op_w_inb, inb);
3463 osd->logger->tinc(l_osd_op_w_lat, latency);
3464 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3465 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3466 } else
3467 ceph_abort();
3468
3469 dout(15) << "log_op_stats " << *m
3470 << " inb " << inb
3471 << " outb " << outb
3472 << " lat " << latency << dendl;
3473 }
3474
3475 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3476 {
3477 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3478 assert(have_same_or_newer_map(m->map_epoch));
3479 assert(m->get_type() == MSG_OSD_SUBOP);
3480 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3481
3482 if (!is_peered()) {
3483 waiting_for_peered.push_back(op);
3484 op->mark_delayed("waiting for active");
3485 return;
3486 }
3487
3488 const OSDOp *first = NULL;
3489 if (m->ops.size() >= 1) {
3490 first = &m->ops[0];
3491 }
3492
3493 if (first) {
3494 switch (first->op.op) {
3495 case CEPH_OSD_OP_DELETE:
3496 sub_op_remove(op);
3497 return;
3498 case CEPH_OSD_OP_SCRUB_RESERVE:
3499 handle_scrub_reserve_request(op);
3500 return;
3501 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3502 handle_scrub_reserve_release(op);
3503 return;
3504 case CEPH_OSD_OP_SCRUB_MAP:
3505 sub_op_scrub_map(op);
3506 return;
3507 }
3508 }
3509 }
3510
3511 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3512 {
3513 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3514 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3515 if (r->ops.size() >= 1) {
3516 const OSDOp& first = r->ops[0];
3517 switch (first.op.op) {
3518 case CEPH_OSD_OP_SCRUB_RESERVE:
3519 {
3520 pg_shard_t from = r->from;
3521 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3522 bool reserved;
3523 ::decode(reserved, p);
3524 if (reserved) {
3525 handle_scrub_reserve_grant(op, from);
3526 } else {
3527 handle_scrub_reserve_reject(op, from);
3528 }
3529 }
3530 return;
3531 }
3532 }
3533 }
3534
3535 void PrimaryLogPG::do_scan(
3536 OpRequestRef op,
3537 ThreadPool::TPHandle &handle)
3538 {
3539 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3540 assert(m->get_type() == MSG_OSD_PG_SCAN);
3541 dout(10) << "do_scan " << *m << dendl;
3542
3543 op->mark_started();
3544
3545 switch (m->op) {
3546 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3547 {
3548 ostringstream ss;
3549 if (osd->check_backfill_full(ss)) {
3550 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3551 queue_peering_event(
3552 CephPeeringEvtRef(
3553 std::make_shared<CephPeeringEvt>(
3554 get_osdmap()->get_epoch(),
3555 get_osdmap()->get_epoch(),
3556 BackfillTooFull())));
3557 return;
3558 }
3559
3560 BackfillInterval bi;
3561 bi.begin = m->begin;
3562 // No need to flush, there won't be any in progress writes occuring
3563 // past m->begin
3564 scan_range(
3565 cct->_conf->osd_backfill_scan_min,
3566 cct->_conf->osd_backfill_scan_max,
3567 &bi,
3568 handle);
3569 MOSDPGScan *reply = new MOSDPGScan(
3570 MOSDPGScan::OP_SCAN_DIGEST,
3571 pg_whoami,
3572 get_osdmap()->get_epoch(), m->query_epoch,
3573 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3574 ::encode(bi.objects, reply->get_data());
3575 osd->send_message_osd_cluster(reply, m->get_connection());
3576 }
3577 break;
3578
3579 case MOSDPGScan::OP_SCAN_DIGEST:
3580 {
3581 pg_shard_t from = m->from;
3582
3583 // Check that from is in backfill_targets vector
3584 assert(is_backfill_targets(from));
3585
3586 BackfillInterval& bi = peer_backfill_info[from];
3587 bi.begin = m->begin;
3588 bi.end = m->end;
3589 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3590
3591 // take care to preserve ordering!
3592 bi.clear_objects();
3593 ::decode_noclear(bi.objects, p);
3594
3595 if (waiting_on_backfill.erase(from)) {
3596 if (waiting_on_backfill.empty()) {
3597 assert(peer_backfill_info.size() == backfill_targets.size());
3598 finish_recovery_op(hobject_t::get_max());
3599 }
3600 } else {
3601 // we canceled backfill for a while due to a too full, and this
3602 // is an extra response from a non-too-full peer
3603 }
3604 }
3605 break;
3606 }
3607 }
3608
3609 void PrimaryLogPG::do_backfill(OpRequestRef op)
3610 {
3611 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3612 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3613 dout(10) << "do_backfill " << *m << dendl;
3614
3615 op->mark_started();
3616
3617 switch (m->op) {
3618 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3619 {
3620 assert(cct->_conf->osd_kill_backfill_at != 1);
3621
3622 MOSDPGBackfill *reply = new MOSDPGBackfill(
3623 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3624 get_osdmap()->get_epoch(),
3625 m->query_epoch,
3626 spg_t(info.pgid.pgid, get_primary().shard));
3627 reply->set_priority(get_recovery_op_priority());
3628 osd->send_message_osd_cluster(reply, m->get_connection());
3629 queue_peering_event(
3630 CephPeeringEvtRef(
3631 std::make_shared<CephPeeringEvt>(
3632 get_osdmap()->get_epoch(),
3633 get_osdmap()->get_epoch(),
3634 RecoveryDone())));
3635 }
3636 // fall-thru
3637
3638 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3639 {
3640 assert(cct->_conf->osd_kill_backfill_at != 2);
3641
3642 info.set_last_backfill(m->last_backfill);
3643 info.stats = m->stats;
3644
3645 ObjectStore::Transaction t;
3646 dirty_info = true;
3647 write_if_dirty(t);
3648 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3649 assert(tr == 0);
3650 }
3651 break;
3652
3653 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3654 {
3655 assert(is_primary());
3656 assert(cct->_conf->osd_kill_backfill_at != 3);
3657 finish_recovery_op(hobject_t::get_max());
3658 }
3659 break;
3660 }
3661 }
3662
3663 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3664 {
3665 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3666 op->get_req());
3667 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3668 dout(7) << __func__ << " " << m->ls << dendl;
3669
3670 op->mark_started();
3671
3672 ObjectStore::Transaction t;
3673 for (auto& p : m->ls) {
3674 remove_snap_mapped_object(t, p.first);
3675 }
3676 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3677 assert(r == 0);
3678 }
3679
3680 int PrimaryLogPG::trim_object(
3681 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3682 {
3683 *ctxp = NULL;
3684 // load clone info
3685 bufferlist bl;
3686 ObjectContextRef obc = get_object_context(coid, false, NULL);
3687 if (!obc || !obc->ssc || !obc->ssc->exists) {
3688 osd->clog->error() << __func__ << ": Can not trim " << coid
3689 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3690 return -ENOENT;
3691 }
3692
3693 hobject_t snapoid(
3694 coid.oid, coid.get_key(),
3695 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3696 info.pgid.pool(), coid.get_namespace());
3697 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3698 if (!snapset_obc) {
3699 osd->clog->error() << __func__ << ": Can not trim " << coid
3700 << " repair needed, no snapset obc for " << snapoid;
3701 return -ENOENT;
3702 }
3703
3704 SnapSet& snapset = obc->ssc->snapset;
3705
3706 bool legacy = snapset.is_legacy() ||
3707 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3708
3709 object_info_t &coi = obc->obs.oi;
3710 set<snapid_t> old_snaps;
3711 if (legacy) {
3712 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3713 } else {
3714 auto p = snapset.clone_snaps.find(coid.snap);
3715 if (p == snapset.clone_snaps.end()) {
3716 osd->clog->error() << "No clone_snaps in snapset " << snapset
3717 << " for object " << coid << "\n";
3718 return -ENOENT;
3719 }
3720 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3721 snapset.clone_snaps[coid.snap].end());
3722 }
3723 if (old_snaps.empty()) {
3724 osd->clog->error() << "No object info snaps for object " << coid;
3725 return -ENOENT;
3726 }
3727
3728 dout(10) << coid << " old_snaps " << old_snaps
3729 << " old snapset " << snapset << dendl;
3730 if (snapset.seq == 0) {
3731 osd->clog->error() << "No snapset.seq for object " << coid;
3732 return -ENOENT;
3733 }
3734
3735 set<snapid_t> new_snaps;
3736 for (set<snapid_t>::iterator i = old_snaps.begin();
3737 i != old_snaps.end();
3738 ++i) {
3739 if (!pool.info.is_removed_snap(*i))
3740 new_snaps.insert(*i);
3741 }
3742
3743 vector<snapid_t>::iterator p = snapset.clones.end();
3744
3745 if (new_snaps.empty()) {
3746 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3747 if (p == snapset.clones.end()) {
3748 osd->clog->error() << "Snap " << coid.snap << " not in clones";
3749 return -ENOENT;
3750 }
3751 }
3752
3753 OpContextUPtr ctx = simple_opc_create(obc);
3754 ctx->snapset_obc = snapset_obc;
3755
3756 if (!ctx->lock_manager.get_snaptrimmer_write(
3757 coid,
3758 obc,
3759 first)) {
3760 close_op_ctx(ctx.release());
3761 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3762 return -ENOLCK;
3763 }
3764
3765 if (!ctx->lock_manager.get_snaptrimmer_write(
3766 snapoid,
3767 snapset_obc,
3768 first)) {
3769 close_op_ctx(ctx.release());
3770 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3771 return -ENOLCK;
3772 }
3773
3774 ctx->at_version = get_next_version();
3775
3776 PGTransaction *t = ctx->op_t.get();
3777
3778 if (new_snaps.empty()) {
3779 // remove clone
3780 dout(10) << coid << " snaps " << old_snaps << " -> "
3781 << new_snaps << " ... deleting" << dendl;
3782
3783 // ...from snapset
3784 assert(p != snapset.clones.end());
3785
3786 snapid_t last = coid.snap;
3787 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3788
3789 if (p != snapset.clones.begin()) {
3790 // not the oldest... merge overlap into next older clone
3791 vector<snapid_t>::iterator n = p - 1;
3792 hobject_t prev_coid = coid;
3793 prev_coid.snap = *n;
3794 bool adjust_prev_bytes = is_present_clone(prev_coid);
3795
3796 if (adjust_prev_bytes)
3797 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3798
3799 snapset.clone_overlap[*n].intersection_of(
3800 snapset.clone_overlap[*p]);
3801
3802 if (adjust_prev_bytes)
3803 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3804 }
3805 ctx->delta_stats.num_objects--;
3806 if (coi.is_dirty())
3807 ctx->delta_stats.num_objects_dirty--;
3808 if (coi.is_omap())
3809 ctx->delta_stats.num_objects_omap--;
3810 if (coi.is_whiteout()) {
3811 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3812 ctx->delta_stats.num_whiteouts--;
3813 }
3814 ctx->delta_stats.num_object_clones--;
3815 if (coi.is_cache_pinned())
3816 ctx->delta_stats.num_objects_pinned--;
3817 obc->obs.exists = false;
3818
3819 snapset.clones.erase(p);
3820 snapset.clone_overlap.erase(last);
3821 snapset.clone_size.erase(last);
3822 snapset.clone_snaps.erase(last);
3823
3824 ctx->log.push_back(
3825 pg_log_entry_t(
3826 pg_log_entry_t::DELETE,
3827 coid,
3828 ctx->at_version,
3829 ctx->obs->oi.version,
3830 0,
3831 osd_reqid_t(),
3832 ctx->mtime,
3833 0)
3834 );
3835 t->remove(coid);
3836 t->update_snaps(
3837 coid,
3838 old_snaps,
3839 new_snaps);
3840
3841 coi = object_info_t(coid);
3842
3843 ctx->at_version.version++;
3844 } else {
3845 // save adjusted snaps for this object
3846 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3847 if (legacy) {
3848 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3849 } else {
3850 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3851 new_snaps.rend());
3852 // we still do a 'modify' event on this object just to trigger a
3853 // snapmapper.update ... :(
3854 }
3855
3856 coi.prior_version = coi.version;
3857 coi.version = ctx->at_version;
3858 bl.clear();
3859 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3860 t->setattr(coid, OI_ATTR, bl);
3861
3862 ctx->log.push_back(
3863 pg_log_entry_t(
3864 pg_log_entry_t::MODIFY,
3865 coid,
3866 coi.version,
3867 coi.prior_version,
3868 0,
3869 osd_reqid_t(),
3870 ctx->mtime,
3871 0)
3872 );
3873 ctx->at_version.version++;
3874
3875 t->update_snaps(
3876 coid,
3877 old_snaps,
3878 new_snaps);
3879 }
3880
3881 // save head snapset
3882 dout(10) << coid << " new snapset " << snapset << " on "
3883 << snapset_obc->obs.oi << dendl;
3884 if (snapset.clones.empty() &&
3885 (!snapset.head_exists ||
3886 (snapset_obc->obs.oi.is_whiteout() &&
3887 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3888 !snapset_obc->obs.oi.is_cache_pinned()))) {
3889 // NOTE: this arguably constitutes minor interference with the
3890 // tiering agent if this is a cache tier since a snap trim event
3891 // is effectively evicting a whiteout we might otherwise want to
3892 // keep around.
3893 dout(10) << coid << " removing " << snapoid << dendl;
3894 ctx->log.push_back(
3895 pg_log_entry_t(
3896 pg_log_entry_t::DELETE,
3897 snapoid,
3898 ctx->at_version,
3899 ctx->snapset_obc->obs.oi.version,
3900 0,
3901 osd_reqid_t(),
3902 ctx->mtime,
3903 0)
3904 );
3905 if (snapoid.is_head()) {
3906 derr << "removing snap head" << dendl;
3907 object_info_t& oi = ctx->snapset_obc->obs.oi;
3908 ctx->delta_stats.num_objects--;
3909 if (oi.is_dirty()) {
3910 ctx->delta_stats.num_objects_dirty--;
3911 }
3912 if (oi.is_omap())
3913 ctx->delta_stats.num_objects_omap--;
3914 if (oi.is_whiteout()) {
3915 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3916 ctx->delta_stats.num_whiteouts--;
3917 }
3918 if (oi.is_cache_pinned()) {
3919 ctx->delta_stats.num_objects_pinned--;
3920 }
3921 }
3922 ctx->snapset_obc->obs.exists = false;
3923 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3924 t->remove(snapoid);
3925 } else {
3926 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3927 snapset.filter(pool.info);
3928 dout(10) << coid << " writing updated snapset on " << snapoid
3929 << ", snapset is " << snapset << dendl;
3930 ctx->log.push_back(
3931 pg_log_entry_t(
3932 pg_log_entry_t::MODIFY,
3933 snapoid,
3934 ctx->at_version,
3935 ctx->snapset_obc->obs.oi.version,
3936 0,
3937 osd_reqid_t(),
3938 ctx->mtime,
3939 0)
3940 );
3941
3942 ctx->snapset_obc->obs.oi.prior_version =
3943 ctx->snapset_obc->obs.oi.version;
3944 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3945
3946 map <string, bufferlist> attrs;
3947 bl.clear();
3948 ::encode(snapset, bl);
3949 attrs[SS_ATTR].claim(bl);
3950
3951 bl.clear();
3952 ::encode(ctx->snapset_obc->obs.oi, bl,
3953 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3954 attrs[OI_ATTR].claim(bl);
3955 t->setattrs(snapoid, attrs);
3956 }
3957
3958 *ctxp = std::move(ctx);
3959 return 0;
3960 }
3961
3962 void PrimaryLogPG::kick_snap_trim()
3963 {
3964 assert(is_active());
3965 assert(is_primary());
3966 if (is_clean() && !snap_trimq.empty()) {
3967 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3968 snap_trimmer_machine.process_event(KickTrim());
3969 }
3970 }
3971
3972 void PrimaryLogPG::snap_trimmer_scrub_complete()
3973 {
3974 if (is_primary() && is_active() && is_clean()) {
3975 assert(!snap_trimq.empty());
3976 snap_trimmer_machine.process_event(ScrubComplete());
3977 }
3978 }
3979
3980 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3981 {
3982 if (deleting || pg_has_reset_since(queued)) {
3983 return;
3984 }
3985
3986 assert(is_primary());
3987
3988 dout(10) << "snap_trimmer posting" << dendl;
3989 snap_trimmer_machine.process_event(DoSnapWork());
3990 dout(10) << "snap_trimmer complete" << dendl;
3991 return;
3992 }
3993
3994 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3995 {
3996 __u64 v2;
3997
3998 string v2s(xattr.c_str(), xattr.length());
3999 if (v2s.length())
4000 v2 = strtoull(v2s.c_str(), NULL, 10);
4001 else
4002 v2 = 0;
4003
4004 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4005
4006 switch (op) {
4007 case CEPH_OSD_CMPXATTR_OP_EQ:
4008 return (v1 == v2);
4009 case CEPH_OSD_CMPXATTR_OP_NE:
4010 return (v1 != v2);
4011 case CEPH_OSD_CMPXATTR_OP_GT:
4012 return (v1 > v2);
4013 case CEPH_OSD_CMPXATTR_OP_GTE:
4014 return (v1 >= v2);
4015 case CEPH_OSD_CMPXATTR_OP_LT:
4016 return (v1 < v2);
4017 case CEPH_OSD_CMPXATTR_OP_LTE:
4018 return (v1 <= v2);
4019 default:
4020 return -EINVAL;
4021 }
4022 }
4023
4024 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4025 {
4026 string v2s(xattr.c_str(), xattr.length());
4027
4028 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4029
4030 switch (op) {
4031 case CEPH_OSD_CMPXATTR_OP_EQ:
4032 return (v1s.compare(v2s) == 0);
4033 case CEPH_OSD_CMPXATTR_OP_NE:
4034 return (v1s.compare(v2s) != 0);
4035 case CEPH_OSD_CMPXATTR_OP_GT:
4036 return (v1s.compare(v2s) > 0);
4037 case CEPH_OSD_CMPXATTR_OP_GTE:
4038 return (v1s.compare(v2s) >= 0);
4039 case CEPH_OSD_CMPXATTR_OP_LT:
4040 return (v1s.compare(v2s) < 0);
4041 case CEPH_OSD_CMPXATTR_OP_LTE:
4042 return (v1s.compare(v2s) <= 0);
4043 default:
4044 return -EINVAL;
4045 }
4046 }
4047
4048 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4049 {
4050 ceph_osd_op& op = osd_op.op;
4051 vector<OSDOp> write_ops(1);
4052 OSDOp& write_op = write_ops[0];
4053 uint64_t write_length = op.writesame.length;
4054 int result = 0;
4055
4056 if (!write_length)
4057 return 0;
4058
4059 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4060 return -EINVAL;
4061
4062 if (op.writesame.data_length != osd_op.indata.length()) {
4063 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4064 return -EINVAL;
4065 }
4066
4067 while (write_length) {
4068 write_op.indata.append(osd_op.indata);
4069 write_length -= op.writesame.data_length;
4070 }
4071
4072 write_op.op.op = CEPH_OSD_OP_WRITE;
4073 write_op.op.extent.offset = op.writesame.offset;
4074 write_op.op.extent.length = op.writesame.length;
4075 result = do_osd_ops(ctx, write_ops);
4076 if (result < 0)
4077 derr << "do_writesame do_osd_ops failed " << result << dendl;
4078
4079 return result;
4080 }
4081
4082 // ========================================================================
4083 // low level osd ops
4084
4085 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4086 {
4087 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4088 bufferlist header, vals;
4089 int r = _get_tmap(ctx, &header, &vals);
4090 if (r < 0) {
4091 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4092 r = 0;
4093 return r;
4094 }
4095
4096 vector<OSDOp> ops(3);
4097
4098 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4099 ops[0].op.extent.offset = 0;
4100 ops[0].op.extent.length = 0;
4101
4102 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4103 ops[1].indata.claim(header);
4104
4105 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4106 ops[2].indata.claim(vals);
4107
4108 return do_osd_ops(ctx, ops);
4109 }
4110
4111 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4112 bufferlist& bl)
4113 {
4114 // decode
4115 bufferlist header;
4116 map<string, bufferlist> m;
4117 if (bl.length()) {
4118 bufferlist::iterator p = bl.begin();
4119 ::decode(header, p);
4120 ::decode(m, p);
4121 assert(p.end());
4122 }
4123
4124 // do the update(s)
4125 while (!bp.end()) {
4126 __u8 op;
4127 string key;
4128 ::decode(op, bp);
4129
4130 switch (op) {
4131 case CEPH_OSD_TMAP_SET: // insert key
4132 {
4133 ::decode(key, bp);
4134 bufferlist data;
4135 ::decode(data, bp);
4136 m[key] = data;
4137 }
4138 break;
4139 case CEPH_OSD_TMAP_RM: // remove key
4140 ::decode(key, bp);
4141 if (!m.count(key)) {
4142 return -ENOENT;
4143 }
4144 m.erase(key);
4145 break;
4146 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4147 ::decode(key, bp);
4148 m.erase(key);
4149 break;
4150 case CEPH_OSD_TMAP_HDR: // update header
4151 {
4152 ::decode(header, bp);
4153 }
4154 break;
4155 default:
4156 return -EINVAL;
4157 }
4158 }
4159
4160 // reencode
4161 bufferlist obl;
4162 ::encode(header, obl);
4163 ::encode(m, obl);
4164
4165 // write it out
4166 vector<OSDOp> nops(1);
4167 OSDOp& newop = nops[0];
4168 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4169 newop.op.extent.offset = 0;
4170 newop.op.extent.length = obl.length();
4171 newop.indata = obl;
4172 do_osd_ops(ctx, nops);
4173 osd_op.outdata.claim(newop.outdata);
4174 return 0;
4175 }
4176
4177 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4178 {
4179 bufferlist::iterator orig_bp = bp;
4180 int result = 0;
4181 if (bp.end()) {
4182 dout(10) << "tmapup is a no-op" << dendl;
4183 } else {
4184 // read the whole object
4185 vector<OSDOp> nops(1);
4186 OSDOp& newop = nops[0];
4187 newop.op.op = CEPH_OSD_OP_READ;
4188 newop.op.extent.offset = 0;
4189 newop.op.extent.length = 0;
4190 result = do_osd_ops(ctx, nops);
4191
4192 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4193
4194 dout(30) << " starting is \n";
4195 newop.outdata.hexdump(*_dout);
4196 *_dout << dendl;
4197
4198 bufferlist::iterator ip = newop.outdata.begin();
4199 bufferlist obl;
4200
4201 dout(30) << "the update command is: \n";
4202 osd_op.indata.hexdump(*_dout);
4203 *_dout << dendl;
4204
4205 // header
4206 bufferlist header;
4207 __u32 nkeys = 0;
4208 if (newop.outdata.length()) {
4209 ::decode(header, ip);
4210 ::decode(nkeys, ip);
4211 }
4212 dout(10) << "tmapup header " << header.length() << dendl;
4213
4214 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4215 ++bp;
4216 ::decode(header, bp);
4217 dout(10) << "tmapup new header " << header.length() << dendl;
4218 }
4219
4220 ::encode(header, obl);
4221
4222 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4223
4224 // update keys
4225 bufferlist newkeydata;
4226 string nextkey, last_in_key;
4227 bufferlist nextval;
4228 bool have_next = false;
4229 if (!ip.end()) {
4230 have_next = true;
4231 ::decode(nextkey, ip);
4232 ::decode(nextval, ip);
4233 }
4234 while (!bp.end() && !result) {
4235 __u8 op;
4236 string key;
4237 try {
4238 ::decode(op, bp);
4239 ::decode(key, bp);
4240 }
4241 catch (buffer::error& e) {
4242 return -EINVAL;
4243 }
4244 if (key < last_in_key) {
4245 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4246 << "', falling back to an inefficient (unsorted) update" << dendl;
4247 bp = orig_bp;
4248 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4249 }
4250 last_in_key = key;
4251
4252 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4253
4254 // skip existing intervening keys
4255 bool key_exists = false;
4256 while (have_next && !key_exists) {
4257 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4258 if (nextkey > key)
4259 break;
4260 if (nextkey < key) {
4261 // copy untouched.
4262 ::encode(nextkey, newkeydata);
4263 ::encode(nextval, newkeydata);
4264 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4265 } else {
4266 // don't copy; discard old value. and stop.
4267 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4268 key_exists = true;
4269 nkeys--;
4270 }
4271 if (!ip.end()) {
4272 ::decode(nextkey, ip);
4273 ::decode(nextval, ip);
4274 } else {
4275 have_next = false;
4276 }
4277 }
4278
4279 if (op == CEPH_OSD_TMAP_SET) {
4280 bufferlist val;
4281 try {
4282 ::decode(val, bp);
4283 }
4284 catch (buffer::error& e) {
4285 return -EINVAL;
4286 }
4287 ::encode(key, newkeydata);
4288 ::encode(val, newkeydata);
4289 dout(20) << " set " << key << " " << val.length() << dendl;
4290 nkeys++;
4291 } else if (op == CEPH_OSD_TMAP_CREATE) {
4292 if (key_exists) {
4293 return -EEXIST;
4294 }
4295 bufferlist val;
4296 try {
4297 ::decode(val, bp);
4298 }
4299 catch (buffer::error& e) {
4300 return -EINVAL;
4301 }
4302 ::encode(key, newkeydata);
4303 ::encode(val, newkeydata);
4304 dout(20) << " create " << key << " " << val.length() << dendl;
4305 nkeys++;
4306 } else if (op == CEPH_OSD_TMAP_RM) {
4307 // do nothing.
4308 if (!key_exists) {
4309 return -ENOENT;
4310 }
4311 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4312 // do nothing
4313 } else {
4314 dout(10) << " invalid tmap op " << (int)op << dendl;
4315 return -EINVAL;
4316 }
4317 }
4318
4319 // copy remaining
4320 if (have_next) {
4321 ::encode(nextkey, newkeydata);
4322 ::encode(nextval, newkeydata);
4323 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4324 }
4325 if (!ip.end()) {
4326 bufferlist rest;
4327 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4328 dout(20) << " keep trailing " << rest.length()
4329 << " at " << newkeydata.length() << dendl;
4330 newkeydata.claim_append(rest);
4331 }
4332
4333 // encode final key count + key data
4334 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4335 ::encode(nkeys, obl);
4336 obl.claim_append(newkeydata);
4337
4338 if (0) {
4339 dout(30) << " final is \n";
4340 obl.hexdump(*_dout);
4341 *_dout << dendl;
4342
4343 // sanity check
4344 bufferlist::iterator tp = obl.begin();
4345 bufferlist h;
4346 ::decode(h, tp);
4347 map<string,bufferlist> d;
4348 ::decode(d, tp);
4349 assert(tp.end());
4350 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4351 }
4352
4353 // write it out
4354 if (!result) {
4355 dout(20) << "tmapput write " << obl.length() << dendl;
4356 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4357 newop.op.extent.offset = 0;
4358 newop.op.extent.length = obl.length();
4359 newop.indata = obl;
4360 do_osd_ops(ctx, nops);
4361 osd_op.outdata.claim(newop.outdata);
4362 }
4363 }
4364 return result;
4365 }
4366
4367 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4368 {
4369 if (offset >= max ||
4370 length > max ||
4371 offset + length > max)
4372 return -EFBIG;
4373
4374 return 0;
4375 }
4376
4377 struct FillInVerifyExtent : public Context {
4378 ceph_le64 *r;
4379 int32_t *rval;
4380 bufferlist *outdatap;
4381 boost::optional<uint32_t> maybe_crc;
4382 uint64_t size;
4383 OSDService *osd;
4384 hobject_t soid;
4385 __le32 flags;
4386 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4387 boost::optional<uint32_t> mc, uint64_t size,
4388 OSDService *osd, hobject_t soid, __le32 flags) :
4389 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4390 size(size), osd(osd), soid(soid), flags(flags) {}
4391 void finish(int len) override {
4392 *r = len;
4393 if (len < 0) {
4394 *rval = len;
4395 return;
4396 }
4397 *rval = 0;
4398
4399 // whole object? can we verify the checksum?
4400 if (maybe_crc && *r == size) {
4401 uint32_t crc = outdatap->crc32c(-1);
4402 if (maybe_crc != crc) {
4403 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4404 << " != expected 0x" << *maybe_crc
4405 << std::dec << " on " << soid;
4406 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4407 *rval = -EIO;
4408 *r = 0;
4409 }
4410 }
4411 }
4412 }
4413 };
4414
4415 struct ToSparseReadResult : public Context {
4416 int* result;
4417 bufferlist* data_bl;
4418 uint64_t data_offset;
4419 ceph_le64* len;
4420 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4421 ceph_le64* len)
4422 : result(result), data_bl(bl), data_offset(offset),len(len) {}
4423 void finish(int r) override {
4424 if (r < 0) {
4425 *result = r;
4426 return;
4427 }
4428 *result = 0;
4429 *len = r;
4430 bufferlist outdata;
4431 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4432 ::encode(extents, outdata);
4433 ::encode_destructively(*data_bl, outdata);
4434 data_bl->swap(outdata);
4435 }
4436 };
4437
4438 template<typename V>
4439 static string list_keys(const map<string, V>& m) {
4440 string s;
4441 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4442 if (!s.empty()) {
4443 s.push_back(',');
4444 }
4445 s.append(itr->first);
4446 }
4447 return s;
4448 }
4449
4450 template<typename T>
4451 static string list_entries(const T& m) {
4452 string s;
4453 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4454 if (!s.empty()) {
4455 s.push_back(',');
4456 }
4457 s.append(*itr);
4458 }
4459 return s;
4460 }
4461
4462 void PrimaryLogPG::maybe_create_new_object(
4463 OpContext *ctx,
4464 bool ignore_transaction)
4465 {
4466 ObjectState& obs = ctx->new_obs;
4467 if (!obs.exists) {
4468 ctx->delta_stats.num_objects++;
4469 obs.exists = true;
4470 assert(!obs.oi.is_whiteout());
4471 obs.oi.new_object();
4472 if (!ignore_transaction)
4473 ctx->op_t->create(obs.oi.soid);
4474 } else if (obs.oi.is_whiteout()) {
4475 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4476 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4477 --ctx->delta_stats.num_whiteouts;
4478 }
4479 }
4480
4481 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4482 OSDOp& osd_op;
4483
4484 ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4485 }
4486
4487 int execute() override {
4488 return osd_op.rval;
4489 }
4490 };
4491
4492 struct C_ChecksumRead : public Context {
4493 PrimaryLogPG *primary_log_pg;
4494 OSDOp &osd_op;
4495 Checksummer::CSumType csum_type;
4496 bufferlist init_value_bl;
4497 ceph_le64 read_length;
4498 bufferlist read_bl;
4499 Context *fill_extent_ctx;
4500
4501 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4502 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4503 boost::optional<uint32_t> maybe_crc, uint64_t size,
4504 OSDService *osd, hobject_t soid, __le32 flags)
4505 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4506 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4507 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4508 &read_bl, maybe_crc, size,
4509 osd, soid, flags)) {
4510 }
4511 ~C_ChecksumRead() override {
4512 delete fill_extent_ctx;
4513 }
4514
4515 void finish(int r) override {
4516 fill_extent_ctx->complete(r);
4517 fill_extent_ctx = nullptr;
4518
4519 if (osd_op.rval >= 0) {
4520 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4521 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4522 &init_value_bl_it, read_bl);
4523 }
4524 }
4525 };
4526
4527 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4528 bufferlist::iterator *bl_it)
4529 {
4530 dout(20) << __func__ << dendl;
4531
4532 auto& op = osd_op.op;
4533 if (op.checksum.chunk_size > 0) {
4534 if (op.checksum.length == 0) {
4535 dout(10) << __func__ << ": length required when chunk size provided"
4536 << dendl;
4537 return -EINVAL;
4538 }
4539 if (op.checksum.length % op.checksum.chunk_size != 0) {
4540 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4541 return -EINVAL;
4542 }
4543 }
4544
4545 auto& oi = ctx->new_obs.oi;
4546 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4547 // zeroed offset+length implies checksum whole object
4548 op.checksum.length = oi.size;
4549 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4550 return -EOVERFLOW;
4551 }
4552
4553 Checksummer::CSumType csum_type;
4554 switch (op.checksum.type) {
4555 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4556 csum_type = Checksummer::CSUM_XXHASH32;
4557 break;
4558 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4559 csum_type = Checksummer::CSUM_XXHASH64;
4560 break;
4561 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4562 csum_type = Checksummer::CSUM_CRC32C;
4563 break;
4564 default:
4565 dout(10) << __func__ << ": unknown crc type ("
4566 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4567 return -EINVAL;
4568 }
4569
4570 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4571 if (bl_it->get_remaining() < csum_init_value_size) {
4572 dout(10) << __func__ << ": init value not provided" << dendl;
4573 return -EINVAL;
4574 }
4575
4576 bufferlist init_value_bl;
4577 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4578 csum_init_value_size);
4579 bl_it->advance(csum_init_value_size);
4580
4581 if (pool.info.require_rollback() && op.checksum.length > 0) {
4582 // If there is a data digest and it is possible we are reading
4583 // entire object, pass the digest.
4584 boost::optional<uint32_t> maybe_crc;
4585 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4586 op.checksum.length >= oi.size) {
4587 maybe_crc = oi.data_digest;
4588 }
4589
4590 // async read
4591 auto& soid = oi.soid;
4592 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4593 std::move(init_value_bl), maybe_crc,
4594 oi.size, osd, soid, op.flags);
4595
4596 ctx->pending_async_reads.push_back({
4597 {op.checksum.offset, op.checksum.length, op.flags},
4598 {&checksum_ctx->read_bl, checksum_ctx}});
4599
4600 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4601 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4602 new ReadFinisher(osd_op));
4603 return -EINPROGRESS;
4604 }
4605
4606 // sync read
4607 std::vector<OSDOp> read_ops(1);
4608 auto& read_op = read_ops[0];
4609 if (op.checksum.length > 0) {
4610 read_op.op.op = CEPH_OSD_OP_READ;
4611 read_op.op.flags = op.flags;
4612 read_op.op.extent.offset = op.checksum.offset;
4613 read_op.op.extent.length = op.checksum.length;
4614 read_op.op.extent.truncate_size = 0;
4615 read_op.op.extent.truncate_seq = 0;
4616
4617 int r = do_osd_ops(ctx, read_ops);
4618 if (r < 0) {
4619 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4620 return r;
4621 }
4622 }
4623
4624 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4625 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4626 read_op.outdata);
4627 }
4628
4629 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4630 Checksummer::CSumType csum_type,
4631 bufferlist::iterator *init_value_bl_it,
4632 const bufferlist &read_bl) {
4633 dout(20) << __func__ << dendl;
4634
4635 auto& op = osd_op.op;
4636
4637 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4638 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4639 << op.checksum.length << dendl;
4640 return -EINVAL;
4641 }
4642
4643 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4644 op.checksum.chunk_size : read_bl.length());
4645 uint32_t csum_count = (csum_chunk_size > 0 ?
4646 read_bl.length() / csum_chunk_size : 0);
4647
4648 bufferlist csum;
4649 bufferptr csum_data;
4650 if (csum_count > 0) {
4651 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4652 csum_data = buffer::create(csum_value_size * csum_count);
4653 csum_data.zero();
4654 csum.append(csum_data);
4655
4656 switch (csum_type) {
4657 case Checksummer::CSUM_XXHASH32:
4658 {
4659 Checksummer::xxhash32::init_value_t init_value;
4660 ::decode(init_value, *init_value_bl_it);
4661 Checksummer::calculate<Checksummer::xxhash32>(
4662 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4663 &csum_data);
4664 }
4665 break;
4666 case Checksummer::CSUM_XXHASH64:
4667 {
4668 Checksummer::xxhash64::init_value_t init_value;
4669 ::decode(init_value, *init_value_bl_it);
4670 Checksummer::calculate<Checksummer::xxhash64>(
4671 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4672 &csum_data);
4673 }
4674 break;
4675 case Checksummer::CSUM_CRC32C:
4676 {
4677 Checksummer::crc32c::init_value_t init_value;
4678 ::decode(init_value, *init_value_bl_it);
4679 Checksummer::calculate<Checksummer::crc32c>(
4680 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4681 &csum_data);
4682 }
4683 break;
4684 default:
4685 break;
4686 }
4687 }
4688
4689 ::encode(csum_count, osd_op.outdata);
4690 osd_op.outdata.claim_append(csum);
4691 return 0;
4692 }
4693
4694 struct C_ExtentCmpRead : public Context {
4695 PrimaryLogPG *primary_log_pg;
4696 OSDOp &osd_op;
4697 ceph_le64 read_length;
4698 bufferlist read_bl;
4699 Context *fill_extent_ctx;
4700
4701 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4702 boost::optional<uint32_t> maybe_crc, uint64_t size,
4703 OSDService *osd, hobject_t soid, __le32 flags)
4704 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4705 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4706 &read_bl, maybe_crc, size,
4707 osd, soid, flags)) {
4708 }
4709 ~C_ExtentCmpRead() override {
4710 delete fill_extent_ctx;
4711 }
4712
4713 void finish(int r) override {
4714 if (r == -ENOENT) {
4715 osd_op.rval = 0;
4716 read_bl.clear();
4717 delete fill_extent_ctx;
4718 } else {
4719 fill_extent_ctx->complete(r);
4720 }
4721 fill_extent_ctx = nullptr;
4722
4723 if (osd_op.rval >= 0) {
4724 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4725 }
4726 }
4727 };
4728
4729 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4730 {
4731 dout(20) << __func__ << dendl;
4732 ceph_osd_op& op = osd_op.op;
4733
4734 auto& oi = ctx->new_obs.oi;
4735 uint64_t size = oi.size;
4736 if ((oi.truncate_seq < op.extent.truncate_seq) &&
4737 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4738 size = op.extent.truncate_size;
4739 }
4740
4741 if (op.extent.offset >= size) {
4742 op.extent.length = 0;
4743 } else if (op.extent.offset + op.extent.length > size) {
4744 op.extent.length = size - op.extent.offset;
4745 }
4746
4747 if (op.extent.length == 0) {
4748 dout(20) << __func__ << " zero length extent" << dendl;
4749 return finish_extent_cmp(osd_op, bufferlist{});
4750 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4751 dout(20) << __func__ << " object DNE" << dendl;
4752 return finish_extent_cmp(osd_op, {});
4753 } else if (pool.info.require_rollback()) {
4754 // If there is a data digest and it is possible we are reading
4755 // entire object, pass the digest.
4756 boost::optional<uint32_t> maybe_crc;
4757 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4758 op.checksum.length >= oi.size) {
4759 maybe_crc = oi.data_digest;
4760 }
4761
4762 // async read
4763 auto& soid = oi.soid;
4764 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4765 osd, soid, op.flags);
4766 ctx->pending_async_reads.push_back({
4767 {op.extent.offset, op.extent.length, op.flags},
4768 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4769
4770 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4771
4772 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4773 new ReadFinisher(osd_op));
4774 return -EINPROGRESS;
4775 }
4776
4777 // sync read
4778 vector<OSDOp> read_ops(1);
4779 OSDOp& read_op = read_ops[0];
4780
4781 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4782 read_op.op.extent.offset = op.extent.offset;
4783 read_op.op.extent.length = op.extent.length;
4784 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4785 read_op.op.extent.truncate_size = op.extent.truncate_size;
4786
4787 int result = do_osd_ops(ctx, read_ops);
4788 if (result < 0) {
4789 derr << __func__ << " failed " << result << dendl;
4790 return result;
4791 }
4792 return finish_extent_cmp(osd_op, read_op.outdata);
4793 }
4794
4795 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4796 {
4797 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4798 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4799 if (osd_op.indata[idx] != read_byte) {
4800 return (-MAX_ERRNO - idx);
4801 }
4802 }
4803
4804 return 0;
4805 }
4806
4807 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4808 dout(20) << __func__ << dendl;
4809 auto& op = osd_op.op;
4810 auto& oi = ctx->new_obs.oi;
4811 auto& soid = oi.soid;
4812 __u32 seq = oi.truncate_seq;
4813 uint64_t size = oi.size;
4814 bool trimmed_read = false;
4815
4816 // are we beyond truncate_size?
4817 if ( (seq < op.extent.truncate_seq) &&
4818 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4819 size = op.extent.truncate_size;
4820
4821 if (op.extent.length == 0) //length is zero mean read the whole object
4822 op.extent.length = size;
4823
4824 if (op.extent.offset >= size) {
4825 op.extent.length = 0;
4826 trimmed_read = true;
4827 } else if (op.extent.offset + op.extent.length > size) {
4828 op.extent.length = size - op.extent.offset;
4829 trimmed_read = true;
4830 }
4831
4832 // read into a buffer
4833 int result = 0;
4834 if (trimmed_read && op.extent.length == 0) {
4835 // read size was trimmed to zero and it is expected to do nothing
4836 // a read operation of 0 bytes does *not* do nothing, this is why
4837 // the trimmed_read boolean is needed
4838 } else if (pool.info.require_rollback()) {
4839 boost::optional<uint32_t> maybe_crc;
4840 // If there is a data digest and it is possible we are reading
4841 // entire object, pass the digest. FillInVerifyExtent will
4842 // will check the oi.size again.
4843 if (oi.is_data_digest() && op.extent.offset == 0 &&
4844 op.extent.length >= oi.size)
4845 maybe_crc = oi.data_digest;
4846 ctx->pending_async_reads.push_back(
4847 make_pair(
4848 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4849 make_pair(&osd_op.outdata,
4850 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4851 &osd_op.outdata, maybe_crc, oi.size,
4852 osd, soid, op.flags))));
4853 dout(10) << " async_read noted for " << soid << dendl;
4854
4855 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4856 new ReadFinisher(osd_op));
4857 } else {
4858 int r = pgbackend->objects_read_sync(
4859 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4860 if (r == -EIO) {
4861 r = rep_repair_primary_object(soid, ctx->op);
4862 }
4863 if (r >= 0)
4864 op.extent.length = r;
4865 else {
4866 result = r;
4867 op.extent.length = 0;
4868 }
4869 dout(10) << " read got " << r << " / " << op.extent.length
4870 << " bytes from obj " << soid << dendl;
4871
4872 // whole object? can we verify the checksum?
4873 if (op.extent.length == oi.size && oi.is_data_digest()) {
4874 uint32_t crc = osd_op.outdata.crc32c(-1);
4875 if (oi.data_digest != crc) {
4876 osd->clog->error() << info.pgid << std::hex
4877 << " full-object read crc 0x" << crc
4878 << " != expected 0x" << oi.data_digest
4879 << std::dec << " on " << soid;
4880 // FIXME fall back to replica or something?
4881 result = -EIO;
4882 }
4883 }
4884 }
4885
4886 // XXX the op.extent.length is the requested length for async read
4887 // On error this length is changed to 0 after the error comes back.
4888 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4889 ctx->delta_stats.num_rd++;
4890 return result;
4891 }
4892
4893 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4894 dout(20) << __func__ << dendl;
4895 auto& op = osd_op.op;
4896 auto& oi = ctx->new_obs.oi;
4897 auto& soid = oi.soid;
4898
4899 if (op.extent.truncate_seq) {
4900 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4901 return -EINVAL;
4902 }
4903
4904 ++ctx->num_read;
4905 if (pool.info.ec_pool()) {
4906 // translate sparse read to a normal one if not supported
4907 uint64_t offset = op.extent.offset;
4908 uint64_t length = op.extent.length;
4909 if (offset > oi.size) {
4910 length = 0;
4911 } else if (offset + length > oi.size) {
4912 length = oi.size - offset;
4913 }
4914
4915 if (length > 0) {
4916 ctx->pending_async_reads.push_back(
4917 make_pair(
4918 boost::make_tuple(offset, length, op.flags),
4919 make_pair(
4920 &osd_op.outdata,
4921 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4922 &op.extent.length))));
4923 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4924
4925 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4926 new ReadFinisher(osd_op));
4927 } else {
4928 dout(10) << " sparse read ended up empty for " << soid << dendl;
4929 map<uint64_t, uint64_t> extents;
4930 ::encode(extents, osd_op.outdata);
4931 }
4932 } else {
4933 // read into a buffer
4934 map<uint64_t, uint64_t> m;
4935 uint32_t total_read = 0;
4936 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4937 info.pgid.shard),
4938 op.extent.offset, op.extent.length, m);
4939 if (r < 0) {
4940 return r;
4941 }
4942
4943 map<uint64_t, uint64_t>::iterator miter;
4944 bufferlist data_bl;
4945 uint64_t last = op.extent.offset;
4946 for (miter = m.begin(); miter != m.end(); ++miter) {
4947 // verify hole?
4948 if (cct->_conf->osd_verify_sparse_read_holes &&
4949 last < miter->first) {
4950 bufferlist t;
4951 uint64_t len = miter->first - last;
4952 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4953 if (r < 0) {
4954 osd->clog->error() << coll << " " << soid
4955 << " sparse-read failed to read: "
4956 << r;
4957 } else if (!t.is_zero()) {
4958 osd->clog->error() << coll << " " << soid
4959 << " sparse-read found data in hole "
4960 << last << "~" << len;
4961 }
4962 }
4963
4964 bufferlist tmpbl;
4965 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4966 op.flags, &tmpbl);
4967 if (r == -EIO) {
4968 r = rep_repair_primary_object(soid, ctx->op);
4969 }
4970 if (r < 0) {
4971 return r;
4972 }
4973
4974 // this is usually happen when we get extent that exceeds the actual file
4975 // size
4976 if (r < (int)miter->second)
4977 miter->second = r;
4978 total_read += r;
4979 dout(10) << "sparse-read " << miter->first << "@" << miter->second
4980 << dendl;
4981 data_bl.claim_append(tmpbl);
4982 last = miter->first + r;
4983 }
4984
4985 if (r < 0) {
4986 return r;
4987 }
4988
4989 // verify trailing hole?
4990 if (cct->_conf->osd_verify_sparse_read_holes) {
4991 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4992 if (last < end) {
4993 bufferlist t;
4994 uint64_t len = end - last;
4995 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4996 if (r < 0) {
4997 osd->clog->error() << coll << " " << soid
4998 << " sparse-read failed to read: " << r;
4999 } else if (!t.is_zero()) {
5000 osd->clog->error() << coll << " " << soid
5001 << " sparse-read found data in hole "
5002 << last << "~" << len;
5003 }
5004 }
5005 }
5006
5007 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5008 // Maybe at first, there is no much whole objects. With continued use, more
5009 // and more whole object exist. So from this point, for spare-read add
5010 // checksum make sense.
5011 if (total_read == oi.size && oi.is_data_digest()) {
5012 uint32_t crc = data_bl.crc32c(-1);
5013 if (oi.data_digest != crc) {
5014 osd->clog->error() << info.pgid << std::hex
5015 << " full-object read crc 0x" << crc
5016 << " != expected 0x" << oi.data_digest
5017 << std::dec << " on " << soid;
5018 // FIXME fall back to replica or something?
5019 return -EIO;
5020 }
5021 }
5022
5023 op.extent.length = total_read;
5024
5025 ::encode(m, osd_op.outdata); // re-encode since it might be modified
5026 ::encode_destructively(data_bl, osd_op.outdata);
5027
5028 dout(10) << " sparse_read got " << total_read << " bytes from object "
5029 << soid << dendl;
5030 }
5031
5032 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5033 ctx->delta_stats.num_rd++;
5034 return 0;
5035 }
5036
5037 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5038 {
5039 int result = 0;
5040 SnapSetContext *ssc = ctx->obc->ssc;
5041 ObjectState& obs = ctx->new_obs;
5042 object_info_t& oi = obs.oi;
5043 const hobject_t& soid = oi.soid;
5044
5045 PGTransaction* t = ctx->op_t.get();
5046
5047 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5048
5049 ctx->current_osd_subop_num = 0;
5050 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5051 OSDOp& osd_op = *p;
5052 ceph_osd_op& op = osd_op.op;
5053
5054 OpFinisher* op_finisher = nullptr;
5055 {
5056 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5057 if (op_finisher_it != ctx->op_finishers.end()) {
5058 op_finisher = op_finisher_it->second.get();
5059 }
5060 }
5061
5062 // TODO: check endianness (__le32 vs uint32_t, etc.)
5063 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5064 // but the code in this function seems to treat them as native-endian. What should the
5065 // tracepoints do?
5066 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5067
5068 dout(10) << "do_osd_op " << osd_op << dendl;
5069
5070 bufferlist::iterator bp = osd_op.indata.begin();
5071
5072 // user-visible modifcation?
5073 switch (op.op) {
5074 // non user-visible modifications
5075 case CEPH_OSD_OP_WATCH:
5076 case CEPH_OSD_OP_CACHE_EVICT:
5077 case CEPH_OSD_OP_CACHE_FLUSH:
5078 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5079 case CEPH_OSD_OP_UNDIRTY:
5080 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5081 case CEPH_OSD_OP_CACHE_PIN:
5082 case CEPH_OSD_OP_CACHE_UNPIN:
5083 case CEPH_OSD_OP_SET_REDIRECT:
5084 break;
5085 default:
5086 if (op.op & CEPH_OSD_OP_MODE_WR)
5087 ctx->user_modify = true;
5088 }
5089
5090 // munge -1 truncate to 0 truncate
5091 if (ceph_osd_op_uses_extent(op.op) &&
5092 op.extent.truncate_seq == 1 &&
5093 op.extent.truncate_size == (-1ULL)) {
5094 op.extent.truncate_size = 0;
5095 op.extent.truncate_seq = 0;
5096 }
5097
5098 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5099 if (op.op == CEPH_OSD_OP_ZERO &&
5100 obs.exists &&
5101 op.extent.offset < cct->_conf->osd_max_object_size &&
5102 op.extent.length >= 1 &&
5103 op.extent.length <= cct->_conf->osd_max_object_size &&
5104 op.extent.offset + op.extent.length >= oi.size) {
5105 if (op.extent.offset >= oi.size) {
5106 // no-op
5107 goto fail;
5108 }
5109 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5110 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5111 op.op = CEPH_OSD_OP_TRUNCATE;
5112 }
5113
5114 switch (op.op) {
5115
5116 // --- READS ---
5117
5118 case CEPH_OSD_OP_CMPEXT:
5119 ++ctx->num_read;
5120 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5121 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5122 op.extent.length, op.extent.truncate_size,
5123 op.extent.truncate_seq);
5124
5125 if (op_finisher == nullptr) {
5126 result = do_extent_cmp(ctx, osd_op);
5127 } else {
5128 result = op_finisher->execute();
5129 }
5130 break;
5131
5132 case CEPH_OSD_OP_SYNC_READ:
5133 if (pool.info.require_rollback()) {
5134 result = -EOPNOTSUPP;
5135 break;
5136 }
5137 // fall through
5138 case CEPH_OSD_OP_READ:
5139 ++ctx->num_read;
5140 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5141 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5142 op.extent.length, op.extent.truncate_size,
5143 op.extent.truncate_seq);
5144 if (op_finisher == nullptr) {
5145 if (!ctx->data_off) {
5146 ctx->data_off = op.extent.offset;
5147 }
5148 result = do_read(ctx, osd_op);
5149 } else {
5150 result = op_finisher->execute();
5151 }
5152 break;
5153
5154 case CEPH_OSD_OP_CHECKSUM:
5155 ++ctx->num_read;
5156 {
5157 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5158 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5159 op.checksum.offset, op.checksum.length,
5160 op.checksum.chunk_size);
5161
5162 if (op_finisher == nullptr) {
5163 result = do_checksum(ctx, osd_op, &bp);
5164 } else {
5165 result = op_finisher->execute();
5166 }
5167 }
5168 break;
5169
5170 /* map extents */
5171 case CEPH_OSD_OP_MAPEXT:
5172 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5173 if (pool.info.require_rollback()) {
5174 result = -EOPNOTSUPP;
5175 break;
5176 }
5177 ++ctx->num_read;
5178 {
5179 // read into a buffer
5180 bufferlist bl;
5181 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5182 info.pgid.shard),
5183 op.extent.offset, op.extent.length, bl);
5184 osd_op.outdata.claim(bl);
5185 if (r < 0)
5186 result = r;
5187 else
5188 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5189 ctx->delta_stats.num_rd++;
5190 dout(10) << " map_extents done on object " << soid << dendl;
5191 }
5192 break;
5193
5194 /* map extents */
5195 case CEPH_OSD_OP_SPARSE_READ:
5196 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5197 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5198 op.extent.length, op.extent.truncate_size,
5199 op.extent.truncate_seq);
5200 if (op_finisher == nullptr) {
5201 result = do_sparse_read(ctx, osd_op);
5202 } else {
5203 result = op_finisher->execute();
5204 }
5205 break;
5206
5207 case CEPH_OSD_OP_CALL:
5208 {
5209 string cname, mname;
5210 bufferlist indata;
5211 try {
5212 bp.copy(op.cls.class_len, cname);
5213 bp.copy(op.cls.method_len, mname);
5214 bp.copy(op.cls.indata_len, indata);
5215 } catch (buffer::error& e) {
5216 dout(10) << "call unable to decode class + method + indata" << dendl;
5217 dout(30) << "in dump: ";
5218 osd_op.indata.hexdump(*_dout);
5219 *_dout << dendl;
5220 result = -EINVAL;
5221 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5222 break;
5223 }
5224 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5225
5226 ClassHandler::ClassData *cls;
5227 result = osd->class_handler->open_class(cname, &cls);
5228 assert(result == 0); // init_op_flags() already verified this works.
5229
5230 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5231 if (!method) {
5232 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5233 result = -EOPNOTSUPP;
5234 break;
5235 }
5236
5237 int flags = method->get_flags();
5238 if (flags & CLS_METHOD_WR)
5239 ctx->user_modify = true;
5240
5241 bufferlist outdata;
5242 dout(10) << "call method " << cname << "." << mname << dendl;
5243 int prev_rd = ctx->num_read;
5244 int prev_wr = ctx->num_write;
5245 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5246
5247 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5248 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5249 result = -EIO;
5250 break;
5251 }
5252 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5253 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5254 result = -EIO;
5255 break;
5256 }
5257
5258 dout(10) << "method called response length=" << outdata.length() << dendl;
5259 op.extent.length = outdata.length();
5260 osd_op.outdata.claim_append(outdata);
5261 dout(30) << "out dump: ";
5262 osd_op.outdata.hexdump(*_dout);
5263 *_dout << dendl;
5264 }
5265 break;
5266
5267 case CEPH_OSD_OP_STAT:
5268 // note: stat does not require RD
5269 {
5270 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5271
5272 if (obs.exists && !oi.is_whiteout()) {
5273 ::encode(oi.size, osd_op.outdata);
5274 ::encode(oi.mtime, osd_op.outdata);
5275 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5276 } else {
5277 result = -ENOENT;
5278 dout(10) << "stat oi object does not exist" << dendl;
5279 }
5280
5281 ctx->delta_stats.num_rd++;
5282 }
5283 break;
5284
5285 case CEPH_OSD_OP_ISDIRTY:
5286 ++ctx->num_read;
5287 {
5288 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5289 bool is_dirty = obs.oi.is_dirty();
5290 ::encode(is_dirty, osd_op.outdata);
5291 ctx->delta_stats.num_rd++;
5292 result = 0;
5293 }
5294 break;
5295
5296 case CEPH_OSD_OP_UNDIRTY:
5297 ++ctx->num_write;
5298 {
5299 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5300 if (oi.is_dirty()) {
5301 ctx->undirty = true; // see make_writeable()
5302 ctx->modify = true;
5303 ctx->delta_stats.num_wr++;
5304 }
5305 result = 0;
5306 }
5307 break;
5308
5309 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5310 ++ctx->num_write;
5311 {
5312 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5313 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5314 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5315 result = -EINVAL;
5316 break;
5317 }
5318 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5319 result = -EINVAL;
5320 break;
5321 }
5322 if (!obs.exists) {
5323 result = 0;
5324 break;
5325 }
5326 if (oi.is_cache_pinned()) {
5327 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5328 result = -EPERM;
5329 break;
5330 }
5331 if (oi.is_dirty()) {
5332 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5333 if (result == -EINPROGRESS)
5334 result = -EAGAIN;
5335 } else {
5336 result = 0;
5337 }
5338 }
5339 break;
5340
5341 case CEPH_OSD_OP_CACHE_FLUSH:
5342 ++ctx->num_write;
5343 {
5344 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5345 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5346 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5347 result = -EINVAL;
5348 break;
5349 }
5350 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5351 result = -EINVAL;
5352 break;
5353 }
5354 if (!obs.exists) {
5355 result = 0;
5356 break;
5357 }
5358 if (oi.is_cache_pinned()) {
5359 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5360 result = -EPERM;
5361 break;
5362 }
5363 hobject_t missing;
5364 if (oi.is_dirty()) {
5365 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5366 if (result == -EINPROGRESS)
5367 result = -EAGAIN;
5368 } else {
5369 result = 0;
5370 }
5371 // Check special return value which has set missing_return
5372 if (result == -ENOENT) {
5373 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5374 assert(!missing.is_min());
5375 wait_for_unreadable_object(missing, ctx->op);
5376 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5377 result = -EAGAIN;
5378 }
5379 }
5380 break;
5381
5382 case CEPH_OSD_OP_CACHE_EVICT:
5383 ++ctx->num_write;
5384 {
5385 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5386 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5387 result = -EINVAL;
5388 break;
5389 }
5390 if (!obs.exists) {
5391 result = 0;
5392 break;
5393 }
5394 if (oi.is_cache_pinned()) {
5395 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5396 result = -EPERM;
5397 break;
5398 }
5399 if (oi.is_dirty()) {
5400 result = -EBUSY;
5401 break;
5402 }
5403 if (!oi.watchers.empty()) {
5404 result = -EBUSY;
5405 break;
5406 }
5407 if (soid.snap == CEPH_NOSNAP) {
5408 result = _verify_no_head_clones(soid, ssc->snapset);
5409 if (result < 0)
5410 break;
5411 }
5412 result = _delete_oid(ctx, true, false);
5413 if (result >= 0) {
5414 // mark that this is a cache eviction to avoid triggering normal
5415 // make_writeable() clone or snapdir object creation in finish_ctx()
5416 ctx->cache_evict = true;
5417 }
5418 osd->logger->inc(l_osd_tier_evict);
5419 }
5420 break;
5421
5422 case CEPH_OSD_OP_GETXATTR:
5423 ++ctx->num_read;
5424 {
5425 string aname;
5426 bp.copy(op.xattr.name_len, aname);
5427 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5428 string name = "_" + aname;
5429 int r = getattr_maybe_cache(
5430 ctx->obc,
5431 name,
5432 &(osd_op.outdata));
5433 if (r >= 0) {
5434 op.xattr.value_len = osd_op.outdata.length();
5435 result = 0;
5436 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5437 } else
5438 result = r;
5439
5440 ctx->delta_stats.num_rd++;
5441 }
5442 break;
5443
5444 case CEPH_OSD_OP_GETXATTRS:
5445 ++ctx->num_read;
5446 {
5447 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5448 map<string, bufferlist> out;
5449 result = getattrs_maybe_cache(
5450 ctx->obc,
5451 &out);
5452
5453 bufferlist bl;
5454 ::encode(out, bl);
5455 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5456 ctx->delta_stats.num_rd++;
5457 osd_op.outdata.claim_append(bl);
5458 }
5459 break;
5460
5461 case CEPH_OSD_OP_CMPXATTR:
5462 ++ctx->num_read;
5463 {
5464 string aname;
5465 bp.copy(op.xattr.name_len, aname);
5466 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5467 string name = "_" + aname;
5468 name[op.xattr.name_len + 1] = 0;
5469
5470 bufferlist xattr;
5471 result = getattr_maybe_cache(
5472 ctx->obc,
5473 name,
5474 &xattr);
5475 if (result < 0 && result != -EEXIST && result != -ENODATA)
5476 break;
5477
5478 ctx->delta_stats.num_rd++;
5479 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5480
5481 switch (op.xattr.cmp_mode) {
5482 case CEPH_OSD_CMPXATTR_MODE_STRING:
5483 {
5484 string val;
5485 bp.copy(op.xattr.value_len, val);
5486 val[op.xattr.value_len] = 0;
5487 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5488 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5489 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5490 }
5491 break;
5492
5493 case CEPH_OSD_CMPXATTR_MODE_U64:
5494 {
5495 uint64_t u64val;
5496 try {
5497 ::decode(u64val, bp);
5498 }
5499 catch (buffer::error& e) {
5500 result = -EINVAL;
5501 goto fail;
5502 }
5503 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5504 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5505 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5506 }
5507 break;
5508
5509 default:
5510 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5511 result = -EINVAL;
5512 }
5513
5514 if (!result) {
5515 dout(10) << "comparison returned false" << dendl;
5516 result = -ECANCELED;
5517 break;
5518 }
5519 if (result < 0) {
5520 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5521 break;
5522 }
5523
5524 dout(10) << "comparison returned true" << dendl;
5525 }
5526 break;
5527
5528 case CEPH_OSD_OP_ASSERT_VER:
5529 ++ctx->num_read;
5530 {
5531 uint64_t ver = op.assert_ver.ver;
5532 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5533 if (!ver)
5534 result = -EINVAL;
5535 else if (ver < oi.user_version)
5536 result = -ERANGE;
5537 else if (ver > oi.user_version)
5538 result = -EOVERFLOW;
5539 }
5540 break;
5541
5542 case CEPH_OSD_OP_LIST_WATCHERS:
5543 ++ctx->num_read;
5544 {
5545 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5546 obj_list_watch_response_t resp;
5547
5548 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5549 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5550 ++oi_iter) {
5551 dout(20) << "key cookie=" << oi_iter->first.first
5552 << " entity=" << oi_iter->first.second << " "
5553 << oi_iter->second << dendl;
5554 assert(oi_iter->first.first == oi_iter->second.cookie);
5555 assert(oi_iter->first.second.is_client());
5556
5557 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5558 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5559 resp.entries.push_back(wi);
5560 }
5561
5562 resp.encode(osd_op.outdata, ctx->get_features());
5563 result = 0;
5564
5565 ctx->delta_stats.num_rd++;
5566 break;
5567 }
5568
5569 case CEPH_OSD_OP_LIST_SNAPS:
5570 ++ctx->num_read;
5571 {
5572 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5573 obj_list_snap_response_t resp;
5574
5575 if (!ssc) {
5576 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5577 }
5578 assert(ssc);
5579
5580 int clonecount = ssc->snapset.clones.size();
5581 if (ssc->snapset.head_exists)
5582 clonecount++;
5583 resp.clones.reserve(clonecount);
5584 for (auto clone_iter = ssc->snapset.clones.begin();
5585 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5586 clone_info ci;
5587 ci.cloneid = *clone_iter;
5588
5589 hobject_t clone_oid = soid;
5590 clone_oid.snap = *clone_iter;
5591
5592 if (!ssc->snapset.is_legacy()) {
5593 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5594 if (p == ssc->snapset.clone_snaps.end()) {
5595 osd->clog->error() << "osd." << osd->whoami
5596 << ": inconsistent clone_snaps found for oid "
5597 << soid << " clone " << *clone_iter
5598 << " snapset " << ssc->snapset;
5599 result = -EINVAL;
5600 break;
5601 }
5602 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5603 ci.snaps.push_back(*q);
5604 }
5605 } else {
5606 /* No need to take a lock here. We are only inspecting state cached on
5607 * in the ObjectContext, so we aren't performing an actual read unless
5608 * the clone obc is not already loaded (in which case, it cannot have
5609 * an in progress write). We also do not risk exposing uncommitted
5610 * state since we do have a read lock on the head object or snapdir,
5611 * which we would have to write lock in order to make user visible
5612 * modifications to the snapshot state (snap trim related mutations
5613 * are not user visible).
5614 */
5615 if (is_missing_object(clone_oid)) {
5616 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5617 wait_for_unreadable_object(clone_oid, ctx->op);
5618 result = -EAGAIN;
5619 break;
5620 }
5621
5622 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5623 if (!clone_obc) {
5624 if (maybe_handle_cache(
5625 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5626 // promoting the clone
5627 result = -EAGAIN;
5628 } else {
5629 osd->clog->error() << "osd." << osd->whoami
5630 << ": missing clone " << clone_oid
5631 << " for oid "
5632 << soid;
5633 // should not happen
5634 result = -ENOENT;
5635 }
5636 break;
5637 }
5638 for (vector<snapid_t>::reverse_iterator p =
5639 clone_obc->obs.oi.legacy_snaps.rbegin();
5640 p != clone_obc->obs.oi.legacy_snaps.rend();
5641 ++p) {
5642 ci.snaps.push_back(*p);
5643 }
5644 }
5645
5646 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5647
5648 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5649 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5650 if (coi == ssc->snapset.clone_overlap.end()) {
5651 osd->clog->error() << "osd." << osd->whoami
5652 << ": inconsistent clone_overlap found for oid "
5653 << soid << " clone " << *clone_iter;
5654 result = -EINVAL;
5655 break;
5656 }
5657 const interval_set<uint64_t> &o = coi->second;
5658 ci.overlap.reserve(o.num_intervals());
5659 for (interval_set<uint64_t>::const_iterator r = o.begin();
5660 r != o.end(); ++r) {
5661 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5662 r.get_len()));
5663 }
5664
5665 map<snapid_t, uint64_t>::const_iterator si;
5666 si = ssc->snapset.clone_size.find(ci.cloneid);
5667 if (si == ssc->snapset.clone_size.end()) {
5668 osd->clog->error() << "osd." << osd->whoami
5669 << ": inconsistent clone_size found for oid "
5670 << soid << " clone " << *clone_iter;
5671 result = -EINVAL;
5672 break;
5673 }
5674 ci.size = si->second;
5675
5676 resp.clones.push_back(ci);
5677 }
5678 if (result < 0) {
5679 break;
5680 }
5681 if (ssc->snapset.head_exists &&
5682 !ctx->obc->obs.oi.is_whiteout()) {
5683 assert(obs.exists);
5684 clone_info ci;
5685 ci.cloneid = CEPH_NOSNAP;
5686
5687 //Size for HEAD is oi.size
5688 ci.size = oi.size;
5689
5690 resp.clones.push_back(ci);
5691 }
5692 resp.seq = ssc->snapset.seq;
5693
5694 resp.encode(osd_op.outdata);
5695 result = 0;
5696
5697 ctx->delta_stats.num_rd++;
5698 break;
5699 }
5700
5701 case CEPH_OSD_OP_NOTIFY:
5702 ++ctx->num_read;
5703 {
5704 uint32_t timeout;
5705 bufferlist bl;
5706
5707 try {
5708 uint32_t ver; // obsolete
5709 ::decode(ver, bp);
5710 ::decode(timeout, bp);
5711 ::decode(bl, bp);
5712 } catch (const buffer::error &e) {
5713 timeout = 0;
5714 }
5715 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5716 if (!timeout)
5717 timeout = cct->_conf->osd_default_notify_timeout;
5718
5719 notify_info_t n;
5720 n.timeout = timeout;
5721 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5722 n.cookie = op.watch.cookie;
5723 n.bl = bl;
5724 ctx->notifies.push_back(n);
5725
5726 // return our unique notify id to the client
5727 ::encode(n.notify_id, osd_op.outdata);
5728 }
5729 break;
5730
5731 case CEPH_OSD_OP_NOTIFY_ACK:
5732 ++ctx->num_read;
5733 {
5734 try {
5735 uint64_t notify_id = 0;
5736 uint64_t watch_cookie = 0;
5737 ::decode(notify_id, bp);
5738 ::decode(watch_cookie, bp);
5739 bufferlist reply_bl;
5740 if (!bp.end()) {
5741 ::decode(reply_bl, bp);
5742 }
5743 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5744 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5745 ctx->notify_acks.push_back(ack);
5746 } catch (const buffer::error &e) {
5747 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5748 OpContext::NotifyAck ack(
5749 // op.watch.cookie is actually the notify_id for historical reasons
5750 op.watch.cookie
5751 );
5752 ctx->notify_acks.push_back(ack);
5753 }
5754 }
5755 break;
5756
5757 case CEPH_OSD_OP_SETALLOCHINT:
5758 ++ctx->num_write;
5759 {
5760 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5761 maybe_create_new_object(ctx);
5762 oi.expected_object_size = op.alloc_hint.expected_object_size;
5763 oi.expected_write_size = op.alloc_hint.expected_write_size;
5764 oi.alloc_hint_flags = op.alloc_hint.flags;
5765 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5766 op.alloc_hint.expected_write_size,
5767 op.alloc_hint.flags);
5768 ctx->delta_stats.num_wr++;
5769 result = 0;
5770 }
5771 break;
5772
5773
5774 // --- WRITES ---
5775
5776 // -- object data --
5777
5778 case CEPH_OSD_OP_WRITE:
5779 ++ctx->num_write;
5780 { // write
5781 __u32 seq = oi.truncate_seq;
5782 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5783 if (op.extent.length != osd_op.indata.length()) {
5784 result = -EINVAL;
5785 break;
5786 }
5787
5788 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5789 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5790
5791 if (pool.info.requires_aligned_append() &&
5792 (op.extent.offset % pool.info.required_alignment() != 0)) {
5793 result = -EOPNOTSUPP;
5794 break;
5795 }
5796
5797 if (!obs.exists) {
5798 if (pool.info.requires_aligned_append() && op.extent.offset) {
5799 result = -EOPNOTSUPP;
5800 break;
5801 }
5802 } else if (op.extent.offset != oi.size &&
5803 pool.info.requires_aligned_append()) {
5804 result = -EOPNOTSUPP;
5805 break;
5806 }
5807
5808 if (seq && (seq > op.extent.truncate_seq) &&
5809 (op.extent.offset + op.extent.length > oi.size)) {
5810 // old write, arrived after trimtrunc
5811 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5812 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5813 << ", adjusting write length to " << op.extent.length << dendl;
5814 bufferlist t;
5815 t.substr_of(osd_op.indata, 0, op.extent.length);
5816 osd_op.indata.swap(t);
5817 }
5818 if (op.extent.truncate_seq > seq) {
5819 // write arrives before trimtrunc
5820 if (obs.exists && !oi.is_whiteout()) {
5821 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5822 << ", truncating to " << op.extent.truncate_size << dendl;
5823 t->truncate(soid, op.extent.truncate_size);
5824 oi.truncate_seq = op.extent.truncate_seq;
5825 oi.truncate_size = op.extent.truncate_size;
5826 if (op.extent.truncate_size != oi.size) {
5827 ctx->delta_stats.num_bytes -= oi.size;
5828 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5829 oi.size = op.extent.truncate_size;
5830 }
5831 } else {
5832 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5833 << ", but object is new" << dendl;
5834 oi.truncate_seq = op.extent.truncate_seq;
5835 oi.truncate_size = op.extent.truncate_size;
5836 }
5837 }
5838 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5839 if (result < 0)
5840 break;
5841
5842 maybe_create_new_object(ctx);
5843
5844 if (op.extent.length == 0) {
5845 if (op.extent.offset > oi.size) {
5846 t->truncate(
5847 soid, op.extent.offset);
5848 } else {
5849 t->nop(soid);
5850 }
5851 } else {
5852 t->write(
5853 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5854 }
5855
5856 if (op.extent.offset == 0 && op.extent.length >= oi.size)
5857 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5858 else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5859 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5860 else
5861 obs.oi.clear_data_digest();
5862 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5863 op.extent.offset, op.extent.length);
5864
5865 }
5866 break;
5867
5868 case CEPH_OSD_OP_WRITEFULL:
5869 ++ctx->num_write;
5870 { // write full object
5871 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5872
5873 if (op.extent.length != osd_op.indata.length()) {
5874 result = -EINVAL;
5875 break;
5876 }
5877 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5878 if (result < 0)
5879 break;
5880
5881 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5882 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5883
5884 maybe_create_new_object(ctx);
5885 if (pool.info.require_rollback()) {
5886 t->truncate(soid, 0);
5887 } else if (obs.exists && op.extent.length < oi.size) {
5888 t->truncate(soid, op.extent.length);
5889 }
5890 if (op.extent.length) {
5891 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5892 }
5893 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5894
5895 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5896 0, op.extent.length, true);
5897 }
5898 break;
5899
5900 case CEPH_OSD_OP_WRITESAME:
5901 ++ctx->num_write;
5902 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5903 result = do_writesame(ctx, osd_op);
5904 break;
5905
5906 case CEPH_OSD_OP_ROLLBACK :
5907 ++ctx->num_write;
5908 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5909 result = _rollback_to(ctx, op);
5910 break;
5911
5912 case CEPH_OSD_OP_ZERO:
5913 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5914 if (pool.info.requires_aligned_append()) {
5915 result = -EOPNOTSUPP;
5916 break;
5917 }
5918 ++ctx->num_write;
5919 { // zero
5920 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5921 if (result < 0)
5922 break;
5923 assert(op.extent.length);
5924 if (obs.exists && !oi.is_whiteout()) {
5925 t->zero(soid, op.extent.offset, op.extent.length);
5926 interval_set<uint64_t> ch;
5927 ch.insert(op.extent.offset, op.extent.length);
5928 ctx->modified_ranges.union_of(ch);
5929 ctx->delta_stats.num_wr++;
5930 oi.clear_data_digest();
5931 } else {
5932 // no-op
5933 }
5934 }
5935 break;
5936 case CEPH_OSD_OP_CREATE:
5937 ++ctx->num_write;
5938 {
5939 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5940 int flags = le32_to_cpu(op.flags);
5941 if (obs.exists && !oi.is_whiteout() &&
5942 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5943 result = -EEXIST; /* this is an exclusive create */
5944 } else {
5945 if (osd_op.indata.length()) {
5946 bufferlist::iterator p = osd_op.indata.begin();
5947 string category;
5948 try {
5949 ::decode(category, p);
5950 }
5951 catch (buffer::error& e) {
5952 result = -EINVAL;
5953 goto fail;
5954 }
5955 // category is no longer implemented.
5956 }
5957 if (result >= 0) {
5958 maybe_create_new_object(ctx);
5959 t->nop(soid);
5960 }
5961 }
5962 }
5963 break;
5964
5965 case CEPH_OSD_OP_TRIMTRUNC:
5966 op.extent.offset = op.extent.truncate_size;
5967 // falling through
5968
5969 case CEPH_OSD_OP_TRUNCATE:
5970 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5971 if (pool.info.requires_aligned_append()) {
5972 result = -EOPNOTSUPP;
5973 break;
5974 }
5975 ++ctx->num_write;
5976 {
5977 // truncate
5978 if (!obs.exists || oi.is_whiteout()) {
5979 dout(10) << " object dne, truncate is a no-op" << dendl;
5980 break;
5981 }
5982
5983 if (op.extent.offset > cct->_conf->osd_max_object_size) {
5984 result = -EFBIG;
5985 break;
5986 }
5987
5988 if (op.extent.truncate_seq) {
5989 assert(op.extent.offset == op.extent.truncate_size);
5990 if (op.extent.truncate_seq <= oi.truncate_seq) {
5991 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5992 << ", no-op" << dendl;
5993 break; // old
5994 }
5995 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5996 << ", truncating" << dendl;
5997 oi.truncate_seq = op.extent.truncate_seq;
5998 oi.truncate_size = op.extent.truncate_size;
5999 }
6000
6001 maybe_create_new_object(ctx);
6002 t->truncate(soid, op.extent.offset);
6003 if (oi.size > op.extent.offset) {
6004 interval_set<uint64_t> trim;
6005 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6006 ctx->modified_ranges.union_of(trim);
6007 }
6008 if (op.extent.offset != oi.size) {
6009 ctx->delta_stats.num_bytes -= oi.size;
6010 ctx->delta_stats.num_bytes += op.extent.offset;
6011 oi.size = op.extent.offset;
6012 }
6013 ctx->delta_stats.num_wr++;
6014 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6015
6016 oi.clear_data_digest();
6017 }
6018 break;
6019
6020 case CEPH_OSD_OP_DELETE:
6021 ++ctx->num_write;
6022 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6023 {
6024 result = _delete_oid(ctx, false, ctx->ignore_cache);
6025 }
6026 break;
6027
6028 case CEPH_OSD_OP_WATCH:
6029 ++ctx->num_write;
6030 {
6031 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6032 op.watch.cookie, op.watch.op);
6033 if (!obs.exists) {
6034 result = -ENOENT;
6035 break;
6036 }
6037 uint64_t cookie = op.watch.cookie;
6038 entity_name_t entity = ctx->reqid.name;
6039 ObjectContextRef obc = ctx->obc;
6040
6041 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6042 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6043 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6044 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6045 dout(10) << "watch: peer_addr="
6046 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6047
6048 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6049 if (op.watch.timeout != 0) {
6050 timeout = op.watch.timeout;
6051 }
6052
6053 watch_info_t w(cookie, timeout,
6054 ctx->op->get_req()->get_connection()->get_peer_addr());
6055 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6056 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6057 if (oi.watchers.count(make_pair(cookie, entity))) {
6058 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6059 } else {
6060 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6061 oi.watchers[make_pair(cookie, entity)] = w;
6062 t->nop(soid); // make sure update the object_info on disk!
6063 }
6064 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6065 ctx->watch_connects.push_back(make_pair(w, will_ping));
6066 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6067 if (!oi.watchers.count(make_pair(cookie, entity))) {
6068 result = -ENOTCONN;
6069 break;
6070 }
6071 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6072 ctx->watch_connects.push_back(make_pair(w, true));
6073 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6074 /* Note: WATCH with PING doesn't cause may_write() to return true,
6075 * so if there is nothing else in the transaction, this is going
6076 * to run do_osd_op_effects, but not write out a log entry */
6077 if (!oi.watchers.count(make_pair(cookie, entity))) {
6078 result = -ENOTCONN;
6079 break;
6080 }
6081 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6082 obc->watchers.find(make_pair(cookie, entity));
6083 if (p == obc->watchers.end() ||
6084 !p->second->is_connected()) {
6085 // client needs to reconnect
6086 result = -ETIMEDOUT;
6087 break;
6088 }
6089 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6090 p->second->got_ping(ceph_clock_now());
6091 result = 0;
6092 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6093 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6094 oi.watchers.find(make_pair(cookie, entity));
6095 if (oi_iter != oi.watchers.end()) {
6096 dout(10) << " removed watch " << oi_iter->second << " by "
6097 << entity << dendl;
6098 oi.watchers.erase(oi_iter);
6099 t->nop(soid); // update oi on disk
6100 ctx->watch_disconnects.push_back(
6101 watch_disconnect_t(cookie, entity, false));
6102 } else {
6103 dout(10) << " can't remove: no watch by " << entity << dendl;
6104 }
6105 }
6106 }
6107 break;
6108
6109 case CEPH_OSD_OP_CACHE_PIN:
6110 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6111 if ((!pool.info.is_tier() ||
6112 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6113 result = -EINVAL;
6114 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6115 break;
6116 }
6117 ++ctx->num_write;
6118 {
6119 if (!obs.exists || oi.is_whiteout()) {
6120 result = -ENOENT;
6121 break;
6122 }
6123
6124 if (!oi.is_cache_pinned()) {
6125 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6126 ctx->modify = true;
6127 ctx->delta_stats.num_objects_pinned++;
6128 ctx->delta_stats.num_wr++;
6129 }
6130 result = 0;
6131 }
6132 break;
6133
6134 case CEPH_OSD_OP_CACHE_UNPIN:
6135 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6136 if ((!pool.info.is_tier() ||
6137 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6138 result = -EINVAL;
6139 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6140 break;
6141 }
6142 ++ctx->num_write;
6143 {
6144 if (!obs.exists || oi.is_whiteout()) {
6145 result = -ENOENT;
6146 break;
6147 }
6148
6149 if (oi.is_cache_pinned()) {
6150 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6151 ctx->modify = true;
6152 ctx->delta_stats.num_objects_pinned--;
6153 ctx->delta_stats.num_wr++;
6154 }
6155 result = 0;
6156 }
6157 break;
6158
6159 case CEPH_OSD_OP_SET_REDIRECT:
6160 ++ctx->num_write;
6161 {
6162 if (pool.info.is_tier()) {
6163 result = -EINVAL;
6164 break;
6165 }
6166 if (!obs.exists) {
6167 result = -ENOENT;
6168 break;
6169 }
6170 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6171 result = -EOPNOTSUPP;
6172 break;
6173 }
6174
6175 object_t target_name;
6176 object_locator_t target_oloc;
6177 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6178 version_t target_version = op.copy_from.src_version;
6179 try {
6180 ::decode(target_name, bp);
6181 ::decode(target_oloc, bp);
6182 }
6183 catch (buffer::error& e) {
6184 result = -EINVAL;
6185 goto fail;
6186 }
6187 pg_t raw_pg;
6188 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6189 hobject_t target(target_name, target_oloc.key, target_snapid,
6190 raw_pg.ps(), raw_pg.pool(),
6191 target_oloc.nspace);
6192 if (target == soid) {
6193 dout(20) << " set-redirect self is invalid" << dendl;
6194 result = -EINVAL;
6195 break;
6196 }
6197 oi.set_flag(object_info_t::FLAG_MANIFEST);
6198 oi.manifest.redirect_target = target;
6199 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6200 t->truncate(soid, 0);
6201 if (oi.is_omap() && pool.info.supports_omap()) {
6202 t->omap_clear(soid);
6203 obs.oi.clear_omap_digest();
6204 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6205 }
6206 ctx->delta_stats.num_bytes -= oi.size;
6207 oi.size = 0;
6208 oi.new_object();
6209 oi.user_version = target_version;
6210 ctx->user_at_version = target_version;
6211 /* rm_attrs */
6212 map<string,bufferlist> rmattrs;
6213 result = getattrs_maybe_cache(ctx->obc,
6214 &rmattrs);
6215 if (result < 0) {
6216 return result;
6217 }
6218 map<string, bufferlist>::iterator iter;
6219 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6220 const string& name = iter->first;
6221 t->rmattr(soid, name);
6222 }
6223 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6224 }
6225
6226 break;
6227
6228 // -- object attrs --
6229
6230 case CEPH_OSD_OP_SETXATTR:
6231 ++ctx->num_write;
6232 {
6233 if (cct->_conf->osd_max_attr_size > 0 &&
6234 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6235 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6236 result = -EFBIG;
6237 break;
6238 }
6239 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6240 cct->_conf->osd_max_attr_name_len);
6241 if (op.xattr.name_len > max_name_len) {
6242 result = -ENAMETOOLONG;
6243 break;
6244 }
6245 maybe_create_new_object(ctx);
6246 string aname;
6247 bp.copy(op.xattr.name_len, aname);
6248 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6249 string name = "_" + aname;
6250 bufferlist bl;
6251 bp.copy(op.xattr.value_len, bl);
6252 t->setattr(soid, name, bl);
6253 ctx->delta_stats.num_wr++;
6254 }
6255 break;
6256
6257 case CEPH_OSD_OP_RMXATTR:
6258 ++ctx->num_write;
6259 {
6260 string aname;
6261 bp.copy(op.xattr.name_len, aname);
6262 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6263 if (!obs.exists || oi.is_whiteout()) {
6264 result = -ENOENT;
6265 break;
6266 }
6267 string name = "_" + aname;
6268 t->rmattr(soid, name);
6269 ctx->delta_stats.num_wr++;
6270 }
6271 break;
6272
6273
6274 // -- fancy writers --
6275 case CEPH_OSD_OP_APPEND:
6276 {
6277 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6278 // just do it inline; this works because we are happy to execute
6279 // fancy op on replicas as well.
6280 vector<OSDOp> nops(1);
6281 OSDOp& newop = nops[0];
6282 newop.op.op = CEPH_OSD_OP_WRITE;
6283 newop.op.extent.offset = oi.size;
6284 newop.op.extent.length = op.extent.length;
6285 newop.op.extent.truncate_seq = oi.truncate_seq;
6286 newop.indata = osd_op.indata;
6287 result = do_osd_ops(ctx, nops);
6288 osd_op.outdata.claim(newop.outdata);
6289 }
6290 break;
6291
6292 case CEPH_OSD_OP_STARTSYNC:
6293 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6294 t->nop(soid);
6295 break;
6296
6297
6298 // -- trivial map --
6299 case CEPH_OSD_OP_TMAPGET:
6300 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6301 if (pool.info.require_rollback()) {
6302 result = -EOPNOTSUPP;
6303 break;
6304 }
6305 {
6306 vector<OSDOp> nops(1);
6307 OSDOp& newop = nops[0];
6308 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6309 newop.op.extent.offset = 0;
6310 newop.op.extent.length = 0;
6311 do_osd_ops(ctx, nops);
6312 osd_op.outdata.claim(newop.outdata);
6313 }
6314 break;
6315
6316 case CEPH_OSD_OP_TMAPPUT:
6317 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6318 if (pool.info.require_rollback()) {
6319 result = -EOPNOTSUPP;
6320 break;
6321 }
6322 {
6323 //_dout_lock.Lock();
6324 //osd_op.data.hexdump(*_dout);
6325 //_dout_lock.Unlock();
6326
6327 // verify sort order
6328 bool unsorted = false;
6329 if (true) {
6330 bufferlist header;
6331 ::decode(header, bp);
6332 uint32_t n;
6333 ::decode(n, bp);
6334 string last_key;
6335 while (n--) {
6336 string key;
6337 ::decode(key, bp);
6338 dout(10) << "tmapput key " << key << dendl;
6339 bufferlist val;
6340 ::decode(val, bp);
6341 if (key < last_key) {
6342 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6343 unsorted = true;
6344 break;
6345 }
6346 last_key = key;
6347 }
6348 }
6349
6350 // write it
6351 vector<OSDOp> nops(1);
6352 OSDOp& newop = nops[0];
6353 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6354 newop.op.extent.offset = 0;
6355 newop.op.extent.length = osd_op.indata.length();
6356 newop.indata = osd_op.indata;
6357
6358 if (unsorted) {
6359 bp = osd_op.indata.begin();
6360 bufferlist header;
6361 map<string, bufferlist> m;
6362 ::decode(header, bp);
6363 ::decode(m, bp);
6364 assert(bp.end());
6365 bufferlist newbl;
6366 ::encode(header, newbl);
6367 ::encode(m, newbl);
6368 newop.indata = newbl;
6369 }
6370 result = do_osd_ops(ctx, nops);
6371 assert(result == 0);
6372 }
6373 break;
6374
6375 case CEPH_OSD_OP_TMAPUP:
6376 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6377 if (pool.info.require_rollback()) {
6378 result = -EOPNOTSUPP;
6379 break;
6380 }
6381 ++ctx->num_write;
6382 result = do_tmapup(ctx, bp, osd_op);
6383 break;
6384
6385 case CEPH_OSD_OP_TMAP2OMAP:
6386 ++ctx->num_write;
6387 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6388 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6389 break;
6390
6391 // OMAP Read ops
6392 case CEPH_OSD_OP_OMAPGETKEYS:
6393 ++ctx->num_read;
6394 {
6395 string start_after;
6396 uint64_t max_return;
6397 try {
6398 ::decode(start_after, bp);
6399 ::decode(max_return, bp);
6400 }
6401 catch (buffer::error& e) {
6402 result = -EINVAL;
6403 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6404 goto fail;
6405 }
6406 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6407 max_return = cct->_conf->osd_max_omap_entries_per_request;
6408 }
6409 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6410
6411 bufferlist bl;
6412 uint32_t num = 0;
6413 bool truncated = false;
6414 if (oi.is_omap()) {
6415 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6416 coll, ghobject_t(soid)
6417 );
6418 assert(iter);
6419 iter->upper_bound(start_after);
6420 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6421 if (num >= max_return ||
6422 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6423 truncated = true;
6424 break;
6425 }
6426 ::encode(iter->key(), bl);
6427 }
6428 } // else return empty out_set
6429 ::encode(num, osd_op.outdata);
6430 osd_op.outdata.claim_append(bl);
6431 ::encode(truncated, osd_op.outdata);
6432 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6433 ctx->delta_stats.num_rd++;
6434 }
6435 break;
6436
6437 case CEPH_OSD_OP_OMAPGETVALS:
6438 ++ctx->num_read;
6439 {
6440 string start_after;
6441 uint64_t max_return;
6442 string filter_prefix;
6443 try {
6444 ::decode(start_after, bp);
6445 ::decode(max_return, bp);
6446 ::decode(filter_prefix, bp);
6447 }
6448 catch (buffer::error& e) {
6449 result = -EINVAL;
6450 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6451 goto fail;
6452 }
6453 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6454 max_return = cct->_conf->osd_max_omap_entries_per_request;
6455 }
6456 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6457
6458 uint32_t num = 0;
6459 bool truncated = false;
6460 bufferlist bl;
6461 if (oi.is_omap()) {
6462 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6463 coll, ghobject_t(soid)
6464 );
6465 if (!iter) {
6466 result = -ENOENT;
6467 goto fail;
6468 }
6469 iter->upper_bound(start_after);
6470 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6471 for (num = 0;
6472 iter->valid() &&
6473 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6474 ++num, iter->next(false)) {
6475 dout(20) << "Found key " << iter->key() << dendl;
6476 if (num >= max_return ||
6477 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6478 truncated = true;
6479 break;
6480 }
6481 ::encode(iter->key(), bl);
6482 ::encode(iter->value(), bl);
6483 }
6484 } // else return empty out_set
6485 ::encode(num, osd_op.outdata);
6486 osd_op.outdata.claim_append(bl);
6487 ::encode(truncated, osd_op.outdata);
6488 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6489 ctx->delta_stats.num_rd++;
6490 }
6491 break;
6492
6493 case CEPH_OSD_OP_OMAPGETHEADER:
6494 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6495 if (!oi.is_omap()) {
6496 // return empty header
6497 break;
6498 }
6499 ++ctx->num_read;
6500 {
6501 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6502 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6503 ctx->delta_stats.num_rd++;
6504 }
6505 break;
6506
6507 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6508 ++ctx->num_read;
6509 {
6510 set<string> keys_to_get;
6511 try {
6512 ::decode(keys_to_get, bp);
6513 }
6514 catch (buffer::error& e) {
6515 result = -EINVAL;
6516 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6517 goto fail;
6518 }
6519 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6520 map<string, bufferlist> out;
6521 if (oi.is_omap()) {
6522 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6523 } // else return empty omap entries
6524 ::encode(out, osd_op.outdata);
6525 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6526 ctx->delta_stats.num_rd++;
6527 }
6528 break;
6529
6530 case CEPH_OSD_OP_OMAP_CMP:
6531 ++ctx->num_read;
6532 {
6533 if (!obs.exists || oi.is_whiteout()) {
6534 result = -ENOENT;
6535 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6536 break;
6537 }
6538 map<string, pair<bufferlist, int> > assertions;
6539 try {
6540 ::decode(assertions, bp);
6541 }
6542 catch (buffer::error& e) {
6543 result = -EINVAL;
6544 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6545 goto fail;
6546 }
6547 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6548
6549 map<string, bufferlist> out;
6550
6551 if (oi.is_omap()) {
6552 set<string> to_get;
6553 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6554 i != assertions.end();
6555 ++i)
6556 to_get.insert(i->first);
6557 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6558 to_get, &out);
6559 if (r < 0) {
6560 result = r;
6561 break;
6562 }
6563 } // else leave out empty
6564
6565 //Should set num_rd_kb based on encode length of map
6566 ctx->delta_stats.num_rd++;
6567
6568 int r = 0;
6569 bufferlist empty;
6570 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6571 i != assertions.end();
6572 ++i) {
6573 auto out_entry = out.find(i->first);
6574 bufferlist &bl = (out_entry != out.end()) ?
6575 out_entry->second : empty;
6576 switch (i->second.second) {
6577 case CEPH_OSD_CMPXATTR_OP_EQ:
6578 if (!(bl == i->second.first)) {
6579 r = -ECANCELED;
6580 }
6581 break;
6582 case CEPH_OSD_CMPXATTR_OP_LT:
6583 if (!(bl < i->second.first)) {
6584 r = -ECANCELED;
6585 }
6586 break;
6587 case CEPH_OSD_CMPXATTR_OP_GT:
6588 if (!(bl > i->second.first)) {
6589 r = -ECANCELED;
6590 }
6591 break;
6592 default:
6593 r = -EINVAL;
6594 break;
6595 }
6596 if (r < 0)
6597 break;
6598 }
6599 if (r < 0) {
6600 result = r;
6601 }
6602 }
6603 break;
6604
6605 // OMAP Write ops
6606 case CEPH_OSD_OP_OMAPSETVALS:
6607 if (!pool.info.supports_omap()) {
6608 result = -EOPNOTSUPP;
6609 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6610 break;
6611 }
6612 ++ctx->num_write;
6613 {
6614 maybe_create_new_object(ctx);
6615 bufferlist to_set_bl;
6616 try {
6617 decode_str_str_map_to_bl(bp, &to_set_bl);
6618 }
6619 catch (buffer::error& e) {
6620 result = -EINVAL;
6621 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6622 goto fail;
6623 }
6624 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6625 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6626 dout(20) << "setting vals: " << dendl;
6627 map<string,bufferlist> to_set;
6628 bufferlist::iterator pt = to_set_bl.begin();
6629 ::decode(to_set, pt);
6630 for (map<string, bufferlist>::iterator i = to_set.begin();
6631 i != to_set.end();
6632 ++i) {
6633 dout(20) << "\t" << i->first << dendl;
6634 }
6635 }
6636 t->omap_setkeys(soid, to_set_bl);
6637 ctx->delta_stats.num_wr++;
6638 }
6639 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6640 obs.oi.clear_omap_digest();
6641 break;
6642
6643 case CEPH_OSD_OP_OMAPSETHEADER:
6644 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6645 if (!pool.info.supports_omap()) {
6646 result = -EOPNOTSUPP;
6647 break;
6648 }
6649 ++ctx->num_write;
6650 {
6651 maybe_create_new_object(ctx);
6652 t->omap_setheader(soid, osd_op.indata);
6653 ctx->delta_stats.num_wr++;
6654 }
6655 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6656 obs.oi.clear_omap_digest();
6657 break;
6658
6659 case CEPH_OSD_OP_OMAPCLEAR:
6660 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6661 if (!pool.info.supports_omap()) {
6662 result = -EOPNOTSUPP;
6663 break;
6664 }
6665 ++ctx->num_write;
6666 {
6667 if (!obs.exists || oi.is_whiteout()) {
6668 result = -ENOENT;
6669 break;
6670 }
6671 if (oi.is_omap()) {
6672 t->omap_clear(soid);
6673 ctx->delta_stats.num_wr++;
6674 obs.oi.clear_omap_digest();
6675 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6676 }
6677 }
6678 break;
6679
6680 case CEPH_OSD_OP_OMAPRMKEYS:
6681 if (!pool.info.supports_omap()) {
6682 result = -EOPNOTSUPP;
6683 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6684 break;
6685 }
6686 ++ctx->num_write;
6687 {
6688 if (!obs.exists || oi.is_whiteout()) {
6689 result = -ENOENT;
6690 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6691 break;
6692 }
6693 bufferlist to_rm_bl;
6694 try {
6695 decode_str_set_to_bl(bp, &to_rm_bl);
6696 }
6697 catch (buffer::error& e) {
6698 result = -EINVAL;
6699 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6700 goto fail;
6701 }
6702 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6703 t->omap_rmkeys(soid, to_rm_bl);
6704 ctx->delta_stats.num_wr++;
6705 }
6706 obs.oi.clear_omap_digest();
6707 break;
6708
6709 case CEPH_OSD_OP_COPY_GET:
6710 ++ctx->num_read;
6711 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6712 soid.snap.val);
6713 if (op_finisher == nullptr) {
6714 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6715 } else {
6716 result = op_finisher->execute();
6717 }
6718 break;
6719
6720 case CEPH_OSD_OP_COPY_FROM:
6721 ++ctx->num_write;
6722 {
6723 object_t src_name;
6724 object_locator_t src_oloc;
6725 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6726 version_t src_version = op.copy_from.src_version;
6727 try {
6728 ::decode(src_name, bp);
6729 ::decode(src_oloc, bp);
6730 }
6731 catch (buffer::error& e) {
6732 result = -EINVAL;
6733 tracepoint(osd,
6734 do_osd_op_pre_copy_from,
6735 soid.oid.name.c_str(),
6736 soid.snap.val,
6737 "???",
6738 0,
6739 "???",
6740 "???",
6741 0,
6742 src_snapid,
6743 src_version);
6744 goto fail;
6745 }
6746 tracepoint(osd,
6747 do_osd_op_pre_copy_from,
6748 soid.oid.name.c_str(),
6749 soid.snap.val,
6750 src_name.name.c_str(),
6751 src_oloc.pool,
6752 src_oloc.key.c_str(),
6753 src_oloc.nspace.c_str(),
6754 src_oloc.hash,
6755 src_snapid,
6756 src_version);
6757 if (op_finisher == nullptr) {
6758 // start
6759 pg_t raw_pg;
6760 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6761 hobject_t src(src_name, src_oloc.key, src_snapid,
6762 raw_pg.ps(), raw_pg.pool(),
6763 src_oloc.nspace);
6764 if (src == soid) {
6765 dout(20) << " copy from self is invalid" << dendl;
6766 result = -EINVAL;
6767 break;
6768 }
6769 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6770 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6771 new CopyFromFinisher(cb));
6772 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6773 op.copy_from.flags,
6774 false,
6775 op.copy_from.src_fadvise_flags,
6776 op.flags);
6777 result = -EINPROGRESS;
6778 } else {
6779 // finish
6780 result = op_finisher->execute();
6781 assert(result == 0);
6782
6783 // COPY_FROM cannot be executed multiple times -- it must restart
6784 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6785 }
6786 }
6787 break;
6788
6789 default:
6790 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6791 dout(1) << "unrecognized osd op " << op.op
6792 << " " << ceph_osd_op_name(op.op)
6793 << dendl;
6794 result = -EOPNOTSUPP;
6795 }
6796
6797 fail:
6798 osd_op.rval = result;
6799 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6800 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6801 result = 0;
6802
6803 if (result < 0)
6804 break;
6805 }
6806 return result;
6807 }
6808
6809 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6810 {
6811 if (ctx->new_obs.oi.size == 0) {
6812 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6813 return -ENODATA;
6814 }
6815 vector<OSDOp> nops(1);
6816 OSDOp &newop = nops[0];
6817 newop.op.op = CEPH_OSD_OP_TMAPGET;
6818 do_osd_ops(ctx, nops);
6819 try {
6820 bufferlist::iterator i = newop.outdata.begin();
6821 ::decode(*header, i);
6822 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6823 } catch (...) {
6824 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6825 << dendl;
6826 return -EINVAL;
6827 }
6828 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6829 << dendl;
6830 return 0;
6831 }
6832
6833 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6834 const SnapSet& ss)
6835 {
6836 // verify that all clones have been evicted
6837 dout(20) << __func__ << " verifying clones are absent "
6838 << ss << dendl;
6839 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6840 p != ss.clones.end();
6841 ++p) {
6842 hobject_t clone_oid = soid;
6843 clone_oid.snap = *p;
6844 if (is_missing_object(clone_oid))
6845 return -EBUSY;
6846 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6847 if (clone_obc && clone_obc->obs.exists) {
6848 dout(10) << __func__ << " cannot evict head before clone "
6849 << clone_oid << dendl;
6850 return -EBUSY;
6851 }
6852 if (copy_ops.count(clone_oid)) {
6853 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6854 << clone_oid << dendl;
6855 return -EBUSY;
6856 }
6857 }
6858 return 0;
6859 }
6860
6861 inline int PrimaryLogPG::_delete_oid(
6862 OpContext *ctx,
6863 bool no_whiteout, // no whiteouts, no matter what.
6864 bool try_no_whiteout) // try not to whiteout
6865 {
6866 SnapSet& snapset = ctx->new_snapset;
6867 ObjectState& obs = ctx->new_obs;
6868 object_info_t& oi = obs.oi;
6869 const hobject_t& soid = oi.soid;
6870 PGTransaction* t = ctx->op_t.get();
6871
6872 // cache: cache: set whiteout on delete?
6873 bool whiteout = false;
6874 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6875 && !no_whiteout
6876 && !try_no_whiteout) {
6877 whiteout = true;
6878 }
6879 bool legacy;
6880 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6881 legacy = false;
6882 // in luminous or later, we can't delete the head if there are
6883 // clones. we trust the caller passing no_whiteout has already
6884 // verified they don't exist.
6885 if (!snapset.clones.empty() ||
6886 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6887 if (no_whiteout) {
6888 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6889 << dendl;
6890 } else {
6891 dout(20) << __func__ << " has or will have clones; will whiteout"
6892 << dendl;
6893 whiteout = true;
6894 }
6895 }
6896 } else {
6897 legacy = true;
6898 }
6899 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6900 << " no_whiteout=" << (int)no_whiteout
6901 << " try_no_whiteout=" << (int)try_no_whiteout
6902 << dendl;
6903 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6904 return -ENOENT;
6905
6906 t->remove(soid);
6907
6908 if (oi.size > 0) {
6909 interval_set<uint64_t> ch;
6910 ch.insert(0, oi.size);
6911 ctx->modified_ranges.union_of(ch);
6912 }
6913
6914 ctx->delta_stats.num_wr++;
6915 if (soid.is_snap()) {
6916 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6917 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6918 } else {
6919 ctx->delta_stats.num_bytes -= oi.size;
6920 }
6921 oi.size = 0;
6922 oi.new_object();
6923
6924 // disconnect all watchers
6925 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6926 oi.watchers.begin();
6927 p != oi.watchers.end();
6928 ++p) {
6929 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6930 ctx->watch_disconnects.push_back(
6931 watch_disconnect_t(p->first.first, p->first.second, true));
6932 }
6933 oi.watchers.clear();
6934
6935 if (whiteout) {
6936 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6937 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6938 ctx->delta_stats.num_whiteouts++;
6939 t->create(soid);
6940 osd->logger->inc(l_osd_tier_whiteout);
6941 return 0;
6942 }
6943
6944 // delete the head
6945 ctx->delta_stats.num_objects--;
6946 if (soid.is_snap())
6947 ctx->delta_stats.num_object_clones--;
6948 if (oi.is_whiteout()) {
6949 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6950 ctx->delta_stats.num_whiteouts--;
6951 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6952 }
6953 if (oi.is_cache_pinned()) {
6954 ctx->delta_stats.num_objects_pinned--;
6955 }
6956 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6957 snapset.head_exists = false;
6958 }
6959 obs.exists = false;
6960 return 0;
6961 }
6962
6963 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6964 {
6965 SnapSet& snapset = ctx->new_snapset;
6966 ObjectState& obs = ctx->new_obs;
6967 object_info_t& oi = obs.oi;
6968 const hobject_t& soid = oi.soid;
6969 PGTransaction* t = ctx->op_t.get();
6970 snapid_t snapid = (uint64_t)op.snap.snapid;
6971 hobject_t missing_oid;
6972
6973 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6974
6975 ObjectContextRef rollback_to;
6976 int ret = find_object_context(
6977 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6978 soid.get_namespace()),
6979 &rollback_to, false, false, &missing_oid);
6980 if (ret == -EAGAIN) {
6981 /* clone must be missing */
6982 assert(is_degraded_or_backfilling_object(missing_oid));
6983 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
6984 << missing_oid << " (requested snapid: ) " << snapid << dendl;
6985 block_write_on_degraded_snap(missing_oid, ctx->op);
6986 return ret;
6987 }
6988 {
6989 ObjectContextRef promote_obc;
6990 cache_result_t tier_mode_result;
6991 if (obs.exists && obs.oi.has_manifest()) {
6992 tier_mode_result =
6993 maybe_handle_manifest_detail(
6994 ctx->op,
6995 true,
6996 rollback_to);
6997 } else {
6998 tier_mode_result =
6999 maybe_handle_cache_detail(
7000 ctx->op,
7001 true,
7002 rollback_to,
7003 ret,
7004 missing_oid,
7005 true,
7006 false,
7007 &promote_obc);
7008 }
7009 switch (tier_mode_result) {
7010 case cache_result_t::NOOP:
7011 break;
7012 case cache_result_t::BLOCKED_PROMOTE:
7013 assert(promote_obc);
7014 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7015 return -EAGAIN;
7016 case cache_result_t::BLOCKED_FULL:
7017 block_write_on_full_cache(soid, ctx->op);
7018 return -EAGAIN;
7019 case cache_result_t::REPLIED_WITH_EAGAIN:
7020 assert(0 == "this can't happen, no rollback on replica");
7021 default:
7022 assert(0 == "must promote was set, other values are not valid");
7023 return -EAGAIN;
7024 }
7025 }
7026
7027 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7028 // there's no snapshot here, or there's no object.
7029 // if there's no snapshot, we delete the object; otherwise, do nothing.
7030 dout(20) << "_rollback_to deleting head on " << soid.oid
7031 << " because got ENOENT|whiteout on find_object_context" << dendl;
7032 if (ctx->obc->obs.oi.watchers.size()) {
7033 // Cannot delete an object with watchers
7034 ret = -EBUSY;
7035 } else {
7036 _delete_oid(ctx, false, false);
7037 ret = 0;
7038 }
7039 } else if (ret) {
7040 // ummm....huh? It *can't* return anything else at time of writing.
7041 assert(0 == "unexpected error code in _rollback_to");
7042 } else { //we got our context, let's use it to do the rollback!
7043 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7044 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7045 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7046 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7047 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7048 ret = -EAGAIN;
7049 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7050 // rolling back to the head; we just need to clone it.
7051 ctx->modify = true;
7052 } else {
7053 /* 1) Delete current head
7054 * 2) Clone correct snapshot into head
7055 * 3) Calculate clone_overlaps by following overlaps
7056 * forward from rollback snapshot */
7057 dout(10) << "_rollback_to deleting " << soid.oid
7058 << " and rolling back to old snap" << dendl;
7059
7060 if (obs.exists) {
7061 t->remove(soid);
7062 }
7063 t->clone(soid, rollback_to_sobject);
7064 snapset.head_exists = true;
7065 t->add_obc(rollback_to);
7066
7067 map<snapid_t, interval_set<uint64_t> >::iterator iter =
7068 snapset.clone_overlap.lower_bound(snapid);
7069 interval_set<uint64_t> overlaps = iter->second;
7070 assert(iter != snapset.clone_overlap.end());
7071 for ( ;
7072 iter != snapset.clone_overlap.end();
7073 ++iter)
7074 overlaps.intersection_of(iter->second);
7075
7076 if (obs.oi.size > 0) {
7077 interval_set<uint64_t> modified;
7078 modified.insert(0, obs.oi.size);
7079 overlaps.intersection_of(modified);
7080 modified.subtract(overlaps);
7081 ctx->modified_ranges.union_of(modified);
7082 }
7083
7084 // Adjust the cached objectcontext
7085 maybe_create_new_object(ctx, true);
7086 ctx->delta_stats.num_bytes -= obs.oi.size;
7087 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7088 obs.oi.size = rollback_to->obs.oi.size;
7089 if (rollback_to->obs.oi.is_data_digest())
7090 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7091 else
7092 obs.oi.clear_data_digest();
7093 if (rollback_to->obs.oi.is_omap_digest())
7094 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7095 else
7096 obs.oi.clear_omap_digest();
7097
7098 if (rollback_to->obs.oi.is_omap()) {
7099 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7100 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7101 } else {
7102 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7103 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7104 }
7105
7106 snapset.head_exists = true;
7107 }
7108 }
7109 return ret;
7110 }
7111
7112 void PrimaryLogPG::_make_clone(
7113 OpContext *ctx,
7114 PGTransaction* t,
7115 ObjectContextRef obc,
7116 const hobject_t& head, const hobject_t& coid,
7117 object_info_t *poi)
7118 {
7119 bufferlist bv;
7120 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7121
7122 t->clone(coid, head);
7123 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7124 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7125 }
7126
7127 void PrimaryLogPG::make_writeable(OpContext *ctx)
7128 {
7129 const hobject_t& soid = ctx->obs->oi.soid;
7130 SnapContext& snapc = ctx->snapc;
7131
7132 // clone?
7133 assert(soid.snap == CEPH_NOSNAP);
7134 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7135 << " snapc=" << snapc << dendl;
7136
7137 bool was_dirty = ctx->obc->obs.oi.is_dirty();
7138 if (ctx->new_obs.exists) {
7139 // we will mark the object dirty
7140 if (ctx->undirty && was_dirty) {
7141 dout(20) << " clearing DIRTY flag" << dendl;
7142 assert(ctx->new_obs.oi.is_dirty());
7143 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7144 --ctx->delta_stats.num_objects_dirty;
7145 osd->logger->inc(l_osd_tier_clean);
7146 } else if (!was_dirty && !ctx->undirty) {
7147 dout(20) << " setting DIRTY flag" << dendl;
7148 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7149 ++ctx->delta_stats.num_objects_dirty;
7150 osd->logger->inc(l_osd_tier_dirty);
7151 }
7152 } else {
7153 if (was_dirty) {
7154 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7155 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7156 --ctx->delta_stats.num_objects_dirty;
7157 }
7158 }
7159
7160 if ((ctx->new_obs.exists &&
7161 ctx->new_obs.oi.is_omap()) &&
7162 (!ctx->obc->obs.exists ||
7163 !ctx->obc->obs.oi.is_omap())) {
7164 ++ctx->delta_stats.num_objects_omap;
7165 }
7166 if ((!ctx->new_obs.exists ||
7167 !ctx->new_obs.oi.is_omap()) &&
7168 (ctx->obc->obs.exists &&
7169 ctx->obc->obs.oi.is_omap())) {
7170 --ctx->delta_stats.num_objects_omap;
7171 }
7172
7173 // use newer snapc?
7174 if (ctx->new_snapset.seq > snapc.seq) {
7175 snapc.seq = ctx->new_snapset.seq;
7176 snapc.snaps = ctx->new_snapset.snaps;
7177 filter_snapc(snapc.snaps);
7178 dout(10) << " using newer snapc " << snapc << dendl;
7179 }
7180
7181 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7182 snapc.snaps.size() && // there are snaps
7183 !ctx->cache_evict &&
7184 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
7185 // clone
7186 hobject_t coid = soid;
7187 coid.snap = snapc.seq;
7188
7189 unsigned l;
7190 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7191
7192 vector<snapid_t> snaps(l);
7193 for (unsigned i=0; i<l; i++)
7194 snaps[i] = snapc.snaps[i];
7195
7196 // prepare clone
7197 object_info_t static_snap_oi(coid);
7198 object_info_t *snap_oi;
7199 if (is_primary()) {
7200 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7201 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7202 ctx->clone_obc->obs.oi = static_snap_oi;
7203 ctx->clone_obc->obs.exists = true;
7204 ctx->clone_obc->ssc = ctx->obc->ssc;
7205 ctx->clone_obc->ssc->ref++;
7206 if (pool.info.require_rollback())
7207 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7208 snap_oi = &ctx->clone_obc->obs.oi;
7209 bool got = ctx->lock_manager.get_write_greedy(
7210 coid,
7211 ctx->clone_obc,
7212 ctx->op);
7213 assert(got);
7214 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7215 } else {
7216 snap_oi = &static_snap_oi;
7217 }
7218 snap_oi->version = ctx->at_version;
7219 snap_oi->prior_version = ctx->obs->oi.version;
7220 snap_oi->copy_user_bits(ctx->obs->oi);
7221
7222 bool legacy = ctx->new_snapset.is_legacy() ||
7223 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7224 if (legacy) {
7225 snap_oi->legacy_snaps = snaps;
7226 }
7227
7228 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7229
7230 ctx->delta_stats.num_objects++;
7231 if (snap_oi->is_dirty()) {
7232 ctx->delta_stats.num_objects_dirty++;
7233 osd->logger->inc(l_osd_tier_dirty);
7234 }
7235 if (snap_oi->is_omap())
7236 ctx->delta_stats.num_objects_omap++;
7237 if (snap_oi->is_cache_pinned())
7238 ctx->delta_stats.num_objects_pinned++;
7239 ctx->delta_stats.num_object_clones++;
7240 ctx->new_snapset.clones.push_back(coid.snap);
7241 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7242 if (!legacy) {
7243 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7244 }
7245
7246 // clone_overlap should contain an entry for each clone
7247 // (an empty interval_set if there is no overlap)
7248 ctx->new_snapset.clone_overlap[coid.snap];
7249 if (ctx->obs->oi.size)
7250 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7251
7252 // log clone
7253 dout(10) << " cloning v " << ctx->obs->oi.version
7254 << " to " << coid << " v " << ctx->at_version
7255 << " snaps=" << snaps
7256 << " snapset=" << ctx->new_snapset << dendl;
7257 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7258 ctx->obs->oi.version,
7259 ctx->obs->oi.user_version,
7260 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7261 ::encode(snaps, ctx->log.back().snaps);
7262
7263 ctx->at_version.version++;
7264 }
7265
7266 // update most recent clone_overlap and usage stats
7267 if (ctx->new_snapset.clones.size() > 0) {
7268 /* we need to check whether the most recent clone exists, if it's been evicted,
7269 * it's not included in the stats */
7270 hobject_t last_clone_oid = soid;
7271 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7272 if (is_present_clone(last_clone_oid)) {
7273 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7274 ctx->modified_ranges.intersection_of(newest_overlap);
7275 // modified_ranges is still in use by the clone
7276 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7277 newest_overlap.subtract(ctx->modified_ranges);
7278 }
7279 }
7280
7281 // update snapset with latest snap context
7282 ctx->new_snapset.seq = snapc.seq;
7283 ctx->new_snapset.snaps = snapc.snaps;
7284 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7285 // pessimistic assumption that this is a net-new legacy SnapSet
7286 ctx->delta_stats.num_legacy_snapsets++;
7287 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7288 } else if (ctx->new_snapset.is_legacy()) {
7289 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7290 }
7291 dout(20) << "make_writeable " << soid
7292 << " done, snapset=" << ctx->new_snapset << dendl;
7293 }
7294
7295
7296 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7297 interval_set<uint64_t>& modified, uint64_t offset,
7298 uint64_t length, bool write_full)
7299 {
7300 interval_set<uint64_t> ch;
7301 if (write_full) {
7302 if (oi.size)
7303 ch.insert(0, oi.size);
7304 } else if (length)
7305 ch.insert(offset, length);
7306 modified.union_of(ch);
7307 if (write_full || offset + length > oi.size) {
7308 uint64_t new_size = offset + length;
7309 delta_stats.num_bytes -= oi.size;
7310 delta_stats.num_bytes += new_size;
7311 oi.size = new_size;
7312 }
7313 delta_stats.num_wr++;
7314 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7315 }
7316
7317 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7318 {
7319 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7320 delta_stats.num_bytes += p.get_len();
7321 }
7322 }
7323
7324 void PrimaryLogPG::complete_disconnect_watches(
7325 ObjectContextRef obc,
7326 const list<watch_disconnect_t> &to_disconnect)
7327 {
7328 for (list<watch_disconnect_t>::const_iterator i =
7329 to_disconnect.begin();
7330 i != to_disconnect.end();
7331 ++i) {
7332 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7333 auto watchers_entry = obc->watchers.find(watcher);
7334 if (watchers_entry != obc->watchers.end()) {
7335 WatchRef watch = watchers_entry->second;
7336 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7337 obc->watchers.erase(watcher);
7338 watch->remove(i->send_disconnect);
7339 } else {
7340 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7341 << watcher << dendl;
7342 }
7343 }
7344 }
7345
7346 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7347 {
7348 entity_name_t entity = ctx->reqid.name;
7349 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7350
7351 // disconnects first
7352 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7353
7354 assert(conn);
7355
7356 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7357 if (!session.get())
7358 return;
7359 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7360
7361 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7362 i != ctx->watch_connects.end();
7363 ++i) {
7364 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7365 dout(15) << "do_osd_op_effects applying watch connect on session "
7366 << session.get() << " watcher " << watcher << dendl;
7367 WatchRef watch;
7368 if (ctx->obc->watchers.count(watcher)) {
7369 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7370 << dendl;
7371 watch = ctx->obc->watchers[watcher];
7372 } else {
7373 dout(15) << "do_osd_op_effects new watcher " << watcher
7374 << dendl;
7375 watch = Watch::makeWatchRef(
7376 this, osd, ctx->obc, i->first.timeout_seconds,
7377 i->first.cookie, entity, conn->get_peer_addr());
7378 ctx->obc->watchers.insert(
7379 make_pair(
7380 watcher,
7381 watch));
7382 }
7383 watch->connect(conn, i->second);
7384 }
7385
7386 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7387 p != ctx->notifies.end();
7388 ++p) {
7389 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7390 ConnectionRef conn(ctx->op->get_req()->get_connection());
7391 NotifyRef notif(
7392 Notify::makeNotifyRef(
7393 conn,
7394 ctx->reqid.name.num(),
7395 p->bl,
7396 p->timeout,
7397 p->cookie,
7398 p->notify_id,
7399 ctx->obc->obs.oi.user_version,
7400 osd));
7401 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7402 ctx->obc->watchers.begin();
7403 i != ctx->obc->watchers.end();
7404 ++i) {
7405 dout(10) << "starting notify on watch " << i->first << dendl;
7406 i->second->start_notify(notif);
7407 }
7408 notif->init();
7409 }
7410
7411 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7412 p != ctx->notify_acks.end();
7413 ++p) {
7414 if (p->watch_cookie)
7415 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7416 else
7417 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7418 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7419 ctx->obc->watchers.begin();
7420 i != ctx->obc->watchers.end();
7421 ++i) {
7422 if (i->first.second != entity) continue;
7423 if (p->watch_cookie &&
7424 p->watch_cookie.get() != i->first.first) continue;
7425 dout(10) << "acking notify on watch " << i->first << dendl;
7426 i->second->notify_ack(p->notify_id, p->reply_bl);
7427 }
7428 }
7429 }
7430
7431 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7432 {
7433 ostringstream ss;
7434 ss << "temp_" << info.pgid << "_" << get_role()
7435 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7436 hobject_t hoid = target.make_temp_hobject(ss.str());
7437 dout(20) << __func__ << " " << hoid << dendl;
7438 return hoid;
7439 }
7440
7441 hobject_t PrimaryLogPG::get_temp_recovery_object(
7442 const hobject_t& target,
7443 eversion_t version)
7444 {
7445 ostringstream ss;
7446 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7447 << "_" << version
7448 << "_" << info.history.same_interval_since
7449 << "_" << target.snap;
7450 // pgid + version + interval + snapid is unique, and short
7451 hobject_t hoid = target.make_temp_hobject(ss.str());
7452 dout(20) << __func__ << " " << hoid << dendl;
7453 return hoid;
7454 }
7455
7456 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7457 {
7458 assert(!ctx->ops->empty());
7459
7460 const hobject_t& soid = ctx->obs->oi.soid;
7461
7462 // valid snap context?
7463 if (!ctx->snapc.is_valid()) {
7464 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7465 return -EINVAL;
7466 }
7467
7468 // prepare the actual mutation
7469 int result = do_osd_ops(ctx, *ctx->ops);
7470 if (result < 0) {
7471 if (ctx->op->may_write() &&
7472 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7473 // need to save the error code in the pg log, to detect dup ops,
7474 // but do nothing else
7475 ctx->update_log_only = true;
7476 }
7477 return result;
7478 }
7479
7480 // read-op? write-op noop? done?
7481 if (ctx->op_t->empty() && !ctx->modify) {
7482 unstable_stats.add(ctx->delta_stats);
7483 if (ctx->op->may_write() &&
7484 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7485 ctx->update_log_only = true;
7486 }
7487 return result;
7488 }
7489
7490 // check for full
7491 if ((ctx->delta_stats.num_bytes > 0 ||
7492 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7493 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7494 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7495 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7496 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7497 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7498 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7499 << dendl;
7500 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7501 // they tried, they failed.
7502 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7503 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7504 } else {
7505 // drop request
7506 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7507 return -EAGAIN;
7508 }
7509 }
7510
7511 // clone, if necessary
7512 if (soid.snap == CEPH_NOSNAP)
7513 make_writeable(ctx);
7514
7515 finish_ctx(ctx,
7516 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7517 pg_log_entry_t::DELETE);
7518
7519 return result;
7520 }
7521
7522 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7523 {
7524 const hobject_t& soid = ctx->obs->oi.soid;
7525 dout(20) << __func__ << " " << soid << " " << ctx
7526 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7527 << dendl;
7528 utime_t now = ceph_clock_now();
7529
7530 // snapset
7531 bufferlist bss;
7532
7533 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7534 ::encode(ctx->new_snapset, bss);
7535 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7536 !ctx->new_snapset.is_legacy());
7537
7538 if (ctx->new_obs.exists) {
7539 if (!ctx->obs->exists) {
7540 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7541 hobject_t snapoid = soid.get_snapdir();
7542 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7543 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7544 ctx->at_version,
7545 ctx->snapset_obc->obs.oi.version,
7546 0, osd_reqid_t(), ctx->mtime, 0));
7547 ctx->op_t->remove(snapoid);
7548
7549 ctx->at_version.version++;
7550
7551 ctx->snapset_obc->obs.exists = false;
7552 }
7553 }
7554 } else if (!ctx->new_snapset.clones.empty() &&
7555 !ctx->cache_evict &&
7556 !ctx->new_snapset.head_exists &&
7557 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7558 // save snapset on _snap
7559 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7560 info.pgid.pool(), soid.get_namespace());
7561 dout(10) << " final snapset " << ctx->new_snapset
7562 << " in " << snapoid << dendl;
7563 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7564 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7565 ctx->at_version,
7566 eversion_t(),
7567 0, osd_reqid_t(), ctx->mtime, 0));
7568
7569 if (!ctx->snapset_obc)
7570 ctx->snapset_obc = get_object_context(snapoid, true);
7571 bool got = false;
7572 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7573 got = ctx->lock_manager.get_write_greedy(
7574 snapoid,
7575 ctx->snapset_obc,
7576 ctx->op);
7577 } else {
7578 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7579 got = ctx->lock_manager.get_lock_type(
7580 ObjectContext::RWState::RWEXCL,
7581 snapoid,
7582 ctx->snapset_obc,
7583 ctx->op);
7584 }
7585 assert(got);
7586 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7587 ctx->snapset_obc->obs.exists = true;
7588 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7589 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7590 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7591 ctx->snapset_obc->obs.oi.local_mtime = now;
7592
7593 map<string, bufferlist> attrs;
7594 bufferlist bv(sizeof(ctx->new_obs.oi));
7595 ::encode(ctx->snapset_obc->obs.oi, bv,
7596 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7597 ctx->op_t->create(snapoid);
7598 attrs[OI_ATTR].claim(bv);
7599 attrs[SS_ATTR].claim(bss);
7600 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7601 ctx->at_version.version++;
7602 }
7603 }
7604
7605 // finish and log the op.
7606 if (ctx->user_modify) {
7607 // update the user_version for any modify ops, except for the watch op
7608 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7609 /* In order for new clients and old clients to interoperate properly
7610 * when exchanging versions, we need to lower bound the user_version
7611 * (which our new clients pay proper attention to)
7612 * by the at_version (which is all the old clients can ever see). */
7613 if (ctx->at_version.version > ctx->user_at_version)
7614 ctx->user_at_version = ctx->at_version.version;
7615 ctx->new_obs.oi.user_version = ctx->user_at_version;
7616 }
7617 ctx->bytes_written = ctx->op_t->get_bytes_written();
7618
7619 if (ctx->new_obs.exists) {
7620 // on the head object
7621 ctx->new_obs.oi.version = ctx->at_version;
7622 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7623 ctx->new_obs.oi.last_reqid = ctx->reqid;
7624 if (ctx->mtime != utime_t()) {
7625 ctx->new_obs.oi.mtime = ctx->mtime;
7626 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7627 ctx->new_obs.oi.local_mtime = now;
7628 } else {
7629 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7630 }
7631
7632 map <string, bufferlist> attrs;
7633 bufferlist bv(sizeof(ctx->new_obs.oi));
7634 ::encode(ctx->new_obs.oi, bv,
7635 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7636 attrs[OI_ATTR].claim(bv);
7637
7638 if (soid.snap == CEPH_NOSNAP) {
7639 dout(10) << " final snapset " << ctx->new_snapset
7640 << " in " << soid << dendl;
7641 attrs[SS_ATTR].claim(bss);
7642 } else {
7643 dout(10) << " no snapset (this is a clone)" << dendl;
7644 }
7645 ctx->op_t->setattrs(soid, attrs);
7646 } else {
7647 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7648 }
7649
7650 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7651 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7652
7653 // append to log
7654 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7655 ctx->obs->oi.version,
7656 ctx->user_at_version, ctx->reqid,
7657 ctx->mtime, 0));
7658 if (soid.snap < CEPH_NOSNAP) {
7659 switch (log_op_type) {
7660 case pg_log_entry_t::MODIFY:
7661 case pg_log_entry_t::PROMOTE:
7662 case pg_log_entry_t::CLEAN:
7663 if (legacy_snapset) {
7664 dout(20) << __func__ << " encoding legacy_snaps "
7665 << ctx->new_obs.oi.legacy_snaps
7666 << dendl;
7667 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7668 } else {
7669 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7670 << dendl;
7671 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7672 }
7673 break;
7674 default:
7675 break;
7676 }
7677 }
7678
7679 if (!ctx->extra_reqids.empty()) {
7680 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7681 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7682 }
7683
7684 // apply new object state.
7685 ctx->obc->obs = ctx->new_obs;
7686
7687 if (soid.is_head() && !ctx->obc->obs.exists &&
7688 (!maintain_ssc || ctx->cache_evict)) {
7689 ctx->obc->ssc->exists = false;
7690 ctx->obc->ssc->snapset = SnapSet();
7691 } else {
7692 ctx->obc->ssc->exists = true;
7693 ctx->obc->ssc->snapset = ctx->new_snapset;
7694 }
7695 }
7696
7697 void PrimaryLogPG::apply_stats(
7698 const hobject_t &soid,
7699 const object_stat_sum_t &delta_stats) {
7700
7701 info.stats.stats.add(delta_stats);
7702
7703 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7704 i != backfill_targets.end();
7705 ++i) {
7706 pg_shard_t bt = *i;
7707 pg_info_t& pinfo = peer_info[bt];
7708 if (soid <= pinfo.last_backfill)
7709 pinfo.stats.stats.add(delta_stats);
7710 else if (soid <= last_backfill_started)
7711 pending_backfill_updates[soid].stats.add(delta_stats);
7712 }
7713
7714 if (is_primary() && scrubber.active) {
7715 if (soid < scrubber.start) {
7716 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7717 << "," << scrubber.end << ")" << dendl;
7718 scrub_cstat.add(delta_stats);
7719 } else {
7720 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7721 << "," << scrubber.end << ")" << dendl;
7722 }
7723 }
7724 }
7725
7726 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7727 {
7728 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7729 assert(ctx->async_reads_complete());
7730
7731 for (vector<OSDOp>::iterator p = ctx->ops->begin();
7732 p != ctx->ops->end() && result >= 0; ++p) {
7733 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7734 result = p->rval;
7735 break;
7736 }
7737 ctx->bytes_read += p->outdata.length();
7738 }
7739 ctx->reply->claim_op_out_data(*ctx->ops);
7740 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7741
7742 MOSDOpReply *reply = ctx->reply;
7743 ctx->reply = nullptr;
7744
7745 if (result >= 0) {
7746 if (!ctx->ignore_log_op_stats) {
7747 log_op_stats(ctx);
7748 publish_stats_to_osd();
7749 }
7750
7751 // on read, return the current object version
7752 if (ctx->obs) {
7753 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7754 } else {
7755 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7756 }
7757 } else if (result == -ENOENT) {
7758 // on ENOENT, set a floor for what the next user version will be.
7759 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7760 }
7761
7762 reply->set_result(result);
7763 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7764 osd->send_message_osd_client(reply, m->get_connection());
7765 close_op_ctx(ctx);
7766 }
7767
7768 // ========================================================================
7769 // copyfrom
7770
7771 struct C_Copyfrom : public Context {
7772 PrimaryLogPGRef pg;
7773 hobject_t oid;
7774 epoch_t last_peering_reset;
7775 ceph_tid_t tid;
7776 PrimaryLogPG::CopyOpRef cop;
7777 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7778 const PrimaryLogPG::CopyOpRef& c)
7779 : pg(p), oid(o), last_peering_reset(lpr),
7780 tid(0), cop(c)
7781 {}
7782 void finish(int r) override {
7783 if (r == -ECANCELED)
7784 return;
7785 pg->lock();
7786 if (last_peering_reset == pg->get_last_peering_reset()) {
7787 pg->process_copy_chunk(oid, tid, r);
7788 }
7789 pg->unlock();
7790 }
7791 };
7792
7793 struct C_CopyFrom_AsyncReadCb : public Context {
7794 OSDOp *osd_op;
7795 object_copy_data_t reply_obj;
7796 uint64_t features;
7797 size_t len;
7798 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7799 osd_op(osd_op), features(features), len(0) {}
7800 void finish(int r) override {
7801 osd_op->rval = r;
7802 if (r < 0) {
7803 return;
7804 }
7805
7806 assert(len > 0);
7807 assert(len <= reply_obj.data.length());
7808 bufferlist bl;
7809 bl.substr_of(reply_obj.data, 0, len);
7810 reply_obj.data.swap(bl);
7811 ::encode(reply_obj, osd_op->outdata, features);
7812 }
7813 };
7814
7815 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7816 OSDOp& osd_op, ObjectContextRef &obc)
7817 {
7818 object_info_t& oi = obc->obs.oi;
7819 hobject_t& soid = oi.soid;
7820 int result = 0;
7821 object_copy_cursor_t cursor;
7822 uint64_t out_max;
7823 try {
7824 ::decode(cursor, bp);
7825 ::decode(out_max, bp);
7826 }
7827 catch (buffer::error& e) {
7828 result = -EINVAL;
7829 return result;
7830 }
7831
7832 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7833 uint64_t features = op->get_features();
7834
7835 bool async_read_started = false;
7836 object_copy_data_t _reply_obj;
7837 C_CopyFrom_AsyncReadCb *cb = NULL;
7838 if (pool.info.require_rollback()) {
7839 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7840 }
7841 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7842 // size, mtime
7843 reply_obj.size = oi.size;
7844 reply_obj.mtime = oi.mtime;
7845 assert(obc->ssc);
7846 if (soid.snap < CEPH_NOSNAP) {
7847 if (obc->ssc->snapset.is_legacy()) {
7848 reply_obj.snaps = oi.legacy_snaps;
7849 } else {
7850 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7851 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7852 reply_obj.snaps = p->second;
7853 }
7854 } else {
7855 reply_obj.snap_seq = obc->ssc->snapset.seq;
7856 }
7857 if (oi.is_data_digest()) {
7858 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7859 reply_obj.data_digest = oi.data_digest;
7860 }
7861 if (oi.is_omap_digest()) {
7862 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7863 reply_obj.omap_digest = oi.omap_digest;
7864 }
7865 reply_obj.truncate_seq = oi.truncate_seq;
7866 reply_obj.truncate_size = oi.truncate_size;
7867
7868 // attrs
7869 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7870 if (!cursor.attr_complete) {
7871 result = getattrs_maybe_cache(
7872 ctx->obc,
7873 &out_attrs);
7874 if (result < 0) {
7875 if (cb) {
7876 delete cb;
7877 }
7878 return result;
7879 }
7880 cursor.attr_complete = true;
7881 dout(20) << " got attrs" << dendl;
7882 }
7883
7884 int64_t left = out_max - osd_op.outdata.length();
7885
7886 // data
7887 bufferlist& bl = reply_obj.data;
7888 if (left > 0 && !cursor.data_complete) {
7889 if (cursor.data_offset < oi.size) {
7890 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7891 if (cb) {
7892 async_read_started = true;
7893 ctx->pending_async_reads.push_back(
7894 make_pair(
7895 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7896 make_pair(&bl, cb)));
7897 cb->len = max_read;
7898
7899 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7900 new ReadFinisher(osd_op));
7901 result = -EINPROGRESS;
7902
7903 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7904 } else {
7905 result = pgbackend->objects_read_sync(
7906 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7907 if (result < 0)
7908 return result;
7909 }
7910 left -= max_read;
7911 cursor.data_offset += max_read;
7912 }
7913 if (cursor.data_offset == oi.size) {
7914 cursor.data_complete = true;
7915 dout(20) << " got data" << dendl;
7916 }
7917 assert(cursor.data_offset <= oi.size);
7918 }
7919
7920 // omap
7921 uint32_t omap_keys = 0;
7922 if (!pool.info.supports_omap() || !oi.is_omap()) {
7923 cursor.omap_complete = true;
7924 } else {
7925 if (left > 0 && !cursor.omap_complete) {
7926 assert(cursor.data_complete);
7927 if (cursor.omap_offset.empty()) {
7928 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7929 &reply_obj.omap_header);
7930 }
7931 bufferlist omap_data;
7932 ObjectMap::ObjectMapIterator iter =
7933 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7934 assert(iter);
7935 iter->upper_bound(cursor.omap_offset);
7936 for (; iter->valid(); iter->next(false)) {
7937 ++omap_keys;
7938 ::encode(iter->key(), omap_data);
7939 ::encode(iter->value(), omap_data);
7940 left -= iter->key().length() + 4 + iter->value().length() + 4;
7941 if (left <= 0)
7942 break;
7943 }
7944 if (omap_keys) {
7945 ::encode(omap_keys, reply_obj.omap_data);
7946 reply_obj.omap_data.claim_append(omap_data);
7947 }
7948 if (iter->valid()) {
7949 cursor.omap_offset = iter->key();
7950 } else {
7951 cursor.omap_complete = true;
7952 dout(20) << " got omap" << dendl;
7953 }
7954 }
7955 }
7956
7957 if (cursor.is_complete()) {
7958 // include reqids only in the final step. this is a bit fragile
7959 // but it works...
7960 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7961 dout(20) << " got reqids" << dendl;
7962 }
7963
7964 dout(20) << " cursor.is_complete=" << cursor.is_complete()
7965 << " " << out_attrs.size() << " attrs"
7966 << " " << bl.length() << " bytes"
7967 << " " << reply_obj.omap_header.length() << " omap header bytes"
7968 << " " << reply_obj.omap_data.length() << " omap data bytes in "
7969 << omap_keys << " keys"
7970 << " " << reply_obj.reqids.size() << " reqids"
7971 << dendl;
7972 reply_obj.cursor = cursor;
7973 if (!async_read_started) {
7974 ::encode(reply_obj, osd_op.outdata, features);
7975 }
7976 if (cb && !async_read_started) {
7977 delete cb;
7978 }
7979
7980 if (result > 0) {
7981 result = 0;
7982 }
7983 return result;
7984 }
7985
7986 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7987 OSDOp& osd_op)
7988 {
7989 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7990 // be careful not to modify anything else that will upset a racing
7991 // operator<<
7992 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7993 uint64_t features = m->get_features();
7994 object_copy_data_t reply_obj;
7995
7996 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7997 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7998 ::encode(reply_obj, osd_op.outdata, features);
7999 osd_op.rval = -ENOENT;
8000 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8001 reply->claim_op_out_data(m->ops);
8002 reply->set_result(-ENOENT);
8003 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8004 osd->send_message_osd_client(reply, m->get_connection());
8005 }
8006
8007 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8008 hobject_t src, object_locator_t oloc,
8009 version_t version, unsigned flags,
8010 bool mirror_snapset,
8011 unsigned src_obj_fadvise_flags,
8012 unsigned dest_obj_fadvise_flags)
8013 {
8014 const hobject_t& dest = obc->obs.oi.soid;
8015 dout(10) << __func__ << " " << dest
8016 << " from " << src << " " << oloc << " v" << version
8017 << " flags " << flags
8018 << (mirror_snapset ? " mirror_snapset" : "")
8019 << dendl;
8020
8021 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8022 src.snap == CEPH_SNAPDIR));
8023
8024 // cancel a previous in-progress copy?
8025 if (copy_ops.count(dest)) {
8026 // FIXME: if the src etc match, we could avoid restarting from the
8027 // beginning.
8028 CopyOpRef cop = copy_ops[dest];
8029 cancel_copy(cop, false);
8030 }
8031
8032 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8033 mirror_snapset, src_obj_fadvise_flags,
8034 dest_obj_fadvise_flags));
8035 copy_ops[dest] = cop;
8036 obc->start_block();
8037
8038 _copy_some(obc, cop);
8039 }
8040
8041 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8042 {
8043 dout(10) << __func__ << " " << obc << " " << cop << dendl;
8044
8045 unsigned flags = 0;
8046 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8047 flags |= CEPH_OSD_FLAG_FLUSH;
8048 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8049 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8050 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8051 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8052 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8053 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8054 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8055 flags |= CEPH_OSD_FLAG_RWORDERED;
8056
8057 C_GatherBuilder gather(cct);
8058
8059 if (cop->cursor.is_initial() && cop->mirror_snapset) {
8060 // list snaps too.
8061 assert(cop->src.snap == CEPH_NOSNAP);
8062 ObjectOperation op;
8063 op.list_snaps(&cop->results.snapset, NULL);
8064 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8065 CEPH_SNAPDIR, NULL,
8066 flags, gather.new_sub(), NULL);
8067 cop->objecter_tid2 = tid;
8068 }
8069
8070 ObjectOperation op;
8071 if (cop->results.user_version) {
8072 op.assert_version(cop->results.user_version);
8073 } else {
8074 // we should learn the version after the first chunk, if we didn't know
8075 // it already!
8076 assert(cop->cursor.is_initial());
8077 }
8078 op.copy_get(&cop->cursor, get_copy_chunk_size(),
8079 &cop->results.object_size, &cop->results.mtime,
8080 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8081 &cop->results.snaps, &cop->results.snap_seq,
8082 &cop->results.flags,
8083 &cop->results.source_data_digest,
8084 &cop->results.source_omap_digest,
8085 &cop->results.reqids,
8086 &cop->results.truncate_seq,
8087 &cop->results.truncate_size,
8088 &cop->rval);
8089 op.set_last_op_flags(cop->src_obj_fadvise_flags);
8090
8091 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8092 get_last_peering_reset(), cop);
8093 gather.set_finisher(new C_OnFinisher(fin,
8094 &osd->objecter_finisher));
8095
8096 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8097 cop->src.snap, NULL,
8098 flags,
8099 gather.new_sub(),
8100 // discover the object version if we don't know it yet
8101 cop->results.user_version ? NULL : &cop->results.user_version);
8102 fin->tid = tid;
8103 cop->objecter_tid = tid;
8104 gather.activate();
8105 }
8106
8107 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8108 {
8109 dout(10) << __func__ << " " << oid << " tid " << tid
8110 << " " << cpp_strerror(r) << dendl;
8111 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8112 if (p == copy_ops.end()) {
8113 dout(10) << __func__ << " no copy_op found" << dendl;
8114 return;
8115 }
8116 CopyOpRef cop = p->second;
8117 if (tid != cop->objecter_tid) {
8118 dout(10) << __func__ << " tid " << tid << " != cop " << cop
8119 << " tid " << cop->objecter_tid << dendl;
8120 return;
8121 }
8122
8123 if (cop->omap_data.length() || cop->omap_header.length())
8124 cop->results.has_omap = true;
8125
8126 if (r >= 0 && !pool.info.supports_omap() &&
8127 (cop->omap_data.length() || cop->omap_header.length())) {
8128 r = -EOPNOTSUPP;
8129 }
8130 cop->objecter_tid = 0;
8131 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
8132 ObjectContextRef& cobc = cop->obc;
8133
8134 if (r < 0)
8135 goto out;
8136
8137 assert(cop->rval >= 0);
8138
8139 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8140 // verify snap hasn't been deleted
8141 vector<snapid_t>::iterator p = cop->results.snaps.begin();
8142 while (p != cop->results.snaps.end()) {
8143 if (pool.info.is_removed_snap(*p)) {
8144 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8145 << dendl;
8146 for (vector<snapid_t>::iterator q = p + 1;
8147 q != cop->results.snaps.end();
8148 ++q)
8149 *(q - 1) = *q;
8150 cop->results.snaps.resize(cop->results.snaps.size() - 1);
8151 } else {
8152 ++p;
8153 }
8154 }
8155 if (cop->results.snaps.empty()) {
8156 dout(10) << __func__ << " no more snaps for " << oid << dendl;
8157 r = -ENOENT;
8158 goto out;
8159 }
8160 }
8161
8162 assert(cop->rval >= 0);
8163
8164 if (!cop->temp_cursor.data_complete) {
8165 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8166 }
8167 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8168 if (cop->omap_header.length()) {
8169 cop->results.omap_digest =
8170 cop->omap_header.crc32c(cop->results.omap_digest);
8171 }
8172 if (cop->omap_data.length()) {
8173 bufferlist keys;
8174 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8175 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8176 }
8177 }
8178
8179 if (!cop->temp_cursor.attr_complete) {
8180 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8181 p != cop->attrs.end();
8182 ++p) {
8183 cop->results.attrs[string("_") + p->first] = p->second;
8184 }
8185 cop->attrs.clear();
8186 }
8187
8188 if (!cop->cursor.is_complete()) {
8189 // write out what we have so far
8190 if (cop->temp_cursor.is_initial()) {
8191 assert(!cop->results.started_temp_obj);
8192 cop->results.started_temp_obj = true;
8193 cop->results.temp_oid = generate_temp_object(oid);
8194 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8195 }
8196 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8197 OpContextUPtr ctx = simple_opc_create(tempobc);
8198 if (cop->temp_cursor.is_initial()) {
8199 ctx->new_temp_oid = cop->results.temp_oid;
8200 }
8201 _write_copy_chunk(cop, ctx->op_t.get());
8202 simple_opc_submit(std::move(ctx));
8203 dout(10) << __func__ << " fetching more" << dendl;
8204 _copy_some(cobc, cop);
8205 return;
8206 }
8207
8208 // verify digests?
8209 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8210 dout(20) << __func__ << std::hex
8211 << " got digest: rx data 0x" << cop->results.data_digest
8212 << " omap 0x" << cop->results.omap_digest
8213 << ", source: data 0x" << cop->results.source_data_digest
8214 << " omap 0x" << cop->results.source_omap_digest
8215 << std::dec
8216 << " flags " << cop->results.flags
8217 << dendl;
8218 }
8219 if (cop->results.is_data_digest() &&
8220 cop->results.data_digest != cop->results.source_data_digest) {
8221 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8222 << " != source 0x" << cop->results.source_data_digest << std::dec
8223 << dendl;
8224 osd->clog->error() << info.pgid << " copy from " << cop->src
8225 << " to " << cop->obc->obs.oi.soid << std::hex
8226 << " data digest 0x" << cop->results.data_digest
8227 << " != source 0x" << cop->results.source_data_digest
8228 << std::dec;
8229 r = -EIO;
8230 goto out;
8231 }
8232 if (cop->results.is_omap_digest() &&
8233 cop->results.omap_digest != cop->results.source_omap_digest) {
8234 derr << __func__ << std::hex
8235 << " omap digest 0x" << cop->results.omap_digest
8236 << " != source 0x" << cop->results.source_omap_digest
8237 << std::dec << dendl;
8238 osd->clog->error() << info.pgid << " copy from " << cop->src
8239 << " to " << cop->obc->obs.oi.soid << std::hex
8240 << " omap digest 0x" << cop->results.omap_digest
8241 << " != source 0x" << cop->results.source_omap_digest
8242 << std::dec;
8243 r = -EIO;
8244 goto out;
8245 }
8246 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8247 derr << __func__ << " injecting copyfrom failure" << dendl;
8248 r = -EIO;
8249 goto out;
8250 }
8251
8252 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8253 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8254 ObjectState& obs = cop->obc->obs;
8255 if (cop->temp_cursor.is_initial()) {
8256 dout(20) << "fill_in_final_tx: writing "
8257 << "directly to final object" << dendl;
8258 // write directly to final object
8259 cop->results.temp_oid = obs.oi.soid;
8260 _write_copy_chunk(cop, t);
8261 } else {
8262 // finish writing to temp object, then move into place
8263 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8264 _write_copy_chunk(cop, t);
8265 t->rename(obs.oi.soid, cop->results.temp_oid);
8266 }
8267 t->setattrs(obs.oi.soid, cop->results.attrs);
8268 });
8269
8270 dout(20) << __func__ << " success; committing" << dendl;
8271
8272 out:
8273 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8274 CopyCallbackResults results(r, &cop->results);
8275 cop->cb->complete(results);
8276
8277 copy_ops.erase(cobc->obs.oi.soid);
8278 cobc->stop_block();
8279
8280 if (r < 0 && cop->results.started_temp_obj) {
8281 dout(10) << __func__ << " deleting partial temp object "
8282 << cop->results.temp_oid << dendl;
8283 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8284 OpContextUPtr ctx = simple_opc_create(tempobc);
8285 ctx->op_t->remove(cop->results.temp_oid);
8286 ctx->discard_temp_oid = cop->results.temp_oid;
8287 simple_opc_submit(std::move(ctx));
8288 }
8289
8290 // cancel and requeue proxy ops on this object
8291 if (!r) {
8292 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8293 it != proxyread_ops.end();) {
8294 if (it->second->soid == cobc->obs.oi.soid) {
8295 cancel_proxy_read((it++)->second);
8296 } else {
8297 ++it;
8298 }
8299 }
8300 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8301 it != proxywrite_ops.end();) {
8302 if (it->second->soid == cobc->obs.oi.soid) {
8303 cancel_proxy_write((it++)->second);
8304 } else {
8305 ++it;
8306 }
8307 }
8308 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8309 }
8310
8311 kick_object_context_blocked(cobc);
8312 }
8313
8314 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8315 {
8316 dout(20) << __func__ << " " << cop
8317 << " " << cop->attrs.size() << " attrs"
8318 << " " << cop->data.length() << " bytes"
8319 << " " << cop->omap_header.length() << " omap header bytes"
8320 << " " << cop->omap_data.length() << " omap data bytes"
8321 << dendl;
8322 if (!cop->temp_cursor.attr_complete) {
8323 t->create(cop->results.temp_oid);
8324 }
8325 if (!cop->temp_cursor.data_complete) {
8326 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8327 cop->cursor.data_offset);
8328 if (pool.info.requires_aligned_append() &&
8329 !cop->cursor.data_complete) {
8330 /**
8331 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8332 * to pick it up on the next pass.
8333 */
8334 assert(cop->temp_cursor.data_offset %
8335 pool.info.required_alignment() == 0);
8336 if (cop->data.length() % pool.info.required_alignment() != 0) {
8337 uint64_t to_trim =
8338 cop->data.length() % pool.info.required_alignment();
8339 bufferlist bl;
8340 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8341 cop->data.swap(bl);
8342 cop->cursor.data_offset -= to_trim;
8343 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8344 cop->cursor.data_offset);
8345 }
8346 }
8347 if (cop->data.length()) {
8348 t->write(
8349 cop->results.temp_oid,
8350 cop->temp_cursor.data_offset,
8351 cop->data.length(),
8352 cop->data,
8353 cop->dest_obj_fadvise_flags);
8354 }
8355 cop->data.clear();
8356 }
8357 if (pool.info.supports_omap()) {
8358 if (!cop->temp_cursor.omap_complete) {
8359 if (cop->omap_header.length()) {
8360 t->omap_setheader(
8361 cop->results.temp_oid,
8362 cop->omap_header);
8363 cop->omap_header.clear();
8364 }
8365 if (cop->omap_data.length()) {
8366 map<string,bufferlist> omap;
8367 bufferlist::iterator p = cop->omap_data.begin();
8368 ::decode(omap, p);
8369 t->omap_setkeys(cop->results.temp_oid, omap);
8370 cop->omap_data.clear();
8371 }
8372 }
8373 } else {
8374 assert(cop->omap_header.length() == 0);
8375 assert(cop->omap_data.length() == 0);
8376 }
8377 cop->temp_cursor = cop->cursor;
8378 }
8379
8380 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8381 {
8382 OpContext *ctx = cb->ctx;
8383 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8384
8385 ObjectState& obs = ctx->new_obs;
8386 if (obs.exists) {
8387 dout(20) << __func__ << ": exists, removing" << dendl;
8388 ctx->op_t->remove(obs.oi.soid);
8389 } else {
8390 ctx->delta_stats.num_objects++;
8391 obs.exists = true;
8392 }
8393 if (cb->is_temp_obj_used()) {
8394 ctx->discard_temp_oid = cb->results->temp_oid;
8395 }
8396 cb->results->fill_in_final_tx(ctx->op_t.get());
8397
8398 // CopyFromCallback fills this in for us
8399 obs.oi.user_version = ctx->user_at_version;
8400
8401 obs.oi.set_data_digest(cb->results->data_digest);
8402 obs.oi.set_omap_digest(cb->results->omap_digest);
8403
8404 obs.oi.truncate_seq = cb->results->truncate_seq;
8405 obs.oi.truncate_size = cb->results->truncate_size;
8406
8407 ctx->extra_reqids = cb->results->reqids;
8408
8409 // cache: clear whiteout?
8410 if (obs.oi.is_whiteout()) {
8411 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8412 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8413 --ctx->delta_stats.num_whiteouts;
8414 }
8415
8416 if (cb->results->has_omap) {
8417 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8418 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8419 } else {
8420 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8421 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8422 }
8423
8424 interval_set<uint64_t> ch;
8425 if (obs.oi.size > 0)
8426 ch.insert(0, obs.oi.size);
8427 ctx->modified_ranges.union_of(ch);
8428
8429 if (cb->get_data_size() != obs.oi.size) {
8430 ctx->delta_stats.num_bytes -= obs.oi.size;
8431 obs.oi.size = cb->get_data_size();
8432 ctx->delta_stats.num_bytes += obs.oi.size;
8433 }
8434 ctx->delta_stats.num_wr++;
8435 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8436
8437 osd->logger->inc(l_osd_copyfrom);
8438 }
8439
8440 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8441 ObjectContextRef obc)
8442 {
8443 const hobject_t& soid = obc->obs.oi.soid;
8444 dout(10) << __func__ << " " << soid << " r=" << r
8445 << " uv" << results->user_version << dendl;
8446
8447 if (r == -ECANCELED) {
8448 return;
8449 }
8450
8451 if (r != -ENOENT && soid.is_snap()) {
8452 if (results->snaps.empty()) {
8453 // we must have read "snap" content from the head object in
8454 // the base pool. use snap_seq to construct what snaps should
8455 // be for this clone (what is was before we evicted the clean
8456 // clone from this pool, and what it will be when we flush and
8457 // the clone eventually happens in the base pool).
8458 SnapSet& snapset = obc->ssc->snapset;
8459 vector<snapid_t>::iterator p = snapset.snaps.begin();
8460 while (p != snapset.snaps.end() && *p > soid.snap)
8461 ++p;
8462 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8463 results->snaps.push_back(*p);
8464 ++p;
8465 }
8466 }
8467
8468 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8469 filter_snapc(results->snaps);
8470
8471 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8472 if (results->snaps.empty()) {
8473 dout(20) << __func__
8474 << " snaps are empty, clone is invalid,"
8475 << " setting r to ENOENT" << dendl;
8476 r = -ENOENT;
8477 }
8478 }
8479
8480 if (r < 0 && results->started_temp_obj) {
8481 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8482 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8483 assert(tempobc);
8484 OpContextUPtr ctx = simple_opc_create(tempobc);
8485 ctx->op_t->remove(results->temp_oid);
8486 simple_opc_submit(std::move(ctx));
8487 results->started_temp_obj = false;
8488 }
8489
8490 if (r == -ENOENT && soid.is_snap()) {
8491 dout(10) << __func__
8492 << ": enoent while trying to promote clone, " << soid
8493 << " must have been trimmed, removing from snapset"
8494 << dendl;
8495 hobject_t head(soid.get_head());
8496 ObjectContextRef obc = get_object_context(head, false);
8497 assert(obc);
8498
8499 OpContextUPtr tctx = simple_opc_create(obc);
8500 tctx->at_version = get_next_version();
8501 filter_snapc(tctx->new_snapset.snaps);
8502 vector<snapid_t> new_clones;
8503 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8504 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8505 i != tctx->new_snapset.clones.end();
8506 ++i) {
8507 if (*i != soid.snap) {
8508 new_clones.push_back(*i);
8509 auto p = tctx->new_snapset.clone_snaps.find(*i);
8510 if (p != tctx->new_snapset.clone_snaps.end()) {
8511 new_clone_snaps[*i] = p->second;
8512 }
8513 }
8514 }
8515 tctx->new_snapset.clones.swap(new_clones);
8516 tctx->new_snapset.clone_overlap.erase(soid.snap);
8517 tctx->new_snapset.clone_size.erase(soid.snap);
8518 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8519
8520 // take RWWRITE lock for duration of our local write. ignore starvation.
8521 if (!tctx->lock_manager.take_write_lock(
8522 head,
8523 obc)) {
8524 assert(0 == "problem!");
8525 }
8526 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8527
8528 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8529
8530 simple_opc_submit(std::move(tctx));
8531 return;
8532 }
8533
8534 bool whiteout = false;
8535 if (r == -ENOENT) {
8536 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8537 dout(10) << __func__ << " whiteout " << soid << dendl;
8538 whiteout = true;
8539 }
8540
8541 if (r < 0 && !whiteout) {
8542 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8543 // pass error to everyone blocked on this object
8544 // FIXME: this is pretty sloppy, but at this point we got
8545 // something unexpected and don't have many other options.
8546 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8547 waiting_for_blocked_object.find(soid);
8548 if (blocked_iter != waiting_for_blocked_object.end()) {
8549 while (!blocked_iter->second.empty()) {
8550 osd->reply_op_error(blocked_iter->second.front(), r);
8551 blocked_iter->second.pop_front();
8552 }
8553 waiting_for_blocked_object.erase(blocked_iter);
8554 }
8555 return;
8556 }
8557
8558 osd->promote_finish(results->object_size);
8559
8560 OpContextUPtr tctx = simple_opc_create(obc);
8561 tctx->at_version = get_next_version();
8562
8563 ++tctx->delta_stats.num_objects;
8564 if (soid.snap < CEPH_NOSNAP)
8565 ++tctx->delta_stats.num_object_clones;
8566 tctx->new_obs.exists = true;
8567
8568 tctx->extra_reqids = results->reqids;
8569
8570 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8571 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8572
8573 if (whiteout) {
8574 // create a whiteout
8575 tctx->op_t->create(soid);
8576 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8577 ++tctx->delta_stats.num_whiteouts;
8578 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8579 osd->logger->inc(l_osd_tier_whiteout);
8580 } else {
8581 if (results->has_omap) {
8582 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8583 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8584 ++tctx->delta_stats.num_objects_omap;
8585 }
8586
8587 results->fill_in_final_tx(tctx->op_t.get());
8588 if (results->started_temp_obj) {
8589 tctx->discard_temp_oid = results->temp_oid;
8590 }
8591 tctx->new_obs.oi.size = results->object_size;
8592 tctx->new_obs.oi.user_version = results->user_version;
8593 // Don't care src object whether have data or omap digest
8594 if (results->object_size)
8595 tctx->new_obs.oi.set_data_digest(results->data_digest);
8596 if (results->has_omap)
8597 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8598 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8599 tctx->new_obs.oi.truncate_size = results->truncate_size;
8600
8601 if (soid.snap != CEPH_NOSNAP) {
8602 if (legacy_snapset) {
8603 tctx->new_obs.oi.legacy_snaps = results->snaps;
8604 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8605 } else {
8606 // it's already in the snapset
8607 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8608 }
8609 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8610 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8611 results->object_size);
8612 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8613
8614 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8615 } else {
8616 tctx->delta_stats.num_bytes += results->object_size;
8617 }
8618 }
8619
8620 if (results->mirror_snapset) {
8621 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8622 tctx->new_snapset.from_snap_set(
8623 results->snapset,
8624 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8625 }
8626 tctx->new_snapset.head_exists = true;
8627 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8628
8629 // take RWWRITE lock for duration of our local write. ignore starvation.
8630 if (!tctx->lock_manager.take_write_lock(
8631 obc->obs.oi.soid,
8632 obc)) {
8633 assert(0 == "problem!");
8634 }
8635 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8636
8637 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8638
8639 simple_opc_submit(std::move(tctx));
8640
8641 osd->logger->inc(l_osd_tier_promote);
8642
8643 if (agent_state &&
8644 agent_state->is_idle())
8645 agent_choose_mode();
8646 }
8647
8648 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8649 {
8650 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8651 << " from " << cop->src << " " << cop->oloc
8652 << " v" << cop->results.user_version << dendl;
8653
8654 // cancel objecter op, if we can
8655 if (cop->objecter_tid) {
8656 osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8657 cop->objecter_tid = 0;
8658 if (cop->objecter_tid2) {
8659 osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8660 cop->objecter_tid2 = 0;
8661 }
8662 }
8663
8664 copy_ops.erase(cop->obc->obs.oi.soid);
8665 cop->obc->stop_block();
8666
8667 kick_object_context_blocked(cop->obc);
8668 cop->results.should_requeue = requeue;
8669 CopyCallbackResults result(-ECANCELED, &cop->results);
8670 cop->cb->complete(result);
8671
8672 // There may still be an objecter callback referencing this copy op.
8673 // That callback will not need the obc since it's been canceled, and
8674 // we need the obc reference to go away prior to flush.
8675 cop->obc = ObjectContextRef();
8676 }
8677
8678 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8679 {
8680 dout(10) << __func__ << dendl;
8681 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8682 while (p != copy_ops.end()) {
8683 // requeue this op? can I queue up all of them?
8684 cancel_copy((p++)->second, requeue);
8685 }
8686 }
8687
8688
8689 // ========================================================================
8690 // flush
8691 //
8692 // Flush a dirty object in the cache tier by writing it back to the
8693 // base tier. The sequence looks like:
8694 //
8695 // * send a copy-from operation to the base tier to copy the current
8696 // version of the object
8697 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8698 // * on completion, we check if the object has been modified. if so,
8699 // just reply with -EAGAIN.
8700 // * try to take a write lock so we can clear the dirty flag. if this
8701 // fails, wait and retry
8702 // * start a repop that clears the bit.
8703 //
8704 // If we have to wait, we will retry by coming back through the
8705 // start_flush method. We check if a flush is already in progress
8706 // and, if so, try to finish it by rechecking the version and trying
8707 // to clear the dirty bit.
8708 //
8709 // In order for the cache-flush (a write op) to not block the copy-get
8710 // from reading the object, the client *must* set the SKIPRWLOCKS
8711 // flag.
8712 //
8713 // NOTE: normally writes are strictly ordered for the client, but
8714 // flushes are special in that they can be reordered with respect to
8715 // other writes. In particular, we can't have a flush request block
8716 // an update to the cache pool object!
8717
8718 struct C_Flush : public Context {
8719 PrimaryLogPGRef pg;
8720 hobject_t oid;
8721 epoch_t last_peering_reset;
8722 ceph_tid_t tid;
8723 utime_t start;
8724 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8725 : pg(p), oid(o), last_peering_reset(lpr),
8726 tid(0), start(ceph_clock_now())
8727 {}
8728 void finish(int r) override {
8729 if (r == -ECANCELED)
8730 return;
8731 pg->lock();
8732 if (last_peering_reset == pg->get_last_peering_reset()) {
8733 pg->finish_flush(oid, tid, r);
8734 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8735 }
8736 pg->unlock();
8737 }
8738 };
8739
8740 int PrimaryLogPG::start_flush(
8741 OpRequestRef op, ObjectContextRef obc,
8742 bool blocking, hobject_t *pmissing,
8743 boost::optional<std::function<void()>> &&on_flush)
8744 {
8745 const object_info_t& oi = obc->obs.oi;
8746 const hobject_t& soid = oi.soid;
8747 dout(10) << __func__ << " " << soid
8748 << " v" << oi.version
8749 << " uv" << oi.user_version
8750 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8751 << dendl;
8752
8753 // get a filtered snapset, need to remove removed snaps
8754 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8755
8756 // verify there are no (older) check for dirty clones
8757 {
8758 dout(20) << " snapset " << snapset << dendl;
8759 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8760 while (p != snapset.clones.rend() && *p >= soid.snap)
8761 ++p;
8762 if (p != snapset.clones.rend()) {
8763 hobject_t next = soid;
8764 next.snap = *p;
8765 assert(next.snap < soid.snap);
8766 if (pg_log.get_missing().is_missing(next)) {
8767 dout(10) << __func__ << " missing clone is " << next << dendl;
8768 if (pmissing)
8769 *pmissing = next;
8770 return -ENOENT;
8771 }
8772 ObjectContextRef older_obc = get_object_context(next, false);
8773 if (older_obc) {
8774 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8775 << dendl;
8776 if (older_obc->obs.oi.is_dirty()) {
8777 dout(10) << __func__ << " next oldest clone is dirty: "
8778 << older_obc->obs.oi << dendl;
8779 return -EBUSY;
8780 }
8781 } else {
8782 dout(20) << __func__ << " next oldest clone " << next
8783 << " is not present; implicitly clean" << dendl;
8784 }
8785 } else {
8786 dout(20) << __func__ << " no older clones" << dendl;
8787 }
8788 }
8789
8790 if (blocking)
8791 obc->start_block();
8792
8793 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8794 if (p != flush_ops.end()) {
8795 FlushOpRef fop = p->second;
8796 if (fop->op == op) {
8797 // we couldn't take the write lock on a cache-try-flush before;
8798 // now we are trying again for the lock.
8799 return try_flush_mark_clean(fop);
8800 }
8801 if (fop->flushed_version == obc->obs.oi.user_version &&
8802 (fop->blocking || !blocking)) {
8803 // nonblocking can join anything
8804 // blocking can only join a blocking flush
8805 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8806 if (op)
8807 fop->dup_ops.push_back(op);
8808 return -EAGAIN; // clean up this ctx; op will retry later
8809 }
8810
8811 // cancel current flush since it will fail anyway, or because we
8812 // are blocking and the existing flush is nonblocking.
8813 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8814 if (fop->op)
8815 osd->reply_op_error(fop->op, -EBUSY);
8816 while (!fop->dup_ops.empty()) {
8817 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8818 fop->dup_ops.pop_front();
8819 }
8820 cancel_flush(fop, false);
8821 }
8822
8823 /**
8824 * In general, we need to send a delete and a copyfrom.
8825 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8826 * where 4 is marked as clean. To flush 10, we have to:
8827 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8828 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8829 *
8830 * There is a complicating case. Supposed there had been a clone 7
8831 * for snaps [7, 6] which has been trimmed since they no longer exist.
8832 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8833 * the delete, the snap will be promoted to 5, and the head will become
8834 * a snapdir. When the copy-from goes through, we'll end up with
8835 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8836 *
8837 * Another complication is the case where there is an interval change
8838 * after doing the delete and the flush but before marking the object
8839 * clean. We'll happily delete head and then recreate it at the same
8840 * sequence number, which works out ok.
8841 */
8842
8843 SnapContext snapc, dsnapc;
8844 if (snapset.seq != 0) {
8845 if (soid.snap == CEPH_NOSNAP) {
8846 snapc.seq = snapset.seq;
8847 snapc.snaps = snapset.snaps;
8848 } else {
8849 snapid_t min_included_snap;
8850 if (snapset.is_legacy()) {
8851 min_included_snap = oi.legacy_snaps.back();
8852 } else {
8853 auto p = snapset.clone_snaps.find(soid.snap);
8854 assert(p != snapset.clone_snaps.end());
8855 min_included_snap = p->second.back();
8856 }
8857 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8858 }
8859
8860 snapid_t prev_snapc = 0;
8861 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8862 citer != snapset.clones.rend();
8863 ++citer) {
8864 if (*citer < soid.snap) {
8865 prev_snapc = *citer;
8866 break;
8867 }
8868 }
8869
8870 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8871 }
8872
8873 object_locator_t base_oloc(soid);
8874 base_oloc.pool = pool.info.tier_of;
8875
8876 if (dsnapc.seq < snapc.seq) {
8877 ObjectOperation o;
8878 o.remove();
8879 osd->objecter->mutate(
8880 soid.oid,
8881 base_oloc,
8882 o,
8883 dsnapc,
8884 ceph::real_clock::from_ceph_timespec(oi.mtime),
8885 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8886 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8887 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8888 }
8889
8890 FlushOpRef fop(std::make_shared<FlushOp>());
8891 fop->obc = obc;
8892 fop->flushed_version = oi.user_version;
8893 fop->blocking = blocking;
8894 fop->on_flush = std::move(on_flush);
8895 fop->op = op;
8896
8897 ObjectOperation o;
8898 if (oi.is_whiteout()) {
8899 fop->removal = true;
8900 o.remove();
8901 } else {
8902 object_locator_t oloc(soid);
8903 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8904 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8905 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8906 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8907 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8908 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8909
8910 //mean the base tier don't cache data after this
8911 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8912 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8913 }
8914 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8915
8916 ceph_tid_t tid = osd->objecter->mutate(
8917 soid.oid, base_oloc, o, snapc,
8918 ceph::real_clock::from_ceph_timespec(oi.mtime),
8919 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8920 new C_OnFinisher(fin,
8921 &osd->objecter_finisher));
8922 /* we're under the pg lock and fin->finish() is grabbing that */
8923 fin->tid = tid;
8924 fop->objecter_tid = tid;
8925
8926 flush_ops[soid] = fop;
8927 info.stats.stats.sum.num_flush++;
8928 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8929 return -EINPROGRESS;
8930 }
8931
8932 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8933 {
8934 dout(10) << __func__ << " " << oid << " tid " << tid
8935 << " " << cpp_strerror(r) << dendl;
8936 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8937 if (p == flush_ops.end()) {
8938 dout(10) << __func__ << " no flush_op found" << dendl;
8939 return;
8940 }
8941 FlushOpRef fop = p->second;
8942 if (tid != fop->objecter_tid) {
8943 dout(10) << __func__ << " tid " << tid << " != fop " << fop
8944 << " tid " << fop->objecter_tid << dendl;
8945 return;
8946 }
8947 ObjectContextRef obc = fop->obc;
8948 fop->objecter_tid = 0;
8949
8950 if (r < 0 && !(r == -ENOENT && fop->removal)) {
8951 if (fop->op)
8952 osd->reply_op_error(fop->op, -EBUSY);
8953 if (fop->blocking) {
8954 obc->stop_block();
8955 kick_object_context_blocked(obc);
8956 }
8957
8958 if (!fop->dup_ops.empty()) {
8959 dout(20) << __func__ << " requeueing dups" << dendl;
8960 requeue_ops(fop->dup_ops);
8961 }
8962 if (fop->on_flush) {
8963 (*(fop->on_flush))();
8964 fop->on_flush = boost::none;
8965 }
8966 flush_ops.erase(oid);
8967 return;
8968 }
8969
8970 r = try_flush_mark_clean(fop);
8971 if (r == -EBUSY && fop->op) {
8972 osd->reply_op_error(fop->op, r);
8973 }
8974 }
8975
8976 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8977 {
8978 ObjectContextRef obc = fop->obc;
8979 const hobject_t& oid = obc->obs.oi.soid;
8980
8981 if (fop->blocking) {
8982 obc->stop_block();
8983 kick_object_context_blocked(obc);
8984 }
8985
8986 if (fop->flushed_version != obc->obs.oi.user_version ||
8987 !obc->obs.exists) {
8988 if (obc->obs.exists)
8989 dout(10) << __func__ << " flushed_version " << fop->flushed_version
8990 << " != current " << obc->obs.oi.user_version
8991 << dendl;
8992 else
8993 dout(10) << __func__ << " object no longer exists" << dendl;
8994
8995 if (!fop->dup_ops.empty()) {
8996 dout(20) << __func__ << " requeueing dups" << dendl;
8997 requeue_ops(fop->dup_ops);
8998 }
8999 if (fop->on_flush) {
9000 (*(fop->on_flush))();
9001 fop->on_flush = boost::none;
9002 }
9003 flush_ops.erase(oid);
9004 if (fop->blocking)
9005 osd->logger->inc(l_osd_tier_flush_fail);
9006 else
9007 osd->logger->inc(l_osd_tier_try_flush_fail);
9008 return -EBUSY;
9009 }
9010
9011 if (!fop->blocking &&
9012 scrubber.write_blocked_by_scrub(oid)) {
9013 if (fop->op) {
9014 dout(10) << __func__ << " blocked by scrub" << dendl;
9015 requeue_op(fop->op);
9016 requeue_ops(fop->dup_ops);
9017 return -EAGAIN; // will retry
9018 } else {
9019 osd->logger->inc(l_osd_tier_try_flush_fail);
9020 cancel_flush(fop, false);
9021 return -ECANCELED;
9022 }
9023 }
9024
9025 // successfully flushed, can we evict this object?
9026 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9027 agent_maybe_evict(obc, true)) {
9028 osd->logger->inc(l_osd_tier_clean);
9029 if (fop->on_flush) {
9030 (*(fop->on_flush))();
9031 fop->on_flush = boost::none;
9032 }
9033 flush_ops.erase(oid);
9034 return 0;
9035 }
9036
9037 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9038 OpContextUPtr ctx = simple_opc_create(fop->obc);
9039
9040 // successfully flushed; can we clear the dirty bit?
9041 // try to take the lock manually, since we don't
9042 // have a ctx yet.
9043 if (ctx->lock_manager.get_lock_type(
9044 ObjectContext::RWState::RWWRITE,
9045 oid,
9046 obc,
9047 fop->op)) {
9048 dout(20) << __func__ << " took write lock" << dendl;
9049 } else if (fop->op) {
9050 dout(10) << __func__ << " waiting on write lock" << dendl;
9051 close_op_ctx(ctx.release());
9052 requeue_op(fop->op);
9053 requeue_ops(fop->dup_ops);
9054 return -EAGAIN; // will retry
9055 } else {
9056 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9057 close_op_ctx(ctx.release());
9058 osd->logger->inc(l_osd_tier_try_flush_fail);
9059 cancel_flush(fop, false);
9060 return -ECANCELED;
9061 }
9062
9063 if (fop->on_flush) {
9064 ctx->register_on_finish(*(fop->on_flush));
9065 fop->on_flush = boost::none;
9066 }
9067
9068 ctx->at_version = get_next_version();
9069
9070 ctx->new_obs = obc->obs;
9071 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9072 --ctx->delta_stats.num_objects_dirty;
9073
9074 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9075
9076 osd->logger->inc(l_osd_tier_clean);
9077
9078 if (!fop->dup_ops.empty() || fop->op) {
9079 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9080 list<OpRequestRef> ls;
9081 if (fop->op)
9082 ls.push_back(fop->op);
9083 ls.splice(ls.end(), fop->dup_ops);
9084 requeue_ops(ls);
9085 }
9086
9087 simple_opc_submit(std::move(ctx));
9088
9089 flush_ops.erase(oid);
9090
9091 if (fop->blocking)
9092 osd->logger->inc(l_osd_tier_flush);
9093 else
9094 osd->logger->inc(l_osd_tier_try_flush);
9095
9096 return -EINPROGRESS;
9097 }
9098
9099 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
9100 {
9101 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9102 << fop->objecter_tid << dendl;
9103 if (fop->objecter_tid) {
9104 osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
9105 fop->objecter_tid = 0;
9106 }
9107 if (fop->blocking) {
9108 fop->obc->stop_block();
9109 kick_object_context_blocked(fop->obc);
9110 }
9111 if (requeue) {
9112 if (fop->op)
9113 requeue_op(fop->op);
9114 requeue_ops(fop->dup_ops);
9115 }
9116 if (fop->on_flush) {
9117 (*(fop->on_flush))();
9118 fop->on_flush = boost::none;
9119 }
9120 flush_ops.erase(fop->obc->obs.oi.soid);
9121 }
9122
9123 void PrimaryLogPG::cancel_flush_ops(bool requeue)
9124 {
9125 dout(10) << __func__ << dendl;
9126 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9127 while (p != flush_ops.end()) {
9128 cancel_flush((p++)->second, requeue);
9129 }
9130 }
9131
9132 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9133 {
9134 if (!pool.info.allow_incomplete_clones())
9135 return true;
9136 if (is_missing_object(coid))
9137 return true;
9138 ObjectContextRef obc = get_object_context(coid, false);
9139 return obc && obc->obs.exists;
9140 }
9141
9142 // ========================================================================
9143 // rep op gather
9144
9145 class C_OSD_RepopApplied : public Context {
9146 PrimaryLogPGRef pg;
9147 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9148 public:
9149 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9150 : pg(pg), repop(repop) {}
9151 void finish(int) override {
9152 pg->repop_all_applied(repop.get());
9153 }
9154 };
9155
9156
9157 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9158 {
9159 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9160 << dendl;
9161 assert(!repop->applies_with_commit);
9162 repop->all_applied = true;
9163 if (!repop->rep_aborted) {
9164 eval_repop(repop);
9165 }
9166 }
9167
9168 class C_OSD_RepopCommit : public Context {
9169 PrimaryLogPGRef pg;
9170 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9171 public:
9172 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9173 : pg(pg), repop(repop) {}
9174 void finish(int) override {
9175 pg->repop_all_committed(repop.get());
9176 }
9177 };
9178
9179 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9180 {
9181 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9182 << dendl;
9183 repop->all_committed = true;
9184 if (repop->applies_with_commit) {
9185 assert(!repop->all_applied);
9186 repop->all_applied = true;
9187 }
9188
9189 if (!repop->rep_aborted) {
9190 if (repop->v != eversion_t()) {
9191 last_update_ondisk = repop->v;
9192 last_complete_ondisk = repop->pg_local_last_complete;
9193 }
9194 eval_repop(repop);
9195 }
9196 }
9197
9198 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9199 {
9200 dout(10) << "op_applied version " << applied_version << dendl;
9201 if (applied_version == eversion_t())
9202 return;
9203 assert(applied_version > last_update_applied);
9204 assert(applied_version <= info.last_update);
9205 last_update_applied = applied_version;
9206 if (is_primary()) {
9207 if (scrubber.active) {
9208 if (last_update_applied >= scrubber.subset_last_update) {
9209 if (ops_blocked_by_scrub()) {
9210 requeue_scrub(true);
9211 } else {
9212 requeue_scrub(false);
9213 }
9214
9215 }
9216 } else {
9217 assert(scrubber.start == scrubber.end);
9218 }
9219 } else {
9220 if (scrubber.active_rep_scrub) {
9221 if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9222 scrubber.active_rep_scrub->get_req())->scrub_to) {
9223 osd->enqueue_back(
9224 info.pgid,
9225 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9226 scrubber.active_rep_scrub = OpRequestRef();
9227 }
9228 }
9229 }
9230 }
9231
9232 void PrimaryLogPG::eval_repop(RepGather *repop)
9233 {
9234 const MOSDOp *m = NULL;
9235 if (repop->op)
9236 m = static_cast<const MOSDOp *>(repop->op->get_req());
9237
9238 if (m)
9239 dout(10) << "eval_repop " << *repop
9240 << (repop->rep_done ? " DONE" : "")
9241 << dendl;
9242 else
9243 dout(10) << "eval_repop " << *repop << " (no op)"
9244 << (repop->rep_done ? " DONE" : "")
9245 << dendl;
9246
9247 if (repop->rep_done)
9248 return;
9249
9250 // ondisk?
9251 if (repop->all_committed) {
9252 dout(10) << " commit: " << *repop << dendl;
9253 for (auto p = repop->on_committed.begin();
9254 p != repop->on_committed.end();
9255 repop->on_committed.erase(p++)) {
9256 (*p)();
9257 }
9258 // send dup commits, in order
9259 if (waiting_for_ondisk.count(repop->v)) {
9260 assert(waiting_for_ondisk.begin()->first == repop->v);
9261 for (list<pair<OpRequestRef, version_t> >::iterator i =
9262 waiting_for_ondisk[repop->v].begin();
9263 i != waiting_for_ondisk[repop->v].end();
9264 ++i) {
9265 osd->reply_op_error(i->first, repop->r, repop->v,
9266 i->second);
9267 }
9268 waiting_for_ondisk.erase(repop->v);
9269 }
9270 }
9271
9272 // applied?
9273 if (repop->all_applied) {
9274 if (repop->applies_with_commit) {
9275 assert(repop->on_applied.empty());
9276 }
9277 dout(10) << " applied: " << *repop << " " << dendl;
9278 for (auto p = repop->on_applied.begin();
9279 p != repop->on_applied.end();
9280 repop->on_applied.erase(p++)) {
9281 (*p)();
9282 }
9283 }
9284
9285 // done.
9286 if (repop->all_applied && repop->all_committed) {
9287 repop->rep_done = true;
9288
9289 publish_stats_to_osd();
9290 calc_min_last_complete_ondisk();
9291
9292 dout(10) << " removing " << *repop << dendl;
9293 assert(!repop_queue.empty());
9294 dout(20) << " q front is " << *repop_queue.front() << dendl;
9295 if (repop_queue.front() != repop) {
9296 if (!repop->applies_with_commit) {
9297 dout(0) << " removing " << *repop << dendl;
9298 dout(0) << " q front is " << *repop_queue.front() << dendl;
9299 assert(repop_queue.front() == repop);
9300 }
9301 } else {
9302 RepGather *to_remove = nullptr;
9303 while (!repop_queue.empty() &&
9304 (to_remove = repop_queue.front())->rep_done) {
9305 repop_queue.pop_front();
9306 for (auto p = to_remove->on_success.begin();
9307 p != to_remove->on_success.end();
9308 to_remove->on_success.erase(p++)) {
9309 (*p)();
9310 }
9311 remove_repop(to_remove);
9312 }
9313 }
9314 }
9315 }
9316
9317 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9318 {
9319 FUNCTRACE();
9320 const hobject_t& soid = ctx->obs->oi.soid;
9321 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9322 << " o " << soid
9323 << dendl;
9324
9325 repop->v = ctx->at_version;
9326 if (ctx->at_version > eversion_t()) {
9327 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9328 i != actingbackfill.end();
9329 ++i) {
9330 if (*i == get_primary()) continue;
9331 pg_info_t &pinfo = peer_info[*i];
9332 // keep peer_info up to date
9333 if (pinfo.last_complete == pinfo.last_update)
9334 pinfo.last_complete = ctx->at_version;
9335 pinfo.last_update = ctx->at_version;
9336 }
9337 }
9338
9339 ctx->obc->ondisk_write_lock();
9340
9341 bool unlock_snapset_obc = false;
9342 ctx->op_t->add_obc(ctx->obc);
9343 if (ctx->clone_obc) {
9344 ctx->clone_obc->ondisk_write_lock();
9345 ctx->op_t->add_obc(ctx->clone_obc);
9346 }
9347 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9348 ctx->obc->obs.oi.soid) {
9349 ctx->snapset_obc->ondisk_write_lock();
9350 unlock_snapset_obc = true;
9351 ctx->op_t->add_obc(ctx->snapset_obc);
9352 }
9353
9354 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9355 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9356 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9357 ctx->obc,
9358 ctx->clone_obc,
9359 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9360 if (!(ctx->log.empty())) {
9361 assert(ctx->at_version >= projected_last_update);
9362 projected_last_update = ctx->at_version;
9363 }
9364 for (auto &&entry: ctx->log) {
9365 projected_log.add(entry);
9366 }
9367 pgbackend->submit_transaction(
9368 soid,
9369 ctx->delta_stats,
9370 ctx->at_version,
9371 std::move(ctx->op_t),
9372 pg_trim_to,
9373 min_last_complete_ondisk,
9374 ctx->log,
9375 ctx->updated_hset_history,
9376 onapplied_sync,
9377 on_all_applied,
9378 on_all_commit,
9379 repop->rep_tid,
9380 ctx->reqid,
9381 ctx->op);
9382 }
9383
9384 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9385 OpContext *ctx, ObjectContextRef obc,
9386 ceph_tid_t rep_tid)
9387 {
9388 if (ctx->op)
9389 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9390 else
9391 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9392
9393 RepGather *repop = new RepGather(
9394 ctx, rep_tid, info.last_complete, false);
9395
9396 repop->start = ceph_clock_now();
9397
9398 repop_queue.push_back(&repop->queue_item);
9399 repop->get();
9400
9401 osd->logger->inc(l_osd_op_wip);
9402
9403 dout(10) << __func__ << ": " << *repop << dendl;
9404 return repop;
9405 }
9406
9407 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9408 eversion_t version,
9409 int r,
9410 ObcLockManager &&manager,
9411 OpRequestRef &&op,
9412 boost::optional<std::function<void(void)> > &&on_complete)
9413 {
9414 RepGather *repop = new RepGather(
9415 std::move(manager),
9416 std::move(op),
9417 std::move(on_complete),
9418 osd->get_tid(),
9419 info.last_complete,
9420 true,
9421 r);
9422 repop->v = version;
9423
9424 repop->start = ceph_clock_now();
9425
9426 repop_queue.push_back(&repop->queue_item);
9427
9428 osd->logger->inc(l_osd_op_wip);
9429
9430 dout(10) << __func__ << ": " << *repop << dendl;
9431 return boost::intrusive_ptr<RepGather>(repop);
9432 }
9433
9434 void PrimaryLogPG::remove_repop(RepGather *repop)
9435 {
9436 dout(20) << __func__ << " " << *repop << dendl;
9437
9438 for (auto p = repop->on_finish.begin();
9439 p != repop->on_finish.end();
9440 repop->on_finish.erase(p++)) {
9441 (*p)();
9442 }
9443
9444 release_object_locks(
9445 repop->lock_manager);
9446 repop->put();
9447
9448 osd->logger->dec(l_osd_op_wip);
9449 }
9450
9451 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9452 {
9453 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9454 ceph_tid_t rep_tid = osd->get_tid();
9455 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9456 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9457 ctx->op_t.reset(new PGTransaction());
9458 ctx->mtime = ceph_clock_now();
9459 return ctx;
9460 }
9461
9462 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9463 {
9464 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9465 dout(20) << __func__ << " " << repop << dendl;
9466 issue_repop(repop, ctx.get());
9467 eval_repop(repop);
9468 calc_trim_to();
9469 repop->put();
9470 }
9471
9472
9473 void PrimaryLogPG::submit_log_entries(
9474 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9475 ObcLockManager &&manager,
9476 boost::optional<std::function<void(void)> > &&_on_complete,
9477 OpRequestRef op,
9478 int r)
9479 {
9480 dout(10) << __func__ << " " << entries << dendl;
9481 assert(is_primary());
9482
9483 eversion_t version;
9484 if (!entries.empty()) {
9485 assert(entries.rbegin()->version >= projected_last_update);
9486 version = projected_last_update = entries.rbegin()->version;
9487 }
9488
9489 boost::intrusive_ptr<RepGather> repop;
9490 boost::optional<std::function<void(void)> > on_complete;
9491 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9492 repop = new_repop(
9493 version,
9494 r,
9495 std::move(manager),
9496 std::move(op),
9497 std::move(_on_complete));
9498 } else {
9499 on_complete = std::move(_on_complete);
9500 }
9501
9502 pgbackend->call_write_ordered(
9503 [this, entries, repop, on_complete]() {
9504 ObjectStore::Transaction t;
9505 eversion_t old_last_update = info.last_update;
9506 merge_new_log_entries(entries, t);
9507
9508
9509 set<pg_shard_t> waiting_on;
9510 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9511 i != actingbackfill.end();
9512 ++i) {
9513 pg_shard_t peer(*i);
9514 if (peer == pg_whoami) continue;
9515 assert(peer_missing.count(peer));
9516 assert(peer_info.count(peer));
9517 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9518 assert(repop);
9519 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9520 entries,
9521 spg_t(info.pgid.pgid, i->shard),
9522 pg_whoami.shard,
9523 get_osdmap()->get_epoch(),
9524 last_peering_reset,
9525 repop->rep_tid);
9526 osd->send_message_osd_cluster(
9527 peer.osd, m, get_osdmap()->get_epoch());
9528 waiting_on.insert(peer);
9529 } else {
9530 MOSDPGLog *m = new MOSDPGLog(
9531 peer.shard, pg_whoami.shard,
9532 info.last_update.epoch,
9533 info);
9534 m->log.log = entries;
9535 m->log.tail = old_last_update;
9536 m->log.head = info.last_update;
9537 osd->send_message_osd_cluster(
9538 peer.osd, m, get_osdmap()->get_epoch());
9539 }
9540 }
9541 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9542 ceph_tid_t rep_tid = repop->rep_tid;
9543 waiting_on.insert(pg_whoami);
9544 log_entry_update_waiting_on.insert(
9545 make_pair(
9546 rep_tid,
9547 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9548 ));
9549 struct OnComplete : public Context {
9550 PrimaryLogPGRef pg;
9551 ceph_tid_t rep_tid;
9552 epoch_t epoch;
9553 OnComplete(
9554 PrimaryLogPGRef pg,
9555 ceph_tid_t rep_tid,
9556 epoch_t epoch)
9557 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9558 void finish(int) override {
9559 pg->lock();
9560 if (!pg->pg_has_reset_since(epoch)) {
9561 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9562 assert(it != pg->log_entry_update_waiting_on.end());
9563 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9564 assert(it2 != it->second.waiting_on.end());
9565 it->second.waiting_on.erase(it2);
9566 if (it->second.waiting_on.empty()) {
9567 pg->repop_all_committed(it->second.repop.get());
9568 pg->log_entry_update_waiting_on.erase(it);
9569 }
9570 }
9571 pg->unlock();
9572 }
9573 };
9574 t.register_on_commit(
9575 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9576 } else {
9577 if (on_complete) {
9578 struct OnComplete : public Context {
9579 PrimaryLogPGRef pg;
9580 std::function<void(void)> on_complete;
9581 epoch_t epoch;
9582 OnComplete(
9583 PrimaryLogPGRef pg,
9584 const std::function<void(void)> &on_complete,
9585 epoch_t epoch)
9586 : pg(pg),
9587 on_complete(std::move(on_complete)),
9588 epoch(epoch) {}
9589 void finish(int) override {
9590 pg->lock();
9591 if (!pg->pg_has_reset_since(epoch))
9592 on_complete();
9593 pg->unlock();
9594 }
9595 };
9596 t.register_on_complete(
9597 new OnComplete{
9598 this, *on_complete, get_osdmap()->get_epoch()
9599 });
9600 }
9601 }
9602 t.register_on_applied(
9603 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9604 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9605 assert(r == 0);
9606 });
9607 }
9608
9609 void PrimaryLogPG::cancel_log_updates()
9610 {
9611 // get rid of all the LogUpdateCtx so their references to repops are
9612 // dropped
9613 log_entry_update_waiting_on.clear();
9614 }
9615
9616 // -------------------------------------------------------
9617
9618 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9619 {
9620 pair<hobject_t, ObjectContextRef> i;
9621 while (object_contexts.get_next(i.first, &i)) {
9622 ObjectContextRef obc(i.second);
9623 get_obc_watchers(obc, pg_watchers);
9624 }
9625 }
9626
9627 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9628 {
9629 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9630 obc->watchers.begin();
9631 j != obc->watchers.end();
9632 ++j) {
9633 obj_watch_item_t owi;
9634
9635 owi.obj = obc->obs.oi.soid;
9636 owi.wi.addr = j->second->get_peer_addr();
9637 owi.wi.name = j->second->get_entity();
9638 owi.wi.cookie = j->second->get_cookie();
9639 owi.wi.timeout_seconds = j->second->get_timeout();
9640
9641 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9642 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9643
9644 pg_watchers.push_back(owi);
9645 }
9646 }
9647
9648 void PrimaryLogPG::check_blacklisted_watchers()
9649 {
9650 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9651 pair<hobject_t, ObjectContextRef> i;
9652 while (object_contexts.get_next(i.first, &i))
9653 check_blacklisted_obc_watchers(i.second);
9654 }
9655
9656 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9657 {
9658 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9659 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9660 obc->watchers.begin();
9661 k != obc->watchers.end();
9662 ) {
9663 //Advance iterator now so handle_watch_timeout() can erase element
9664 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9665 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9666 entity_addr_t ea = j->second->get_peer_addr();
9667 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9668 if (get_osdmap()->is_blacklisted(ea)) {
9669 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9670 assert(j->second->get_pg() == this);
9671 j->second->unregister_cb();
9672 handle_watch_timeout(j->second);
9673 }
9674 }
9675 }
9676
9677 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9678 {
9679 assert(is_active());
9680 assert((recovering.count(obc->obs.oi.soid) ||
9681 !is_missing_object(obc->obs.oi.soid)) ||
9682 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9683 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9684 pg_log_entry_t::LOST_REVERT &&
9685 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9686 obc->obs.oi.version));
9687
9688 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9689 assert(obc->watchers.empty());
9690 // populate unconnected_watchers
9691 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9692 obc->obs.oi.watchers.begin();
9693 p != obc->obs.oi.watchers.end();
9694 ++p) {
9695 utime_t expire = info.stats.last_became_active;
9696 expire += p->second.timeout_seconds;
9697 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9698 WatchRef watch(
9699 Watch::makeWatchRef(
9700 this, osd, obc, p->second.timeout_seconds, p->first.first,
9701 p->first.second, p->second.addr));
9702 watch->disconnect();
9703 obc->watchers.insert(
9704 make_pair(
9705 make_pair(p->first.first, p->first.second),
9706 watch));
9707 }
9708 // Look for watchers from blacklisted clients and drop
9709 check_blacklisted_obc_watchers(obc);
9710 }
9711
9712 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9713 {
9714 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9715 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9716
9717 if (!is_active()) {
9718 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9719 return;
9720 }
9721 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9722 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9723 watch->get_delayed_cb()
9724 );
9725 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9726 << obc->obs.oi.soid
9727 << dendl;
9728 return;
9729 }
9730
9731 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9732 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9733 << obc->obs.oi.soid
9734 << dendl;
9735 scrubber.add_callback(
9736 watch->get_delayed_cb() // This callback!
9737 );
9738 return;
9739 }
9740
9741 OpContextUPtr ctx = simple_opc_create(obc);
9742 ctx->at_version = get_next_version();
9743
9744 object_info_t& oi = ctx->new_obs.oi;
9745 oi.watchers.erase(make_pair(watch->get_cookie(),
9746 watch->get_entity()));
9747
9748 list<watch_disconnect_t> watch_disconnects = {
9749 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9750 };
9751 ctx->register_on_success(
9752 [this, obc, watch_disconnects]() {
9753 complete_disconnect_watches(obc, watch_disconnects);
9754 });
9755
9756
9757 PGTransaction *t = ctx->op_t.get();
9758 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9759 ctx->at_version,
9760 oi.version,
9761 0,
9762 osd_reqid_t(), ctx->mtime, 0));
9763
9764 oi.prior_version = obc->obs.oi.version;
9765 oi.version = ctx->at_version;
9766 bufferlist bl;
9767 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9768 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9769
9770 // apply new object state.
9771 ctx->obc->obs = ctx->new_obs;
9772
9773 // no ctx->delta_stats
9774 simple_opc_submit(std::move(ctx));
9775 }
9776
9777 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9778 SnapSetContext *ssc)
9779 {
9780 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9781 assert(obc->destructor_callback == NULL);
9782 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9783 obc->obs.oi = oi;
9784 obc->obs.exists = false;
9785 obc->ssc = ssc;
9786 if (ssc)
9787 register_snapset_context(ssc);
9788 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9789 if (is_active())
9790 populate_obc_watchers(obc);
9791 return obc;
9792 }
9793
9794 ObjectContextRef PrimaryLogPG::get_object_context(
9795 const hobject_t& soid,
9796 bool can_create,
9797 const map<string, bufferlist> *attrs)
9798 {
9799 assert(
9800 attrs || !pg_log.get_missing().is_missing(soid) ||
9801 // or this is a revert... see recover_primary()
9802 (pg_log.get_log().objects.count(soid) &&
9803 pg_log.get_log().objects.find(soid)->second->op ==
9804 pg_log_entry_t::LOST_REVERT));
9805 ObjectContextRef obc = object_contexts.lookup(soid);
9806 osd->logger->inc(l_osd_object_ctx_cache_total);
9807 if (obc) {
9808 osd->logger->inc(l_osd_object_ctx_cache_hit);
9809 dout(10) << __func__ << ": found obc in cache: " << obc
9810 << dendl;
9811 } else {
9812 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9813 // check disk
9814 bufferlist bv;
9815 if (attrs) {
9816 assert(attrs->count(OI_ATTR));
9817 bv = attrs->find(OI_ATTR)->second;
9818 } else {
9819 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9820 if (r < 0) {
9821 if (!can_create) {
9822 dout(10) << __func__ << ": no obc for soid "
9823 << soid << " and !can_create"
9824 << dendl;
9825 return ObjectContextRef(); // -ENOENT!
9826 }
9827
9828 dout(10) << __func__ << ": no obc for soid "
9829 << soid << " but can_create"
9830 << dendl;
9831 // new object.
9832 object_info_t oi(soid);
9833 SnapSetContext *ssc = get_snapset_context(
9834 soid, true, 0, false);
9835 assert(ssc);
9836 obc = create_object_context(oi, ssc);
9837 dout(10) << __func__ << ": " << obc << " " << soid
9838 << " " << obc->rwstate
9839 << " oi: " << obc->obs.oi
9840 << " ssc: " << obc->ssc
9841 << " snapset: " << obc->ssc->snapset << dendl;
9842 return obc;
9843 }
9844 }
9845
9846 object_info_t oi;
9847 try {
9848 bufferlist::iterator bliter = bv.begin();
9849 ::decode(oi, bliter);
9850 } catch (...) {
9851 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9852 return ObjectContextRef(); // -ENOENT!
9853 }
9854
9855 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9856
9857 obc = object_contexts.lookup_or_create(oi.soid);
9858 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9859 obc->obs.oi = oi;
9860 obc->obs.exists = true;
9861
9862 obc->ssc = get_snapset_context(
9863 soid, true,
9864 soid.has_snapset() ? attrs : 0);
9865
9866 if (is_active())
9867 populate_obc_watchers(obc);
9868
9869 if (pool.info.require_rollback()) {
9870 if (attrs) {
9871 obc->attr_cache = *attrs;
9872 } else {
9873 int r = pgbackend->objects_get_attrs(
9874 soid,
9875 &obc->attr_cache);
9876 assert(r == 0);
9877 }
9878 }
9879
9880 dout(10) << __func__ << ": creating obc from disk: " << obc
9881 << dendl;
9882 }
9883
9884 // XXX: Caller doesn't expect this
9885 if (obc->ssc == NULL) {
9886 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9887 return ObjectContextRef(); // -ENOENT!
9888 }
9889
9890 dout(10) << __func__ << ": " << obc << " " << soid
9891 << " " << obc->rwstate
9892 << " oi: " << obc->obs.oi
9893 << " exists: " << (int)obc->obs.exists
9894 << " ssc: " << obc->ssc
9895 << " snapset: " << obc->ssc->snapset << dendl;
9896 return obc;
9897 }
9898
9899 void PrimaryLogPG::context_registry_on_change()
9900 {
9901 pair<hobject_t, ObjectContextRef> i;
9902 while (object_contexts.get_next(i.first, &i)) {
9903 ObjectContextRef obc(i.second);
9904 if (obc) {
9905 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9906 obc->watchers.begin();
9907 j != obc->watchers.end();
9908 obc->watchers.erase(j++)) {
9909 j->second->discard();
9910 }
9911 }
9912 }
9913 }
9914
9915
9916 /*
9917 * If we return an error, and set *pmissing, then promoting that
9918 * object may help.
9919 *
9920 * If we return -EAGAIN, we will always set *pmissing to the missing
9921 * object to wait for.
9922 *
9923 * If we return an error but do not set *pmissing, then we know the
9924 * object does not exist.
9925 */
9926 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9927 ObjectContextRef *pobc,
9928 bool can_create,
9929 bool map_snapid_to_clone,
9930 hobject_t *pmissing)
9931 {
9932 FUNCTRACE();
9933 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9934 // want the head?
9935 if (oid.snap == CEPH_NOSNAP) {
9936 ObjectContextRef obc = get_object_context(oid, can_create);
9937 if (!obc) {
9938 if (pmissing)
9939 *pmissing = oid;
9940 return -ENOENT;
9941 }
9942 dout(10) << "find_object_context " << oid
9943 << " @" << oid.snap
9944 << " oi=" << obc->obs.oi
9945 << dendl;
9946 *pobc = obc;
9947
9948 return 0;
9949 }
9950
9951 hobject_t head = oid.get_head();
9952
9953 // want the snapdir?
9954 if (oid.snap == CEPH_SNAPDIR) {
9955 // return head or snapdir, whichever exists.
9956 ObjectContextRef headobc = get_object_context(head, can_create);
9957 ObjectContextRef obc = headobc;
9958 if (!obc || !obc->obs.exists)
9959 obc = get_object_context(oid, can_create);
9960 if (!obc || !obc->obs.exists) {
9961 // if we have neither, we would want to promote the head.
9962 if (pmissing)
9963 *pmissing = head;
9964 if (pobc)
9965 *pobc = headobc; // may be null
9966 return -ENOENT;
9967 }
9968 dout(10) << "find_object_context " << oid
9969 << " @" << oid.snap
9970 << " oi=" << obc->obs.oi
9971 << dendl;
9972 *pobc = obc;
9973
9974 // always populate ssc for SNAPDIR...
9975 if (!obc->ssc)
9976 obc->ssc = get_snapset_context(
9977 oid, true);
9978 return 0;
9979 }
9980
9981 // we want a snap
9982 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9983 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9984 return -ENOENT;
9985 }
9986
9987 SnapSetContext *ssc = get_snapset_context(oid, can_create);
9988 if (!ssc || !(ssc->exists || can_create)) {
9989 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9990 if (pmissing)
9991 *pmissing = head; // start by getting the head
9992 if (ssc)
9993 put_snapset_context(ssc);
9994 return -ENOENT;
9995 }
9996
9997 if (map_snapid_to_clone) {
9998 dout(10) << "find_object_context " << oid << " @" << oid.snap
9999 << " snapset " << ssc->snapset
10000 << " map_snapid_to_clone=true" << dendl;
10001 if (oid.snap > ssc->snapset.seq) {
10002 // already must be readable
10003 ObjectContextRef obc = get_object_context(head, false);
10004 dout(10) << "find_object_context " << oid << " @" << oid.snap
10005 << " snapset " << ssc->snapset
10006 << " maps to head" << dendl;
10007 *pobc = obc;
10008 put_snapset_context(ssc);
10009 return (obc && obc->obs.exists) ? 0 : -ENOENT;
10010 } else {
10011 vector<snapid_t>::const_iterator citer = std::find(
10012 ssc->snapset.clones.begin(),
10013 ssc->snapset.clones.end(),
10014 oid.snap);
10015 if (citer == ssc->snapset.clones.end()) {
10016 dout(10) << "find_object_context " << oid << " @" << oid.snap
10017 << " snapset " << ssc->snapset
10018 << " maps to nothing" << dendl;
10019 put_snapset_context(ssc);
10020 return -ENOENT;
10021 }
10022
10023 dout(10) << "find_object_context " << oid << " @" << oid.snap
10024 << " snapset " << ssc->snapset
10025 << " maps to " << oid << dendl;
10026
10027 if (pg_log.get_missing().is_missing(oid)) {
10028 dout(10) << "find_object_context " << oid << " @" << oid.snap
10029 << " snapset " << ssc->snapset
10030 << " " << oid << " is missing" << dendl;
10031 if (pmissing)
10032 *pmissing = oid;
10033 put_snapset_context(ssc);
10034 return -EAGAIN;
10035 }
10036
10037 ObjectContextRef obc = get_object_context(oid, false);
10038 if (!obc || !obc->obs.exists) {
10039 dout(10) << "find_object_context " << oid << " @" << oid.snap
10040 << " snapset " << ssc->snapset
10041 << " " << oid << " is not present" << dendl;
10042 if (pmissing)
10043 *pmissing = oid;
10044 put_snapset_context(ssc);
10045 return -ENOENT;
10046 }
10047 dout(10) << "find_object_context " << oid << " @" << oid.snap
10048 << " snapset " << ssc->snapset
10049 << " " << oid << " HIT" << dendl;
10050 *pobc = obc;
10051 put_snapset_context(ssc);
10052 return 0;
10053 }
10054 ceph_abort(); //unreachable
10055 }
10056
10057 dout(10) << "find_object_context " << oid << " @" << oid.snap
10058 << " snapset " << ssc->snapset << dendl;
10059
10060 // head?
10061 if (oid.snap > ssc->snapset.seq) {
10062 if (ssc->snapset.head_exists) {
10063 ObjectContextRef obc = get_object_context(head, false);
10064 dout(10) << "find_object_context " << head
10065 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10066 << " -- HIT " << obc->obs
10067 << dendl;
10068 if (!obc->ssc)
10069 obc->ssc = ssc;
10070 else {
10071 assert(ssc == obc->ssc);
10072 put_snapset_context(ssc);
10073 }
10074 *pobc = obc;
10075 return 0;
10076 }
10077 dout(10) << "find_object_context " << head
10078 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10079 << " but head dne -- DNE"
10080 << dendl;
10081 put_snapset_context(ssc);
10082 return -ENOENT;
10083 }
10084
10085 // which clone would it be?
10086 unsigned k = 0;
10087 while (k < ssc->snapset.clones.size() &&
10088 ssc->snapset.clones[k] < oid.snap)
10089 k++;
10090 if (k == ssc->snapset.clones.size()) {
10091 dout(10) << "find_object_context no clones with last >= oid.snap "
10092 << oid.snap << " -- DNE" << dendl;
10093 put_snapset_context(ssc);
10094 return -ENOENT;
10095 }
10096 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10097 info.pgid.pool(), oid.get_namespace());
10098
10099 if (pg_log.get_missing().is_missing(soid)) {
10100 dout(20) << "find_object_context " << soid << " missing, try again later"
10101 << dendl;
10102 if (pmissing)
10103 *pmissing = soid;
10104 put_snapset_context(ssc);
10105 return -EAGAIN;
10106 }
10107
10108 ObjectContextRef obc = get_object_context(soid, false);
10109 if (!obc || !obc->obs.exists) {
10110 if (pmissing)
10111 *pmissing = soid;
10112 put_snapset_context(ssc);
10113 if (is_degraded_or_backfilling_object(soid)) {
10114 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10115 return -EAGAIN;
10116 } else {
10117 dout(20) << __func__ << " missing clone " << soid << dendl;
10118 return -ENOENT;
10119 }
10120 }
10121
10122 if (!obc->ssc) {
10123 obc->ssc = ssc;
10124 } else {
10125 assert(obc->ssc == ssc);
10126 put_snapset_context(ssc);
10127 }
10128 ssc = 0;
10129
10130 // clone
10131 dout(20) << "find_object_context " << soid
10132 << " snapset " << obc->ssc->snapset
10133 << " legacy_snaps " << obc->obs.oi.legacy_snaps
10134 << dendl;
10135 snapid_t first, last;
10136 if (obc->ssc->snapset.is_legacy()) {
10137 first = obc->obs.oi.legacy_snaps.back();
10138 last = obc->obs.oi.legacy_snaps.front();
10139 } else {
10140 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10141 assert(p != obc->ssc->snapset.clone_snaps.end());
10142 first = p->second.back();
10143 last = p->second.front();
10144 }
10145 if (first <= oid.snap) {
10146 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10147 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10148 *pobc = obc;
10149 return 0;
10150 } else {
10151 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10152 << "] does not contain " << oid.snap << " -- DNE" << dendl;
10153 return -ENOENT;
10154 }
10155 }
10156
10157 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10158 {
10159 if (obc->ssc)
10160 put_snapset_context(obc->ssc);
10161 }
10162
10163 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10164 {
10165 object_info_t& oi = obc->obs.oi;
10166
10167 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10168 object_stat_sum_t stat;
10169
10170 stat.num_bytes += oi.size;
10171
10172 if (oi.soid.snap != CEPH_SNAPDIR)
10173 stat.num_objects++;
10174 if (oi.is_dirty())
10175 stat.num_objects_dirty++;
10176 if (oi.is_whiteout())
10177 stat.num_whiteouts++;
10178 if (oi.is_omap())
10179 stat.num_objects_omap++;
10180 if (oi.is_cache_pinned())
10181 stat.num_objects_pinned++;
10182
10183 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10184 stat.num_object_clones++;
10185
10186 if (!obc->ssc)
10187 obc->ssc = get_snapset_context(oi.soid, false);
10188 assert(obc->ssc);
10189
10190 // subtract off clone overlap
10191 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10192 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10193 for (interval_set<uint64_t>::const_iterator r = o.begin();
10194 r != o.end();
10195 ++r) {
10196 stat.num_bytes -= r.get_len();
10197 }
10198 }
10199 }
10200
10201 // add it in
10202 pgstat->stats.sum.add(stat);
10203 }
10204
10205 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10206 {
10207 const hobject_t& soid = obc->obs.oi.soid;
10208 if (obc->is_blocked()) {
10209 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10210 return;
10211 }
10212
10213 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10214 if (p != waiting_for_blocked_object.end()) {
10215 list<OpRequestRef>& ls = p->second;
10216 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10217 requeue_ops(ls);
10218 waiting_for_blocked_object.erase(p);
10219 }
10220
10221 map<hobject_t, ObjectContextRef>::iterator i =
10222 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10223 if (i != objects_blocked_on_snap_promotion.end()) {
10224 assert(i->second == obc);
10225 objects_blocked_on_snap_promotion.erase(i);
10226 }
10227
10228 if (obc->requeue_scrub_on_unblock) {
10229 obc->requeue_scrub_on_unblock = false;
10230 requeue_scrub();
10231 }
10232 }
10233
10234 SnapSetContext *PrimaryLogPG::get_snapset_context(
10235 const hobject_t& oid,
10236 bool can_create,
10237 const map<string, bufferlist> *attrs,
10238 bool oid_existed)
10239 {
10240 Mutex::Locker l(snapset_contexts_lock);
10241 SnapSetContext *ssc;
10242 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10243 oid.get_snapdir());
10244 if (p != snapset_contexts.end()) {
10245 if (can_create || p->second->exists) {
10246 ssc = p->second;
10247 } else {
10248 return NULL;
10249 }
10250 } else {
10251 bufferlist bv;
10252 if (!attrs) {
10253 int r = -ENOENT;
10254 if (!(oid.is_head() && !oid_existed))
10255 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10256 if (r < 0) {
10257 // try _snapset
10258 if (!(oid.is_snapdir() && !oid_existed))
10259 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10260 if (r < 0 && !can_create)
10261 return NULL;
10262 }
10263 } else {
10264 assert(attrs->count(SS_ATTR));
10265 bv = attrs->find(SS_ATTR)->second;
10266 }
10267 ssc = new SnapSetContext(oid.get_snapdir());
10268 _register_snapset_context(ssc);
10269 if (bv.length()) {
10270 bufferlist::iterator bvp = bv.begin();
10271 try {
10272 ssc->snapset.decode(bvp);
10273 } catch (buffer::error& e) {
10274 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10275 return NULL;
10276 }
10277 ssc->exists = true;
10278 } else {
10279 ssc->exists = false;
10280 }
10281 }
10282 assert(ssc);
10283 ssc->ref++;
10284 return ssc;
10285 }
10286
10287 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10288 {
10289 Mutex::Locker l(snapset_contexts_lock);
10290 --ssc->ref;
10291 if (ssc->ref == 0) {
10292 if (ssc->registered)
10293 snapset_contexts.erase(ssc->oid);
10294 delete ssc;
10295 }
10296 }
10297
10298 /** pull - request object from a peer
10299 */
10300
10301 /*
10302 * Return values:
10303 * NONE - didn't pull anything
10304 * YES - pulled what the caller wanted
10305 * OTHER - needed to pull something else first (_head or _snapdir)
10306 */
10307 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10308
10309 int PrimaryLogPG::recover_missing(
10310 const hobject_t &soid, eversion_t v,
10311 int priority,
10312 PGBackend::RecoveryHandle *h)
10313 {
10314 if (missing_loc.is_unfound(soid)) {
10315 dout(7) << "pull " << soid
10316 << " v " << v
10317 << " but it is unfound" << dendl;
10318 return PULL_NONE;
10319 }
10320
10321 if (missing_loc.is_deleted(soid)) {
10322 start_recovery_op(soid);
10323 assert(!recovering.count(soid));
10324 recovering.insert(make_pair(soid, ObjectContextRef()));
10325 epoch_t cur_epoch = get_osdmap()->get_epoch();
10326 remove_missing_object(soid, v, new FunctionContext(
10327 [=](int) {
10328 lock();
10329 if (!pg_has_reset_since(cur_epoch)) {
10330 bool object_missing = false;
10331 for (const auto& shard : actingbackfill) {
10332 if (shard == pg_whoami)
10333 continue;
10334 if (peer_missing[shard].is_missing(soid)) {
10335 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10336 object_missing = true;
10337 break;
10338 }
10339 }
10340 if (!object_missing) {
10341 object_stat_sum_t stat_diff;
10342 stat_diff.num_objects_recovered = 1;
10343 on_global_recover(soid, stat_diff, true);
10344 } else {
10345 auto recovery_handle = pgbackend->open_recovery_op();
10346 pgbackend->recover_delete_object(soid, v, recovery_handle);
10347 pgbackend->run_recovery_op(recovery_handle, priority);
10348 }
10349 }
10350 unlock();
10351 }));
10352 return PULL_YES;
10353 }
10354
10355 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10356 ObjectContextRef obc;
10357 ObjectContextRef head_obc;
10358 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10359 // do we have the head and/or snapdir?
10360 hobject_t head = soid.get_head();
10361 if (pg_log.get_missing().is_missing(head)) {
10362 if (recovering.count(head)) {
10363 dout(10) << " missing but already recovering head " << head << dendl;
10364 return PULL_NONE;
10365 } else {
10366 int r = recover_missing(
10367 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10368 h);
10369 if (r != PULL_NONE)
10370 return PULL_OTHER;
10371 return PULL_NONE;
10372 }
10373 }
10374 head = soid.get_snapdir();
10375 if (pg_log.get_missing().is_missing(head)) {
10376 if (recovering.count(head)) {
10377 dout(10) << " missing but already recovering snapdir " << head << dendl;
10378 return PULL_NONE;
10379 } else {
10380 int r = recover_missing(
10381 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10382 h);
10383 if (r != PULL_NONE)
10384 return PULL_OTHER;
10385 return PULL_NONE;
10386 }
10387 }
10388
10389 // we must have one or the other
10390 head_obc = get_object_context(
10391 soid.get_head(),
10392 false,
10393 0);
10394 if (!head_obc)
10395 head_obc = get_object_context(
10396 soid.get_snapdir(),
10397 false,
10398 0);
10399 assert(head_obc);
10400 }
10401 start_recovery_op(soid);
10402 assert(!recovering.count(soid));
10403 recovering.insert(make_pair(soid, obc));
10404 int r = pgbackend->recover_object(
10405 soid,
10406 v,
10407 head_obc,
10408 obc,
10409 h);
10410 // This is only a pull which shouldn't return an error
10411 assert(r >= 0);
10412 return PULL_YES;
10413 }
10414
10415 void PrimaryLogPG::send_remove_op(
10416 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10417 {
10418 ceph_tid_t tid = osd->get_tid();
10419 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10420
10421 dout(10) << "send_remove_op " << oid << " from osd." << peer
10422 << " tid " << tid << dendl;
10423
10424 MOSDSubOp *subop = new MOSDSubOp(
10425 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10426 oid, CEPH_OSD_FLAG_ACK,
10427 get_osdmap()->get_epoch(), tid, v);
10428 subop->ops = vector<OSDOp>(1);
10429 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10430
10431 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10432 }
10433
10434 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10435 eversion_t v, Context *on_complete)
10436 {
10437 dout(20) << __func__ << " " << soid << " " << v << dendl;
10438 assert(on_complete != nullptr);
10439 // delete locally
10440 ObjectStore::Transaction t;
10441 remove_snap_mapped_object(t, soid);
10442
10443 ObjectRecoveryInfo recovery_info;
10444 recovery_info.soid = soid;
10445 recovery_info.version = v;
10446
10447 epoch_t cur_epoch = get_osdmap()->get_epoch();
10448 t.register_on_complete(new FunctionContext(
10449 [=](int) {
10450 lock();
10451 if (!pg_has_reset_since(cur_epoch)) {
10452 ObjectStore::Transaction t2;
10453 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10454 t2.register_on_complete(on_complete);
10455 int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10456 assert(r == 0);
10457 unlock();
10458 } else {
10459 unlock();
10460 on_complete->complete(-EAGAIN);
10461 }
10462 }));
10463 int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10464 assert(r == 0);
10465 }
10466
10467 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10468 {
10469 dout(10) << "finish_degraded_object " << oid << dendl;
10470 if (callbacks_for_degraded_object.count(oid)) {
10471 list<Context*> contexts;
10472 contexts.swap(callbacks_for_degraded_object[oid]);
10473 callbacks_for_degraded_object.erase(oid);
10474 for (list<Context*>::iterator i = contexts.begin();
10475 i != contexts.end();
10476 ++i) {
10477 (*i)->complete(0);
10478 }
10479 }
10480 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10481 oid.get_head());
10482 if (i != objects_blocked_on_degraded_snap.end() &&
10483 i->second == oid.snap)
10484 objects_blocked_on_degraded_snap.erase(i);
10485 }
10486
10487 void PrimaryLogPG::_committed_pushed_object(
10488 epoch_t epoch, eversion_t last_complete)
10489 {
10490 lock();
10491 if (!pg_has_reset_since(epoch)) {
10492 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10493 last_complete_ondisk = last_complete;
10494
10495 if (last_complete_ondisk == info.last_update) {
10496 if (!is_primary()) {
10497 // Either we are a replica or backfill target.
10498 // we are fully up to date. tell the primary!
10499 osd->send_message_osd_cluster(
10500 get_primary().osd,
10501 new MOSDPGTrim(
10502 get_osdmap()->get_epoch(),
10503 spg_t(info.pgid.pgid, get_primary().shard),
10504 last_complete_ondisk),
10505 get_osdmap()->get_epoch());
10506 } else {
10507 calc_min_last_complete_ondisk();
10508 }
10509 }
10510
10511 } else {
10512 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10513 }
10514
10515 unlock();
10516 }
10517
10518 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10519 {
10520 lock();
10521 dout(20) << __func__ << dendl;
10522 if (obc) {
10523 dout(20) << "obc = " << *obc << dendl;
10524 }
10525 assert(active_pushes >= 1);
10526 --active_pushes;
10527
10528 // requeue an active chunky scrub waiting on recovery ops
10529 if (!deleting && active_pushes == 0
10530 && scrubber.is_chunky_scrub_active()) {
10531 if (ops_blocked_by_scrub()) {
10532 requeue_scrub(true);
10533 } else {
10534 requeue_scrub(false);
10535 }
10536 }
10537 unlock();
10538 }
10539
10540 void PrimaryLogPG::_applied_recovered_object_replica()
10541 {
10542 lock();
10543 dout(20) << __func__ << dendl;
10544 assert(active_pushes >= 1);
10545 --active_pushes;
10546
10547 // requeue an active chunky scrub waiting on recovery ops
10548 if (!deleting && active_pushes == 0 &&
10549 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10550 scrubber.active_rep_scrub->get_req())->chunky) {
10551 osd->enqueue_back(
10552 info.pgid,
10553 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10554 scrubber.active_rep_scrub = OpRequestRef();
10555 }
10556 unlock();
10557 }
10558
10559 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10560 {
10561 dout(10) << "got missing " << oid << " v " << v << dendl;
10562 pg_log.recover_got(oid, v, info);
10563 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10564 dout(10) << "last_complete now " << info.last_complete
10565 << " log.complete_to " << pg_log.get_log().complete_to->version
10566 << dendl;
10567 } else {
10568 dout(10) << "last_complete now " << info.last_complete
10569 << " log.complete_to at end" << dendl;
10570 //below is not true in the repair case.
10571 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10572 assert(info.last_complete == info.last_update);
10573 }
10574 }
10575
10576 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10577 {
10578 list<pg_shard_t> fl = { pg_whoami };
10579 failed_push(fl, soid);
10580 }
10581
10582 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10583 {
10584 dout(20) << __func__ << ": " << soid << dendl;
10585 assert(recovering.count(soid));
10586 auto obc = recovering[soid];
10587 if (obc) {
10588 list<OpRequestRef> blocked_ops;
10589 obc->drop_recovery_read(&blocked_ops);
10590 requeue_ops(blocked_ops);
10591 }
10592 recovering.erase(soid);
10593 for (auto&& i : from)
10594 missing_loc.remove_location(soid, i);
10595 dout(0) << __func__ << " " << soid << " from shard " << from
10596 << ", reps on " << missing_loc.get_locations(soid)
10597 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10598 finish_recovery_op(soid); // close out this attempt,
10599 }
10600
10601 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10602 {
10603 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10604 assert(m->get_type() == MSG_OSD_SUBOP);
10605 dout(7) << "sub_op_remove " << m->poid << dendl;
10606
10607 op->mark_started();
10608
10609 ObjectStore::Transaction t;
10610 remove_snap_mapped_object(t, m->poid);
10611 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10612 assert(r == 0);
10613 }
10614
10615 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10616 {
10617 eversion_t v;
10618 pg_missing_item pmi;
10619 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10620 assert(is_missing);
10621 v = pmi.have;
10622 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10623
10624 assert(!actingbackfill.empty());
10625 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10626 i != actingbackfill.end();
10627 ++i) {
10628 if (*i == get_primary()) continue;
10629 pg_shard_t peer = *i;
10630 if (!peer_missing[peer].is_missing(oid)) {
10631 continue;
10632 }
10633 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10634 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10635 if (h > v)
10636 v = h;
10637 }
10638
10639 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10640 return v;
10641 }
10642
10643 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10644 {
10645 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10646 op->get_req());
10647 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10648 ObjectStore::Transaction t;
10649 append_log_entries_update_missing(m->entries, t);
10650
10651 Context *complete = new FunctionContext(
10652 [=](int) {
10653 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10654 op->get_req());
10655 lock();
10656 if (!pg_has_reset_since(msg->get_epoch())) {
10657 MOSDPGUpdateLogMissingReply *reply =
10658 new MOSDPGUpdateLogMissingReply(
10659 spg_t(info.pgid.pgid, primary_shard().shard),
10660 pg_whoami.shard,
10661 msg->get_epoch(),
10662 msg->min_epoch,
10663 msg->get_tid());
10664 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10665 msg->get_connection()->send_message(reply);
10666 }
10667 unlock();
10668 });
10669
10670 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10671 t.register_on_commit(complete);
10672 } else {
10673 /* Hack to work around the fact that ReplicatedBackend sends
10674 * ack+commit if commit happens first
10675 *
10676 * This behavior is no longer necessary, but we preserve it so old
10677 * primaries can keep their repops in order */
10678 if (pool.info.ec_pool()) {
10679 t.register_on_complete(complete);
10680 } else {
10681 t.register_on_commit(complete);
10682 }
10683 }
10684 t.register_on_applied(
10685 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10686 int tr = osd->store->queue_transaction(
10687 osr.get(),
10688 std::move(t),
10689 nullptr);
10690 assert(tr == 0);
10691 }
10692
10693 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10694 {
10695 const MOSDPGUpdateLogMissingReply *m =
10696 static_cast<const MOSDPGUpdateLogMissingReply*>(
10697 op->get_req());
10698 dout(20) << __func__ << " got reply from "
10699 << m->get_from() << dendl;
10700
10701 auto it = log_entry_update_waiting_on.find(m->get_tid());
10702 if (it != log_entry_update_waiting_on.end()) {
10703 if (it->second.waiting_on.count(m->get_from())) {
10704 it->second.waiting_on.erase(m->get_from());
10705 } else {
10706 osd->clog->error()
10707 << info.pgid << " got reply "
10708 << *m << " from shard we are not waiting for "
10709 << m->get_from();
10710 }
10711
10712 if (it->second.waiting_on.empty()) {
10713 repop_all_committed(it->second.repop.get());
10714 log_entry_update_waiting_on.erase(it);
10715 }
10716 } else {
10717 osd->clog->error()
10718 << info.pgid << " got reply "
10719 << *m << " on unknown tid " << m->get_tid();
10720 }
10721 }
10722
10723 /* Mark all unfound objects as lost.
10724 */
10725 void PrimaryLogPG::mark_all_unfound_lost(
10726 int what,
10727 ConnectionRef con,
10728 ceph_tid_t tid)
10729 {
10730 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10731 list<hobject_t> oids;
10732
10733 dout(30) << __func__ << ": log before:\n";
10734 pg_log.get_log().print(*_dout);
10735 *_dout << dendl;
10736
10737 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10738
10739 utime_t mtime = ceph_clock_now();
10740 map<hobject_t, pg_missing_item>::const_iterator m =
10741 missing_loc.get_needs_recovery().begin();
10742 map<hobject_t, pg_missing_item>::const_iterator mend =
10743 missing_loc.get_needs_recovery().end();
10744
10745 ObcLockManager manager;
10746 eversion_t v = get_next_version();
10747 v.epoch = get_osdmap()->get_epoch();
10748 uint64_t num_unfound = missing_loc.num_unfound();
10749 while (m != mend) {
10750 const hobject_t &oid(m->first);
10751 if (!missing_loc.is_unfound(oid)) {
10752 // We only care about unfound objects
10753 ++m;
10754 continue;
10755 }
10756
10757 ObjectContextRef obc;
10758 eversion_t prev;
10759
10760 switch (what) {
10761 case pg_log_entry_t::LOST_MARK:
10762 assert(0 == "actually, not implemented yet!");
10763 break;
10764
10765 case pg_log_entry_t::LOST_REVERT:
10766 prev = pick_newest_available(oid);
10767 if (prev > eversion_t()) {
10768 // log it
10769 pg_log_entry_t e(
10770 pg_log_entry_t::LOST_REVERT, oid, v,
10771 m->second.need, 0, osd_reqid_t(), mtime, 0);
10772 e.reverting_to = prev;
10773 e.mark_unrollbackable();
10774 log_entries.push_back(e);
10775 dout(10) << e << dendl;
10776
10777 // we are now missing the new version; recovery code will sort it out.
10778 ++v.version;
10779 ++m;
10780 break;
10781 }
10782
10783 case pg_log_entry_t::LOST_DELETE:
10784 {
10785 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10786 0, osd_reqid_t(), mtime, 0);
10787 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10788 if (pool.info.require_rollback()) {
10789 e.mod_desc.try_rmobject(v.version);
10790 } else {
10791 e.mark_unrollbackable();
10792 }
10793 } // otherwise, just do what we used to do
10794 dout(10) << e << dendl;
10795 log_entries.push_back(e);
10796 oids.push_back(oid);
10797
10798 // If context found mark object as deleted in case
10799 // of racing with new creation. This can happen if
10800 // object lost and EIO at primary.
10801 obc = object_contexts.lookup(oid);
10802 if (obc)
10803 obc->obs.exists = false;
10804
10805 ++v.version;
10806 ++m;
10807 }
10808 break;
10809
10810 default:
10811 ceph_abort();
10812 }
10813 }
10814
10815 info.stats.stats_invalid = true;
10816
10817 submit_log_entries(
10818 log_entries,
10819 std::move(manager),
10820 boost::optional<std::function<void(void)> >(
10821 [this, oids, con, num_unfound, tid]() {
10822 if (perform_deletes_during_peering()) {
10823 for (auto oid : oids) {
10824 // clear old locations - merge_new_log_entries will have
10825 // handled rebuilding missing_loc for each of these
10826 // objects if we have the RECOVERY_DELETES flag
10827 missing_loc.recovered(oid);
10828 }
10829 }
10830
10831 if (is_recovery_unfound()) {
10832 queue_peering_event(
10833 CephPeeringEvtRef(
10834 std::make_shared<CephPeeringEvt>(
10835 get_osdmap()->get_epoch(),
10836 get_osdmap()->get_epoch(),
10837 DoRecovery())));
10838 } else if (is_backfill_unfound()) {
10839 queue_peering_event(
10840 CephPeeringEvtRef(
10841 std::make_shared<CephPeeringEvt>(
10842 get_osdmap()->get_epoch(),
10843 get_osdmap()->get_epoch(),
10844 RequestBackfill())));
10845 } else {
10846 queue_recovery();
10847 }
10848
10849 stringstream ss;
10850 ss << "pg has " << num_unfound
10851 << " objects unfound and apparently lost marking";
10852 string rs = ss.str();
10853 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10854 osd->clog->info() << rs;
10855 if (con) {
10856 MCommandReply *reply = new MCommandReply(0, rs);
10857 reply->set_tid(tid);
10858 con->send_message(reply);
10859 }
10860 }),
10861 OpRequestRef());
10862 }
10863
10864 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10865 {
10866 assert(repop_queue.empty());
10867 }
10868
10869 /*
10870 * pg status change notification
10871 */
10872
10873 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10874 {
10875 list<OpRequestRef> rq;
10876
10877 // apply all repops
10878 while (!repop_queue.empty()) {
10879 RepGather *repop = repop_queue.front();
10880 repop_queue.pop_front();
10881 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10882 repop->rep_aborted = true;
10883 repop->on_applied.clear();
10884 repop->on_committed.clear();
10885 repop->on_success.clear();
10886
10887 if (requeue) {
10888 if (repop->op) {
10889 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10890 rq.push_back(repop->op);
10891 repop->op = OpRequestRef();
10892 }
10893
10894 // also requeue any dups, interleaved into position
10895 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10896 waiting_for_ondisk.find(repop->v);
10897 if (p != waiting_for_ondisk.end()) {
10898 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10899 for (list<pair<OpRequestRef, version_t> >::iterator i =
10900 p->second.begin();
10901 i != p->second.end();
10902 ++i) {
10903 rq.push_back(i->first);
10904 }
10905 waiting_for_ondisk.erase(p);
10906 }
10907 }
10908
10909 remove_repop(repop);
10910 }
10911
10912 assert(repop_queue.empty());
10913
10914 if (requeue) {
10915 requeue_ops(rq);
10916 if (!waiting_for_ondisk.empty()) {
10917 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10918 waiting_for_ondisk.begin();
10919 i != waiting_for_ondisk.end();
10920 ++i) {
10921 for (list<pair<OpRequestRef, version_t> >::iterator j =
10922 i->second.begin();
10923 j != i->second.end();
10924 ++j) {
10925 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10926 << i->first << dendl;
10927 }
10928 }
10929 assert(waiting_for_ondisk.empty());
10930 }
10931 }
10932
10933 waiting_for_ondisk.clear();
10934 }
10935
10936 void PrimaryLogPG::on_flushed()
10937 {
10938 assert(flushes_in_progress > 0);
10939 flushes_in_progress--;
10940 if (flushes_in_progress == 0) {
10941 requeue_ops(waiting_for_flush);
10942 }
10943 if (!is_peered() || !is_primary()) {
10944 pair<hobject_t, ObjectContextRef> i;
10945 while (object_contexts.get_next(i.first, &i)) {
10946 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10947 }
10948 assert(object_contexts.empty());
10949 }
10950 pgbackend->on_flushed();
10951 }
10952
10953 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10954 {
10955 dout(10) << "on_removal" << dendl;
10956
10957 // adjust info to backfill
10958 info.set_last_backfill(hobject_t());
10959 pg_log.reset_backfill();
10960 dirty_info = true;
10961
10962
10963 // clear log
10964 PGLogEntryHandler rollbacker{this, t};
10965 pg_log.roll_forward(&rollbacker);
10966
10967 write_if_dirty(*t);
10968
10969 if (!deleting)
10970 on_shutdown();
10971 }
10972
10973 void PrimaryLogPG::clear_async_reads()
10974 {
10975 dout(10) << __func__ << dendl;
10976 for(auto& i : in_progress_async_reads) {
10977 dout(10) << "clear ctx: "
10978 << "OpRequestRef " << i.first
10979 << " OpContext " << i.second
10980 << dendl;
10981 close_op_ctx(i.second);
10982 }
10983 }
10984
10985 void PrimaryLogPG::on_shutdown()
10986 {
10987 dout(10) << "on_shutdown" << dendl;
10988
10989 // remove from queues
10990 osd->pg_stat_queue_dequeue(this);
10991 osd->peering_wq.dequeue(this);
10992
10993 // handles queue races
10994 deleting = true;
10995
10996 if (recovery_queued) {
10997 recovery_queued = false;
10998 osd->clear_queued_recovery(this);
10999 }
11000
11001 clear_scrub_reserved();
11002 scrub_clear_state();
11003
11004 unreg_next_scrub();
11005 cancel_copy_ops(false);
11006 cancel_flush_ops(false);
11007 cancel_proxy_ops(false);
11008 apply_and_flush_repops(false);
11009 cancel_log_updates();
11010 // we must remove PGRefs, so do this this prior to release_backoffs() callers
11011 clear_backoffs();
11012 // clean up snap trim references
11013 snap_trimmer_machine.process_event(Reset());
11014
11015 pgbackend->on_change();
11016
11017 context_registry_on_change();
11018 object_contexts.clear();
11019
11020 clear_async_reads();
11021
11022 osd->remote_reserver.cancel_reservation(info.pgid);
11023 osd->local_reserver.cancel_reservation(info.pgid);
11024
11025 clear_primary_state();
11026 cancel_recovery();
11027 }
11028
11029 void PrimaryLogPG::on_activate()
11030 {
11031 // all clean?
11032 if (needs_recovery()) {
11033 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11034 queue_peering_event(
11035 CephPeeringEvtRef(
11036 std::make_shared<CephPeeringEvt>(
11037 get_osdmap()->get_epoch(),
11038 get_osdmap()->get_epoch(),
11039 DoRecovery())));
11040 } else if (needs_backfill()) {
11041 dout(10) << "activate queueing backfill" << dendl;
11042 queue_peering_event(
11043 CephPeeringEvtRef(
11044 std::make_shared<CephPeeringEvt>(
11045 get_osdmap()->get_epoch(),
11046 get_osdmap()->get_epoch(),
11047 RequestBackfill())));
11048 } else {
11049 dout(10) << "activate all replicas clean, no recovery" << dendl;
11050 eio_errors_to_process = false;
11051 queue_peering_event(
11052 CephPeeringEvtRef(
11053 std::make_shared<CephPeeringEvt>(
11054 get_osdmap()->get_epoch(),
11055 get_osdmap()->get_epoch(),
11056 AllReplicasRecovered())));
11057 }
11058
11059 publish_stats_to_osd();
11060
11061 if (!backfill_targets.empty()) {
11062 last_backfill_started = earliest_backfill();
11063 new_backfill = true;
11064 assert(!last_backfill_started.is_max());
11065 dout(5) << "on activate: bft=" << backfill_targets
11066 << " from " << last_backfill_started << dendl;
11067 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11068 i != backfill_targets.end();
11069 ++i) {
11070 dout(5) << "target shard " << *i
11071 << " from " << peer_info[*i].last_backfill
11072 << dendl;
11073 }
11074 }
11075
11076 hit_set_setup();
11077 agent_setup();
11078 }
11079
11080 void PrimaryLogPG::_on_new_interval()
11081 {
11082 dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11083 if (!pg_log.get_missing().may_include_deletes &&
11084 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11085 pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11086 }
11087 assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11088 }
11089
11090 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11091 {
11092 dout(10) << "on_change" << dendl;
11093
11094 if (hit_set && hit_set->insert_count() == 0) {
11095 dout(20) << " discarding empty hit_set" << dendl;
11096 hit_set_clear();
11097 }
11098
11099 if (recovery_queued) {
11100 recovery_queued = false;
11101 osd->clear_queued_recovery(this);
11102 }
11103
11104 // requeue everything in the reverse order they should be
11105 // reexamined.
11106 requeue_ops(waiting_for_peered);
11107 requeue_ops(waiting_for_flush);
11108 requeue_ops(waiting_for_active);
11109
11110 clear_scrub_reserved();
11111
11112 cancel_copy_ops(is_primary());
11113 cancel_flush_ops(is_primary());
11114 cancel_proxy_ops(is_primary());
11115
11116 // requeue object waiters
11117 for (auto& p : waiting_for_unreadable_object) {
11118 release_backoffs(p.first);
11119 }
11120 if (is_primary()) {
11121 requeue_object_waiters(waiting_for_unreadable_object);
11122 } else {
11123 waiting_for_unreadable_object.clear();
11124 }
11125 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11126 p != waiting_for_degraded_object.end();
11127 waiting_for_degraded_object.erase(p++)) {
11128 release_backoffs(p->first);
11129 if (is_primary())
11130 requeue_ops(p->second);
11131 else
11132 p->second.clear();
11133 finish_degraded_object(p->first);
11134 }
11135
11136 // requeues waiting_for_scrub
11137 scrub_clear_state();
11138
11139 for (auto p = waiting_for_blocked_object.begin();
11140 p != waiting_for_blocked_object.end();
11141 waiting_for_blocked_object.erase(p++)) {
11142 if (is_primary())
11143 requeue_ops(p->second);
11144 else
11145 p->second.clear();
11146 }
11147 for (auto i = callbacks_for_degraded_object.begin();
11148 i != callbacks_for_degraded_object.end();
11149 ) {
11150 finish_degraded_object((i++)->first);
11151 }
11152 assert(callbacks_for_degraded_object.empty());
11153
11154 if (is_primary()) {
11155 requeue_ops(waiting_for_cache_not_full);
11156 } else {
11157 waiting_for_cache_not_full.clear();
11158 }
11159 objects_blocked_on_cache_full.clear();
11160
11161 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11162 in_progress_async_reads.begin();
11163 i != in_progress_async_reads.end();
11164 in_progress_async_reads.erase(i++)) {
11165 close_op_ctx(i->second);
11166 if (is_primary())
11167 requeue_op(i->first);
11168 }
11169
11170 // this will requeue ops we were working on but didn't finish, and
11171 // any dups
11172 apply_and_flush_repops(is_primary());
11173 cancel_log_updates();
11174
11175 // do this *after* apply_and_flush_repops so that we catch any newly
11176 // registered watches.
11177 context_registry_on_change();
11178
11179 pgbackend->on_change_cleanup(t);
11180 scrubber.cleanup_store(t);
11181 pgbackend->on_change();
11182
11183 // clear snap_trimmer state
11184 snap_trimmer_machine.process_event(Reset());
11185
11186 debug_op_order.clear();
11187 unstable_stats.clear();
11188
11189 // we don't want to cache object_contexts through the interval change
11190 // NOTE: we actually assert that all currently live references are dead
11191 // by the time the flush for the next interval completes.
11192 object_contexts.clear();
11193
11194 // should have been cleared above by finishing all of the degraded objects
11195 assert(objects_blocked_on_degraded_snap.empty());
11196 }
11197
11198 void PrimaryLogPG::on_role_change()
11199 {
11200 dout(10) << "on_role_change" << dendl;
11201 if (get_role() != 0 && hit_set) {
11202 dout(10) << " clearing hit set" << dendl;
11203 hit_set_clear();
11204 }
11205 }
11206
11207 void PrimaryLogPG::on_pool_change()
11208 {
11209 dout(10) << __func__ << dendl;
11210 // requeue cache full waiters just in case the cache_mode is
11211 // changing away from writeback mode. note that if we are not
11212 // active the normal requeuing machinery is sufficient (and properly
11213 // ordered).
11214 if (is_active() &&
11215 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11216 !waiting_for_cache_not_full.empty()) {
11217 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11218 << dendl;
11219 requeue_ops(waiting_for_cache_not_full);
11220 objects_blocked_on_cache_full.clear();
11221 }
11222 hit_set_setup();
11223 agent_setup();
11224 }
11225
11226 // clear state. called on recovery completion AND cancellation.
11227 void PrimaryLogPG::_clear_recovery_state()
11228 {
11229 missing_loc.clear();
11230 #ifdef DEBUG_RECOVERY_OIDS
11231 recovering_oids.clear();
11232 #endif
11233 last_backfill_started = hobject_t();
11234 set<hobject_t>::iterator i = backfills_in_flight.begin();
11235 while (i != backfills_in_flight.end()) {
11236 assert(recovering.count(*i));
11237 backfills_in_flight.erase(i++);
11238 }
11239
11240 list<OpRequestRef> blocked_ops;
11241 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11242 i != recovering.end();
11243 recovering.erase(i++)) {
11244 if (i->second) {
11245 i->second->drop_recovery_read(&blocked_ops);
11246 requeue_ops(blocked_ops);
11247 }
11248 }
11249 assert(backfills_in_flight.empty());
11250 pending_backfill_updates.clear();
11251 assert(recovering.empty());
11252 pgbackend->clear_recovery_state();
11253 }
11254
11255 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11256 {
11257 dout(20) << __func__ << ": " << soid << dendl;
11258 assert(recovering.count(soid));
11259 ObjectContextRef obc = recovering[soid];
11260 if (obc) {
11261 list<OpRequestRef> blocked_ops;
11262 obc->drop_recovery_read(&blocked_ops);
11263 requeue_ops(blocked_ops);
11264 }
11265 recovering.erase(soid);
11266 finish_recovery_op(soid);
11267 release_backoffs(soid);
11268 if (waiting_for_degraded_object.count(soid)) {
11269 dout(20) << " kicking degraded waiters on " << soid << dendl;
11270 requeue_ops(waiting_for_degraded_object[soid]);
11271 waiting_for_degraded_object.erase(soid);
11272 }
11273 if (waiting_for_unreadable_object.count(soid)) {
11274 dout(20) << " kicking unreadable waiters on " << soid << dendl;
11275 requeue_ops(waiting_for_unreadable_object[soid]);
11276 waiting_for_unreadable_object.erase(soid);
11277 }
11278 if (is_missing_object(soid))
11279 pg_log.set_last_requested(0); // get recover_primary to start over
11280 finish_degraded_object(soid);
11281 }
11282
11283 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11284 {
11285 /*
11286 * check that any peers we are planning to (or currently) pulling
11287 * objects from are dealt with.
11288 */
11289 missing_loc.check_recovery_sources(osdmap);
11290 pgbackend->check_recovery_sources(osdmap);
11291
11292 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11293 i != peer_log_requested.end();
11294 ) {
11295 if (!osdmap->is_up(i->osd)) {
11296 dout(10) << "peer_log_requested removing " << *i << dendl;
11297 peer_log_requested.erase(i++);
11298 } else {
11299 ++i;
11300 }
11301 }
11302
11303 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11304 i != peer_missing_requested.end();
11305 ) {
11306 if (!osdmap->is_up(i->osd)) {
11307 dout(10) << "peer_missing_requested removing " << *i << dendl;
11308 peer_missing_requested.erase(i++);
11309 } else {
11310 ++i;
11311 }
11312 }
11313 }
11314
11315 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11316 {
11317 set<pg_shard_t> now_down;
11318 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11319 p != missing_loc_sources.end();
11320 ) {
11321 if (osdmap->is_up(p->osd)) {
11322 ++p;
11323 continue;
11324 }
11325 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11326 now_down.insert(*p);
11327 missing_loc_sources.erase(p++);
11328 }
11329
11330 if (now_down.empty()) {
11331 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11332 } else {
11333 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11334 << missing_loc_sources << dendl;
11335
11336 // filter missing_loc
11337 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11338 while (p != missing_loc.end()) {
11339 set<pg_shard_t>::iterator q = p->second.begin();
11340 while (q != p->second.end())
11341 if (now_down.count(*q)) {
11342 p->second.erase(q++);
11343 } else {
11344 ++q;
11345 }
11346 if (p->second.empty())
11347 missing_loc.erase(p++);
11348 else
11349 ++p;
11350 }
11351 }
11352 }
11353
11354
11355 bool PrimaryLogPG::start_recovery_ops(
11356 uint64_t max,
11357 ThreadPool::TPHandle &handle,
11358 uint64_t *ops_started)
11359 {
11360 uint64_t& started = *ops_started;
11361 started = 0;
11362 bool work_in_progress = false;
11363 assert(is_primary());
11364
11365 if (!state_test(PG_STATE_RECOVERING) &&
11366 !state_test(PG_STATE_BACKFILLING)) {
11367 /* TODO: I think this case is broken and will make do_recovery()
11368 * unhappy since we're returning false */
11369 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11370 return false;
11371 }
11372
11373 const auto &missing = pg_log.get_missing();
11374
11375 unsigned int num_missing = missing.num_missing();
11376 uint64_t num_unfound = get_num_unfound();
11377
11378 if (num_missing == 0) {
11379 info.last_complete = info.last_update;
11380 }
11381
11382 if (num_missing == num_unfound) {
11383 // All of the missing objects we have are unfound.
11384 // Recover the replicas.
11385 started = recover_replicas(max, handle);
11386 }
11387 if (!started) {
11388 // We still have missing objects that we should grab from replicas.
11389 started += recover_primary(max, handle);
11390 }
11391 if (!started && num_unfound != get_num_unfound()) {
11392 // second chance to recovery replicas
11393 started = recover_replicas(max, handle);
11394 }
11395
11396 if (started)
11397 work_in_progress = true;
11398
11399 bool deferred_backfill = false;
11400 if (recovering.empty() &&
11401 state_test(PG_STATE_BACKFILLING) &&
11402 !backfill_targets.empty() && started < max &&
11403 missing.num_missing() == 0 &&
11404 waiting_on_backfill.empty()) {
11405 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11406 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11407 deferred_backfill = true;
11408 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11409 !is_degraded()) {
11410 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11411 deferred_backfill = true;
11412 } else if (!backfill_reserved) {
11413 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11414 if (!backfill_reserving) {
11415 dout(10) << "queueing RequestBackfill" << dendl;
11416 backfill_reserving = true;
11417 queue_peering_event(
11418 CephPeeringEvtRef(
11419 std::make_shared<CephPeeringEvt>(
11420 get_osdmap()->get_epoch(),
11421 get_osdmap()->get_epoch(),
11422 RequestBackfill())));
11423 }
11424 deferred_backfill = true;
11425 } else {
11426 started += recover_backfill(max - started, handle, &work_in_progress);
11427 }
11428 }
11429
11430 dout(10) << " started " << started << dendl;
11431 osd->logger->inc(l_osd_rop, started);
11432
11433 if (!recovering.empty() ||
11434 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11435 return work_in_progress;
11436
11437 assert(recovering.empty());
11438 assert(recovery_ops_active == 0);
11439
11440 dout(10) << __func__ << " needs_recovery: "
11441 << missing_loc.get_needs_recovery()
11442 << dendl;
11443 dout(10) << __func__ << " missing_loc: "
11444 << missing_loc.get_missing_locs()
11445 << dendl;
11446 int unfound = get_num_unfound();
11447 if (unfound) {
11448 dout(10) << " still have " << unfound << " unfound" << dendl;
11449 return work_in_progress;
11450 }
11451
11452 if (missing.num_missing() > 0) {
11453 // this shouldn't happen!
11454 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11455 << missing.num_missing() << ": " << missing.get_items();
11456 return work_in_progress;
11457 }
11458
11459 if (needs_recovery()) {
11460 // this shouldn't happen!
11461 // We already checked num_missing() so we must have missing replicas
11462 osd->clog->error() << info.pgid
11463 << " Unexpected Error: recovery ending with missing replicas";
11464 return work_in_progress;
11465 }
11466
11467 if (state_test(PG_STATE_RECOVERING)) {
11468 state_clear(PG_STATE_RECOVERING);
11469 state_clear(PG_STATE_FORCED_RECOVERY);
11470 if (needs_backfill()) {
11471 dout(10) << "recovery done, queuing backfill" << dendl;
11472 queue_peering_event(
11473 CephPeeringEvtRef(
11474 std::make_shared<CephPeeringEvt>(
11475 get_osdmap()->get_epoch(),
11476 get_osdmap()->get_epoch(),
11477 RequestBackfill())));
11478 } else {
11479 dout(10) << "recovery done, no backfill" << dendl;
11480 eio_errors_to_process = false;
11481 state_clear(PG_STATE_FORCED_BACKFILL);
11482 queue_peering_event(
11483 CephPeeringEvtRef(
11484 std::make_shared<CephPeeringEvt>(
11485 get_osdmap()->get_epoch(),
11486 get_osdmap()->get_epoch(),
11487 AllReplicasRecovered())));
11488 }
11489 } else { // backfilling
11490 state_clear(PG_STATE_BACKFILLING);
11491 state_clear(PG_STATE_FORCED_BACKFILL);
11492 state_clear(PG_STATE_FORCED_RECOVERY);
11493 dout(10) << "recovery done, backfill done" << dendl;
11494 eio_errors_to_process = false;
11495 queue_peering_event(
11496 CephPeeringEvtRef(
11497 std::make_shared<CephPeeringEvt>(
11498 get_osdmap()->get_epoch(),
11499 get_osdmap()->get_epoch(),
11500 Backfilled())));
11501 }
11502
11503 return false;
11504 }
11505
11506 /**
11507 * do one recovery op.
11508 * return true if done, false if nothing left to do.
11509 */
11510 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11511 {
11512 assert(is_primary());
11513
11514 const auto &missing = pg_log.get_missing();
11515
11516 dout(10) << "recover_primary recovering " << recovering.size()
11517 << " in pg" << dendl;
11518 dout(10) << "recover_primary " << missing << dendl;
11519 dout(25) << "recover_primary " << missing.get_items() << dendl;
11520
11521 // look at log!
11522 pg_log_entry_t *latest = 0;
11523 unsigned started = 0;
11524 int skipped = 0;
11525
11526 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11527 map<version_t, hobject_t>::const_iterator p =
11528 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11529 while (p != missing.get_rmissing().end()) {
11530 handle.reset_tp_timeout();
11531 hobject_t soid;
11532 version_t v = p->first;
11533
11534 if (pg_log.get_log().objects.count(p->second)) {
11535 latest = pg_log.get_log().objects.find(p->second)->second;
11536 assert(latest->is_update() || latest->is_delete());
11537 soid = latest->soid;
11538 } else {
11539 latest = 0;
11540 soid = p->second;
11541 }
11542 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11543 ++p;
11544
11545 hobject_t head = soid.get_head();
11546
11547 eversion_t need = item.need;
11548
11549 dout(10) << "recover_primary "
11550 << soid << " " << item.need
11551 << (missing.is_missing(soid) ? " (missing)":"")
11552 << (missing.is_missing(head) ? " (missing head)":"")
11553 << (recovering.count(soid) ? " (recovering)":"")
11554 << (recovering.count(head) ? " (recovering head)":"")
11555 << dendl;
11556
11557 if (latest) {
11558 switch (latest->op) {
11559 case pg_log_entry_t::CLONE:
11560 /*
11561 * Handling for this special case removed for now, until we
11562 * can correctly construct an accurate SnapSet from the old
11563 * one.
11564 */
11565 break;
11566
11567 case pg_log_entry_t::LOST_REVERT:
11568 {
11569 if (item.have == latest->reverting_to) {
11570 ObjectContextRef obc = get_object_context(soid, true);
11571
11572 if (obc->obs.oi.version == latest->version) {
11573 // I'm already reverting
11574 dout(10) << " already reverting " << soid << dendl;
11575 } else {
11576 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11577 obc->ondisk_write_lock();
11578 obc->obs.oi.version = latest->version;
11579
11580 ObjectStore::Transaction t;
11581 bufferlist b2;
11582 obc->obs.oi.encode(
11583 b2,
11584 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11585 assert(!pool.info.require_rollback());
11586 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11587
11588 recover_got(soid, latest->version);
11589 missing_loc.add_location(soid, pg_whoami);
11590
11591 ++active_pushes;
11592
11593 osd->store->queue_transaction(osr.get(), std::move(t),
11594 new C_OSD_AppliedRecoveredObject(this, obc),
11595 new C_OSD_CommittedPushedObject(
11596 this,
11597 get_osdmap()->get_epoch(),
11598 info.last_complete),
11599 new C_OSD_OndiskWriteUnlock(obc));
11600 continue;
11601 }
11602 } else {
11603 /*
11604 * Pull the old version of the object. Update missing_loc here to have the location
11605 * of the version we want.
11606 *
11607 * This doesn't use the usual missing_loc paths, but that's okay:
11608 * - if we have it locally, we hit the case above, and go from there.
11609 * - if we don't, we always pass through this case during recovery and set up the location
11610 * properly.
11611 * - this way we don't need to mangle the missing code to be general about needing an old
11612 * version...
11613 */
11614 eversion_t alternate_need = latest->reverting_to;
11615 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11616
11617 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11618 p != peer_missing.end();
11619 ++p)
11620 if (p->second.is_missing(soid, need) &&
11621 p->second.get_items().at(soid).have == alternate_need) {
11622 missing_loc.add_location(soid, p->first);
11623 }
11624 dout(10) << " will pull " << alternate_need << " or " << need
11625 << " from one of " << missing_loc.get_locations(soid)
11626 << dendl;
11627 }
11628 }
11629 break;
11630 }
11631 }
11632
11633 if (!recovering.count(soid)) {
11634 if (recovering.count(head)) {
11635 ++skipped;
11636 } else {
11637 int r = recover_missing(
11638 soid, need, get_recovery_op_priority(), h);
11639 switch (r) {
11640 case PULL_YES:
11641 ++started;
11642 break;
11643 case PULL_OTHER:
11644 ++started;
11645 case PULL_NONE:
11646 ++skipped;
11647 break;
11648 default:
11649 ceph_abort();
11650 }
11651 if (started >= max)
11652 break;
11653 }
11654 }
11655
11656 // only advance last_requested if we haven't skipped anything
11657 if (!skipped)
11658 pg_log.set_last_requested(v);
11659 }
11660
11661 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11662 return started;
11663 }
11664
11665 bool PrimaryLogPG::primary_error(
11666 const hobject_t& soid, eversion_t v)
11667 {
11668 pg_log.missing_add(soid, v, eversion_t());
11669 pg_log.set_last_requested(0);
11670 missing_loc.remove_location(soid, pg_whoami);
11671 bool uhoh = true;
11672 assert(!actingbackfill.empty());
11673 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11674 i != actingbackfill.end();
11675 ++i) {
11676 if (*i == get_primary()) continue;
11677 pg_shard_t peer = *i;
11678 if (!peer_missing[peer].is_missing(soid, v)) {
11679 missing_loc.add_location(soid, peer);
11680 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11681 << ", there should be a copy on shard " << peer << dendl;
11682 uhoh = false;
11683 }
11684 }
11685 if (uhoh)
11686 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11687 else
11688 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11689 << ", will try copies on " << missing_loc.get_locations(soid);
11690 return uhoh;
11691 }
11692
11693 int PrimaryLogPG::prep_object_replica_deletes(
11694 const hobject_t& soid, eversion_t v,
11695 PGBackend::RecoveryHandle *h)
11696 {
11697 assert(is_primary());
11698 dout(10) << __func__ << ": on " << soid << dendl;
11699
11700 start_recovery_op(soid);
11701 assert(!recovering.count(soid));
11702 recovering.insert(make_pair(soid, ObjectContextRef()));
11703
11704 pgbackend->recover_delete_object(soid, v, h);
11705 return 1;
11706 }
11707
11708 int PrimaryLogPG::prep_object_replica_pushes(
11709 const hobject_t& soid, eversion_t v,
11710 PGBackend::RecoveryHandle *h)
11711 {
11712 assert(is_primary());
11713 dout(10) << __func__ << ": on " << soid << dendl;
11714
11715 // NOTE: we know we will get a valid oloc off of disk here.
11716 ObjectContextRef obc = get_object_context(soid, false);
11717 if (!obc) {
11718 primary_error(soid, v);
11719 return 0;
11720 }
11721
11722 if (!obc->get_recovery_read()) {
11723 dout(20) << "recovery delayed on " << soid
11724 << "; could not get rw_manager lock" << dendl;
11725 return 0;
11726 } else {
11727 dout(20) << "recovery got recovery read lock on " << soid
11728 << dendl;
11729 }
11730
11731 start_recovery_op(soid);
11732 assert(!recovering.count(soid));
11733 recovering.insert(make_pair(soid, obc));
11734
11735 /* We need this in case there is an in progress write on the object. In fact,
11736 * the only possible write is an update to the xattr due to a lost_revert --
11737 * a client write would be blocked since the object is degraded.
11738 * In almost all cases, therefore, this lock should be uncontended.
11739 */
11740 obc->ondisk_read_lock();
11741 int r = pgbackend->recover_object(
11742 soid,
11743 v,
11744 ObjectContextRef(),
11745 obc, // has snapset context
11746 h);
11747 obc->ondisk_read_unlock();
11748 if (r < 0) {
11749 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11750 primary_failed(soid);
11751 primary_error(soid, v);
11752 return 0;
11753 }
11754 return 1;
11755 }
11756
11757 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11758 {
11759 dout(10) << __func__ << "(" << max << ")" << dendl;
11760 uint64_t started = 0;
11761
11762 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11763
11764 // this is FAR from an optimal recovery order. pretty lame, really.
11765 assert(!actingbackfill.empty());
11766 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11767 i != actingbackfill.end();
11768 ++i) {
11769 if (*i == get_primary()) continue;
11770 pg_shard_t peer = *i;
11771 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11772 assert(pm != peer_missing.end());
11773 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11774 assert(pi != peer_info.end());
11775 size_t m_sz = pm->second.num_missing();
11776
11777 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11778 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11779
11780 // oldest first!
11781 const pg_missing_t &m(pm->second);
11782 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11783 p != m.get_rmissing().end() && started < max;
11784 ++p) {
11785 handle.reset_tp_timeout();
11786 const hobject_t soid(p->second);
11787
11788 if (missing_loc.is_unfound(soid)) {
11789 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11790 continue;
11791 }
11792
11793 if (soid > pi->second.last_backfill) {
11794 if (!recovering.count(soid)) {
11795 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11796 derr << __func__ << ": object added to missing set for backfill, but "
11797 << "is not in recovering, error!" << dendl;
11798 ceph_abort();
11799 }
11800 continue;
11801 }
11802
11803 if (recovering.count(soid)) {
11804 dout(10) << __func__ << ": already recovering " << soid << dendl;
11805 continue;
11806 }
11807
11808 if (missing_loc.is_deleted(soid)) {
11809 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11810 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11811 started += prep_object_replica_deletes(soid, r->second.need, h);
11812 continue;
11813 }
11814
11815 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11816 dout(10) << __func__ << ": " << soid.get_head()
11817 << " still missing on primary" << dendl;
11818 continue;
11819 }
11820
11821 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11822 dout(10) << __func__ << ": " << soid.get_snapdir()
11823 << " still missing on primary" << dendl;
11824 continue;
11825 }
11826
11827 if (pg_log.get_missing().is_missing(soid)) {
11828 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11829 continue;
11830 }
11831
11832 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11833 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11834 started += prep_object_replica_pushes(soid, r->second.need,
11835 h);
11836 }
11837 }
11838
11839 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11840 return started;
11841 }
11842
11843 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11844 {
11845 hobject_t e = hobject_t::get_max();
11846 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11847 i != backfill_targets.end();
11848 ++i) {
11849 pg_shard_t peer = *i;
11850 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11851 peer_backfill_info.find(peer);
11852 assert(iter != peer_backfill_info.end());
11853 if (iter->second.begin < e)
11854 e = iter->second.begin;
11855 }
11856 return e;
11857 }
11858
11859 bool PrimaryLogPG::all_peer_done() const
11860 {
11861 // Primary hasn't got any more objects
11862 assert(backfill_info.empty());
11863
11864 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11865 i != backfill_targets.end();
11866 ++i) {
11867 pg_shard_t bt = *i;
11868 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11869 peer_backfill_info.find(bt);
11870 assert(piter != peer_backfill_info.end());
11871 const BackfillInterval& pbi = piter->second;
11872 // See if peer has more to process
11873 if (!pbi.extends_to_end() || !pbi.empty())
11874 return false;
11875 }
11876 return true;
11877 }
11878
11879 /**
11880 * recover_backfill
11881 *
11882 * Invariants:
11883 *
11884 * backfilled: fully pushed to replica or present in replica's missing set (both
11885 * our copy and theirs).
11886 *
11887 * All objects on a backfill_target in
11888 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11889 * objects have been actually deleted and all logically-valid objects are replicated.
11890 * There may be PG objects in this interval yet to be backfilled.
11891 *
11892 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11893 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11894 *
11895 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11896 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11897 * interval remain on the backfill target.
11898 *
11899 * For a backfill target, all objects <= peer_info[target].last_backfill
11900 * have been backfilled to target
11901 *
11902 * There *MAY* be missing/outdated objects between last_backfill_started and
11903 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11904 * io created objects since the last scan. For this reason, we call
11905 * update_range() again before continuing backfill.
11906 */
11907 uint64_t PrimaryLogPG::recover_backfill(
11908 uint64_t max,
11909 ThreadPool::TPHandle &handle, bool *work_started)
11910 {
11911 dout(10) << "recover_backfill (" << max << ")"
11912 << " bft=" << backfill_targets
11913 << " last_backfill_started " << last_backfill_started
11914 << (new_backfill ? " new_backfill":"")
11915 << dendl;
11916 assert(!backfill_targets.empty());
11917
11918 // Initialize from prior backfill state
11919 if (new_backfill) {
11920 // on_activate() was called prior to getting here
11921 assert(last_backfill_started == earliest_backfill());
11922 new_backfill = false;
11923
11924 // initialize BackfillIntervals
11925 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11926 i != backfill_targets.end();
11927 ++i) {
11928 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11929 }
11930 backfill_info.reset(last_backfill_started);
11931
11932 backfills_in_flight.clear();
11933 pending_backfill_updates.clear();
11934 }
11935
11936 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11937 i != backfill_targets.end();
11938 ++i) {
11939 dout(10) << "peer osd." << *i
11940 << " info " << peer_info[*i]
11941 << " interval " << peer_backfill_info[*i].begin
11942 << "-" << peer_backfill_info[*i].end
11943 << " " << peer_backfill_info[*i].objects.size() << " objects"
11944 << dendl;
11945 }
11946
11947 // update our local interval to cope with recent changes
11948 backfill_info.begin = last_backfill_started;
11949 update_range(&backfill_info, handle);
11950
11951 unsigned ops = 0;
11952 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11953 set<hobject_t> add_to_stat;
11954
11955 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11956 i != backfill_targets.end();
11957 ++i) {
11958 peer_backfill_info[*i].trim_to(
11959 std::max(peer_info[*i].last_backfill, last_backfill_started));
11960 }
11961 backfill_info.trim_to(last_backfill_started);
11962
11963 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11964 while (ops < max) {
11965 if (backfill_info.begin <= earliest_peer_backfill() &&
11966 !backfill_info.extends_to_end() && backfill_info.empty()) {
11967 hobject_t next = backfill_info.end;
11968 backfill_info.reset(next);
11969 backfill_info.end = hobject_t::get_max();
11970 update_range(&backfill_info, handle);
11971 backfill_info.trim();
11972 }
11973
11974 dout(20) << " my backfill interval " << backfill_info << dendl;
11975
11976 bool sent_scan = false;
11977 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11978 i != backfill_targets.end();
11979 ++i) {
11980 pg_shard_t bt = *i;
11981 BackfillInterval& pbi = peer_backfill_info[bt];
11982
11983 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11984 if (pbi.begin <= backfill_info.begin &&
11985 !pbi.extends_to_end() && pbi.empty()) {
11986 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11987 epoch_t e = get_osdmap()->get_epoch();
11988 MOSDPGScan *m = new MOSDPGScan(
11989 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11990 spg_t(info.pgid.pgid, bt.shard),
11991 pbi.end, hobject_t());
11992 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11993 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11994 waiting_on_backfill.insert(bt);
11995 sent_scan = true;
11996 }
11997 }
11998
11999 // Count simultaneous scans as a single op and let those complete
12000 if (sent_scan) {
12001 ops++;
12002 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12003 break;
12004 }
12005
12006 if (backfill_info.empty() && all_peer_done()) {
12007 dout(10) << " reached end for both local and all peers" << dendl;
12008 break;
12009 }
12010
12011 // Get object within set of peers to operate on and
12012 // the set of targets for which that object applies.
12013 hobject_t check = earliest_peer_backfill();
12014
12015 if (check < backfill_info.begin) {
12016
12017 set<pg_shard_t> check_targets;
12018 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12019 i != backfill_targets.end();
12020 ++i) {
12021 pg_shard_t bt = *i;
12022 BackfillInterval& pbi = peer_backfill_info[bt];
12023 if (pbi.begin == check)
12024 check_targets.insert(bt);
12025 }
12026 assert(!check_targets.empty());
12027
12028 dout(20) << " BACKFILL removing " << check
12029 << " from peers " << check_targets << dendl;
12030 for (set<pg_shard_t>::iterator i = check_targets.begin();
12031 i != check_targets.end();
12032 ++i) {
12033 pg_shard_t bt = *i;
12034 BackfillInterval& pbi = peer_backfill_info[bt];
12035 assert(pbi.begin == check);
12036
12037 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12038 pbi.pop_front();
12039 }
12040
12041 /* This requires a bit of explanation. We compare head against
12042 * last_backfill to determine whether to send an operation
12043 * to the replica. A single write operation can touch up to three
12044 * objects: head, the snapdir, and a new clone which sorts closer to
12045 * head than any existing clone. If last_backfill points at a clone,
12046 * the transaction won't be sent and all 3 must lie on the right side
12047 * of the line (i.e., we'll backfill them later). If last_backfill
12048 * points at snapdir, it sorts greater than head, so we send the
12049 * transaction which is correct because all three must lie to the left
12050 * of the line.
12051 *
12052 * If it points at head, we have a bit of an issue. If head actually
12053 * exists, no problem, because any transaction which touches snapdir
12054 * must end up creating it (and deleting head), so sending the
12055 * operation won't pose a problem -- we'll end up having to scan it,
12056 * but it'll end up being the right version so we won't bother to
12057 * rebackfill it. However, if head doesn't exist, any write on head
12058 * will remove snapdir. For a replicated pool, this isn't a problem,
12059 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12060 * It only poses a problem for EC pools, because we never just delete
12061 * an object, we rename it into a rollback object. That operation
12062 * will end up crashing the osd with ENOENT. Tolerating the failure
12063 * wouldn't work either, even if snapdir exists, we'd be creating a
12064 * rollback object past the last_backfill line which wouldn't get
12065 * cleaned up (no rollback objects past the last_backfill line is an
12066 * existing important invariant). Thus, let's avoid the whole issue
12067 * by just not updating last_backfill_started here if head doesn't
12068 * exist and snapdir does. We aren't using up a recovery count here,
12069 * so we're going to recover snapdir immediately anyway. We'll only
12070 * fail "backward" if we fail to get the rw lock and that just means
12071 * we'll re-process this section of the hash space again.
12072 *
12073 * I'm choosing this hack here because the really "correct" answer is
12074 * going to be to unify snapdir and head into a single object (a
12075 * snapdir is really just a confusing way to talk about head existing
12076 * as a whiteout), but doing that is going to be a somewhat larger
12077 * undertaking.
12078 *
12079 * @see http://tracker.ceph.com/issues/17668
12080 */
12081 if (!(check.is_head() &&
12082 backfill_info.begin.is_snapdir() &&
12083 check == backfill_info.begin.get_head()))
12084 last_backfill_started = check;
12085
12086 // Don't increment ops here because deletions
12087 // are cheap and not replied to unlike real recovery_ops,
12088 // and we can't increment ops without requeueing ourself
12089 // for recovery.
12090 } else {
12091 eversion_t& obj_v = backfill_info.objects.begin()->second;
12092
12093 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12094 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12095 i != backfill_targets.end();
12096 ++i) {
12097 pg_shard_t bt = *i;
12098 BackfillInterval& pbi = peer_backfill_info[bt];
12099 // Find all check peers that have the wrong version
12100 if (check == backfill_info.begin && check == pbi.begin) {
12101 if (pbi.objects.begin()->second != obj_v) {
12102 need_ver_targs.push_back(bt);
12103 } else {
12104 keep_ver_targs.push_back(bt);
12105 }
12106 } else {
12107 pg_info_t& pinfo = peer_info[bt];
12108
12109 // Only include peers that we've caught up to their backfill line
12110 // otherwise, they only appear to be missing this object
12111 // because their pbi.begin > backfill_info.begin.
12112 if (backfill_info.begin > pinfo.last_backfill)
12113 missing_targs.push_back(bt);
12114 else
12115 skip_targs.push_back(bt);
12116 }
12117 }
12118
12119 if (!keep_ver_targs.empty()) {
12120 // These peers have version obj_v
12121 dout(20) << " BACKFILL keeping " << check
12122 << " with ver " << obj_v
12123 << " on peers " << keep_ver_targs << dendl;
12124 //assert(!waiting_for_degraded_object.count(check));
12125 }
12126 if (!need_ver_targs.empty() || !missing_targs.empty()) {
12127 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12128 assert(obc);
12129 if (obc->get_recovery_read()) {
12130 if (!need_ver_targs.empty()) {
12131 dout(20) << " BACKFILL replacing " << check
12132 << " with ver " << obj_v
12133 << " to peers " << need_ver_targs << dendl;
12134 }
12135 if (!missing_targs.empty()) {
12136 dout(20) << " BACKFILL pushing " << backfill_info.begin
12137 << " with ver " << obj_v
12138 << " to peers " << missing_targs << dendl;
12139 }
12140 vector<pg_shard_t> all_push = need_ver_targs;
12141 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12142
12143 handle.reset_tp_timeout();
12144 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12145 if (r < 0) {
12146 *work_started = true;
12147 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12148 break;
12149 }
12150 ops++;
12151 } else {
12152 *work_started = true;
12153 dout(20) << "backfill blocking on " << backfill_info.begin
12154 << "; could not get rw_manager lock" << dendl;
12155 break;
12156 }
12157 }
12158 dout(20) << "need_ver_targs=" << need_ver_targs
12159 << " keep_ver_targs=" << keep_ver_targs << dendl;
12160 dout(20) << "backfill_targets=" << backfill_targets
12161 << " missing_targs=" << missing_targs
12162 << " skip_targs=" << skip_targs << dendl;
12163
12164 last_backfill_started = backfill_info.begin;
12165 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12166 backfill_info.pop_front();
12167 vector<pg_shard_t> check_targets = need_ver_targs;
12168 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12169 for (vector<pg_shard_t>::iterator i = check_targets.begin();
12170 i != check_targets.end();
12171 ++i) {
12172 pg_shard_t bt = *i;
12173 BackfillInterval& pbi = peer_backfill_info[bt];
12174 pbi.pop_front();
12175 }
12176 }
12177 }
12178
12179 hobject_t backfill_pos =
12180 std::min(backfill_info.begin, earliest_peer_backfill());
12181
12182 for (set<hobject_t>::iterator i = add_to_stat.begin();
12183 i != add_to_stat.end();
12184 ++i) {
12185 ObjectContextRef obc = get_object_context(*i, false);
12186 assert(obc);
12187 pg_stat_t stat;
12188 add_object_context_to_pg_stat(obc, &stat);
12189 pending_backfill_updates[*i] = stat;
12190 }
12191 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12192 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12193 for (unsigned i = 0; i < to_remove.size(); ++i) {
12194 handle.reset_tp_timeout();
12195 const hobject_t& oid = to_remove[i].get<0>();
12196 eversion_t v = to_remove[i].get<1>();
12197 pg_shard_t peer = to_remove[i].get<2>();
12198 MOSDPGBackfillRemove *m;
12199 auto it = reqs.find(peer);
12200 if (it != reqs.end()) {
12201 m = it->second;
12202 } else {
12203 m = reqs[peer] = new MOSDPGBackfillRemove(
12204 spg_t(info.pgid.pgid, peer.shard),
12205 get_osdmap()->get_epoch());
12206 }
12207 m->ls.push_back(make_pair(oid, v));
12208
12209 if (oid <= last_backfill_started)
12210 pending_backfill_updates[oid]; // add empty stat!
12211 }
12212 for (auto p : reqs) {
12213 osd->send_message_osd_cluster(p.first.osd, p.second,
12214 get_osdmap()->get_epoch());
12215 }
12216 } else {
12217 // for jewel targets
12218 for (unsigned i = 0; i < to_remove.size(); ++i) {
12219 handle.reset_tp_timeout();
12220
12221 // ordered before any subsequent updates
12222 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12223 to_remove[i].get<2>());
12224
12225 if (to_remove[i].get<0>() <= last_backfill_started)
12226 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12227 }
12228 }
12229
12230 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12231
12232 dout(5) << "backfill_pos is " << backfill_pos << dendl;
12233 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12234 i != backfills_in_flight.end();
12235 ++i) {
12236 dout(20) << *i << " is still in flight" << dendl;
12237 }
12238
12239 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12240 backfill_pos : *(backfills_in_flight.begin());
12241 hobject_t new_last_backfill = earliest_backfill();
12242 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12243 for (map<hobject_t, pg_stat_t>::iterator i =
12244 pending_backfill_updates.begin();
12245 i != pending_backfill_updates.end() &&
12246 i->first < next_backfill_to_complete;
12247 pending_backfill_updates.erase(i++)) {
12248 dout(20) << " pending_backfill_update " << i->first << dendl;
12249 assert(i->first > new_last_backfill);
12250 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12251 j != backfill_targets.end();
12252 ++j) {
12253 pg_shard_t bt = *j;
12254 pg_info_t& pinfo = peer_info[bt];
12255 //Add stats to all peers that were missing object
12256 if (i->first > pinfo.last_backfill)
12257 pinfo.stats.add(i->second);
12258 }
12259 new_last_backfill = i->first;
12260 }
12261 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12262
12263 assert(!pending_backfill_updates.empty() ||
12264 new_last_backfill == last_backfill_started);
12265 if (pending_backfill_updates.empty() &&
12266 backfill_pos.is_max()) {
12267 assert(backfills_in_flight.empty());
12268 new_last_backfill = backfill_pos;
12269 last_backfill_started = backfill_pos;
12270 }
12271 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12272
12273 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12274 // all the backfill targets. Otherwise, we will move last_backfill up on
12275 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12276 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12277 i != backfill_targets.end();
12278 ++i) {
12279 pg_shard_t bt = *i;
12280 pg_info_t& pinfo = peer_info[bt];
12281
12282 if (new_last_backfill > pinfo.last_backfill) {
12283 pinfo.set_last_backfill(new_last_backfill);
12284 epoch_t e = get_osdmap()->get_epoch();
12285 MOSDPGBackfill *m = NULL;
12286 if (pinfo.last_backfill.is_max()) {
12287 m = new MOSDPGBackfill(
12288 MOSDPGBackfill::OP_BACKFILL_FINISH,
12289 e,
12290 last_peering_reset,
12291 spg_t(info.pgid.pgid, bt.shard));
12292 // Use default priority here, must match sub_op priority
12293 /* pinfo.stats might be wrong if we did log-based recovery on the
12294 * backfilled portion in addition to continuing backfill.
12295 */
12296 pinfo.stats = info.stats;
12297 start_recovery_op(hobject_t::get_max());
12298 } else {
12299 m = new MOSDPGBackfill(
12300 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12301 e,
12302 last_peering_reset,
12303 spg_t(info.pgid.pgid, bt.shard));
12304 // Use default priority here, must match sub_op priority
12305 }
12306 m->last_backfill = pinfo.last_backfill;
12307 m->stats = pinfo.stats;
12308 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12309 dout(10) << " peer " << bt
12310 << " num_objects now " << pinfo.stats.stats.sum.num_objects
12311 << " / " << info.stats.stats.sum.num_objects << dendl;
12312 }
12313 }
12314
12315 if (ops)
12316 *work_started = true;
12317 return ops;
12318 }
12319
12320 int PrimaryLogPG::prep_backfill_object_push(
12321 hobject_t oid, eversion_t v,
12322 ObjectContextRef obc,
12323 vector<pg_shard_t> peers,
12324 PGBackend::RecoveryHandle *h)
12325 {
12326 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12327 assert(!peers.empty());
12328
12329 backfills_in_flight.insert(oid);
12330 for (unsigned int i = 0 ; i < peers.size(); ++i) {
12331 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12332 assert(bpm != peer_missing.end());
12333 bpm->second.add(oid, eversion_t(), eversion_t(), false);
12334 }
12335
12336 assert(!recovering.count(oid));
12337
12338 start_recovery_op(oid);
12339 recovering.insert(make_pair(oid, obc));
12340
12341 // We need to take the read_lock here in order to flush in-progress writes
12342 obc->ondisk_read_lock();
12343 int r = pgbackend->recover_object(
12344 oid,
12345 v,
12346 ObjectContextRef(),
12347 obc,
12348 h);
12349 obc->ondisk_read_unlock();
12350 if (r < 0) {
12351 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12352 primary_failed(oid);
12353 primary_error(oid, v);
12354 backfills_in_flight.erase(oid);
12355 missing_loc.add_missing(oid, v, eversion_t());
12356 }
12357 return r;
12358 }
12359
12360 void PrimaryLogPG::update_range(
12361 BackfillInterval *bi,
12362 ThreadPool::TPHandle &handle)
12363 {
12364 int local_min = cct->_conf->osd_backfill_scan_min;
12365 int local_max = cct->_conf->osd_backfill_scan_max;
12366
12367 if (bi->version < info.log_tail) {
12368 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12369 << dendl;
12370 if (last_update_applied >= info.log_tail) {
12371 bi->version = last_update_applied;
12372 } else {
12373 osr->flush();
12374 bi->version = info.last_update;
12375 }
12376 scan_range(local_min, local_max, bi, handle);
12377 }
12378
12379 if (bi->version >= projected_last_update) {
12380 dout(10) << __func__<< ": bi is current " << dendl;
12381 assert(bi->version == projected_last_update);
12382 } else if (bi->version >= info.log_tail) {
12383 if (pg_log.get_log().empty() && projected_log.empty()) {
12384 /* Because we don't move log_tail on split, the log might be
12385 * empty even if log_tail != last_update. However, the only
12386 * way to get here with an empty log is if log_tail is actually
12387 * eversion_t(), because otherwise the entry which changed
12388 * last_update since the last scan would have to be present.
12389 */
12390 assert(bi->version == eversion_t());
12391 return;
12392 }
12393
12394 dout(10) << __func__<< ": bi is old, (" << bi->version
12395 << ") can be updated with log to projected_last_update "
12396 << projected_last_update << dendl;
12397
12398 auto func = [&](const pg_log_entry_t &e) {
12399 dout(10) << __func__ << ": updating from version " << e.version
12400 << dendl;
12401 const hobject_t &soid = e.soid;
12402 if (soid >= bi->begin &&
12403 soid < bi->end) {
12404 if (e.is_update()) {
12405 dout(10) << __func__ << ": " << e.soid << " updated to version "
12406 << e.version << dendl;
12407 bi->objects.erase(e.soid);
12408 bi->objects.insert(
12409 make_pair(
12410 e.soid,
12411 e.version));
12412 } else if (e.is_delete()) {
12413 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12414 bi->objects.erase(e.soid);
12415 }
12416 }
12417 };
12418 dout(10) << "scanning pg log first" << dendl;
12419 pg_log.get_log().scan_log_after(bi->version, func);
12420 dout(10) << "scanning projected log" << dendl;
12421 projected_log.scan_log_after(bi->version, func);
12422 bi->version = projected_last_update;
12423 } else {
12424 assert(0 == "scan_range should have raised bi->version past log_tail");
12425 }
12426 }
12427
12428 void PrimaryLogPG::scan_range(
12429 int min, int max, BackfillInterval *bi,
12430 ThreadPool::TPHandle &handle)
12431 {
12432 assert(is_locked());
12433 dout(10) << "scan_range from " << bi->begin << dendl;
12434 bi->clear_objects();
12435
12436 vector<hobject_t> ls;
12437 ls.reserve(max);
12438 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12439 assert(r >= 0);
12440 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12441 dout(20) << ls << dendl;
12442
12443 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12444 handle.reset_tp_timeout();
12445 ObjectContextRef obc;
12446 if (is_primary())
12447 obc = object_contexts.lookup(*p);
12448 if (obc) {
12449 bi->objects[*p] = obc->obs.oi.version;
12450 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12451 } else {
12452 bufferlist bl;
12453 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12454
12455 /* If the object does not exist here, it must have been removed
12456 * between the collection_list_partial and here. This can happen
12457 * for the first item in the range, which is usually last_backfill.
12458 */
12459 if (r == -ENOENT)
12460 continue;
12461
12462 assert(r >= 0);
12463 object_info_t oi(bl);
12464 bi->objects[*p] = oi.version;
12465 dout(20) << " " << *p << " " << oi.version << dendl;
12466 }
12467 }
12468 }
12469
12470
12471 /** check_local
12472 *
12473 * verifies that stray objects have been deleted
12474 */
12475 void PrimaryLogPG::check_local()
12476 {
12477 dout(10) << __func__ << dendl;
12478
12479 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12480
12481 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12482 return;
12483
12484 // just scan the log.
12485 set<hobject_t> did;
12486 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12487 p != pg_log.get_log().log.rend();
12488 ++p) {
12489 if (did.count(p->soid))
12490 continue;
12491 did.insert(p->soid);
12492
12493 if (p->is_delete() && !is_missing_object(p->soid)) {
12494 dout(10) << " checking " << p->soid
12495 << " at " << p->version << dendl;
12496 struct stat st;
12497 int r = osd->store->stat(
12498 ch,
12499 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12500 &st);
12501 if (r != -ENOENT) {
12502 derr << __func__ << " " << p->soid << " exists, but should have been "
12503 << "deleted" << dendl;
12504 assert(0 == "erroneously present object");
12505 }
12506 } else {
12507 // ignore old(+missing) objects
12508 }
12509 }
12510 }
12511
12512
12513
12514 // ===========================
12515 // hit sets
12516
12517 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12518 {
12519 ostringstream ss;
12520 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12521 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12522 info.pgid.ps(), info.pgid.pool(),
12523 cct->_conf->osd_hit_set_namespace);
12524 dout(20) << __func__ << " " << hoid << dendl;
12525 return hoid;
12526 }
12527
12528 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12529 utime_t end,
12530 bool using_gmt)
12531 {
12532 ostringstream ss;
12533 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12534 if (using_gmt) {
12535 start.gmtime(ss) << "_";
12536 end.gmtime(ss);
12537 } else {
12538 start.localtime(ss) << "_";
12539 end.localtime(ss);
12540 }
12541 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12542 info.pgid.ps(), info.pgid.pool(),
12543 cct->_conf->osd_hit_set_namespace);
12544 dout(20) << __func__ << " " << hoid << dendl;
12545 return hoid;
12546 }
12547
12548 void PrimaryLogPG::hit_set_clear()
12549 {
12550 dout(20) << __func__ << dendl;
12551 hit_set.reset();
12552 hit_set_start_stamp = utime_t();
12553 }
12554
12555 void PrimaryLogPG::hit_set_setup()
12556 {
12557 if (!is_active() ||
12558 !is_primary()) {
12559 hit_set_clear();
12560 return;
12561 }
12562
12563 if (is_active() && is_primary() &&
12564 (!pool.info.hit_set_count ||
12565 !pool.info.hit_set_period ||
12566 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12567 hit_set_clear();
12568
12569 // only primary is allowed to remove all the hit set objects
12570 hit_set_remove_all();
12571 return;
12572 }
12573
12574 // FIXME: discard any previous data for now
12575 hit_set_create();
12576
12577 // include any writes we know about from the pg log. this doesn't
12578 // capture reads, but it is better than nothing!
12579 hit_set_apply_log();
12580 }
12581
12582 void PrimaryLogPG::hit_set_remove_all()
12583 {
12584 // If any archives are degraded we skip this
12585 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12586 p != info.hit_set.history.end();
12587 ++p) {
12588 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12589
12590 // Once we hit a degraded object just skip
12591 if (is_degraded_or_backfilling_object(aoid))
12592 return;
12593 if (scrubber.write_blocked_by_scrub(aoid))
12594 return;
12595 }
12596
12597 if (!info.hit_set.history.empty()) {
12598 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12599 assert(p != info.hit_set.history.rend());
12600 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12601 assert(!is_degraded_or_backfilling_object(oid));
12602 ObjectContextRef obc = get_object_context(oid, false);
12603 assert(obc);
12604
12605 OpContextUPtr ctx = simple_opc_create(obc);
12606 ctx->at_version = get_next_version();
12607 ctx->updated_hset_history = info.hit_set;
12608 utime_t now = ceph_clock_now();
12609 ctx->mtime = now;
12610 hit_set_trim(ctx, 0);
12611 simple_opc_submit(std::move(ctx));
12612 }
12613
12614 info.hit_set = pg_hit_set_history_t();
12615 if (agent_state) {
12616 agent_state->discard_hit_sets();
12617 }
12618 }
12619
12620 void PrimaryLogPG::hit_set_create()
12621 {
12622 utime_t now = ceph_clock_now();
12623 // make a copy of the params to modify
12624 HitSet::Params params(pool.info.hit_set_params);
12625
12626 dout(20) << __func__ << " " << params << dendl;
12627 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12628 BloomHitSet::Params *p =
12629 static_cast<BloomHitSet::Params*>(params.impl.get());
12630
12631 // convert false positive rate so it holds up across the full period
12632 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12633 if (p->get_fpp() <= 0.0)
12634 p->set_fpp(.01); // fpp cannot be zero!
12635
12636 // if we don't have specified size, estimate target size based on the
12637 // previous bin!
12638 if (p->target_size == 0 && hit_set) {
12639 utime_t dur = now - hit_set_start_stamp;
12640 unsigned unique = hit_set->approx_unique_insert_count();
12641 dout(20) << __func__ << " previous set had approx " << unique
12642 << " unique items over " << dur << " seconds" << dendl;
12643 p->target_size = (double)unique * (double)pool.info.hit_set_period
12644 / (double)dur;
12645 }
12646 if (p->target_size <
12647 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12648 p->target_size = cct->_conf->osd_hit_set_min_size;
12649
12650 if (p->target_size
12651 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12652 p->target_size = cct->_conf->osd_hit_set_max_size;
12653
12654 p->seed = now.sec();
12655
12656 dout(10) << __func__ << " target_size " << p->target_size
12657 << " fpp " << p->get_fpp() << dendl;
12658 }
12659 hit_set.reset(new HitSet(params));
12660 hit_set_start_stamp = now;
12661 }
12662
12663 /**
12664 * apply log entries to set
12665 *
12666 * this would only happen after peering, to at least capture writes
12667 * during an interval that was potentially lost.
12668 */
12669 bool PrimaryLogPG::hit_set_apply_log()
12670 {
12671 if (!hit_set)
12672 return false;
12673
12674 eversion_t to = info.last_update;
12675 eversion_t from = info.hit_set.current_last_update;
12676 if (to <= from) {
12677 dout(20) << __func__ << " no update" << dendl;
12678 return false;
12679 }
12680
12681 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12682 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12683 while (p != pg_log.get_log().log.rend() && p->version > to)
12684 ++p;
12685 while (p != pg_log.get_log().log.rend() && p->version > from) {
12686 hit_set->insert(p->soid);
12687 ++p;
12688 }
12689
12690 return true;
12691 }
12692
12693 void PrimaryLogPG::hit_set_persist()
12694 {
12695 dout(10) << __func__ << dendl;
12696 bufferlist bl;
12697 unsigned max = pool.info.hit_set_count;
12698
12699 utime_t now = ceph_clock_now();
12700 hobject_t oid;
12701
12702 // If any archives are degraded we skip this persist request
12703 // account for the additional entry being added below
12704 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12705 p != info.hit_set.history.end();
12706 ++p) {
12707 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12708
12709 // Once we hit a degraded object just skip further trim
12710 if (is_degraded_or_backfilling_object(aoid))
12711 return;
12712 if (scrubber.write_blocked_by_scrub(aoid))
12713 return;
12714 }
12715
12716 // If backfill is in progress and we could possibly overlap with the
12717 // hit_set_* objects, back off. Since these all have
12718 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12719 // look just at that. This is necessary because our transactions
12720 // may include a modify of the new hit_set *and* a delete of the
12721 // old one, and this may span the backfill boundary.
12722 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12723 p != backfill_targets.end();
12724 ++p) {
12725 assert(peer_info.count(*p));
12726 const pg_info_t& pi = peer_info[*p];
12727 if (pi.last_backfill == hobject_t() ||
12728 pi.last_backfill.get_hash() == info.pgid.ps()) {
12729 dout(10) << __func__ << " backfill target osd." << *p
12730 << " last_backfill has not progressed past pgid ps"
12731 << dendl;
12732 return;
12733 }
12734 }
12735
12736
12737 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12738 new_hset.begin = hit_set_start_stamp;
12739 new_hset.end = now;
12740 oid = get_hit_set_archive_object(
12741 new_hset.begin,
12742 new_hset.end,
12743 new_hset.using_gmt);
12744
12745 // If the current object is degraded we skip this persist request
12746 if (scrubber.write_blocked_by_scrub(oid))
12747 return;
12748
12749 hit_set->seal();
12750 ::encode(*hit_set, bl);
12751 dout(20) << __func__ << " archive " << oid << dendl;
12752
12753 if (agent_state) {
12754 agent_state->add_hit_set(new_hset.begin, hit_set);
12755 uint32_t size = agent_state->hit_set_map.size();
12756 if (size >= pool.info.hit_set_count) {
12757 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12758 }
12759 hit_set_in_memory_trim(size);
12760 }
12761
12762 ObjectContextRef obc = get_object_context(oid, true);
12763 OpContextUPtr ctx = simple_opc_create(obc);
12764
12765 ctx->at_version = get_next_version();
12766 ctx->updated_hset_history = info.hit_set;
12767 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12768
12769 updated_hit_set_hist.current_last_update = info.last_update;
12770 new_hset.version = ctx->at_version;
12771
12772 updated_hit_set_hist.history.push_back(new_hset);
12773 hit_set_create();
12774
12775 // fabricate an object_info_t and SnapSet
12776 obc->obs.oi.version = ctx->at_version;
12777 obc->obs.oi.mtime = now;
12778 obc->obs.oi.size = bl.length();
12779 obc->obs.exists = true;
12780 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12781
12782 ctx->new_obs = obc->obs;
12783
12784 obc->ssc->snapset.head_exists = true;
12785 ctx->new_snapset = obc->ssc->snapset;
12786
12787 ctx->delta_stats.num_objects++;
12788 ctx->delta_stats.num_objects_hit_set_archive++;
12789 ctx->delta_stats.num_bytes += bl.length();
12790 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12791
12792 bufferlist bss;
12793 ::encode(ctx->new_snapset, bss);
12794 bufferlist boi(sizeof(ctx->new_obs.oi));
12795 ::encode(ctx->new_obs.oi, boi,
12796 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12797
12798 ctx->op_t->create(oid);
12799 if (bl.length()) {
12800 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12801 }
12802 map <string, bufferlist> attrs;
12803 attrs[OI_ATTR].claim(boi);
12804 attrs[SS_ATTR].claim(bss);
12805 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12806 ctx->log.push_back(
12807 pg_log_entry_t(
12808 pg_log_entry_t::MODIFY,
12809 oid,
12810 ctx->at_version,
12811 eversion_t(),
12812 0,
12813 osd_reqid_t(),
12814 ctx->mtime,
12815 0)
12816 );
12817
12818 hit_set_trim(ctx, max);
12819
12820 simple_opc_submit(std::move(ctx));
12821 }
12822
12823 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12824 {
12825 assert(ctx->updated_hset_history);
12826 pg_hit_set_history_t &updated_hit_set_hist =
12827 *(ctx->updated_hset_history);
12828 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12829 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12830 assert(p != updated_hit_set_hist.history.end());
12831 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12832
12833 assert(!is_degraded_or_backfilling_object(oid));
12834
12835 dout(20) << __func__ << " removing " << oid << dendl;
12836 ++ctx->at_version.version;
12837 ctx->log.push_back(
12838 pg_log_entry_t(pg_log_entry_t::DELETE,
12839 oid,
12840 ctx->at_version,
12841 p->version,
12842 0,
12843 osd_reqid_t(),
12844 ctx->mtime,
12845 0));
12846
12847 ctx->op_t->remove(oid);
12848 updated_hit_set_hist.history.pop_front();
12849
12850 ObjectContextRef obc = get_object_context(oid, false);
12851 assert(obc);
12852 --ctx->delta_stats.num_objects;
12853 --ctx->delta_stats.num_objects_hit_set_archive;
12854 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12855 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12856 }
12857 }
12858
12859 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12860 {
12861 while (agent_state->hit_set_map.size() > max_in_memory) {
12862 agent_state->remove_oldest_hit_set();
12863 }
12864 }
12865
12866
12867 // =======================================
12868 // cache agent
12869
12870 void PrimaryLogPG::agent_setup()
12871 {
12872 assert(is_locked());
12873 if (!is_active() ||
12874 !is_primary() ||
12875 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12876 pool.info.tier_of < 0 ||
12877 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12878 agent_clear();
12879 return;
12880 }
12881 if (!agent_state) {
12882 agent_state.reset(new TierAgentState);
12883
12884 // choose random starting position
12885 agent_state->position = hobject_t();
12886 agent_state->position.pool = info.pgid.pool();
12887 agent_state->position.set_hash(pool.info.get_random_pg_position(
12888 info.pgid.pgid,
12889 rand()));
12890 agent_state->start = agent_state->position;
12891
12892 dout(10) << __func__ << " allocated new state, position "
12893 << agent_state->position << dendl;
12894 } else {
12895 dout(10) << __func__ << " keeping existing state" << dendl;
12896 }
12897
12898 if (info.stats.stats_invalid) {
12899 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12900 }
12901
12902 agent_choose_mode();
12903 }
12904
12905 void PrimaryLogPG::agent_clear()
12906 {
12907 agent_stop();
12908 agent_state.reset(NULL);
12909 }
12910
12911 // Return false if no objects operated on since start of object hash space
12912 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12913 {
12914 lock();
12915 if (!agent_state) {
12916 dout(10) << __func__ << " no agent state, stopping" << dendl;
12917 unlock();
12918 return true;
12919 }
12920
12921 assert(!deleting);
12922
12923 if (agent_state->is_idle()) {
12924 dout(10) << __func__ << " idle, stopping" << dendl;
12925 unlock();
12926 return true;
12927 }
12928
12929 osd->logger->inc(l_osd_agent_wake);
12930
12931 dout(10) << __func__
12932 << " max " << start_max
12933 << ", flush " << agent_state->get_flush_mode_name()
12934 << ", evict " << agent_state->get_evict_mode_name()
12935 << ", pos " << agent_state->position
12936 << dendl;
12937 assert(is_primary());
12938 assert(is_active());
12939
12940 agent_load_hit_sets();
12941
12942 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12943 assert(base_pool);
12944
12945 int ls_min = 1;
12946 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12947
12948 // list some objects. this conveniently lists clones (oldest to
12949 // newest) before heads... the same order we want to flush in.
12950 //
12951 // NOTE: do not flush the Sequencer. we will assume that the
12952 // listing we get back is imprecise.
12953 vector<hobject_t> ls;
12954 hobject_t next;
12955 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12956 &ls, &next);
12957 assert(r >= 0);
12958 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12959 int started = 0;
12960 for (vector<hobject_t>::iterator p = ls.begin();
12961 p != ls.end();
12962 ++p) {
12963 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12964 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12965 osd->logger->inc(l_osd_agent_skip);
12966 continue;
12967 }
12968 if (is_degraded_or_backfilling_object(*p)) {
12969 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12970 osd->logger->inc(l_osd_agent_skip);
12971 continue;
12972 }
12973 if (is_missing_object(p->get_head())) {
12974 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12975 osd->logger->inc(l_osd_agent_skip);
12976 continue;
12977 }
12978 ObjectContextRef obc = get_object_context(*p, false, NULL);
12979 if (!obc) {
12980 // we didn't flush; we may miss something here.
12981 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12982 osd->logger->inc(l_osd_agent_skip);
12983 continue;
12984 }
12985 if (!obc->obs.exists) {
12986 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12987 osd->logger->inc(l_osd_agent_skip);
12988 continue;
12989 }
12990 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12991 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12992 osd->logger->inc(l_osd_agent_skip);
12993 continue;
12994 }
12995 if (obc->is_blocked()) {
12996 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12997 osd->logger->inc(l_osd_agent_skip);
12998 continue;
12999 }
13000 if (obc->is_request_pending()) {
13001 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13002 osd->logger->inc(l_osd_agent_skip);
13003 continue;
13004 }
13005
13006 // be careful flushing omap to an EC pool.
13007 if (!base_pool->supports_omap() &&
13008 obc->obs.oi.is_omap()) {
13009 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13010 osd->logger->inc(l_osd_agent_skip);
13011 continue;
13012 }
13013
13014 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13015 agent_maybe_evict(obc, false))
13016 ++started;
13017 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13018 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13019 ++started;
13020 --agent_flush_quota;
13021 }
13022 if (started >= start_max) {
13023 // If finishing early, set "next" to the next object
13024 if (++p != ls.end())
13025 next = *p;
13026 break;
13027 }
13028 }
13029
13030 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13031 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13032 agent_state->hist_age = 0;
13033 agent_state->temp_hist.decay();
13034 }
13035
13036 // Total objects operated on so far
13037 int total_started = agent_state->started + started;
13038 bool need_delay = false;
13039
13040 dout(20) << __func__ << " start pos " << agent_state->position
13041 << " next start pos " << next
13042 << " started " << total_started << dendl;
13043
13044 // See if we've made a full pass over the object hash space
13045 // This might check at most ls_max objects a second time to notice that
13046 // we've checked every objects at least once.
13047 if (agent_state->position < agent_state->start &&
13048 next >= agent_state->start) {
13049 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13050 if (total_started == 0)
13051 need_delay = true;
13052 else
13053 total_started = 0;
13054 agent_state->start = next;
13055 }
13056 agent_state->started = total_started;
13057
13058 // See if we are starting from beginning
13059 if (next.is_max())
13060 agent_state->position = hobject_t();
13061 else
13062 agent_state->position = next;
13063
13064 // Discard old in memory HitSets
13065 hit_set_in_memory_trim(pool.info.hit_set_count);
13066
13067 if (need_delay) {
13068 assert(agent_state->delaying == false);
13069 agent_delay();
13070 unlock();
13071 return false;
13072 }
13073 agent_choose_mode();
13074 unlock();
13075 return true;
13076 }
13077
13078 void PrimaryLogPG::agent_load_hit_sets()
13079 {
13080 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13081 return;
13082 }
13083
13084 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13085 dout(10) << __func__ << dendl;
13086 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13087 p != info.hit_set.history.end(); ++p) {
13088 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13089 dout(10) << __func__ << " loading " << p->begin << "-"
13090 << p->end << dendl;
13091 if (!pool.info.is_replicated()) {
13092 // FIXME: EC not supported here yet
13093 derr << __func__ << " on non-replicated pool" << dendl;
13094 break;
13095 }
13096
13097 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13098 if (is_unreadable_object(oid)) {
13099 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13100 break;
13101 }
13102
13103 ObjectContextRef obc = get_object_context(oid, false);
13104 if (!obc) {
13105 derr << __func__ << ": could not load hitset " << oid << dendl;
13106 break;
13107 }
13108
13109 bufferlist bl;
13110 {
13111 obc->ondisk_read_lock();
13112 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13113 assert(r >= 0);
13114 obc->ondisk_read_unlock();
13115 }
13116 HitSetRef hs(new HitSet);
13117 bufferlist::iterator pbl = bl.begin();
13118 ::decode(*hs, pbl);
13119 agent_state->add_hit_set(p->begin.sec(), hs);
13120 }
13121 }
13122 }
13123 }
13124
13125 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13126 {
13127 if (!obc->obs.oi.is_dirty()) {
13128 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13129 osd->logger->inc(l_osd_agent_skip);
13130 return false;
13131 }
13132 if (obc->obs.oi.is_cache_pinned()) {
13133 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13134 osd->logger->inc(l_osd_agent_skip);
13135 return false;
13136 }
13137
13138 utime_t now = ceph_clock_now();
13139 utime_t ob_local_mtime;
13140 if (obc->obs.oi.local_mtime != utime_t()) {
13141 ob_local_mtime = obc->obs.oi.local_mtime;
13142 } else {
13143 ob_local_mtime = obc->obs.oi.mtime;
13144 }
13145 bool evict_mode_full =
13146 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13147 if (!evict_mode_full &&
13148 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
13149 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13150 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13151 osd->logger->inc(l_osd_agent_skip);
13152 return false;
13153 }
13154
13155 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13156 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13157 osd->logger->inc(l_osd_agent_skip);
13158 return false;
13159 }
13160
13161 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13162
13163 // FIXME: flush anything dirty, regardless of what distribution of
13164 // ages we expect.
13165
13166 hobject_t oid = obc->obs.oi.soid;
13167 osd->agent_start_op(oid);
13168 // no need to capture a pg ref, can't outlive fop or ctx
13169 std::function<void()> on_flush = [this, oid]() {
13170 osd->agent_finish_op(oid);
13171 };
13172
13173 int result = start_flush(
13174 OpRequestRef(), obc, false, NULL,
13175 on_flush);
13176 if (result != -EINPROGRESS) {
13177 on_flush();
13178 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13179 << " with " << result << dendl;
13180 osd->logger->inc(l_osd_agent_skip);
13181 return false;
13182 }
13183
13184 osd->logger->inc(l_osd_agent_flush);
13185 return true;
13186 }
13187
13188 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13189 {
13190 const hobject_t& soid = obc->obs.oi.soid;
13191 if (!after_flush && obc->obs.oi.is_dirty()) {
13192 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13193 return false;
13194 }
13195 if (!obc->obs.oi.watchers.empty()) {
13196 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13197 return false;
13198 }
13199 if (obc->is_blocked()) {
13200 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13201 return false;
13202 }
13203 if (obc->obs.oi.is_cache_pinned()) {
13204 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13205 return false;
13206 }
13207
13208 if (soid.snap == CEPH_NOSNAP) {
13209 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13210 if (result < 0) {
13211 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13212 return false;
13213 }
13214 }
13215
13216 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13217 // is this object old than cache_min_evict_age?
13218 utime_t now = ceph_clock_now();
13219 utime_t ob_local_mtime;
13220 if (obc->obs.oi.local_mtime != utime_t()) {
13221 ob_local_mtime = obc->obs.oi.local_mtime;
13222 } else {
13223 ob_local_mtime = obc->obs.oi.mtime;
13224 }
13225 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13226 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13227 osd->logger->inc(l_osd_agent_skip);
13228 return false;
13229 }
13230 // is this object old and/or cold enough?
13231 int temp = 0;
13232 uint64_t temp_upper = 0, temp_lower = 0;
13233 if (hit_set)
13234 agent_estimate_temp(soid, &temp);
13235 agent_state->temp_hist.add(temp);
13236 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13237
13238 dout(20) << __func__
13239 << " temp " << temp
13240 << " pos " << temp_lower << "-" << temp_upper
13241 << ", evict_effort " << agent_state->evict_effort
13242 << dendl;
13243 dout(30) << "agent_state:\n";
13244 Formatter *f = Formatter::create("");
13245 f->open_object_section("agent_state");
13246 agent_state->dump(f);
13247 f->close_section();
13248 f->flush(*_dout);
13249 delete f;
13250 *_dout << dendl;
13251
13252 if (1000000 - temp_upper >= agent_state->evict_effort)
13253 return false;
13254 }
13255
13256 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13257 OpContextUPtr ctx = simple_opc_create(obc);
13258
13259 if (!ctx->lock_manager.get_lock_type(
13260 ObjectContext::RWState::RWWRITE,
13261 obc->obs.oi.soid,
13262 obc,
13263 OpRequestRef())) {
13264 close_op_ctx(ctx.release());
13265 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13266 return false;
13267 }
13268
13269 osd->agent_start_evict_op();
13270 ctx->register_on_finish(
13271 [this]() {
13272 osd->agent_finish_evict_op();
13273 });
13274
13275 ctx->at_version = get_next_version();
13276 assert(ctx->new_obs.exists);
13277 int r = _delete_oid(ctx.get(), true, false);
13278 if (obc->obs.oi.is_omap())
13279 ctx->delta_stats.num_objects_omap--;
13280 ctx->delta_stats.num_evict++;
13281 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13282 if (obc->obs.oi.is_dirty())
13283 --ctx->delta_stats.num_objects_dirty;
13284 assert(r == 0);
13285 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13286 simple_opc_submit(std::move(ctx));
13287 osd->logger->inc(l_osd_tier_evict);
13288 osd->logger->inc(l_osd_agent_evict);
13289 return true;
13290 }
13291
13292 void PrimaryLogPG::agent_stop()
13293 {
13294 dout(20) << __func__ << dendl;
13295 if (agent_state && !agent_state->is_idle()) {
13296 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13297 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13298 osd->agent_disable_pg(this, agent_state->evict_effort);
13299 }
13300 }
13301
13302 void PrimaryLogPG::agent_delay()
13303 {
13304 dout(20) << __func__ << dendl;
13305 if (agent_state && !agent_state->is_idle()) {
13306 assert(agent_state->delaying == false);
13307 agent_state->delaying = true;
13308 osd->agent_disable_pg(this, agent_state->evict_effort);
13309 }
13310 }
13311
13312 void PrimaryLogPG::agent_choose_mode_restart()
13313 {
13314 dout(20) << __func__ << dendl;
13315 lock();
13316 if (agent_state && agent_state->delaying) {
13317 agent_state->delaying = false;
13318 agent_choose_mode(true);
13319 }
13320 unlock();
13321 }
13322
13323 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13324 {
13325 bool requeued = false;
13326 // Let delay play out
13327 if (agent_state->delaying) {
13328 dout(20) << __func__ << this << " delaying, ignored" << dendl;
13329 return requeued;
13330 }
13331
13332 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13333 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13334 unsigned evict_effort = 0;
13335
13336 if (info.stats.stats_invalid) {
13337 // idle; stats can't be trusted until we scrub.
13338 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13339 goto skip_calc;
13340 }
13341
13342 {
13343 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13344 assert(divisor > 0);
13345
13346 // adjust (effective) user objects down based on the number
13347 // of HitSet objects, which should not count toward our total since
13348 // they cannot be flushed.
13349 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13350
13351 // also exclude omap objects if ec backing pool
13352 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13353 assert(base_pool);
13354 if (!base_pool->supports_omap())
13355 unflushable += info.stats.stats.sum.num_objects_omap;
13356
13357 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13358 if (num_user_objects > unflushable)
13359 num_user_objects -= unflushable;
13360 else
13361 num_user_objects = 0;
13362
13363 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13364 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13365 num_user_bytes -= unflushable_bytes;
13366 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13367 num_user_bytes += num_overhead_bytes;
13368
13369 // also reduce the num_dirty by num_objects_omap
13370 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13371 if (!base_pool->supports_omap()) {
13372 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13373 num_dirty -= info.stats.stats.sum.num_objects_omap;
13374 else
13375 num_dirty = 0;
13376 }
13377
13378 dout(10) << __func__
13379 << " flush_mode: "
13380 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13381 << " evict_mode: "
13382 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13383 << " num_objects: " << info.stats.stats.sum.num_objects
13384 << " num_bytes: " << info.stats.stats.sum.num_bytes
13385 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13386 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13387 << " num_dirty: " << num_dirty
13388 << " num_user_objects: " << num_user_objects
13389 << " num_user_bytes: " << num_user_bytes
13390 << " num_overhead_bytes: " << num_overhead_bytes
13391 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13392 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13393 << dendl;
13394
13395 // get dirty, full ratios
13396 uint64_t dirty_micro = 0;
13397 uint64_t full_micro = 0;
13398 if (pool.info.target_max_bytes && num_user_objects > 0) {
13399 uint64_t avg_size = num_user_bytes / num_user_objects;
13400 dirty_micro =
13401 num_dirty * avg_size * 1000000 /
13402 MAX(pool.info.target_max_bytes / divisor, 1);
13403 full_micro =
13404 num_user_objects * avg_size * 1000000 /
13405 MAX(pool.info.target_max_bytes / divisor, 1);
13406 }
13407 if (pool.info.target_max_objects > 0) {
13408 uint64_t dirty_objects_micro =
13409 num_dirty * 1000000 /
13410 MAX(pool.info.target_max_objects / divisor, 1);
13411 if (dirty_objects_micro > dirty_micro)
13412 dirty_micro = dirty_objects_micro;
13413 uint64_t full_objects_micro =
13414 num_user_objects * 1000000 /
13415 MAX(pool.info.target_max_objects / divisor, 1);
13416 if (full_objects_micro > full_micro)
13417 full_micro = full_objects_micro;
13418 }
13419 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13420 << " full " << ((float)full_micro / 1000000.0)
13421 << dendl;
13422
13423 // flush mode
13424 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13425 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13426 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13427 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13428 flush_target += flush_slop;
13429 flush_high_target += flush_slop;
13430 } else {
13431 flush_target -= MIN(flush_target, flush_slop);
13432 flush_high_target -= MIN(flush_high_target, flush_slop);
13433 }
13434
13435 if (dirty_micro > flush_high_target) {
13436 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13437 } else if (dirty_micro > flush_target) {
13438 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13439 }
13440
13441 // evict mode
13442 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13443 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13444 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13445 evict_target += evict_slop;
13446 else
13447 evict_target -= MIN(evict_target, evict_slop);
13448
13449 if (full_micro > 1000000) {
13450 // evict anything clean
13451 evict_mode = TierAgentState::EVICT_MODE_FULL;
13452 evict_effort = 1000000;
13453 } else if (full_micro > evict_target) {
13454 // set effort in [0..1] range based on where we are between
13455 evict_mode = TierAgentState::EVICT_MODE_SOME;
13456 uint64_t over = full_micro - evict_target;
13457 uint64_t span = 1000000 - evict_target;
13458 evict_effort = MAX(over * 1000000 / span,
13459 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13460
13461 // quantize effort to avoid too much reordering in the agent_queue.
13462 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13463 assert(inc > 0);
13464 uint64_t was = evict_effort;
13465 evict_effort -= evict_effort % inc;
13466 if (evict_effort < inc)
13467 evict_effort = inc;
13468 assert(evict_effort >= inc && evict_effort <= 1000000);
13469 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13470 }
13471 }
13472
13473 skip_calc:
13474 bool old_idle = agent_state->is_idle();
13475 if (flush_mode != agent_state->flush_mode) {
13476 dout(5) << __func__ << " flush_mode "
13477 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13478 << " -> "
13479 << TierAgentState::get_flush_mode_name(flush_mode)
13480 << dendl;
13481 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13482 osd->agent_inc_high_count();
13483 info.stats.stats.sum.num_flush_mode_high = 1;
13484 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13485 info.stats.stats.sum.num_flush_mode_low = 1;
13486 }
13487 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13488 osd->agent_dec_high_count();
13489 info.stats.stats.sum.num_flush_mode_high = 0;
13490 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13491 info.stats.stats.sum.num_flush_mode_low = 0;
13492 }
13493 agent_state->flush_mode = flush_mode;
13494 }
13495 if (evict_mode != agent_state->evict_mode) {
13496 dout(5) << __func__ << " evict_mode "
13497 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13498 << " -> "
13499 << TierAgentState::get_evict_mode_name(evict_mode)
13500 << dendl;
13501 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13502 is_active()) {
13503 if (op)
13504 requeue_op(op);
13505 requeue_ops(waiting_for_flush);
13506 requeue_ops(waiting_for_active);
13507 requeue_ops(waiting_for_scrub);
13508 requeue_ops(waiting_for_cache_not_full);
13509 objects_blocked_on_cache_full.clear();
13510 requeued = true;
13511 }
13512 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13513 info.stats.stats.sum.num_evict_mode_some = 1;
13514 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13515 info.stats.stats.sum.num_evict_mode_full = 1;
13516 }
13517 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13518 info.stats.stats.sum.num_evict_mode_some = 0;
13519 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13520 info.stats.stats.sum.num_evict_mode_full = 0;
13521 }
13522 agent_state->evict_mode = evict_mode;
13523 }
13524 uint64_t old_effort = agent_state->evict_effort;
13525 if (evict_effort != agent_state->evict_effort) {
13526 dout(5) << __func__ << " evict_effort "
13527 << ((float)agent_state->evict_effort / 1000000.0)
13528 << " -> "
13529 << ((float)evict_effort / 1000000.0)
13530 << dendl;
13531 agent_state->evict_effort = evict_effort;
13532 }
13533
13534 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13535 // (including flush). This is probably fine (they should be
13536 // correlated) but it is not precisely correct.
13537 if (agent_state->is_idle()) {
13538 if (!restart && !old_idle) {
13539 osd->agent_disable_pg(this, old_effort);
13540 }
13541 } else {
13542 if (restart || old_idle) {
13543 osd->agent_enable_pg(this, agent_state->evict_effort);
13544 } else if (old_effort != agent_state->evict_effort) {
13545 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13546 }
13547 }
13548 return requeued;
13549 }
13550
13551 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13552 {
13553 assert(hit_set);
13554 assert(temp);
13555 *temp = 0;
13556 if (hit_set->contains(oid))
13557 *temp = 1000000;
13558 unsigned i = 0;
13559 int last_n = pool.info.hit_set_search_last_n;
13560 for (map<time_t,HitSetRef>::reverse_iterator p =
13561 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13562 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13563 if (p->second->contains(oid)) {
13564 *temp += pool.info.get_grade(i);
13565 --last_n;
13566 }
13567 }
13568 }
13569
13570 // Dup op detection
13571
13572 bool PrimaryLogPG::already_complete(eversion_t v)
13573 {
13574 dout(20) << __func__ << ": " << v << dendl;
13575 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13576 !i.end();
13577 ++i) {
13578 dout(20) << __func__ << ": " << **i << dendl;
13579 // skip copy from temp object ops
13580 if ((*i)->v == eversion_t()) {
13581 dout(20) << __func__ << ": " << **i
13582 << " version is empty" << dendl;
13583 continue;
13584 }
13585 if ((*i)->v > v) {
13586 dout(20) << __func__ << ": " << **i
13587 << " (*i)->v past v" << dendl;
13588 break;
13589 }
13590 if (!(*i)->all_committed) {
13591 dout(20) << __func__ << ": " << **i
13592 << " not committed, returning false"
13593 << dendl;
13594 return false;
13595 }
13596 }
13597 dout(20) << __func__ << ": returning true" << dendl;
13598 return true;
13599 }
13600
13601 bool PrimaryLogPG::already_ack(eversion_t v)
13602 {
13603 dout(20) << __func__ << ": " << v << dendl;
13604 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13605 !i.end();
13606 ++i) {
13607 // skip copy from temp object ops
13608 if ((*i)->v == eversion_t()) {
13609 dout(20) << __func__ << ": " << **i
13610 << " version is empty" << dendl;
13611 continue;
13612 }
13613 if ((*i)->v > v) {
13614 dout(20) << __func__ << ": " << **i
13615 << " (*i)->v past v" << dendl;
13616 break;
13617 }
13618 if (!(*i)->all_applied) {
13619 dout(20) << __func__ << ": " << **i
13620 << " not applied, returning false"
13621 << dendl;
13622 return false;
13623 }
13624 }
13625 dout(20) << __func__ << ": returning true" << dendl;
13626 return true;
13627 }
13628
13629
13630 // ==========================================================================================
13631 // SCRUB
13632
13633
13634 bool PrimaryLogPG::_range_available_for_scrub(
13635 const hobject_t &begin, const hobject_t &end)
13636 {
13637 pair<hobject_t, ObjectContextRef> next;
13638 next.second = object_contexts.lookup(begin);
13639 next.first = begin;
13640 bool more = true;
13641 while (more && next.first < end) {
13642 if (next.second && next.second->is_blocked()) {
13643 next.second->requeue_scrub_on_unblock = true;
13644 dout(10) << __func__ << ": scrub delayed, "
13645 << next.first << " is blocked"
13646 << dendl;
13647 return false;
13648 }
13649 more = object_contexts.get_next(next.first, &next);
13650 }
13651 return true;
13652 }
13653
13654 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13655 const vector<snapid_t>::reverse_iterator &curclone) {
13656 return snapset && curclone != snapset.get().clones.rend();
13657 }
13658
13659 void PrimaryLogPG::log_missing(unsigned missing,
13660 const boost::optional<hobject_t> &head,
13661 LogChannelRef clog,
13662 const spg_t &pgid,
13663 const char *func,
13664 const char *mode,
13665 bool allow_incomplete_clones)
13666 {
13667 assert(head);
13668 if (allow_incomplete_clones) {
13669 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13670 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13671 } else {
13672 clog->info() << mode << " " << pgid << " " << head.get()
13673 << " " << missing << " missing clone(s)";
13674 }
13675 }
13676
13677 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13678 const boost::optional<SnapSet> &snapset,
13679 LogChannelRef clog,
13680 const spg_t &pgid,
13681 const char *mode,
13682 bool allow_incomplete_clones,
13683 boost::optional<snapid_t> target,
13684 vector<snapid_t>::reverse_iterator *curclone,
13685 inconsistent_snapset_wrapper &e)
13686 {
13687 assert(head);
13688 assert(snapset);
13689 unsigned missing = 0;
13690
13691 // NOTE: clones are in descending order, thus **curclone > target test here
13692 hobject_t next_clone(head.get());
13693 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13694 ++missing;
13695 // it is okay to be missing one or more clones in a cache tier.
13696 // skip higher-numbered clones in the list.
13697 if (!allow_incomplete_clones) {
13698 next_clone.snap = **curclone;
13699 clog->error() << mode << " " << pgid << " " << head.get()
13700 << " expected clone " << next_clone << " " << missing
13701 << " missing";
13702 ++scrubber.shallow_errors;
13703 e.set_clone_missing(next_clone.snap);
13704 }
13705 // Clones are descending
13706 ++(*curclone);
13707 }
13708 return missing;
13709 }
13710
13711 /*
13712 * Validate consistency of the object info and snap sets.
13713 *
13714 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13715 * the comparison of the objects is against multiple snapset.clones. There are
13716 * multiple clone lists and in between lists we expect head or snapdir.
13717 *
13718 * Example
13719 *
13720 * objects expected
13721 * ======= =======
13722 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13723 * obj2 head head/snapdir, head ok
13724 * [SnapSet clones 6 4 2 1]
13725 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13726 * obj2 snap 6 obj2 snap 6, match
13727 * obj2 snap 4 obj2 snap 4, match
13728 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13729 * [Snapset clones 3 1]
13730 * obj3 snap 3 obj3 snap 3 match
13731 * obj3 snap 1 obj3 snap 1 match
13732 * obj4 snapdir head/snapdir, snapdir ok
13733 * [Snapset clones 4]
13734 * EOL obj4 snap 4, (expected)
13735 */
13736 void PrimaryLogPG::scrub_snapshot_metadata(
13737 ScrubMap &scrubmap,
13738 const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13739 {
13740 dout(10) << __func__ << dendl;
13741
13742 coll_t c(info.pgid);
13743 bool repair = state_test(PG_STATE_REPAIR);
13744 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13745 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13746 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13747
13748 /// snapsets to repair
13749 map<hobject_t,SnapSet> snapset_to_repair;
13750
13751 // traverse in reverse order.
13752 boost::optional<hobject_t> head;
13753 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13754 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13755 unsigned missing = 0;
13756 inconsistent_snapset_wrapper soid_error, head_error;
13757
13758 bufferlist last_data;
13759
13760 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13761 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13762 const hobject_t& soid = p->first;
13763 soid_error = inconsistent_snapset_wrapper{soid};
13764 object_stat_sum_t stat;
13765 boost::optional<object_info_t> oi;
13766
13767 if (!soid.is_snapdir())
13768 stat.num_objects++;
13769
13770 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13771 stat.num_objects_hit_set_archive++;
13772
13773 if (soid.is_snap()) {
13774 // it's a clone
13775 stat.num_object_clones++;
13776 }
13777
13778 // basic checks.
13779 if (p->second.attrs.count(OI_ATTR) == 0) {
13780 oi = boost::none;
13781 osd->clog->error() << mode << " " << info.pgid << " " << soid
13782 << " no '" << OI_ATTR << "' attr";
13783 ++scrubber.shallow_errors;
13784 soid_error.set_oi_attr_missing();
13785 } else {
13786 bufferlist bv;
13787 bv.push_back(p->second.attrs[OI_ATTR]);
13788 try {
13789 oi = object_info_t(); // Initialize optional<> before decode into it
13790 oi.get().decode(bv);
13791 } catch (buffer::error& e) {
13792 oi = boost::none;
13793 osd->clog->error() << mode << " " << info.pgid << " " << soid
13794 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13795 ++scrubber.shallow_errors;
13796 soid_error.set_oi_attr_corrupted();
13797 soid_error.set_oi_attr_missing(); // Not available too
13798 }
13799 }
13800
13801 if (oi) {
13802 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13803 osd->clog->error() << mode << " " << info.pgid << " " << soid
13804 << " on disk size (" << p->second.size
13805 << ") does not match object info size ("
13806 << oi->size << ") adjusted for ondisk to ("
13807 << pgbackend->be_get_ondisk_size(oi->size)
13808 << ")";
13809 soid_error.set_size_mismatch();
13810 ++scrubber.shallow_errors;
13811 }
13812
13813 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13814
13815 // A clone num_bytes will be added later when we have snapset
13816 if (!soid.is_snap()) {
13817 stat.num_bytes += oi->size;
13818 }
13819 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13820 stat.num_bytes_hit_set_archive += oi->size;
13821
13822 if (!soid.is_snapdir()) {
13823 if (oi->is_dirty())
13824 ++stat.num_objects_dirty;
13825 if (oi->is_whiteout())
13826 ++stat.num_whiteouts;
13827 if (oi->is_omap())
13828 ++stat.num_objects_omap;
13829 if (oi->is_cache_pinned())
13830 ++stat.num_objects_pinned;
13831 }
13832 } else {
13833 // pessimistic assumption that this object might contain a
13834 // legacy SnapSet
13835 stat.num_legacy_snapsets++;
13836 }
13837
13838 // Check for any problems while processing clones
13839 if (doing_clones(snapset, curclone)) {
13840 boost::optional<snapid_t> target;
13841 // Expecting an object with snap for current head
13842 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13843
13844 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13845 << soid << " while processing " << head.get() << dendl;
13846
13847 target = all_clones;
13848 } else {
13849 assert(soid.is_snap());
13850 target = soid.snap;
13851 }
13852
13853 // Log any clones we were expecting to be there up to target
13854 // This will set missing, but will be a no-op if snap.soid == *curclone.
13855 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13856 pool.info.allow_incomplete_clones(), target, &curclone,
13857 head_error);
13858 }
13859 bool expected;
13860 // Check doing_clones() again in case we ran process_clones_to()
13861 if (doing_clones(snapset, curclone)) {
13862 // A head/snapdir would have processed all clones above
13863 // or all greater than *curclone.
13864 assert(soid.is_snap() && *curclone <= soid.snap);
13865
13866 // After processing above clone snap should match the expected curclone
13867 expected = (*curclone == soid.snap);
13868 } else {
13869 // If we aren't doing clones any longer, then expecting head/snapdir
13870 expected = soid.has_snapset();
13871 }
13872 if (!expected) {
13873 // If we couldn't read the head's snapset, just ignore clones
13874 if (head && !snapset) {
13875 osd->clog->error() << mode << " " << info.pgid << " " << soid
13876 << " clone ignored due to missing snapset";
13877 } else {
13878 osd->clog->error() << mode << " " << info.pgid << " " << soid
13879 << " is an unexpected clone";
13880 }
13881 ++scrubber.shallow_errors;
13882 soid_error.set_headless();
13883 scrubber.store->add_snap_error(pool.id, soid_error);
13884 if (head && soid.get_head() == head->get_head())
13885 head_error.set_clone(soid.snap);
13886 continue;
13887 }
13888
13889 // new snapset?
13890 if (soid.has_snapset()) {
13891
13892 if (missing) {
13893 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13894 pool.info.allow_incomplete_clones());
13895 }
13896
13897 // Save previous head error information
13898 if (head && head_error.errors)
13899 scrubber.store->add_snap_error(pool.id, head_error);
13900 // Set this as a new head object
13901 head = soid;
13902 missing = 0;
13903 head_error = soid_error;
13904
13905 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13906
13907 if (p->second.attrs.count(SS_ATTR) == 0) {
13908 osd->clog->error() << mode << " " << info.pgid << " " << soid
13909 << " no '" << SS_ATTR << "' attr";
13910 ++scrubber.shallow_errors;
13911 snapset = boost::none;
13912 head_error.set_ss_attr_missing();
13913 } else {
13914 bufferlist bl;
13915 bl.push_back(p->second.attrs[SS_ATTR]);
13916 bufferlist::iterator blp = bl.begin();
13917 try {
13918 snapset = SnapSet(); // Initialize optional<> before decoding into it
13919 ::decode(snapset.get(), blp);
13920 } catch (buffer::error& e) {
13921 snapset = boost::none;
13922 osd->clog->error() << mode << " " << info.pgid << " " << soid
13923 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13924 ++scrubber.shallow_errors;
13925 head_error.set_ss_attr_corrupted();
13926 }
13927 }
13928
13929 if (snapset) {
13930 // what will be next?
13931 curclone = snapset->clones.rbegin();
13932
13933 if (!snapset->clones.empty()) {
13934 dout(20) << " snapset " << snapset.get() << dendl;
13935 if (snapset->seq == 0) {
13936 osd->clog->error() << mode << " " << info.pgid << " " << soid
13937 << " snaps.seq not set";
13938 ++scrubber.shallow_errors;
13939 head_error.set_snapset_mismatch();
13940 }
13941 }
13942
13943 if (soid.is_head() && !snapset->head_exists) {
13944 osd->clog->error() << mode << " " << info.pgid << " " << soid
13945 << " snapset.head_exists=false, but head exists";
13946 ++scrubber.shallow_errors;
13947 head_error.set_head_mismatch();
13948 // Fix head_exists locally so is_legacy() returns correctly
13949 snapset->head_exists = true;
13950 }
13951 if (soid.is_snapdir() && snapset->head_exists) {
13952 osd->clog->error() << mode << " " << info.pgid << " " << soid
13953 << " snapset.head_exists=true, but snapdir exists";
13954 ++scrubber.shallow_errors;
13955 head_error.set_head_mismatch();
13956 // For symmetry fix this too, but probably doesn't matter
13957 snapset->head_exists = false;
13958 }
13959
13960 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13961 if (soid.is_snapdir()) {
13962 dout(10) << " will move snapset to head from " << soid << dendl;
13963 snapset_to_repair[soid.get_head()] = *snapset;
13964 } else if (snapset->is_legacy()) {
13965 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13966 << dendl;
13967 snapset_to_repair[soid.get_head()] = *snapset;
13968 }
13969 } else {
13970 stat.num_legacy_snapsets++;
13971 }
13972 } else {
13973 // pessimistic assumption that this object might contain a
13974 // legacy SnapSet
13975 stat.num_legacy_snapsets++;
13976 }
13977 } else {
13978 assert(soid.is_snap());
13979 assert(head);
13980 assert(snapset);
13981 assert(soid.snap == *curclone);
13982
13983 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13984
13985 if (snapset->clone_size.count(soid.snap) == 0) {
13986 osd->clog->error() << mode << " " << info.pgid << " " << soid
13987 << " is missing in clone_size";
13988 ++scrubber.shallow_errors;
13989 soid_error.set_size_mismatch();
13990 } else {
13991 if (oi && oi->size != snapset->clone_size[soid.snap]) {
13992 osd->clog->error() << mode << " " << info.pgid << " " << soid
13993 << " size " << oi->size << " != clone_size "
13994 << snapset->clone_size[*curclone];
13995 ++scrubber.shallow_errors;
13996 soid_error.set_size_mismatch();
13997 }
13998
13999 if (snapset->clone_overlap.count(soid.snap) == 0) {
14000 osd->clog->error() << mode << " " << info.pgid << " " << soid
14001 << " is missing in clone_overlap";
14002 ++scrubber.shallow_errors;
14003 soid_error.set_size_mismatch();
14004 } else {
14005 // This checking is based on get_clone_bytes(). The first 2 asserts
14006 // can't happen because we know we have a clone_size and
14007 // a clone_overlap. Now we check that the interval_set won't
14008 // cause the last assert.
14009 uint64_t size = snapset->clone_size.find(soid.snap)->second;
14010 const interval_set<uint64_t> &overlap =
14011 snapset->clone_overlap.find(soid.snap)->second;
14012 bool bad_interval_set = false;
14013 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14014 i != overlap.end(); ++i) {
14015 if (size < i.get_len()) {
14016 bad_interval_set = true;
14017 break;
14018 }
14019 size -= i.get_len();
14020 }
14021
14022 if (bad_interval_set) {
14023 osd->clog->error() << mode << " " << info.pgid << " " << soid
14024 << " bad interval_set in clone_overlap";
14025 ++scrubber.shallow_errors;
14026 soid_error.set_size_mismatch();
14027 } else {
14028 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14029 }
14030 }
14031 }
14032
14033 // migrate legacy_snaps to snapset?
14034 auto p = snapset_to_repair.find(soid.get_head());
14035 if (p != snapset_to_repair.end()) {
14036 if (!oi || oi->legacy_snaps.empty()) {
14037 osd->clog->error() << mode << " " << info.pgid << " " << soid
14038 << " has no oi or legacy_snaps; cannot convert "
14039 << *snapset;
14040 ++scrubber.shallow_errors;
14041 } else {
14042 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
14043 << " to snapset " << p->second << dendl;
14044 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14045 }
14046 }
14047
14048 // what's next?
14049 ++curclone;
14050 if (soid_error.errors)
14051 scrubber.store->add_snap_error(pool.id, soid_error);
14052 }
14053
14054 scrub_cstat.add(stat);
14055 }
14056
14057 if (doing_clones(snapset, curclone)) {
14058 dout(10) << __func__ << " " << mode << " " << info.pgid
14059 << " No more objects while processing " << head.get() << dendl;
14060
14061 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14062 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14063 head_error);
14064 }
14065 // There could be missing found by the test above or even
14066 // before dropping out of the loop for the last head.
14067 if (missing) {
14068 log_missing(missing, head, osd->clog, info.pgid, __func__,
14069 mode, pool.info.allow_incomplete_clones());
14070 }
14071 if (head && head_error.errors)
14072 scrubber.store->add_snap_error(pool.id, head_error);
14073
14074 for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14075 missing_digest.begin();
14076 p != missing_digest.end();
14077 ++p) {
14078 if (p->first.is_snapdir())
14079 continue;
14080 dout(10) << __func__ << " recording digests for " << p->first << dendl;
14081 ObjectContextRef obc = get_object_context(p->first, false);
14082 if (!obc) {
14083 osd->clog->error() << info.pgid << " " << mode
14084 << " cannot get object context for object "
14085 << p->first;
14086 continue;
14087 } else if (obc->obs.oi.soid != p->first) {
14088 osd->clog->error() << info.pgid << " " << mode
14089 << " object " << p->first
14090 << " has a valid oi attr with a mismatched name, "
14091 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14092 continue;
14093 }
14094 OpContextUPtr ctx = simple_opc_create(obc);
14095 ctx->at_version = get_next_version();
14096 ctx->mtime = utime_t(); // do not update mtime
14097 ctx->new_obs.oi.set_data_digest(p->second.first);
14098 ctx->new_obs.oi.set_omap_digest(p->second.second);
14099 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14100
14101 ctx->register_on_success(
14102 [this]() {
14103 dout(20) << "updating scrub digest" << dendl;
14104 if (--scrubber.num_digest_updates_pending == 0) {
14105 requeue_scrub();
14106 }
14107 });
14108
14109 simple_opc_submit(std::move(ctx));
14110 ++scrubber.num_digest_updates_pending;
14111 }
14112 for (auto& p : snapset_to_repair) {
14113 // cache pools may not have the clones, which means we won't know
14114 // what snaps they have. fake out the clone_snaps entries anyway (with
14115 // blank snap lists).
14116 p.second.head_exists = true;
14117 if (pool.info.allow_incomplete_clones()) {
14118 for (auto s : p.second.clones) {
14119 if (p.second.clone_snaps.count(s) == 0) {
14120 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14121 << s << dendl;
14122 p.second.clone_snaps[s];
14123 }
14124 }
14125 }
14126 if (p.second.clones.size() != p.second.clone_snaps.size() ||
14127 p.second.is_legacy()) {
14128 // this happens if we encounter other errors above, like a missing
14129 // or extra clone.
14130 dout(10) << __func__ << " not writing snapset to " << p.first
14131 << " snapset " << p.second << " clones " << p.second.clones
14132 << "; didn't convert fully" << dendl;
14133 scrub_cstat.sum.num_legacy_snapsets++;
14134 continue;
14135 }
14136 dout(10) << __func__ << " writing snapset to " << p.first
14137 << " " << p.second << dendl;
14138 ObjectContextRef obc = get_object_context(p.first, true);
14139 if (!obc) {
14140 osd->clog->error() << info.pgid << " " << mode
14141 << " cannot get object context for object "
14142 << p.first;
14143 continue;
14144 } else if (obc->obs.oi.soid != p.first) {
14145 osd->clog->error() << info.pgid << " " << mode
14146 << " object " << p.first
14147 << " has a valid oi attr with a mismatched name, "
14148 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14149 continue;
14150 }
14151 ObjectContextRef snapset_obc;
14152 if (!obc->obs.exists) {
14153 snapset_obc = get_object_context(p.first.get_snapdir(), false);
14154 if (!snapset_obc) {
14155 osd->clog->error() << info.pgid << " " << mode
14156 << " cannot get object context for "
14157 << p.first.get_snapdir();
14158 continue;
14159 }
14160 }
14161 OpContextUPtr ctx = simple_opc_create(obc);
14162 PGTransaction *t = ctx->op_t.get();
14163 ctx->snapset_obc = snapset_obc;
14164 ctx->at_version = get_next_version();
14165 ctx->mtime = utime_t(); // do not update mtime
14166 ctx->new_snapset = p.second;
14167 if (!ctx->new_obs.exists) {
14168 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
14169 ctx->new_obs.exists = true;
14170 ctx->new_snapset.head_exists = true;
14171 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14172 ++ctx->delta_stats.num_whiteouts;
14173 ++ctx->delta_stats.num_objects;
14174 t->create(p.first);
14175 if (p.first < scrubber.start) {
14176 dout(20) << __func__ << " kludging around update outside of scrub range"
14177 << dendl;
14178 } else {
14179 scrub_cstat.add(ctx->delta_stats);
14180 }
14181 }
14182 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
14183 assert(!ctx->new_snapset.is_legacy());
14184 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14185 ctx->register_on_success(
14186 [this]() {
14187 dout(20) << "updating snapset" << dendl;
14188 if (--scrubber.num_digest_updates_pending == 0) {
14189 requeue_scrub();
14190 }
14191 });
14192
14193 simple_opc_submit(std::move(ctx));
14194 ++scrubber.num_digest_updates_pending;
14195 }
14196
14197 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14198 }
14199
14200 void PrimaryLogPG::_scrub_clear_state()
14201 {
14202 scrub_cstat = object_stat_collection_t();
14203 }
14204
14205 void PrimaryLogPG::_scrub_finish()
14206 {
14207 bool repair = state_test(PG_STATE_REPAIR);
14208 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14209 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14210
14211 if (info.stats.stats_invalid) {
14212 info.stats.stats = scrub_cstat;
14213 info.stats.stats_invalid = false;
14214
14215 if (agent_state)
14216 agent_choose_mode();
14217 }
14218
14219 dout(10) << mode << " got "
14220 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14221 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14222 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14223 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14224 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14225 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14226 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14227 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14228 << dendl;
14229
14230 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14231 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14232 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14233 !info.stats.dirty_stats_invalid) ||
14234 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14235 !info.stats.omap_stats_invalid) ||
14236 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14237 !info.stats.pin_stats_invalid) ||
14238 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14239 !info.stats.hitset_stats_invalid) ||
14240 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14241 !info.stats.hitset_bytes_stats_invalid) ||
14242 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14243 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14244 osd->clog->error() << info.pgid << " " << mode
14245 << " stat mismatch, got "
14246 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14247 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14248 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14249 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14250 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14251 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14252 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14253 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14254 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14255 ++scrubber.shallow_errors;
14256
14257 if (repair) {
14258 ++scrubber.fixed;
14259 info.stats.stats = scrub_cstat;
14260 info.stats.dirty_stats_invalid = false;
14261 info.stats.omap_stats_invalid = false;
14262 info.stats.hitset_stats_invalid = false;
14263 info.stats.hitset_bytes_stats_invalid = false;
14264 publish_stats_to_osd();
14265 share_pg_info();
14266 }
14267 } else if (scrub_cstat.sum.num_legacy_snapsets !=
14268 info.stats.stats.sum.num_legacy_snapsets) {
14269 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14270 << " from " << info.stats.stats.sum.num_legacy_snapsets
14271 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14272 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14273 publish_stats_to_osd();
14274 share_pg_info();
14275 }
14276 // Clear object context cache to get repair information
14277 if (repair)
14278 object_contexts.clear();
14279 }
14280
14281 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14282 {
14283 return osd->check_osdmap_full(missing_on);
14284 }
14285
14286 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14287 {
14288 // Only supports replicated pools
14289 assert(!pool.info.require_rollback());
14290 assert(is_primary());
14291
14292 dout(10) << __func__ << " " << soid
14293 << " peers osd.{" << actingbackfill << "}" << dendl;
14294
14295 if (!is_clean()) {
14296 block_for_clean(soid, op);
14297 return -EAGAIN;
14298 }
14299
14300 assert(!pg_log.get_missing().is_missing(soid));
14301 bufferlist bv;
14302 object_info_t oi;
14303 eversion_t v;
14304 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14305 if (r < 0) {
14306 // Leave v and try to repair without a version, getting attr failed
14307 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14308 << soid << " error=" << r << dendl;
14309 } else try {
14310 bufferlist::iterator bliter = bv.begin();
14311 ::decode(oi, bliter);
14312 v = oi.version;
14313 } catch (...) {
14314 // Leave v as default constructed. This will fail when sent to older OSDs, but
14315 // not much worse than failing here.
14316 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14317 }
14318
14319 missing_loc.add_missing(soid, v, eversion_t());
14320 if (primary_error(soid, v)) {
14321 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14322 // XXX: If we knew that there is no down osd which could include this
14323 // object, it would be nice if we could return EIO here.
14324 // If a "never fail" flag was available, that could be used
14325 // for rbd to NOT return EIO until object marked lost.
14326
14327 // Drop through to save this op in case an osd comes up with the object.
14328 }
14329
14330 // Restart the op after object becomes readable again
14331 waiting_for_unreadable_object[soid].push_back(op);
14332 op->mark_delayed("waiting for missing object");
14333
14334 if (!eio_errors_to_process) {
14335 eio_errors_to_process = true;
14336 assert(is_clean());
14337 queue_peering_event(
14338 CephPeeringEvtRef(
14339 std::make_shared<CephPeeringEvt>(
14340 get_osdmap()->get_epoch(),
14341 get_osdmap()->get_epoch(),
14342 DoRecovery())));
14343 } else {
14344 // A prior error must have already cleared clean state and queued recovery
14345 // or a map change has triggered re-peering.
14346 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14347 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14348 }
14349
14350 return -EAGAIN;
14351 }
14352
14353 /*---SnapTrimmer Logging---*/
14354 #undef dout_prefix
14355 #define dout_prefix *_dout << pg->gen_prefix()
14356
14357 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14358 {
14359 ldout(pg->cct, 20) << "enter " << state_name << dendl;
14360 }
14361
14362 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14363 {
14364 ldout(pg->cct, 20) << "exit " << state_name << dendl;
14365 }
14366
14367 /*---SnapTrimmer states---*/
14368 #undef dout_prefix
14369 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14370 << "SnapTrimmer state<" << get_state_name() << ">: ")
14371
14372 /* NotTrimming */
14373 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14374 : my_base(ctx),
14375 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14376 {
14377 context< SnapTrimmer >().log_enter(state_name);
14378 }
14379
14380 void PrimaryLogPG::NotTrimming::exit()
14381 {
14382 context< SnapTrimmer >().log_exit(state_name, enter_time);
14383 }
14384
14385 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14386 {
14387 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14388 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14389
14390 if (!(pg->is_primary() && pg->is_active())) {
14391 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14392 return discard_event();
14393 }
14394 if (!pg->is_clean() ||
14395 pg->snap_trimq.empty()) {
14396 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14397 return discard_event();
14398 }
14399 if (pg->scrubber.active) {
14400 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14401 return transit< WaitScrub >();
14402 } else {
14403 return transit< Trimming >();
14404 }
14405 }
14406
14407 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14408 {
14409 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14410 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14411
14412 pending = nullptr;
14413 if (!context< SnapTrimmer >().can_trim()) {
14414 post_event(KickTrim());
14415 return transit< NotTrimming >();
14416 }
14417
14418 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14419 ldout(pg->cct, 10) << "NotTrimming: trimming "
14420 << pg->snap_trimq.range_start()
14421 << dendl;
14422 return transit< AwaitAsyncWork >();
14423 }
14424
14425 /* AwaitAsyncWork */
14426 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14427 : my_base(ctx),
14428 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14429 {
14430 auto *pg = context< SnapTrimmer >().pg;
14431 context< SnapTrimmer >().log_enter(state_name);
14432 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14433 pg->state_set(PG_STATE_SNAPTRIM);
14434 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14435 pg->publish_stats_to_osd();
14436 }
14437
14438 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14439 {
14440 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14441 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14442 auto &in_flight = context<Trimming>().in_flight;
14443 assert(in_flight.empty());
14444
14445 assert(pg->is_primary() && pg->is_active());
14446 if (!context< SnapTrimmer >().can_trim()) {
14447 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14448 post_event(KickTrim());
14449 return transit< NotTrimming >();
14450 }
14451
14452 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14453
14454 vector<hobject_t> to_trim;
14455 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14456 to_trim.reserve(max);
14457 int r = pg->snap_mapper.get_next_objects_to_trim(
14458 snap_to_trim,
14459 max,
14460 &to_trim);
14461 if (r != 0 && r != -ENOENT) {
14462 lderr(pg->cct) << "get_next_objects_to_trim returned "
14463 << cpp_strerror(r) << dendl;
14464 assert(0 == "get_next_objects_to_trim returned an invalid code");
14465 } else if (r == -ENOENT) {
14466 // Done!
14467 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14468
14469 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14470 << " to purged_snaps"
14471 << dendl;
14472 pg->info.purged_snaps.insert(snap_to_trim);
14473 pg->snap_trimq.erase(snap_to_trim);
14474 ldout(pg->cct, 10) << "purged_snaps now "
14475 << pg->info.purged_snaps << ", snap_trimq now "
14476 << pg->snap_trimq << dendl;
14477
14478 ObjectStore::Transaction t;
14479 pg->dirty_big_info = true;
14480 pg->write_if_dirty(t);
14481 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14482 assert(tr == 0);
14483
14484 pg->share_pg_info();
14485 post_event(KickTrim());
14486 return transit< NotTrimming >();
14487 }
14488 assert(!to_trim.empty());
14489
14490 for (auto &&object: to_trim) {
14491 // Get next
14492 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14493 OpContextUPtr ctx;
14494 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14495 if (error) {
14496 if (error == -ENOLCK) {
14497 ldout(pg->cct, 10) << "could not get write lock on obj "
14498 << object << dendl;
14499 } else {
14500 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14501 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14502 }
14503 if (!in_flight.empty()) {
14504 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14505 return transit< WaitRepops >();
14506 }
14507 if (error == -ENOLCK) {
14508 ldout(pg->cct, 10) << "waiting for it to clear"
14509 << dendl;
14510 return transit< WaitRWLock >();
14511 } else {
14512 return transit< NotTrimming >();
14513 }
14514 }
14515
14516 in_flight.insert(object);
14517 ctx->register_on_success(
14518 [pg, object, &in_flight]() {
14519 assert(in_flight.find(object) != in_flight.end());
14520 in_flight.erase(object);
14521 if (in_flight.empty()) {
14522 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14523 pg->snap_trimmer_machine.process_event(Reset());
14524 } else {
14525 pg->snap_trimmer_machine.process_event(RepopsComplete());
14526 }
14527 }
14528 });
14529
14530 pg->simple_opc_submit(std::move(ctx));
14531 }
14532
14533 return transit< WaitRepops >();
14534 }
14535
14536 void PrimaryLogPG::setattr_maybe_cache(
14537 ObjectContextRef obc,
14538 OpContext *op,
14539 PGTransaction *t,
14540 const string &key,
14541 bufferlist &val)
14542 {
14543 t->setattr(obc->obs.oi.soid, key, val);
14544 }
14545
14546 void PrimaryLogPG::setattrs_maybe_cache(
14547 ObjectContextRef obc,
14548 OpContext *op,
14549 PGTransaction *t,
14550 map<string, bufferlist> &attrs)
14551 {
14552 t->setattrs(obc->obs.oi.soid, attrs);
14553 }
14554
14555 void PrimaryLogPG::rmattr_maybe_cache(
14556 ObjectContextRef obc,
14557 OpContext *op,
14558 PGTransaction *t,
14559 const string &key)
14560 {
14561 t->rmattr(obc->obs.oi.soid, key);
14562 }
14563
14564 int PrimaryLogPG::getattr_maybe_cache(
14565 ObjectContextRef obc,
14566 const string &key,
14567 bufferlist *val)
14568 {
14569 if (pool.info.require_rollback()) {
14570 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14571 if (i != obc->attr_cache.end()) {
14572 if (val)
14573 *val = i->second;
14574 return 0;
14575 } else {
14576 return -ENODATA;
14577 }
14578 }
14579 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14580 }
14581
14582 int PrimaryLogPG::getattrs_maybe_cache(
14583 ObjectContextRef obc,
14584 map<string, bufferlist> *out)
14585 {
14586 int r = 0;
14587 assert(out);
14588 if (pool.info.require_rollback()) {
14589 *out = obc->attr_cache;
14590 } else {
14591 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14592 }
14593 map<string, bufferlist> tmp;
14594 for (map<string, bufferlist>::iterator i = out->begin();
14595 i != out->end();
14596 ++i) {
14597 if (i->first.size() > 1 && i->first[0] == '_')
14598 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14599 }
14600 tmp.swap(*out);
14601 return r;
14602 }
14603
14604 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14605 return osd->check_failsafe_full(ss);
14606 }
14607
14608 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14609 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14610
14611 #ifdef PG_DEBUG_REFS
14612 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14613 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14614 #endif
14615
14616 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14617 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }