]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
update sources to v12.2.5
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
31
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
47
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
56
57 #ifdef WITH_LTTNG
58 #include "tracing/osd.h"
59 #else
60 #define tracepoint(...)
61 #endif
62
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66 #undef dout_prefix
67 #define dout_prefix _prefix(_dout, this)
68 template <typename T>
69 static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71 }
72
73
74 #include <sstream>
75 #include <utility>
76
77 #include <errno.h>
78
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81 PGLSFilter::PGLSFilter() : cct(nullptr)
82 {
83 }
84
85 PGLSFilter::~PGLSFilter()
86 {
87 }
88
89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104 };
105
106 /**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115 protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125 public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128 };
129
130 template <typename T>
131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135 public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146 };
147
148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152 }
153
154 class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158 public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169 };
170
171
172 Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174 }
175
176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185 };
186
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201 };
202
203 struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
210 opcontext->finish_read(pg);
211 }
212 ~OnReadComplete() override {}
213 };
214
215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
216 PrimaryLogPGRef pg;
217 ObjectContextRef obc;
218 public:
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
220 pg(p), obc(o) {}
221 void finish(int r) override {
222 pg->_applied_recovered_object(obc);
223 }
224 };
225
226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238 };
239
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 void finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 }
248 };
249
250 // OpContext
251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
252 {
253 inflightreads = 1;
254 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
255 pair<bufferlist*, Context*> > > in;
256 in.swap(pending_async_reads);
257 pg->pgbackend->objects_read_async(
258 obc->obs.oi.soid,
259 in,
260 new OnReadComplete(pg, this), pg->get_pool().fast_read);
261 }
262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
263 {
264 assert(inflightreads > 0);
265 --inflightreads;
266 if (async_reads_complete()) {
267 assert(pg->in_progress_async_reads.size());
268 assert(pg->in_progress_async_reads.front().second == this);
269 pg->in_progress_async_reads.pop_front();
270
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg->execute_ctx(this);
274 }
275 }
276
277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
278 public:
279 PrimaryLogPG::CopyResults *results = nullptr;
280 PrimaryLogPG::OpContext *ctx;
281 OSDOp &osd_op;
282
283 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
284 : ctx(ctx), osd_op(osd_op) {
285 }
286 ~CopyFromCallback() override {}
287
288 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
289 results = results_.get<1>();
290 int r = results_.get<0>();
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
297 } else {
298 if (r != -ECANCELED) { // on cancel just toss it out; client resends
299 if (ctx->op)
300 ctx->pg->osd->reply_op_error(ctx->op, r);
301 } else if (results->should_requeue) {
302 if (ctx->op)
303 ctx->pg->requeue_op(ctx->op);
304 }
305 ctx->pg->close_op_ctx(ctx);
306 }
307 }
308
309 bool is_temp_obj_used() {
310 return results->started_temp_obj;
311 }
312 uint64_t get_data_size() {
313 return results->object_size;
314 }
315 };
316
317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
318 CopyFromCallback *copy_from_callback;
319
320 CopyFromFinisher(CopyFromCallback *copy_from_callback)
321 : copy_from_callback(copy_from_callback) {
322 }
323
324 int execute() override {
325 // instance will be destructed after this method completes
326 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
327 return 0;
328 }
329 };
330
331 // ======================
332 // PGBackend::Listener
333
334 void PrimaryLogPG::on_local_recover(
335 const hobject_t &hoid,
336 const ObjectRecoveryInfo &_recovery_info,
337 ObjectContextRef obc,
338 bool is_delete,
339 ObjectStore::Transaction *t
340 )
341 {
342 dout(10) << __func__ << ": " << hoid << dendl;
343
344 ObjectRecoveryInfo recovery_info(_recovery_info);
345 clear_object_snap_mapping(t, hoid);
346 if (!is_delete && recovery_info.soid.is_snap()) {
347 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
348 set<snapid_t> snaps;
349 dout(20) << " snapset " << recovery_info.ss
350 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
351 if (recovery_info.ss.is_legacy() ||
352 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
353 assert(recovery_info.oi.legacy_snaps.size());
354 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
355 recovery_info.oi.legacy_snaps.end());
356 } else {
357 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
358 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
359 snaps.insert(p->second.begin(), p->second.end());
360 }
361 dout(20) << " snaps " << snaps << dendl;
362 snap_mapper.add_oid(
363 recovery_info.soid,
364 snaps,
365 &_t);
366 }
367 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
368 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
369 assert(is_primary());
370 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
371 if (latest->op == pg_log_entry_t::LOST_REVERT &&
372 latest->reverting_to == recovery_info.version) {
373 dout(10) << " got old revert version " << recovery_info.version
374 << " for " << *latest << dendl;
375 recovery_info.version = latest->version;
376 // update the attr to the revert event version
377 recovery_info.oi.prior_version = recovery_info.oi.version;
378 recovery_info.oi.version = latest->version;
379 bufferlist bl;
380 ::encode(recovery_info.oi, bl,
381 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
382 assert(!pool.info.require_rollback());
383 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
384 if (obc)
385 obc->attr_cache[OI_ATTR] = bl;
386 }
387 }
388
389 // keep track of active pushes for scrub
390 ++active_pushes;
391
392 if (recovery_info.version > pg_log.get_can_rollback_to()) {
393 /* This can only happen during a repair, and even then, it would
394 * be one heck of a race. If we are repairing the object, the
395 * write in question must be fully committed, so it's not valid
396 * to roll it back anyway (and we'll be rolled forward shortly
397 * anyway) */
398 PGLogEntryHandler h{this, t};
399 pg_log.roll_forward_to(recovery_info.version, &h);
400 }
401 recover_got(recovery_info.soid, recovery_info.version);
402
403 if (is_primary()) {
404 if (!is_delete) {
405 obc->obs.exists = true;
406 obc->ondisk_write_lock();
407
408 bool got = obc->get_recovery_read();
409 assert(got);
410
411 assert(recovering.count(obc->obs.oi.soid));
412 recovering[obc->obs.oi.soid] = obc;
413 obc->obs.oi = recovery_info.oi; // may have been updated above
414 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
415 }
416
417 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
418
419 publish_stats_to_osd();
420 assert(missing_loc.needs_recovery(hoid));
421 if (!is_delete)
422 missing_loc.add_location(hoid, pg_whoami);
423 release_backoffs(hoid);
424 if (!is_unreadable_object(hoid)) {
425 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
426 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
427 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
428 requeue_ops(unreadable_object_entry->second);
429 waiting_for_unreadable_object.erase(unreadable_object_entry);
430 }
431 }
432 } else {
433 t->register_on_applied(
434 new C_OSD_AppliedRecoveredObjectReplica(this));
435
436 }
437
438 t->register_on_commit(
439 new C_OSD_CommittedPushedObject(
440 this,
441 get_osdmap()->get_epoch(),
442 info.last_complete));
443
444 // update pg
445 dirty_info = true;
446 write_if_dirty(*t);
447 }
448
449 void PrimaryLogPG::on_global_recover(
450 const hobject_t &soid,
451 const object_stat_sum_t &stat_diff,
452 bool is_delete)
453 {
454 info.stats.stats.sum.add(stat_diff);
455 missing_loc.recovered(soid);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
459 assert(i != recovering.end());
460
461 if (!is_delete) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 assert(i->second);
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488 }
489
490 void PrimaryLogPG::on_peer_recover(
491 pg_shard_t peer,
492 const hobject_t &soid,
493 const ObjectRecoveryInfo &recovery_info)
494 {
495 publish_stats_to_osd();
496 // done!
497 peer_missing[peer].got(soid, recovery_info.version);
498 }
499
500 void PrimaryLogPG::begin_peer_recover(
501 pg_shard_t peer,
502 const hobject_t soid)
503 {
504 peer_missing[peer].revise_have(soid, eversion_t());
505 }
506
507 void PrimaryLogPG::schedule_recovery_work(
508 GenContext<ThreadPool::TPHandle&> *c)
509 {
510 osd->recovery_gen_wq.queue(c);
511 }
512
513 void PrimaryLogPG::send_message_osd_cluster(
514 int peer, Message *m, epoch_t from_epoch)
515 {
516 osd->send_message_osd_cluster(peer, m, from_epoch);
517 }
518
519 void PrimaryLogPG::send_message_osd_cluster(
520 Message *m, Connection *con)
521 {
522 osd->send_message_osd_cluster(m, con);
523 }
524
525 void PrimaryLogPG::send_message_osd_cluster(
526 Message *m, const ConnectionRef& con)
527 {
528 osd->send_message_osd_cluster(m, con);
529 }
530
531 void PrimaryLogPG::on_primary_error(
532 const hobject_t &oid,
533 eversion_t v)
534 {
535 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
536 primary_failed(oid);
537 primary_error(oid, v);
538 backfill_add_missing(oid, v);
539 }
540
541 void PrimaryLogPG::backfill_add_missing(
542 const hobject_t &oid,
543 eversion_t v)
544 {
545 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
546 backfills_in_flight.erase(oid);
547 missing_loc.add_missing(oid, v, eversion_t());
548 }
549
550 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
551 int peer, epoch_t from_epoch)
552 {
553 return osd->get_con_osd_cluster(peer, from_epoch);
554 }
555
556 PerfCounters *PrimaryLogPG::get_logger()
557 {
558 return osd->logger;
559 }
560
561
562 // ====================
563 // missing objects
564
565 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
566 {
567 return pg_log.get_missing().get_items().count(soid);
568 }
569
570 void PrimaryLogPG::maybe_kick_recovery(
571 const hobject_t &soid)
572 {
573 eversion_t v;
574 if (!missing_loc.needs_recovery(soid, &v))
575 return;
576
577 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
578 if (p != recovering.end()) {
579 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
580 } else if (missing_loc.is_unfound(soid)) {
581 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
582 } else {
583 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
584 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
585 if (is_missing_object(soid)) {
586 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
587 } else if (missing_loc.is_deleted(soid)) {
588 prep_object_replica_deletes(soid, v, h);
589 } else {
590 prep_object_replica_pushes(soid, v, h);
591 }
592 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
593 }
594 }
595
596 void PrimaryLogPG::wait_for_unreadable_object(
597 const hobject_t& soid, OpRequestRef op)
598 {
599 assert(is_unreadable_object(soid));
600 maybe_kick_recovery(soid);
601 waiting_for_unreadable_object[soid].push_back(op);
602 op->mark_delayed("waiting for missing object");
603 }
604
605 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
606 {
607 /* The conditions below may clear (on_local_recover, before we queue
608 * the transaction) before we actually requeue the degraded waiters
609 * in on_global_recover after the transaction completes.
610 */
611 if (waiting_for_degraded_object.count(soid))
612 return true;
613 if (pg_log.get_missing().get_items().count(soid))
614 return true;
615 assert(!actingbackfill.empty());
616 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
617 i != actingbackfill.end();
618 ++i) {
619 if (*i == get_primary()) continue;
620 pg_shard_t peer = *i;
621 auto peer_missing_entry = peer_missing.find(peer);
622 if (peer_missing_entry != peer_missing.end() &&
623 peer_missing_entry->second.get_items().count(soid))
624 return true;
625
626 // Object is degraded if after last_backfill AND
627 // we are backfilling it
628 if (is_backfill_targets(peer) &&
629 peer_info[peer].last_backfill <= soid &&
630 last_backfill_started >= soid &&
631 backfills_in_flight.count(soid))
632 return true;
633 }
634 return false;
635 }
636
637 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
638 {
639 assert(is_degraded_or_backfilling_object(soid));
640
641 maybe_kick_recovery(soid);
642 waiting_for_degraded_object[soid].push_back(op);
643 op->mark_delayed("waiting for degraded object");
644 }
645
646 void PrimaryLogPG::block_write_on_full_cache(
647 const hobject_t& _oid, OpRequestRef op)
648 {
649 const hobject_t oid = _oid.get_head();
650 dout(20) << __func__ << ": blocking object " << oid
651 << " on full cache" << dendl;
652 objects_blocked_on_cache_full.insert(oid);
653 waiting_for_cache_not_full.push_back(op);
654 op->mark_delayed("waiting for cache not full");
655 }
656
657 void PrimaryLogPG::block_for_clean(
658 const hobject_t& oid, OpRequestRef op)
659 {
660 dout(20) << __func__ << ": blocking object " << oid
661 << " on primary repair" << dendl;
662 waiting_for_clean_to_primary_repair.push_back(op);
663 op->mark_delayed("waiting for clean to repair");
664 }
665
666 void PrimaryLogPG::block_write_on_snap_rollback(
667 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
668 {
669 dout(20) << __func__ << ": blocking object " << oid.get_head()
670 << " on snap promotion " << obc->obs.oi.soid << dendl;
671 // otherwise, we'd have blocked in do_op
672 assert(oid.is_head());
673 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
674 objects_blocked_on_snap_promotion[oid] = obc;
675 wait_for_blocked_object(obc->obs.oi.soid, op);
676 }
677
678 void PrimaryLogPG::block_write_on_degraded_snap(
679 const hobject_t& snap, OpRequestRef op)
680 {
681 dout(20) << __func__ << ": blocking object " << snap.get_head()
682 << " on degraded snap " << snap << dendl;
683 // otherwise, we'd have blocked in do_op
684 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
685 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
686 wait_for_degraded_object(snap, op);
687 }
688
689 bool PrimaryLogPG::maybe_await_blocked_snapset(
690 const hobject_t &hoid,
691 OpRequestRef op)
692 {
693 ObjectContextRef obc;
694 obc = object_contexts.lookup(hoid.get_head());
695 if (obc) {
696 if (obc->is_blocked()) {
697 wait_for_blocked_object(obc->obs.oi.soid, op);
698 return true;
699 } else {
700 return false;
701 }
702 }
703 obc = object_contexts.lookup(hoid.get_snapdir());
704 if (obc) {
705 if (obc->is_blocked()) {
706 wait_for_blocked_object(obc->obs.oi.soid, op);
707 return true;
708 } else {
709 return false;
710 }
711 }
712 return false;
713 }
714
715 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
716 {
717 dout(10) << __func__ << " " << soid << " " << op << dendl;
718 waiting_for_blocked_object[soid].push_back(op);
719 op->mark_delayed("waiting for blocked object");
720 }
721
722 void PrimaryLogPG::maybe_force_recovery()
723 {
724 // no force if not in degraded/recovery/backfill states
725 if (!is_degraded() &&
726 !state_test(PG_STATE_RECOVERING |
727 PG_STATE_RECOVERY_WAIT |
728 PG_STATE_BACKFILLING |
729 PG_STATE_BACKFILL_WAIT |
730 PG_STATE_BACKFILL_TOOFULL))
731 return;
732
733 if (pg_log.get_log().approx_size() <
734 cct->_conf->osd_max_pg_log_entries *
735 cct->_conf->osd_force_recovery_pg_log_entries_factor)
736 return;
737
738 // find the oldest missing object
739 version_t min_version = 0;
740 hobject_t soid;
741 if (!pg_log.get_missing().get_items().empty()) {
742 min_version = pg_log.get_missing().get_rmissing().begin()->first;
743 soid = pg_log.get_missing().get_rmissing().begin()->second;
744 }
745 assert(!actingbackfill.empty());
746 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
747 it != actingbackfill.end();
748 ++it) {
749 if (*it == get_primary()) continue;
750 pg_shard_t peer = *it;
751 if (peer_missing.count(peer) &&
752 !peer_missing[peer].get_items().empty() &&
753 min_version > peer_missing[peer].get_rmissing().begin()->first) {
754 min_version = peer_missing[peer].get_rmissing().begin()->first;
755 soid = peer_missing[peer].get_rmissing().begin()->second;
756 }
757 }
758
759 // recover it
760 if (soid != hobject_t())
761 maybe_kick_recovery(soid);
762 }
763
764 class PGLSPlainFilter : public PGLSFilter {
765 string val;
766 public:
767 int init(bufferlist::iterator &params) override
768 {
769 try {
770 ::decode(xattr, params);
771 ::decode(val, params);
772 } catch (buffer::error &e) {
773 return -EINVAL;
774 }
775
776 return 0;
777 }
778 ~PGLSPlainFilter() override {}
779 bool filter(const hobject_t &obj, bufferlist& xattr_data,
780 bufferlist& outdata) override;
781 };
782
783 class PGLSParentFilter : public PGLSFilter {
784 inodeno_t parent_ino;
785 public:
786 CephContext* cct;
787 PGLSParentFilter(CephContext* cct) : cct(cct) {
788 xattr = "_parent";
789 }
790 int init(bufferlist::iterator &params) override
791 {
792 try {
793 ::decode(parent_ino, params);
794 } catch (buffer::error &e) {
795 return -EINVAL;
796 }
797 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
798
799 return 0;
800 }
801 ~PGLSParentFilter() override {}
802 bool filter(const hobject_t &obj, bufferlist& xattr_data,
803 bufferlist& outdata) override;
804 };
805
806 bool PGLSParentFilter::filter(const hobject_t &obj,
807 bufferlist& xattr_data, bufferlist& outdata)
808 {
809 bufferlist::iterator iter = xattr_data.begin();
810 inode_backtrace_t bt;
811
812 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
813
814 ::decode(bt, iter);
815
816 vector<inode_backpointer_t>::iterator vi;
817 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
818 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
819 if (vi->dirino == parent_ino) {
820 ::encode(*vi, outdata);
821 return true;
822 }
823 }
824
825 return false;
826 }
827
828 bool PGLSPlainFilter::filter(const hobject_t &obj,
829 bufferlist& xattr_data, bufferlist& outdata)
830 {
831 if (val.size() != xattr_data.length())
832 return false;
833
834 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
835 return false;
836
837 return true;
838 }
839
840 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
841 {
842 bufferlist bl;
843
844 // If filter has expressed an interest in an xattr, load it.
845 if (!filter->get_xattr().empty()) {
846 int ret = pgbackend->objects_get_attr(
847 sobj,
848 filter->get_xattr(),
849 &bl);
850 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
851 if (ret < 0) {
852 if (ret != -ENODATA || filter->reject_empty_xattr()) {
853 return false;
854 }
855 }
856 }
857
858 return filter->filter(sobj, bl, outdata);
859 }
860
861 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
862 {
863 string type;
864 PGLSFilter *filter;
865
866 try {
867 ::decode(type, iter);
868 }
869 catch (buffer::error& e) {
870 return -EINVAL;
871 }
872
873 if (type.compare("parent") == 0) {
874 filter = new PGLSParentFilter(cct);
875 } else if (type.compare("plain") == 0) {
876 filter = new PGLSPlainFilter();
877 } else {
878 std::size_t dot = type.find(".");
879 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
880 return -EINVAL;
881 }
882
883 const std::string class_name = type.substr(0, dot);
884 const std::string filter_name = type.substr(dot + 1);
885 ClassHandler::ClassData *cls = NULL;
886 int r = osd->class_handler->open_class(class_name, &cls);
887 if (r != 0) {
888 derr << "Error opening class '" << class_name << "': "
889 << cpp_strerror(r) << dendl;
890 if (r != -EPERM) // propogate permission error
891 r = -EINVAL;
892 return r;
893 } else {
894 assert(cls);
895 }
896
897 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
898 if (class_filter == NULL) {
899 derr << "Error finding filter '" << filter_name << "' in class "
900 << class_name << dendl;
901 return -EINVAL;
902 }
903 filter = class_filter->fn();
904 if (!filter) {
905 // Object classes are obliged to return us something, but let's
906 // give an error rather than asserting out.
907 derr << "Buggy class " << class_name << " failed to construct "
908 "filter " << filter_name << dendl;
909 return -EINVAL;
910 }
911 }
912
913 assert(filter);
914 int r = filter->init(iter);
915 if (r < 0) {
916 derr << "Error initializing filter " << type << ": "
917 << cpp_strerror(r) << dendl;
918 delete filter;
919 return -EINVAL;
920 } else {
921 // Successfully constructed and initialized, return it.
922 *pfilter = filter;
923 return 0;
924 }
925 }
926
927
928 // ==========================================================
929
930 int PrimaryLogPG::do_command(
931 cmdmap_t cmdmap,
932 ostream& ss,
933 bufferlist& idata,
934 bufferlist& odata,
935 ConnectionRef con,
936 ceph_tid_t tid)
937 {
938 const auto &missing = pg_log.get_missing();
939 string prefix;
940 string format;
941
942 cmd_getval(cct, cmdmap, "format", format);
943 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
944
945 string command;
946 cmd_getval(cct, cmdmap, "cmd", command);
947 if (command == "query") {
948 f->open_object_section("pg");
949 f->dump_string("state", pg_state_string(get_state()));
950 f->dump_stream("snap_trimq") << snap_trimq;
951 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
952 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
953 f->open_array_section("up");
954 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
955 f->dump_unsigned("osd", *p);
956 f->close_section();
957 f->open_array_section("acting");
958 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
959 f->dump_unsigned("osd", *p);
960 f->close_section();
961 if (!backfill_targets.empty()) {
962 f->open_array_section("backfill_targets");
963 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
964 p != backfill_targets.end();
965 ++p)
966 f->dump_stream("shard") << *p;
967 f->close_section();
968 }
969 if (!actingbackfill.empty()) {
970 f->open_array_section("actingbackfill");
971 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
972 p != actingbackfill.end();
973 ++p)
974 f->dump_stream("shard") << *p;
975 f->close_section();
976 }
977 f->open_object_section("info");
978 _update_calc_stats();
979 info.dump(f.get());
980 f->close_section();
981
982 f->open_array_section("peer_info");
983 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
984 p != peer_info.end();
985 ++p) {
986 f->open_object_section("info");
987 f->dump_stream("peer") << p->first;
988 p->second.dump(f.get());
989 f->close_section();
990 }
991 f->close_section();
992
993 f->open_array_section("recovery_state");
994 handle_query_state(f.get());
995 f->close_section();
996
997 f->open_object_section("agent_state");
998 if (agent_state)
999 agent_state->dump(f.get());
1000 f->close_section();
1001
1002 f->close_section();
1003 f->flush(odata);
1004 return 0;
1005 }
1006 else if (command == "mark_unfound_lost") {
1007 string mulcmd;
1008 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1009 int mode = -1;
1010 if (mulcmd == "revert") {
1011 if (pool.info.ec_pool()) {
1012 ss << "mode must be 'delete' for ec pool";
1013 return -EINVAL;
1014 }
1015 mode = pg_log_entry_t::LOST_REVERT;
1016 } else if (mulcmd == "delete") {
1017 mode = pg_log_entry_t::LOST_DELETE;
1018 } else {
1019 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1020 return -EINVAL;
1021 }
1022 assert(mode == pg_log_entry_t::LOST_REVERT ||
1023 mode == pg_log_entry_t::LOST_DELETE);
1024
1025 if (!is_primary()) {
1026 ss << "not primary";
1027 return -EROFS;
1028 }
1029
1030 uint64_t unfound = missing_loc.num_unfound();
1031 if (!unfound) {
1032 ss << "pg has no unfound objects";
1033 return 0; // make command idempotent
1034 }
1035
1036 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1037 ss << "pg has " << unfound
1038 << " unfound objects but we haven't probed all sources, not marking lost";
1039 return -EINVAL;
1040 }
1041
1042 mark_all_unfound_lost(mode, con, tid);
1043 return -EAGAIN;
1044 }
1045 else if (command == "list_missing") {
1046 hobject_t offset;
1047 string offset_json;
1048 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1049 json_spirit::Value v;
1050 try {
1051 if (!json_spirit::read(offset_json, v))
1052 throw std::runtime_error("bad json");
1053 offset.decode(v);
1054 } catch (std::runtime_error& e) {
1055 ss << "error parsing offset: " << e.what();
1056 return -EINVAL;
1057 }
1058 }
1059 f->open_object_section("missing");
1060 {
1061 f->open_object_section("offset");
1062 offset.dump(f.get());
1063 f->close_section();
1064 }
1065 f->dump_int("num_missing", missing.num_missing());
1066 f->dump_int("num_unfound", get_num_unfound());
1067 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1068 missing_loc.get_needs_recovery();
1069 map<hobject_t, pg_missing_item>::const_iterator p =
1070 needs_recovery_map.upper_bound(offset);
1071 {
1072 f->open_array_section("objects");
1073 int32_t num = 0;
1074 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1075 if (missing_loc.is_unfound(p->first)) {
1076 f->open_object_section("object");
1077 {
1078 f->open_object_section("oid");
1079 p->first.dump(f.get());
1080 f->close_section();
1081 }
1082 p->second.dump(f.get()); // have, need keys
1083 {
1084 f->open_array_section("locations");
1085 for (set<pg_shard_t>::iterator r =
1086 missing_loc.get_locations(p->first).begin();
1087 r != missing_loc.get_locations(p->first).end();
1088 ++r)
1089 f->dump_stream("shard") << *r;
1090 f->close_section();
1091 }
1092 f->close_section();
1093 num++;
1094 }
1095 }
1096 f->close_section();
1097 }
1098 f->dump_bool("more", p != needs_recovery_map.end());
1099 f->close_section();
1100 f->flush(odata);
1101 return 0;
1102 }
1103
1104 ss << "unknown pg command " << prefix;
1105 return -EINVAL;
1106 }
1107
1108 // ==========================================================
1109
1110 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1111 {
1112 // NOTE: this is non-const because we modify the OSDOp.outdata in
1113 // place
1114 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1115 assert(m->get_type() == CEPH_MSG_OSD_OP);
1116 dout(10) << "do_pg_op " << *m << dendl;
1117
1118 op->mark_started();
1119
1120 int result = 0;
1121 string cname, mname;
1122 PGLSFilter *filter = NULL;
1123 bufferlist filter_out;
1124
1125 snapid_t snapid = m->get_snapid();
1126
1127 vector<OSDOp> ops = m->ops;
1128
1129 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1130 OSDOp& osd_op = *p;
1131 bufferlist::iterator bp = p->indata.begin();
1132 switch (p->op.op) {
1133 case CEPH_OSD_OP_PGNLS_FILTER:
1134 try {
1135 ::decode(cname, bp);
1136 ::decode(mname, bp);
1137 }
1138 catch (const buffer::error& e) {
1139 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1140 result = -EINVAL;
1141 break;
1142 }
1143 if (filter) {
1144 delete filter;
1145 filter = NULL;
1146 }
1147 result = get_pgls_filter(bp, &filter);
1148 if (result < 0)
1149 break;
1150
1151 assert(filter);
1152
1153 // fall through
1154
1155 case CEPH_OSD_OP_PGNLS:
1156 if (snapid != CEPH_NOSNAP) {
1157 result = -EINVAL;
1158 break;
1159 }
1160 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1161 dout(10) << " pgnls pg=" << m->get_pg()
1162 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1163 << " != " << info.pgid << dendl;
1164 result = 0; // hmm?
1165 } else {
1166 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1167
1168 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1169 // read into a buffer
1170 vector<hobject_t> sentries;
1171 pg_nls_response_t response;
1172 try {
1173 ::decode(response.handle, bp);
1174 }
1175 catch (const buffer::error& e) {
1176 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1177 result = -EINVAL;
1178 break;
1179 }
1180
1181 hobject_t next;
1182 hobject_t lower_bound = response.handle;
1183 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1184 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1185 dout(10) << " pgnls lower_bound " << lower_bound
1186 << " pg_end " << pg_end << dendl;
1187 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1188 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1189 // this should only happen with a buggy client.
1190 dout(10) << "outside of PG bounds " << pg_start << " .. "
1191 << pg_end << dendl;
1192 result = -EINVAL;
1193 break;
1194 }
1195
1196 hobject_t current = lower_bound;
1197 osr->flush();
1198 int r = pgbackend->objects_list_partial(
1199 current,
1200 list_size,
1201 list_size,
1202 &sentries,
1203 &next);
1204 if (r != 0) {
1205 result = -EINVAL;
1206 break;
1207 }
1208
1209 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1210 pg_log.get_missing().get_items().lower_bound(current);
1211 vector<hobject_t>::iterator ls_iter = sentries.begin();
1212 hobject_t _max = hobject_t::get_max();
1213 while (1) {
1214 const hobject_t &mcand =
1215 missing_iter == pg_log.get_missing().get_items().end() ?
1216 _max :
1217 missing_iter->first;
1218 const hobject_t &lcand =
1219 ls_iter == sentries.end() ?
1220 _max :
1221 *ls_iter;
1222
1223 hobject_t candidate;
1224 if (mcand == lcand) {
1225 candidate = mcand;
1226 if (!mcand.is_max()) {
1227 ++ls_iter;
1228 ++missing_iter;
1229 }
1230 } else if (mcand < lcand) {
1231 candidate = mcand;
1232 assert(!mcand.is_max());
1233 ++missing_iter;
1234 } else {
1235 candidate = lcand;
1236 assert(!lcand.is_max());
1237 ++ls_iter;
1238 }
1239
1240 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1241 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1242
1243 if (candidate >= next) {
1244 break;
1245 }
1246
1247 if (response.entries.size() == list_size) {
1248 next = candidate;
1249 break;
1250 }
1251
1252 // skip snapdir objects
1253 if (candidate.snap == CEPH_SNAPDIR)
1254 continue;
1255
1256 if (candidate.snap != CEPH_NOSNAP)
1257 continue;
1258
1259 // skip internal namespace
1260 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1261 continue;
1262
1263 if (missing_loc.is_deleted(candidate))
1264 continue;
1265
1266 // skip wrong namespace
1267 if (m->get_hobj().nspace != librados::all_nspaces &&
1268 candidate.get_namespace() != m->get_hobj().nspace)
1269 continue;
1270
1271 if (filter && !pgls_filter(filter, candidate, filter_out))
1272 continue;
1273
1274 dout(20) << "pgnls item 0x" << std::hex
1275 << candidate.get_hash()
1276 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1277 << std::dec << " "
1278 << candidate.oid.name << dendl;
1279
1280 librados::ListObjectImpl item;
1281 item.nspace = candidate.get_namespace();
1282 item.oid = candidate.oid.name;
1283 item.locator = candidate.get_key();
1284 response.entries.push_back(item);
1285 }
1286
1287 if (next.is_max() &&
1288 missing_iter == pg_log.get_missing().get_items().end() &&
1289 ls_iter == sentries.end()) {
1290 result = 1;
1291
1292 // Set response.handle to the start of the next PG according
1293 // to the object sort order.
1294 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1295 } else {
1296 response.handle = next;
1297 }
1298 dout(10) << "pgnls handle=" << response.handle << dendl;
1299 ::encode(response, osd_op.outdata);
1300 if (filter)
1301 ::encode(filter_out, osd_op.outdata);
1302 dout(10) << " pgnls result=" << result << " outdata.length()="
1303 << osd_op.outdata.length() << dendl;
1304 }
1305 break;
1306
1307 case CEPH_OSD_OP_PGLS_FILTER:
1308 try {
1309 ::decode(cname, bp);
1310 ::decode(mname, bp);
1311 }
1312 catch (const buffer::error& e) {
1313 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1314 result = -EINVAL;
1315 break;
1316 }
1317 if (filter) {
1318 delete filter;
1319 filter = NULL;
1320 }
1321 result = get_pgls_filter(bp, &filter);
1322 if (result < 0)
1323 break;
1324
1325 assert(filter);
1326
1327 // fall through
1328
1329 case CEPH_OSD_OP_PGLS:
1330 if (snapid != CEPH_NOSNAP) {
1331 result = -EINVAL;
1332 break;
1333 }
1334 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1335 dout(10) << " pgls pg=" << m->get_pg()
1336 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1337 << " != " << info.pgid << dendl;
1338 result = 0; // hmm?
1339 } else {
1340 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1341
1342 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1343 // read into a buffer
1344 vector<hobject_t> sentries;
1345 pg_ls_response_t response;
1346 try {
1347 ::decode(response.handle, bp);
1348 }
1349 catch (const buffer::error& e) {
1350 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1351 result = -EINVAL;
1352 break;
1353 }
1354
1355 hobject_t next;
1356 hobject_t current = response.handle;
1357 osr->flush();
1358 int r = pgbackend->objects_list_partial(
1359 current,
1360 list_size,
1361 list_size,
1362 &sentries,
1363 &next);
1364 if (r != 0) {
1365 result = -EINVAL;
1366 break;
1367 }
1368
1369 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1370
1371 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1372 pg_log.get_missing().get_items().lower_bound(current);
1373 vector<hobject_t>::iterator ls_iter = sentries.begin();
1374 hobject_t _max = hobject_t::get_max();
1375 while (1) {
1376 const hobject_t &mcand =
1377 missing_iter == pg_log.get_missing().get_items().end() ?
1378 _max :
1379 missing_iter->first;
1380 const hobject_t &lcand =
1381 ls_iter == sentries.end() ?
1382 _max :
1383 *ls_iter;
1384
1385 hobject_t candidate;
1386 if (mcand == lcand) {
1387 candidate = mcand;
1388 if (!mcand.is_max()) {
1389 ++ls_iter;
1390 ++missing_iter;
1391 }
1392 } else if (mcand < lcand) {
1393 candidate = mcand;
1394 assert(!mcand.is_max());
1395 ++missing_iter;
1396 } else {
1397 candidate = lcand;
1398 assert(!lcand.is_max());
1399 ++ls_iter;
1400 }
1401
1402 if (candidate >= next) {
1403 break;
1404 }
1405
1406 if (response.entries.size() == list_size) {
1407 next = candidate;
1408 break;
1409 }
1410
1411 // skip snapdir objects
1412 if (candidate.snap == CEPH_SNAPDIR)
1413 continue;
1414
1415 if (candidate.snap != CEPH_NOSNAP)
1416 continue;
1417
1418 // skip wrong namespace
1419 if (candidate.get_namespace() != m->get_hobj().nspace)
1420 continue;
1421
1422 if (missing_loc.is_deleted(candidate))
1423 continue;
1424
1425 if (filter && !pgls_filter(filter, candidate, filter_out))
1426 continue;
1427
1428 response.entries.push_back(make_pair(candidate.oid,
1429 candidate.get_key()));
1430 }
1431 if (next.is_max() &&
1432 missing_iter == pg_log.get_missing().get_items().end() &&
1433 ls_iter == sentries.end()) {
1434 result = 1;
1435 }
1436 response.handle = next;
1437 ::encode(response, osd_op.outdata);
1438 if (filter)
1439 ::encode(filter_out, osd_op.outdata);
1440 dout(10) << " pgls result=" << result << " outdata.length()="
1441 << osd_op.outdata.length() << dendl;
1442 }
1443 break;
1444
1445 case CEPH_OSD_OP_PG_HITSET_LS:
1446 {
1447 list< pair<utime_t,utime_t> > ls;
1448 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1449 p != info.hit_set.history.end();
1450 ++p)
1451 ls.push_back(make_pair(p->begin, p->end));
1452 if (hit_set)
1453 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1454 ::encode(ls, osd_op.outdata);
1455 }
1456 break;
1457
1458 case CEPH_OSD_OP_PG_HITSET_GET:
1459 {
1460 utime_t stamp(osd_op.op.hit_set_get.stamp);
1461 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1462 // read the current in-memory HitSet, not the version we've
1463 // checkpointed.
1464 if (!hit_set) {
1465 result= -ENOENT;
1466 break;
1467 }
1468 ::encode(*hit_set, osd_op.outdata);
1469 result = osd_op.outdata.length();
1470 } else {
1471 // read an archived HitSet.
1472 hobject_t oid;
1473 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1474 p != info.hit_set.history.end();
1475 ++p) {
1476 if (stamp >= p->begin && stamp <= p->end) {
1477 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1478 break;
1479 }
1480 }
1481 if (oid == hobject_t()) {
1482 result = -ENOENT;
1483 break;
1484 }
1485 if (!pool.info.is_replicated()) {
1486 // FIXME: EC not supported yet
1487 result = -EOPNOTSUPP;
1488 break;
1489 }
1490 if (is_unreadable_object(oid)) {
1491 wait_for_unreadable_object(oid, op);
1492 delete filter;
1493 return;
1494 }
1495 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1496 }
1497 }
1498 break;
1499
1500 case CEPH_OSD_OP_SCRUBLS:
1501 result = do_scrub_ls(m, &osd_op);
1502 break;
1503
1504 default:
1505 result = -EINVAL;
1506 break;
1507 }
1508
1509 if (result < 0)
1510 break;
1511 }
1512
1513 // reply
1514 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1515 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1516 false);
1517 reply->claim_op_out_data(ops);
1518 reply->set_result(result);
1519 reply->set_reply_versions(info.last_update, info.last_user_version);
1520 osd->send_message_osd_client(reply, m->get_connection());
1521 delete filter;
1522 }
1523
1524 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1525 {
1526 if (m->get_pg() != info.pgid.pgid) {
1527 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1528 return -EINVAL; // hmm?
1529 }
1530 auto bp = osd_op->indata.begin();
1531 scrub_ls_arg_t arg;
1532 try {
1533 arg.decode(bp);
1534 } catch (buffer::error&) {
1535 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1536 return -EINVAL;
1537 }
1538 int r = 0;
1539 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1540 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1541 r = -EAGAIN;
1542 } else if (!scrubber.store) {
1543 r = -ENOENT;
1544 } else if (arg.get_snapsets) {
1545 result.vals = scrubber.store->get_snap_errors(osd->store,
1546 get_pgid().pool(),
1547 arg.start_after,
1548 arg.max_return);
1549 } else {
1550 result.vals = scrubber.store->get_object_errors(osd->store,
1551 get_pgid().pool(),
1552 arg.start_after,
1553 arg.max_return);
1554 }
1555 ::encode(result, osd_op->outdata);
1556 return r;
1557 }
1558
1559 void PrimaryLogPG::calc_trim_to()
1560 {
1561 size_t target = cct->_conf->osd_min_pg_log_entries;
1562 if (is_degraded() ||
1563 state_test(PG_STATE_RECOVERING |
1564 PG_STATE_RECOVERY_WAIT |
1565 PG_STATE_BACKFILLING |
1566 PG_STATE_BACKFILL_WAIT |
1567 PG_STATE_BACKFILL_TOOFULL)) {
1568 target = cct->_conf->osd_max_pg_log_entries;
1569 }
1570
1571 eversion_t limit = MIN(
1572 min_last_complete_ondisk,
1573 pg_log.get_can_rollback_to());
1574 if (limit != eversion_t() &&
1575 limit != pg_trim_to &&
1576 pg_log.get_log().approx_size() > target) {
1577 size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
1578 cct->_conf->osd_pg_log_trim_max);
1579 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1580 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1581 return;
1582 }
1583 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1584 eversion_t new_trim_to;
1585 for (size_t i = 0; i < num_to_trim; ++i) {
1586 new_trim_to = it->version;
1587 ++it;
1588 if (new_trim_to > limit) {
1589 new_trim_to = limit;
1590 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1591 break;
1592 }
1593 }
1594 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1595 pg_trim_to = new_trim_to;
1596 assert(pg_trim_to <= pg_log.get_head());
1597 assert(pg_trim_to <= min_last_complete_ondisk);
1598 }
1599 }
1600
1601 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1602 const PGPool &_pool, spg_t p) :
1603 PG(o, curmap, _pool, p),
1604 pgbackend(
1605 PGBackend::build_pg_backend(
1606 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1607 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1608 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1609 new_backfill(false),
1610 temp_seq(0),
1611 snap_trimmer_machine(this)
1612 {
1613 missing_loc.set_backend_predicates(
1614 pgbackend->get_is_readable_predicate(),
1615 pgbackend->get_is_recoverable_predicate());
1616 snap_trimmer_machine.initiate();
1617 }
1618
1619 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1620 {
1621 src_oloc = oloc;
1622 if (oloc.key.empty())
1623 src_oloc.key = oid.name;
1624 }
1625
1626 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1627 {
1628 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1629 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1630 if (!session)
1631 return; // drop it.
1632 session->put(); // get_priv takes a ref, and so does the SessionRef
1633 hobject_t begin = info.pgid.pgid.get_hobj_start();
1634 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1635 if (begin < m->begin) {
1636 begin = m->begin;
1637 }
1638 if (end > m->end) {
1639 end = m->end;
1640 }
1641 dout(10) << __func__ << " backoff ack id " << m->id
1642 << " [" << begin << "," << end << ")" << dendl;
1643 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1644 }
1645
1646 void PrimaryLogPG::do_request(
1647 OpRequestRef& op,
1648 ThreadPool::TPHandle &handle)
1649 {
1650 if (op->osd_trace) {
1651 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1652 op->pg_trace.event("do request");
1653 }
1654 // make sure we have a new enough map
1655 auto p = waiting_for_map.find(op->get_source());
1656 if (p != waiting_for_map.end()) {
1657 // preserve ordering
1658 dout(20) << __func__ << " waiting_for_map "
1659 << p->first << " not empty, queueing" << dendl;
1660 p->second.push_back(op);
1661 op->mark_delayed("waiting_for_map not empty");
1662 return;
1663 }
1664 if (!have_same_or_newer_map(op->min_epoch)) {
1665 dout(20) << __func__ << " min " << op->min_epoch
1666 << ", queue on waiting_for_map " << op->get_source() << dendl;
1667 waiting_for_map[op->get_source()].push_back(op);
1668 op->mark_delayed("op must wait for map");
1669 osd->request_osdmap_update(op->min_epoch);
1670 return;
1671 }
1672
1673 if (can_discard_request(op)) {
1674 return;
1675 }
1676
1677 // pg-wide backoffs
1678 const Message *m = op->get_req();
1679 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1680 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1681 if (!session)
1682 return; // drop it.
1683 session->put(); // get_priv takes a ref, and so does the SessionRef
1684
1685 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1686 if (session->check_backoff(cct, info.pgid,
1687 info.pgid.pgid.get_hobj_start(), m)) {
1688 return;
1689 }
1690
1691 bool backoff =
1692 is_down() ||
1693 is_incomplete() ||
1694 (!is_active() && is_peered());
1695 if (g_conf->osd_backoff_on_peering && !backoff) {
1696 if (is_peering()) {
1697 backoff = true;
1698 }
1699 }
1700 if (backoff) {
1701 add_pg_backoff(session);
1702 return;
1703 }
1704 }
1705 // pg backoff acks at pg-level
1706 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1707 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1708 if (ba->begin != ba->end) {
1709 handle_backoff(op);
1710 return;
1711 }
1712 }
1713 }
1714
1715 if (!is_peered()) {
1716 // Delay unless PGBackend says it's ok
1717 if (pgbackend->can_handle_while_inactive(op)) {
1718 bool handled = pgbackend->handle_message(op);
1719 assert(handled);
1720 return;
1721 } else {
1722 waiting_for_peered.push_back(op);
1723 op->mark_delayed("waiting for peered");
1724 return;
1725 }
1726 }
1727
1728 if (flushes_in_progress > 0) {
1729 dout(20) << flushes_in_progress
1730 << " flushes_in_progress pending "
1731 << "waiting for flush on " << op << dendl;
1732 waiting_for_flush.push_back(op);
1733 op->mark_delayed("waiting for flush");
1734 return;
1735 }
1736
1737 assert(is_peered() && flushes_in_progress == 0);
1738 if (pgbackend->handle_message(op))
1739 return;
1740
1741 switch (op->get_req()->get_type()) {
1742 case CEPH_MSG_OSD_OP:
1743 case CEPH_MSG_OSD_BACKOFF:
1744 if (!is_active()) {
1745 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1746 waiting_for_active.push_back(op);
1747 op->mark_delayed("waiting for active");
1748 return;
1749 }
1750 switch (op->get_req()->get_type()) {
1751 case CEPH_MSG_OSD_OP:
1752 // verify client features
1753 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1754 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1755 osd->reply_op_error(op, -EOPNOTSUPP);
1756 return;
1757 }
1758 do_op(op);
1759 break;
1760 case CEPH_MSG_OSD_BACKOFF:
1761 // object-level backoff acks handled in osdop context
1762 handle_backoff(op);
1763 break;
1764 }
1765 break;
1766
1767 case MSG_OSD_SUBOP:
1768 do_sub_op(op);
1769 break;
1770
1771 case MSG_OSD_SUBOPREPLY:
1772 do_sub_op_reply(op);
1773 break;
1774
1775 case MSG_OSD_PG_SCAN:
1776 do_scan(op, handle);
1777 break;
1778
1779 case MSG_OSD_PG_BACKFILL:
1780 do_backfill(op);
1781 break;
1782
1783 case MSG_OSD_PG_BACKFILL_REMOVE:
1784 do_backfill_remove(op);
1785 break;
1786
1787 case MSG_OSD_SCRUB_RESERVE:
1788 {
1789 const MOSDScrubReserve *m =
1790 static_cast<const MOSDScrubReserve*>(op->get_req());
1791 switch (m->type) {
1792 case MOSDScrubReserve::REQUEST:
1793 handle_scrub_reserve_request(op);
1794 break;
1795 case MOSDScrubReserve::GRANT:
1796 handle_scrub_reserve_grant(op, m->from);
1797 break;
1798 case MOSDScrubReserve::REJECT:
1799 handle_scrub_reserve_reject(op, m->from);
1800 break;
1801 case MOSDScrubReserve::RELEASE:
1802 handle_scrub_reserve_release(op);
1803 break;
1804 }
1805 }
1806 break;
1807
1808 case MSG_OSD_REP_SCRUB:
1809 replica_scrub(op, handle);
1810 break;
1811
1812 case MSG_OSD_REP_SCRUBMAP:
1813 do_replica_scrub_map(op);
1814 break;
1815
1816 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1817 do_update_log_missing(op);
1818 break;
1819
1820 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1821 do_update_log_missing_reply(op);
1822 break;
1823
1824 default:
1825 assert(0 == "bad message type in do_request");
1826 }
1827 }
1828
1829 hobject_t PrimaryLogPG::earliest_backfill() const
1830 {
1831 hobject_t e = hobject_t::get_max();
1832 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1833 i != backfill_targets.end();
1834 ++i) {
1835 pg_shard_t bt = *i;
1836 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1837 assert(iter != peer_info.end());
1838 if (iter->second.last_backfill < e)
1839 e = iter->second.last_backfill;
1840 }
1841 return e;
1842 }
1843
1844 /** do_op - do an op
1845 * pg lock will be held (if multithreaded)
1846 * osd_lock NOT held.
1847 */
1848 void PrimaryLogPG::do_op(OpRequestRef& op)
1849 {
1850 FUNCTRACE();
1851 // NOTE: take a non-const pointer here; we must be careful not to
1852 // change anything that will break other reads on m (operator<<).
1853 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1854 assert(m->get_type() == CEPH_MSG_OSD_OP);
1855 if (m->finish_decode()) {
1856 op->reset_desc(); // for TrackedOp
1857 m->clear_payload();
1858 }
1859
1860 dout(20) << __func__ << ": op " << *m << dendl;
1861
1862 hobject_t head = m->get_hobj();
1863 head.snap = CEPH_NOSNAP;
1864
1865 if (!info.pgid.pgid.contains(
1866 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1867 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1868 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1869 << std::hex << head.get_hash() << std::dec << dendl;
1870 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1871 << " op " << *m;
1872 assert(!cct->_conf->osd_debug_misdirected_ops);
1873 return;
1874 }
1875
1876 bool can_backoff =
1877 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1878 SessionRef session;
1879 if (can_backoff) {
1880 session = static_cast<Session*>(m->get_connection()->get_priv());
1881 if (!session.get()) {
1882 dout(10) << __func__ << " no session" << dendl;
1883 return;
1884 }
1885 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1886
1887 if (session->check_backoff(cct, info.pgid, head, m)) {
1888 return;
1889 }
1890 }
1891
1892 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1893 // not implemented.
1894 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1895 osd->reply_op_error(op, -EINVAL);
1896 return;
1897 }
1898
1899 if (op->rmw_flags == 0) {
1900 int r = osd->osd->init_op_flags(op);
1901 if (r) {
1902 osd->reply_op_error(op, r);
1903 return;
1904 }
1905 }
1906
1907 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1908 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1909 op->may_read() &&
1910 !(op->may_write() || op->may_cache())) {
1911 // balanced reads; any replica will do
1912 if (!(is_primary() || is_replica())) {
1913 osd->handle_misdirected_op(this, op);
1914 return;
1915 }
1916 } else {
1917 // normal case; must be primary
1918 if (!is_primary()) {
1919 osd->handle_misdirected_op(this, op);
1920 return;
1921 }
1922 }
1923
1924 if (!op_has_sufficient_caps(op)) {
1925 osd->reply_op_error(op, -EPERM);
1926 return;
1927 }
1928
1929 if (op->includes_pg_op()) {
1930 return do_pg_op(op);
1931 }
1932
1933 // object name too long?
1934 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1935 dout(4) << "do_op name is longer than "
1936 << cct->_conf->osd_max_object_name_len
1937 << " bytes" << dendl;
1938 osd->reply_op_error(op, -ENAMETOOLONG);
1939 return;
1940 }
1941 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1942 dout(4) << "do_op locator is longer than "
1943 << cct->_conf->osd_max_object_name_len
1944 << " bytes" << dendl;
1945 osd->reply_op_error(op, -ENAMETOOLONG);
1946 return;
1947 }
1948 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1949 dout(4) << "do_op namespace is longer than "
1950 << cct->_conf->osd_max_object_namespace_len
1951 << " bytes" << dendl;
1952 osd->reply_op_error(op, -ENAMETOOLONG);
1953 return;
1954 }
1955
1956 if (int r = osd->store->validate_hobject_key(head)) {
1957 dout(4) << "do_op object " << head << " invalid for backing store: "
1958 << r << dendl;
1959 osd->reply_op_error(op, r);
1960 return;
1961 }
1962
1963 // blacklisted?
1964 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1965 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1966 osd->reply_op_error(op, -EBLACKLISTED);
1967 return;
1968 }
1969
1970 // order this op as a write?
1971 bool write_ordered = op->rwordered();
1972
1973 // discard due to cluster full transition? (we discard any op that
1974 // originates before the cluster or pool is marked full; the client
1975 // will resend after the full flag is removed or if they expect the
1976 // op to succeed despite being full). The except is FULL_FORCE and
1977 // FULL_TRY ops, which there is no reason to discard because they
1978 // bypass all full checks anyway. If this op isn't write or
1979 // read-ordered, we skip.
1980 // FIXME: we exclude mds writes for now.
1981 if (write_ordered && !(m->get_source().is_mds() ||
1982 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1983 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1984 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1985 dout(10) << __func__ << " discarding op sent before full " << m << " "
1986 << *m << dendl;
1987 return;
1988 }
1989 // mds should have stopped writing before this point.
1990 // We can't allow OSD to become non-startable even if mds
1991 // could be writing as part of file removals.
1992 ostringstream ss;
1993 if (write_ordered && osd->check_failsafe_full(ss)) {
1994 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1995 << ss.str()
1996 << dendl;
1997 return;
1998 }
1999 int64_t poolid = get_pgid().pool();
2000 if (op->may_write()) {
2001
2002 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2003 if (!pi) {
2004 return;
2005 }
2006
2007 // invalid?
2008 if (m->get_snapid() != CEPH_NOSNAP) {
2009 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2010 osd->reply_op_error(op, -EINVAL);
2011 return;
2012 }
2013
2014 // too big?
2015 if (cct->_conf->osd_max_write_size &&
2016 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2017 // journal can't hold commit!
2018 derr << "do_op msg data len " << m->get_data_len()
2019 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2020 << " on " << *m << dendl;
2021 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2022 return;
2023 }
2024 }
2025
2026 dout(10) << "do_op " << *m
2027 << (op->may_write() ? " may_write" : "")
2028 << (op->may_read() ? " may_read" : "")
2029 << (op->may_cache() ? " may_cache" : "")
2030 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2031 << " flags " << ceph_osd_flag_string(m->get_flags())
2032 << dendl;
2033
2034 // missing object?
2035 if (is_unreadable_object(head)) {
2036 if (!is_primary()) {
2037 osd->reply_op_error(op, -EAGAIN);
2038 return;
2039 }
2040 if (can_backoff &&
2041 (g_conf->osd_backoff_on_degraded ||
2042 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2043 add_backoff(session, head, head);
2044 maybe_kick_recovery(head);
2045 } else {
2046 wait_for_unreadable_object(head, op);
2047 }
2048 return;
2049 }
2050
2051 // degraded object?
2052 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2053 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2054 add_backoff(session, head, head);
2055 maybe_kick_recovery(head);
2056 } else {
2057 wait_for_degraded_object(head, op);
2058 }
2059 return;
2060 }
2061
2062 if (write_ordered &&
2063 scrubber.write_blocked_by_scrub(head)) {
2064 dout(20) << __func__ << ": waiting for scrub" << dendl;
2065 waiting_for_scrub.push_back(op);
2066 op->mark_delayed("waiting for scrub");
2067 return;
2068 }
2069
2070 // blocked on snap?
2071 map<hobject_t, snapid_t>::iterator blocked_iter =
2072 objects_blocked_on_degraded_snap.find(head);
2073 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2074 hobject_t to_wait_on(head);
2075 to_wait_on.snap = blocked_iter->second;
2076 wait_for_degraded_object(to_wait_on, op);
2077 return;
2078 }
2079 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2080 objects_blocked_on_snap_promotion.find(head);
2081 if (write_ordered &&
2082 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2083 wait_for_blocked_object(
2084 blocked_snap_promote_iter->second->obs.oi.soid,
2085 op);
2086 return;
2087 }
2088 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2089 block_write_on_full_cache(head, op);
2090 return;
2091 }
2092
2093 // missing snapdir?
2094 hobject_t snapdir = head.get_snapdir();
2095
2096 if (is_unreadable_object(snapdir)) {
2097 wait_for_unreadable_object(snapdir, op);
2098 return;
2099 }
2100
2101 // degraded object?
2102 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2103 wait_for_degraded_object(snapdir, op);
2104 return;
2105 }
2106
2107 // dup/resent?
2108 if (op->may_write() || op->may_cache()) {
2109 // warning: we will get back *a* request for this reqid, but not
2110 // necessarily the most recent. this happens with flush and
2111 // promote ops, but we can't possible have both in our log where
2112 // the original request is still not stable on disk, so for our
2113 // purposes here it doesn't matter which one we get.
2114 eversion_t version;
2115 version_t user_version;
2116 int return_code = 0;
2117 bool got = check_in_progress_op(
2118 m->get_reqid(), &version, &user_version, &return_code);
2119 if (got) {
2120 dout(3) << __func__ << " dup " << m->get_reqid()
2121 << " version " << version << dendl;
2122 if (already_complete(version)) {
2123 osd->reply_op_error(op, return_code, version, user_version);
2124 } else {
2125 dout(10) << " waiting for " << version << " to commit" << dendl;
2126 // always queue ondisk waiters, so that we can requeue if needed
2127 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2128 op->mark_delayed("waiting for ondisk");
2129 }
2130 return;
2131 }
2132 }
2133
2134 ObjectContextRef obc;
2135 bool can_create = op->may_write() || op->may_cache();
2136 hobject_t missing_oid;
2137 const hobject_t& oid = m->get_hobj();
2138
2139 // io blocked on obc?
2140 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2141 maybe_await_blocked_snapset(oid, op)) {
2142 return;
2143 }
2144
2145 int r = find_object_context(
2146 oid, &obc, can_create,
2147 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2148 &missing_oid);
2149
2150 if (r == -EAGAIN) {
2151 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2152 // we have to wait for the object.
2153 if (is_primary()) {
2154 // missing the specific snap we need; requeue and wait.
2155 assert(!op->may_write()); // only happens on a read/cache
2156 wait_for_unreadable_object(missing_oid, op);
2157 return;
2158 }
2159 } else if (r == 0) {
2160 if (is_unreadable_object(obc->obs.oi.soid)) {
2161 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2162 << " is unreadable, waiting" << dendl;
2163 wait_for_unreadable_object(obc->obs.oi.soid, op);
2164 return;
2165 }
2166
2167 // degraded object? (the check above was for head; this could be a clone)
2168 if (write_ordered &&
2169 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2170 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2171 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2172 << " is degraded, waiting" << dendl;
2173 wait_for_degraded_object(obc->obs.oi.soid, op);
2174 return;
2175 }
2176 }
2177
2178 bool in_hit_set = false;
2179 if (hit_set) {
2180 if (obc.get()) {
2181 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2182 in_hit_set = true;
2183 } else {
2184 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2185 in_hit_set = true;
2186 }
2187 if (!op->hitset_inserted) {
2188 hit_set->insert(oid);
2189 op->hitset_inserted = true;
2190 if (hit_set->is_full() ||
2191 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2192 hit_set_persist();
2193 }
2194 }
2195 }
2196
2197 if (agent_state) {
2198 if (agent_choose_mode(false, op))
2199 return;
2200 }
2201
2202 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2203 if (maybe_handle_manifest(op,
2204 write_ordered,
2205 obc))
2206 return;
2207 }
2208
2209 if (maybe_handle_cache(op,
2210 write_ordered,
2211 obc,
2212 r,
2213 missing_oid,
2214 false,
2215 in_hit_set))
2216 return;
2217
2218 if (r && (r != -ENOENT || !obc)) {
2219 // copy the reqids for copy get on ENOENT
2220 if (r == -ENOENT &&
2221 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2222 fill_in_copy_get_noent(op, oid, m->ops[0]);
2223 return;
2224 }
2225 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2226 if (op->may_write() &&
2227 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2228 record_write_error(op, oid, nullptr, r);
2229 } else {
2230 osd->reply_op_error(op, r);
2231 }
2232 return;
2233 }
2234
2235 // make sure locator is consistent
2236 object_locator_t oloc(obc->obs.oi.soid);
2237 if (m->get_object_locator() != oloc) {
2238 dout(10) << " provided locator " << m->get_object_locator()
2239 << " != object's " << obc->obs.oi.soid << dendl;
2240 osd->clog->warn() << "bad locator " << m->get_object_locator()
2241 << " on object " << oloc
2242 << " op " << *m;
2243 }
2244
2245 // io blocked on obc?
2246 if (obc->is_blocked() &&
2247 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2248 wait_for_blocked_object(obc->obs.oi.soid, op);
2249 return;
2250 }
2251
2252 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2253
2254 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2255 OSDOp& osd_op = *p;
2256
2257 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2258 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2259 m->get_snapid() != CEPH_SNAPDIR) {
2260 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2261 osd->reply_op_error(op, -EINVAL);
2262 return;
2263 }
2264 }
2265
2266 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2267
2268 if (!obc->obs.exists)
2269 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2270
2271 /* Due to obc caching, we might have a cached non-existent snapset_obc
2272 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2273 * do_op pipeline make decisions based on whether snapset_obc is
2274 * populated.
2275 */
2276 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2277 ctx->snapset_obc = ObjectContextRef();
2278
2279 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2280 dout(20) << __func__ << ": skipping rw locks" << dendl;
2281 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2282 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2283
2284 // verify there is in fact a flush in progress
2285 // FIXME: we could make this a stronger test.
2286 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2287 if (p == flush_ops.end()) {
2288 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2289 reply_ctx(ctx, -EINVAL);
2290 return;
2291 }
2292 } else if (!get_rw_locks(write_ordered, ctx)) {
2293 dout(20) << __func__ << " waiting for rw locks " << dendl;
2294 op->mark_delayed("waiting for rw locks");
2295 close_op_ctx(ctx);
2296 return;
2297 }
2298 dout(20) << __func__ << " obc " << *obc << dendl;
2299
2300 if (r) {
2301 dout(20) << __func__ << " returned an error: " << r << dendl;
2302 close_op_ctx(ctx);
2303 if (op->may_write() &&
2304 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2305 record_write_error(op, oid, nullptr, r);
2306 } else {
2307 osd->reply_op_error(op, r);
2308 }
2309 return;
2310 }
2311
2312 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2313 ctx->ignore_cache = true;
2314 }
2315
2316 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2317 // This object is lost. Reading from it returns an error.
2318 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2319 << " is lost" << dendl;
2320 reply_ctx(ctx, -ENFILE);
2321 return;
2322 }
2323 if (!op->may_write() &&
2324 !op->may_cache() &&
2325 (!obc->obs.exists ||
2326 ((m->get_snapid() != CEPH_SNAPDIR) &&
2327 obc->obs.oi.is_whiteout()))) {
2328 // copy the reqids for copy get on ENOENT
2329 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2330 fill_in_copy_get_noent(op, oid, m->ops[0]);
2331 close_op_ctx(ctx);
2332 return;
2333 }
2334 reply_ctx(ctx, -ENOENT);
2335 return;
2336 }
2337
2338 op->mark_started();
2339
2340 execute_ctx(ctx);
2341 utime_t prepare_latency = ceph_clock_now();
2342 prepare_latency -= op->get_dequeued_time();
2343 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2344 if (op->may_read() && op->may_write()) {
2345 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2346 } else if (op->may_read()) {
2347 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2348 } else if (op->may_write() || op->may_cache()) {
2349 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2350 }
2351
2352 // force recovery of the oldest missing object if too many logs
2353 maybe_force_recovery();
2354 }
2355
2356 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2357 OpRequestRef op,
2358 bool write_ordered,
2359 ObjectContextRef obc)
2360 {
2361 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2362 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2363 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2364 return cache_result_t::NOOP;
2365 }
2366
2367 if (obc)
2368 dout(10) << __func__ << " " << obc->obs.oi << " "
2369 << (obc->obs.exists ? "exists" : "DNE")
2370 << dendl;
2371
2372 // if it is write-ordered and blocked, stop now
2373 if (obc.get() && obc->is_blocked() && write_ordered) {
2374 // we're already doing something with this object
2375 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2376 return cache_result_t::NOOP;
2377 }
2378
2379 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2380 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2381 OSDOp& osd_op = *p;
2382 ceph_osd_op& op = osd_op.op;
2383 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2384 return cache_result_t::NOOP;
2385 }
2386 }
2387
2388 switch (obc->obs.oi.manifest.type) {
2389 case object_manifest_t::TYPE_REDIRECT:
2390 if (op->may_write() || write_ordered) {
2391 do_proxy_write(op, obc->obs.oi.soid, obc);
2392 } else {
2393 do_proxy_read(op, obc);
2394 }
2395 return cache_result_t::HANDLED_PROXY;
2396 case object_manifest_t::TYPE_CHUNKED:
2397 default:
2398 assert(0 == "unrecognized manifest type");
2399 }
2400
2401 return cache_result_t::NOOP;
2402 }
2403
2404 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2405 MOSDOpReply *orig_reply, int r)
2406 {
2407 dout(20) << __func__ << " r=" << r << dendl;
2408 assert(op->may_write());
2409 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2410 mempool::osd_pglog::list<pg_log_entry_t> entries;
2411 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2412 get_next_version(), eversion_t(), 0,
2413 reqid, utime_t(), r));
2414
2415 struct OnComplete {
2416 PrimaryLogPG *pg;
2417 OpRequestRef op;
2418 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2419 int r;
2420 OnComplete(
2421 PrimaryLogPG *pg,
2422 OpRequestRef op,
2423 MOSDOpReply *orig_reply,
2424 int r)
2425 : pg(pg), op(op),
2426 orig_reply(orig_reply, false /* take over ref */), r(r)
2427 {}
2428 void operator()() {
2429 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2430 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2431 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2432 MOSDOpReply *reply = orig_reply.detach();
2433 if (reply == nullptr) {
2434 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2435 flags, true);
2436 }
2437 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2438 pg->osd->send_message_osd_client(reply, m->get_connection());
2439 }
2440 };
2441
2442 ObcLockManager lock_manager;
2443 submit_log_entries(
2444 entries,
2445 std::move(lock_manager),
2446 boost::optional<std::function<void(void)> >(
2447 OnComplete(this, op, orig_reply, r)),
2448 op,
2449 r);
2450 }
2451
2452 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2453 OpRequestRef op,
2454 bool write_ordered,
2455 ObjectContextRef obc,
2456 int r, hobject_t missing_oid,
2457 bool must_promote,
2458 bool in_hit_set,
2459 ObjectContextRef *promote_obc)
2460 {
2461 // return quickly if caching is not enabled
2462 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2463 return cache_result_t::NOOP;
2464
2465 if (op &&
2466 op->get_req() &&
2467 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2468 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2469 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2470 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2471 return cache_result_t::NOOP;
2472 }
2473
2474 must_promote = must_promote || op->need_promote();
2475
2476 if (obc)
2477 dout(25) << __func__ << " " << obc->obs.oi << " "
2478 << (obc->obs.exists ? "exists" : "DNE")
2479 << " missing_oid " << missing_oid
2480 << " must_promote " << (int)must_promote
2481 << " in_hit_set " << (int)in_hit_set
2482 << dendl;
2483 else
2484 dout(25) << __func__ << " (no obc)"
2485 << " missing_oid " << missing_oid
2486 << " must_promote " << (int)must_promote
2487 << " in_hit_set " << (int)in_hit_set
2488 << dendl;
2489
2490 // if it is write-ordered and blocked, stop now
2491 if (obc.get() && obc->is_blocked() && write_ordered) {
2492 // we're already doing something with this object
2493 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2494 return cache_result_t::NOOP;
2495 }
2496
2497 if (r == -ENOENT && missing_oid == hobject_t()) {
2498 // we know this object is logically absent (e.g., an undefined clone)
2499 return cache_result_t::NOOP;
2500 }
2501
2502 if (obc.get() && obc->obs.exists) {
2503 osd->logger->inc(l_osd_op_cache_hit);
2504 return cache_result_t::NOOP;
2505 }
2506 if (!is_primary()) {
2507 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2508 osd->reply_op_error(op, -EAGAIN);
2509 return cache_result_t::REPLIED_WITH_EAGAIN;
2510 }
2511
2512 if (missing_oid == hobject_t() && obc.get()) {
2513 missing_oid = obc->obs.oi.soid;
2514 }
2515
2516 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2517 const object_locator_t oloc = m->get_object_locator();
2518
2519 if (op->need_skip_handle_cache()) {
2520 return cache_result_t::NOOP;
2521 }
2522
2523 // older versions do not proxy the feature bits.
2524 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2525 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2526 OpRequestRef promote_op;
2527
2528 switch (pool.info.cache_mode) {
2529 case pg_pool_t::CACHEMODE_WRITEBACK:
2530 if (agent_state &&
2531 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2532 if (!op->may_write() && !op->may_cache() &&
2533 !write_ordered && !must_promote) {
2534 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2535 do_proxy_read(op);
2536 return cache_result_t::HANDLED_PROXY;
2537 }
2538 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2539 block_write_on_full_cache(missing_oid, op);
2540 return cache_result_t::BLOCKED_FULL;
2541 }
2542
2543 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2544 promote_object(obc, missing_oid, oloc, op, promote_obc);
2545 return cache_result_t::BLOCKED_PROMOTE;
2546 }
2547
2548 if (op->may_write() || op->may_cache()) {
2549 if (can_proxy_write) {
2550 do_proxy_write(op, missing_oid);
2551 } else {
2552 // promote if can't proxy the write
2553 promote_object(obc, missing_oid, oloc, op, promote_obc);
2554 return cache_result_t::BLOCKED_PROMOTE;
2555 }
2556
2557 // Promote too?
2558 if (!op->need_skip_promote() &&
2559 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2560 pool.info.min_write_recency_for_promote,
2561 OpRequestRef(),
2562 promote_obc)) {
2563 return cache_result_t::BLOCKED_PROMOTE;
2564 }
2565 return cache_result_t::HANDLED_PROXY;
2566 } else {
2567 do_proxy_read(op);
2568
2569 // Avoid duplicate promotion
2570 if (obc.get() && obc->is_blocked()) {
2571 if (promote_obc)
2572 *promote_obc = obc;
2573 return cache_result_t::BLOCKED_PROMOTE;
2574 }
2575
2576 // Promote too?
2577 if (!op->need_skip_promote()) {
2578 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2579 pool.info.min_read_recency_for_promote,
2580 promote_op, promote_obc);
2581 }
2582
2583 return cache_result_t::HANDLED_PROXY;
2584 }
2585 assert(0 == "unreachable");
2586 return cache_result_t::NOOP;
2587
2588 case pg_pool_t::CACHEMODE_FORWARD:
2589 // FIXME: this mode allows requests to be reordered.
2590 do_cache_redirect(op);
2591 return cache_result_t::HANDLED_REDIRECT;
2592
2593 case pg_pool_t::CACHEMODE_READONLY:
2594 // TODO: clean this case up
2595 if (!obc.get() && r == -ENOENT) {
2596 // we don't have the object and op's a read
2597 promote_object(obc, missing_oid, oloc, op, promote_obc);
2598 return cache_result_t::BLOCKED_PROMOTE;
2599 }
2600 if (!r) { // it must be a write
2601 do_cache_redirect(op);
2602 return cache_result_t::HANDLED_REDIRECT;
2603 }
2604 // crap, there was a failure of some kind
2605 return cache_result_t::NOOP;
2606
2607 case pg_pool_t::CACHEMODE_READFORWARD:
2608 // Do writeback to the cache tier for writes
2609 if (op->may_write() || write_ordered || must_promote) {
2610 if (agent_state &&
2611 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2612 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2613 block_write_on_full_cache(missing_oid, op);
2614 return cache_result_t::BLOCKED_FULL;
2615 }
2616 promote_object(obc, missing_oid, oloc, op, promote_obc);
2617 return cache_result_t::BLOCKED_PROMOTE;
2618 }
2619
2620 // If it is a read, we can read, we need to forward it
2621 do_cache_redirect(op);
2622 return cache_result_t::HANDLED_REDIRECT;
2623
2624 case pg_pool_t::CACHEMODE_PROXY:
2625 if (!must_promote) {
2626 if (op->may_write() || op->may_cache() || write_ordered) {
2627 if (can_proxy_write) {
2628 do_proxy_write(op, missing_oid);
2629 return cache_result_t::HANDLED_PROXY;
2630 }
2631 } else {
2632 do_proxy_read(op);
2633 return cache_result_t::HANDLED_PROXY;
2634 }
2635 }
2636 // ugh, we're forced to promote.
2637 if (agent_state &&
2638 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2639 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2640 block_write_on_full_cache(missing_oid, op);
2641 return cache_result_t::BLOCKED_FULL;
2642 }
2643 promote_object(obc, missing_oid, oloc, op, promote_obc);
2644 return cache_result_t::BLOCKED_PROMOTE;
2645
2646 case pg_pool_t::CACHEMODE_READPROXY:
2647 // Do writeback to the cache tier for writes
2648 if (op->may_write() || write_ordered || must_promote) {
2649 if (agent_state &&
2650 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2651 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2652 block_write_on_full_cache(missing_oid, op);
2653 return cache_result_t::BLOCKED_FULL;
2654 }
2655 promote_object(obc, missing_oid, oloc, op, promote_obc);
2656 return cache_result_t::BLOCKED_PROMOTE;
2657 }
2658
2659 // If it is a read, we can read, we need to proxy it
2660 do_proxy_read(op);
2661 return cache_result_t::HANDLED_PROXY;
2662
2663 default:
2664 assert(0 == "unrecognized cache_mode");
2665 }
2666 return cache_result_t::NOOP;
2667 }
2668
2669 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2670 const hobject_t& missing_oid,
2671 const object_locator_t& oloc,
2672 bool in_hit_set,
2673 uint32_t recency,
2674 OpRequestRef promote_op,
2675 ObjectContextRef *promote_obc)
2676 {
2677 dout(20) << __func__ << " missing_oid " << missing_oid
2678 << " in_hit_set " << in_hit_set << dendl;
2679
2680 switch (recency) {
2681 case 0:
2682 break;
2683 case 1:
2684 // Check if in the current hit set
2685 if (in_hit_set) {
2686 break;
2687 } else {
2688 // not promoting
2689 return false;
2690 }
2691 break;
2692 default:
2693 {
2694 unsigned count = (int)in_hit_set;
2695 if (count) {
2696 // Check if in other hit sets
2697 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2698 for (map<time_t,HitSetRef>::reverse_iterator itor =
2699 agent_state->hit_set_map.rbegin();
2700 itor != agent_state->hit_set_map.rend();
2701 ++itor) {
2702 if (!itor->second->contains(oid)) {
2703 break;
2704 }
2705 ++count;
2706 if (count >= recency) {
2707 break;
2708 }
2709 }
2710 }
2711 if (count >= recency) {
2712 break;
2713 }
2714 return false; // not promoting
2715 }
2716 break;
2717 }
2718
2719 if (osd->promote_throttle()) {
2720 dout(10) << __func__ << " promote throttled" << dendl;
2721 return false;
2722 }
2723 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2724 return true;
2725 }
2726
2727 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2728 {
2729 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2730 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2731 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2732 get_osdmap()->get_epoch(), flags, false);
2733 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2734 reply->set_redirect(redir);
2735 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2736 << op << dendl;
2737 m->get_connection()->send_message(reply);
2738 return;
2739 }
2740
2741 struct C_ProxyRead : public Context {
2742 PrimaryLogPGRef pg;
2743 hobject_t oid;
2744 epoch_t last_peering_reset;
2745 ceph_tid_t tid;
2746 PrimaryLogPG::ProxyReadOpRef prdop;
2747 utime_t start;
2748 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2749 const PrimaryLogPG::ProxyReadOpRef& prd)
2750 : pg(p), oid(o), last_peering_reset(lpr),
2751 tid(0), prdop(prd), start(ceph_clock_now())
2752 {}
2753 void finish(int r) override {
2754 if (prdop->canceled)
2755 return;
2756 pg->lock();
2757 if (prdop->canceled) {
2758 pg->unlock();
2759 return;
2760 }
2761 if (last_peering_reset == pg->get_last_peering_reset()) {
2762 pg->finish_proxy_read(oid, tid, r);
2763 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2764 }
2765 pg->unlock();
2766 }
2767 };
2768
2769 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2770 {
2771 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2772 // stash the result in the request's OSDOp vector
2773 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2774 object_locator_t oloc;
2775 hobject_t soid;
2776 /* extensible tier */
2777 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2778 switch (obc->obs.oi.manifest.type) {
2779 case object_manifest_t::TYPE_REDIRECT:
2780 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2781 soid = obc->obs.oi.manifest.redirect_target;
2782 break;
2783 case object_manifest_t::TYPE_CHUNKED:
2784 default:
2785 assert(0 == "unrecognized manifest type");
2786 }
2787 } else {
2788 /* proxy */
2789 soid = m->get_hobj();
2790 oloc = object_locator_t(m->get_object_locator());
2791 oloc.pool = pool.info.tier_of;
2792 }
2793 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2794
2795 // pass through some original flags that make sense.
2796 // - leave out redirection and balancing flags since we are
2797 // already proxying through the primary
2798 // - leave off read/write/exec flags that are derived from the op
2799 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2800 CEPH_OSD_FLAG_ORDERSNAP |
2801 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2802 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2803
2804 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2805
2806 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2807
2808 ObjectOperation obj_op;
2809 obj_op.dup(prdop->ops);
2810
2811 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2812 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2813 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2814 ceph_osd_op op = obj_op.ops[i].op;
2815 switch (op.op) {
2816 case CEPH_OSD_OP_READ:
2817 case CEPH_OSD_OP_SYNC_READ:
2818 case CEPH_OSD_OP_SPARSE_READ:
2819 case CEPH_OSD_OP_CHECKSUM:
2820 case CEPH_OSD_OP_CMPEXT:
2821 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2822 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2823 }
2824 }
2825 }
2826
2827 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2828 prdop);
2829 ceph_tid_t tid = osd->objecter->read(
2830 soid.oid, oloc, obj_op,
2831 m->get_snapid(), NULL,
2832 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2833 &prdop->user_version,
2834 &prdop->data_offset,
2835 m->get_features());
2836 fin->tid = tid;
2837 prdop->objecter_tid = tid;
2838 proxyread_ops[tid] = prdop;
2839 in_progress_proxy_ops[soid].push_back(op);
2840 }
2841
2842 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2843 {
2844 dout(10) << __func__ << " " << oid << " tid " << tid
2845 << " " << cpp_strerror(r) << dendl;
2846
2847 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2848 if (p == proxyread_ops.end()) {
2849 dout(10) << __func__ << " no proxyread_op found" << dendl;
2850 return;
2851 }
2852 ProxyReadOpRef prdop = p->second;
2853 if (tid != prdop->objecter_tid) {
2854 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2855 << " tid " << prdop->objecter_tid << dendl;
2856 return;
2857 }
2858 if (oid != prdop->soid) {
2859 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2860 << " soid " << prdop->soid << dendl;
2861 return;
2862 }
2863 proxyread_ops.erase(tid);
2864
2865 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2866 if (q == in_progress_proxy_ops.end()) {
2867 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2868 return;
2869 }
2870 assert(q->second.size());
2871 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2872 q->second.end(),
2873 prdop->op);
2874 assert(it != q->second.end());
2875 OpRequestRef op = *it;
2876 q->second.erase(it);
2877 if (q->second.size() == 0) {
2878 in_progress_proxy_ops.erase(oid);
2879 }
2880
2881 osd->logger->inc(l_osd_tier_proxy_read);
2882
2883 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2884 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2885 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2886 ctx->user_at_version = prdop->user_version;
2887 ctx->data_off = prdop->data_offset;
2888 ctx->ignore_log_op_stats = true;
2889 complete_read_ctx(r, ctx);
2890 }
2891
2892 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2893 {
2894 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2895 if (p == in_progress_proxy_ops.end())
2896 return;
2897
2898 list<OpRequestRef>& ls = p->second;
2899 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2900 requeue_ops(ls);
2901 in_progress_proxy_ops.erase(p);
2902 }
2903
2904 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
2905 vector<ceph_tid_t> *tids)
2906 {
2907 dout(10) << __func__ << " " << prdop->soid << dendl;
2908 prdop->canceled = true;
2909
2910 // cancel objecter op, if we can
2911 if (prdop->objecter_tid) {
2912 tids->push_back(prdop->objecter_tid);
2913 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2914 prdop->ops[i].outdata.clear();
2915 }
2916 proxyread_ops.erase(prdop->objecter_tid);
2917 prdop->objecter_tid = 0;
2918 }
2919 }
2920
2921 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
2922 {
2923 dout(10) << __func__ << dendl;
2924
2925 // cancel proxy reads
2926 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2927 while (p != proxyread_ops.end()) {
2928 cancel_proxy_read((p++)->second, tids);
2929 }
2930
2931 // cancel proxy writes
2932 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2933 while (q != proxywrite_ops.end()) {
2934 cancel_proxy_write((q++)->second, tids);
2935 }
2936
2937 if (requeue) {
2938 map<hobject_t, list<OpRequestRef>>::iterator p =
2939 in_progress_proxy_ops.begin();
2940 while (p != in_progress_proxy_ops.end()) {
2941 list<OpRequestRef>& ls = p->second;
2942 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2943 << " requests" << dendl;
2944 requeue_ops(ls);
2945 in_progress_proxy_ops.erase(p++);
2946 }
2947 } else {
2948 in_progress_proxy_ops.clear();
2949 }
2950 }
2951
2952 struct C_ProxyWrite_Commit : public Context {
2953 PrimaryLogPGRef pg;
2954 hobject_t oid;
2955 epoch_t last_peering_reset;
2956 ceph_tid_t tid;
2957 PrimaryLogPG::ProxyWriteOpRef pwop;
2958 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2959 const PrimaryLogPG::ProxyWriteOpRef& pw)
2960 : pg(p), oid(o), last_peering_reset(lpr),
2961 tid(0), pwop(pw)
2962 {}
2963 void finish(int r) override {
2964 if (pwop->canceled)
2965 return;
2966 pg->lock();
2967 if (pwop->canceled) {
2968 pg->unlock();
2969 return;
2970 }
2971 if (last_peering_reset == pg->get_last_peering_reset()) {
2972 pg->finish_proxy_write(oid, tid, r);
2973 }
2974 pg->unlock();
2975 }
2976 };
2977
2978 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2979 {
2980 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2981 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2982 object_locator_t oloc;
2983 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2984 hobject_t soid;
2985 /* extensible tier */
2986 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2987 switch (obc->obs.oi.manifest.type) {
2988 case object_manifest_t::TYPE_REDIRECT:
2989 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2990 soid = obc->obs.oi.manifest.redirect_target;
2991 break;
2992 case object_manifest_t::TYPE_CHUNKED:
2993 default:
2994 assert(0 == "unrecognized manifest type");
2995 }
2996 } else {
2997 /* proxy */
2998 soid = m->get_hobj();
2999 oloc = object_locator_t(m->get_object_locator());
3000 oloc.pool = pool.info.tier_of;
3001 }
3002
3003 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3004 if (!(op->may_write() || op->may_cache())) {
3005 flags |= CEPH_OSD_FLAG_RWORDERED;
3006 }
3007 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3008
3009 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3010 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3011 pwop->mtime = m->get_mtime();
3012
3013 ObjectOperation obj_op;
3014 obj_op.dup(pwop->ops);
3015
3016 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3017 this, soid, get_last_peering_reset(), pwop);
3018 ceph_tid_t tid = osd->objecter->mutate(
3019 soid.oid, oloc, obj_op, snapc,
3020 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3021 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3022 &pwop->user_version, pwop->reqid);
3023 fin->tid = tid;
3024 pwop->objecter_tid = tid;
3025 proxywrite_ops[tid] = pwop;
3026 in_progress_proxy_ops[soid].push_back(op);
3027 }
3028
3029 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3030 {
3031 dout(10) << __func__ << " " << oid << " tid " << tid
3032 << " " << cpp_strerror(r) << dendl;
3033
3034 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3035 if (p == proxywrite_ops.end()) {
3036 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3037 return;
3038 }
3039 ProxyWriteOpRef pwop = p->second;
3040 assert(tid == pwop->objecter_tid);
3041 assert(oid == pwop->soid);
3042
3043 proxywrite_ops.erase(tid);
3044
3045 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3046 if (q == in_progress_proxy_ops.end()) {
3047 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3048 delete pwop->ctx;
3049 pwop->ctx = NULL;
3050 return;
3051 }
3052 list<OpRequestRef>& in_progress_op = q->second;
3053 assert(in_progress_op.size());
3054 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3055 in_progress_op.end(),
3056 pwop->op);
3057 assert(it != in_progress_op.end());
3058 in_progress_op.erase(it);
3059 if (in_progress_op.size() == 0) {
3060 in_progress_proxy_ops.erase(oid);
3061 }
3062
3063 osd->logger->inc(l_osd_tier_proxy_write);
3064
3065 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3066 assert(m != NULL);
3067
3068 if (!pwop->sent_reply) {
3069 // send commit.
3070 MOSDOpReply *reply = pwop->ctx->reply;
3071 if (reply)
3072 pwop->ctx->reply = NULL;
3073 else {
3074 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3075 reply->set_reply_versions(eversion_t(), pwop->user_version);
3076 }
3077 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3078 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3079 osd->send_message_osd_client(reply, m->get_connection());
3080 pwop->sent_reply = true;
3081 pwop->ctx->op->mark_commit_sent();
3082 }
3083
3084 delete pwop->ctx;
3085 pwop->ctx = NULL;
3086 }
3087
3088 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3089 vector<ceph_tid_t> *tids)
3090 {
3091 dout(10) << __func__ << " " << pwop->soid << dendl;
3092 pwop->canceled = true;
3093
3094 // cancel objecter op, if we can
3095 if (pwop->objecter_tid) {
3096 tids->push_back(pwop->objecter_tid);
3097 delete pwop->ctx;
3098 pwop->ctx = NULL;
3099 proxywrite_ops.erase(pwop->objecter_tid);
3100 pwop->objecter_tid = 0;
3101 }
3102 }
3103
3104 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3105 ObjectContextRef obc;
3106 PrimaryLogPG *pg;
3107 utime_t start;
3108 public:
3109 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3110 : obc(obc_),
3111 pg(pg_),
3112 start(ceph_clock_now()) {}
3113
3114 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3115 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3116 int r = results.get<0>();
3117 pg->finish_promote(r, results_data, obc);
3118 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3119 }
3120 };
3121
3122 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3123 const hobject_t& missing_oid,
3124 const object_locator_t& oloc,
3125 OpRequestRef op,
3126 ObjectContextRef *promote_obc)
3127 {
3128 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3129 assert(hoid != hobject_t());
3130 if (scrubber.write_blocked_by_scrub(hoid)) {
3131 dout(10) << __func__ << " " << hoid
3132 << " blocked by scrub" << dendl;
3133 if (op) {
3134 waiting_for_scrub.push_back(op);
3135 op->mark_delayed("waiting for scrub");
3136 dout(10) << __func__ << " " << hoid
3137 << " placing op in waiting_for_scrub" << dendl;
3138 } else {
3139 dout(10) << __func__ << " " << hoid
3140 << " no op, dropping on the floor" << dendl;
3141 }
3142 return;
3143 }
3144 if (!obc) { // we need to create an ObjectContext
3145 assert(missing_oid != hobject_t());
3146 obc = get_object_context(missing_oid, true);
3147 }
3148 if (promote_obc)
3149 *promote_obc = obc;
3150
3151 /*
3152 * Before promote complete, if there are proxy-reads for the object,
3153 * for this case we don't use DONTNEED.
3154 */
3155 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3156 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3157 if (q == in_progress_proxy_ops.end()) {
3158 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3159 }
3160
3161 PromoteCallback *cb = new PromoteCallback(obc, this);
3162 object_locator_t my_oloc = oloc;
3163 my_oloc.pool = pool.info.tier_of;
3164
3165 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3166 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3167 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3168 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3169 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3170 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3171 src_fadvise_flags, 0);
3172
3173 assert(obc->is_blocked());
3174
3175 if (op)
3176 wait_for_blocked_object(obc->obs.oi.soid, op);
3177 info.stats.stats.sum.num_promote++;
3178 }
3179
3180 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3181 {
3182 FUNCTRACE();
3183 dout(10) << __func__ << " " << ctx << dendl;
3184 ctx->reset_obs(ctx->obc);
3185 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3186 OpRequestRef op = ctx->op;
3187 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3188 ObjectContextRef obc = ctx->obc;
3189 const hobject_t& soid = obc->obs.oi.soid;
3190
3191 // this method must be idempotent since we may call it several times
3192 // before we finally apply the resulting transaction.
3193 ctx->op_t.reset(new PGTransaction);
3194
3195 if (op->may_write() || op->may_cache()) {
3196 // snap
3197 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3198 pool.info.is_pool_snaps_mode()) {
3199 // use pool's snapc
3200 ctx->snapc = pool.snapc;
3201 } else {
3202 // client specified snapc
3203 ctx->snapc.seq = m->get_snap_seq();
3204 ctx->snapc.snaps = m->get_snaps();
3205 filter_snapc(ctx->snapc.snaps);
3206 }
3207 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3208 ctx->snapc.seq < obc->ssc->snapset.seq) {
3209 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3210 << " < snapset seq " << obc->ssc->snapset.seq
3211 << " on " << obc->obs.oi.soid << dendl;
3212 reply_ctx(ctx, -EOLDSNAPC);
3213 return;
3214 }
3215
3216 // version
3217 ctx->at_version = get_next_version();
3218 ctx->mtime = m->get_mtime();
3219
3220 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3221 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3222 << " snapc " << ctx->snapc
3223 << " snapset " << obc->ssc->snapset
3224 << dendl;
3225 } else {
3226 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3227 << " ov " << obc->obs.oi.version
3228 << dendl;
3229 }
3230
3231 if (!ctx->user_at_version)
3232 ctx->user_at_version = obc->obs.oi.user_version;
3233 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3234
3235 if (op->may_read()) {
3236 dout(10) << " taking ondisk_read_lock" << dendl;
3237 obc->ondisk_read_lock();
3238 }
3239
3240 {
3241 #ifdef WITH_LTTNG
3242 osd_reqid_t reqid = ctx->op->get_reqid();
3243 #endif
3244 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3245 reqid.name._num, reqid.tid, reqid.inc);
3246 }
3247
3248 int result = prepare_transaction(ctx);
3249
3250 {
3251 #ifdef WITH_LTTNG
3252 osd_reqid_t reqid = ctx->op->get_reqid();
3253 #endif
3254 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3255 reqid.name._num, reqid.tid, reqid.inc);
3256 }
3257
3258 if (op->may_read()) {
3259 dout(10) << " dropping ondisk_read_lock" << dendl;
3260 obc->ondisk_read_unlock();
3261 }
3262
3263 bool pending_async_reads = !ctx->pending_async_reads.empty();
3264 if (result == -EINPROGRESS || pending_async_reads) {
3265 // come back later.
3266 if (pending_async_reads) {
3267 in_progress_async_reads.push_back(make_pair(op, ctx));
3268 ctx->start_async_reads(this);
3269 }
3270 return;
3271 }
3272
3273 if (result == -EAGAIN) {
3274 // clean up after the ctx
3275 close_op_ctx(ctx);
3276 return;
3277 }
3278
3279 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3280 // prepare the reply
3281 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3282 successful_write);
3283
3284 // Write operations aren't allowed to return a data payload because
3285 // we can't do so reliably. If the client has to resend the request
3286 // and it has already been applied, we will return 0 with no
3287 // payload. Non-deterministic behavior is no good. However, it is
3288 // possible to construct an operation that does a read, does a guard
3289 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3290 // with the write, or return a CMPXATTR and the read value.
3291 if (successful_write) {
3292 // write. normalize the result code.
3293 dout(20) << " zeroing write result code " << result << dendl;
3294 result = 0;
3295 }
3296 ctx->reply->set_result(result);
3297
3298 // read or error?
3299 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3300 // finish side-effects
3301 if (result >= 0)
3302 do_osd_op_effects(ctx, m->get_connection());
3303
3304 complete_read_ctx(result, ctx);
3305 return;
3306 }
3307
3308 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3309
3310 assert(op->may_write() || op->may_cache());
3311
3312 // trim log?
3313 calc_trim_to();
3314
3315 // verify that we are doing this in order?
3316 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3317 !pool.info.is_tier() && !pool.info.has_tiers()) {
3318 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3319 ceph_tid_t t = m->get_tid();
3320 client_t n = m->get_source().num();
3321 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3322 if (p == cm.end()) {
3323 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3324 cm[n] = t;
3325 } else {
3326 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3327 if (p->second > t) {
3328 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3329 assert(0 == "out of order op");
3330 }
3331 p->second = t;
3332 }
3333 }
3334
3335 if (ctx->update_log_only) {
3336 if (result >= 0)
3337 do_osd_op_effects(ctx, m->get_connection());
3338
3339 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3340 // save just what we need from ctx
3341 MOSDOpReply *reply = ctx->reply;
3342 ctx->reply = nullptr;
3343 reply->claim_op_out_data(*ctx->ops);
3344 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3345 close_op_ctx(ctx);
3346
3347 if (result == -ENOENT) {
3348 reply->set_enoent_reply_versions(info.last_update,
3349 info.last_user_version);
3350 }
3351 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3352 // append to pg log for dup detection - don't save buffers for now
3353 record_write_error(op, soid, reply, result);
3354 return;
3355 }
3356
3357 // no need to capture PG ref, repop cancel will handle that
3358 // Can capture the ctx by pointer, it's owned by the repop
3359 ctx->register_on_commit(
3360 [m, ctx, this](){
3361 if (ctx->op)
3362 log_op_stats(
3363 ctx);
3364
3365 if (m && !ctx->sent_reply) {
3366 MOSDOpReply *reply = ctx->reply;
3367 if (reply)
3368 ctx->reply = nullptr;
3369 else {
3370 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3371 reply->set_reply_versions(ctx->at_version,
3372 ctx->user_at_version);
3373 }
3374 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3375 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3376 osd->send_message_osd_client(reply, m->get_connection());
3377 ctx->sent_reply = true;
3378 ctx->op->mark_commit_sent();
3379 }
3380 });
3381 ctx->register_on_success(
3382 [ctx, this]() {
3383 do_osd_op_effects(
3384 ctx,
3385 ctx->op ? ctx->op->get_req()->get_connection() :
3386 ConnectionRef());
3387 });
3388 ctx->register_on_finish(
3389 [ctx, this]() {
3390 delete ctx;
3391 });
3392
3393 // issue replica writes
3394 ceph_tid_t rep_tid = osd->get_tid();
3395
3396 RepGather *repop = new_repop(ctx, obc, rep_tid);
3397
3398 issue_repop(repop, ctx);
3399 eval_repop(repop);
3400 repop->put();
3401 }
3402
3403 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3404 release_object_locks(ctx->lock_manager);
3405
3406 ctx->op_t.reset();
3407
3408 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3409 ctx->on_finish.erase(p++)) {
3410 (*p)();
3411 }
3412 delete ctx;
3413 }
3414
3415 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3416 {
3417 if (ctx->op)
3418 osd->reply_op_error(ctx->op, r);
3419 close_op_ctx(ctx);
3420 }
3421
3422 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3423 {
3424 if (ctx->op)
3425 osd->reply_op_error(ctx->op, r, v, uv);
3426 close_op_ctx(ctx);
3427 }
3428
3429 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3430 {
3431 OpRequestRef op = ctx->op;
3432 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3433
3434 utime_t now = ceph_clock_now();
3435 utime_t latency = now;
3436 latency -= ctx->op->get_req()->get_recv_stamp();
3437 utime_t process_latency = now;
3438 process_latency -= ctx->op->get_dequeued_time();
3439
3440 uint64_t inb = ctx->bytes_written;
3441 uint64_t outb = ctx->bytes_read;
3442
3443 osd->logger->inc(l_osd_op);
3444
3445 osd->logger->inc(l_osd_op_outb, outb);
3446 osd->logger->inc(l_osd_op_inb, inb);
3447 osd->logger->tinc(l_osd_op_lat, latency);
3448 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3449
3450 if (op->may_read() && op->may_write()) {
3451 osd->logger->inc(l_osd_op_rw);
3452 osd->logger->inc(l_osd_op_rw_inb, inb);
3453 osd->logger->inc(l_osd_op_rw_outb, outb);
3454 osd->logger->tinc(l_osd_op_rw_lat, latency);
3455 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3456 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3457 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3458 } else if (op->may_read()) {
3459 osd->logger->inc(l_osd_op_r);
3460 osd->logger->inc(l_osd_op_r_outb, outb);
3461 osd->logger->tinc(l_osd_op_r_lat, latency);
3462 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3463 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3464 } else if (op->may_write() || op->may_cache()) {
3465 osd->logger->inc(l_osd_op_w);
3466 osd->logger->inc(l_osd_op_w_inb, inb);
3467 osd->logger->tinc(l_osd_op_w_lat, latency);
3468 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3469 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3470 } else
3471 ceph_abort();
3472
3473 dout(15) << "log_op_stats " << *m
3474 << " inb " << inb
3475 << " outb " << outb
3476 << " lat " << latency << dendl;
3477 }
3478
3479 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3480 {
3481 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3482 assert(have_same_or_newer_map(m->map_epoch));
3483 assert(m->get_type() == MSG_OSD_SUBOP);
3484 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3485
3486 if (!is_peered()) {
3487 waiting_for_peered.push_back(op);
3488 op->mark_delayed("waiting for active");
3489 return;
3490 }
3491
3492 const OSDOp *first = NULL;
3493 if (m->ops.size() >= 1) {
3494 first = &m->ops[0];
3495 }
3496
3497 if (first) {
3498 switch (first->op.op) {
3499 case CEPH_OSD_OP_DELETE:
3500 sub_op_remove(op);
3501 return;
3502 case CEPH_OSD_OP_SCRUB_RESERVE:
3503 handle_scrub_reserve_request(op);
3504 return;
3505 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3506 handle_scrub_reserve_release(op);
3507 return;
3508 case CEPH_OSD_OP_SCRUB_MAP:
3509 sub_op_scrub_map(op);
3510 return;
3511 }
3512 }
3513 }
3514
3515 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3516 {
3517 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3518 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3519 if (r->ops.size() >= 1) {
3520 const OSDOp& first = r->ops[0];
3521 switch (first.op.op) {
3522 case CEPH_OSD_OP_SCRUB_RESERVE:
3523 {
3524 pg_shard_t from = r->from;
3525 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3526 bool reserved;
3527 ::decode(reserved, p);
3528 if (reserved) {
3529 handle_scrub_reserve_grant(op, from);
3530 } else {
3531 handle_scrub_reserve_reject(op, from);
3532 }
3533 }
3534 return;
3535 }
3536 }
3537 }
3538
3539 void PrimaryLogPG::do_scan(
3540 OpRequestRef op,
3541 ThreadPool::TPHandle &handle)
3542 {
3543 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3544 assert(m->get_type() == MSG_OSD_PG_SCAN);
3545 dout(10) << "do_scan " << *m << dendl;
3546
3547 op->mark_started();
3548
3549 switch (m->op) {
3550 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3551 {
3552 ostringstream ss;
3553 if (osd->check_backfill_full(ss)) {
3554 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3555 queue_peering_event(
3556 CephPeeringEvtRef(
3557 std::make_shared<CephPeeringEvt>(
3558 get_osdmap()->get_epoch(),
3559 get_osdmap()->get_epoch(),
3560 BackfillTooFull())));
3561 return;
3562 }
3563
3564 BackfillInterval bi;
3565 bi.begin = m->begin;
3566 // No need to flush, there won't be any in progress writes occuring
3567 // past m->begin
3568 scan_range(
3569 cct->_conf->osd_backfill_scan_min,
3570 cct->_conf->osd_backfill_scan_max,
3571 &bi,
3572 handle);
3573 MOSDPGScan *reply = new MOSDPGScan(
3574 MOSDPGScan::OP_SCAN_DIGEST,
3575 pg_whoami,
3576 get_osdmap()->get_epoch(), m->query_epoch,
3577 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3578 ::encode(bi.objects, reply->get_data());
3579 osd->send_message_osd_cluster(reply, m->get_connection());
3580 }
3581 break;
3582
3583 case MOSDPGScan::OP_SCAN_DIGEST:
3584 {
3585 pg_shard_t from = m->from;
3586
3587 // Check that from is in backfill_targets vector
3588 assert(is_backfill_targets(from));
3589
3590 BackfillInterval& bi = peer_backfill_info[from];
3591 bi.begin = m->begin;
3592 bi.end = m->end;
3593 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3594
3595 // take care to preserve ordering!
3596 bi.clear_objects();
3597 ::decode_noclear(bi.objects, p);
3598
3599 if (waiting_on_backfill.erase(from)) {
3600 if (waiting_on_backfill.empty()) {
3601 assert(peer_backfill_info.size() == backfill_targets.size());
3602 finish_recovery_op(hobject_t::get_max());
3603 }
3604 } else {
3605 // we canceled backfill for a while due to a too full, and this
3606 // is an extra response from a non-too-full peer
3607 }
3608 }
3609 break;
3610 }
3611 }
3612
3613 void PrimaryLogPG::do_backfill(OpRequestRef op)
3614 {
3615 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3616 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3617 dout(10) << "do_backfill " << *m << dendl;
3618
3619 op->mark_started();
3620
3621 switch (m->op) {
3622 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3623 {
3624 assert(cct->_conf->osd_kill_backfill_at != 1);
3625
3626 MOSDPGBackfill *reply = new MOSDPGBackfill(
3627 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3628 get_osdmap()->get_epoch(),
3629 m->query_epoch,
3630 spg_t(info.pgid.pgid, get_primary().shard));
3631 reply->set_priority(get_recovery_op_priority());
3632 osd->send_message_osd_cluster(reply, m->get_connection());
3633 queue_peering_event(
3634 CephPeeringEvtRef(
3635 std::make_shared<CephPeeringEvt>(
3636 get_osdmap()->get_epoch(),
3637 get_osdmap()->get_epoch(),
3638 RecoveryDone())));
3639 }
3640 // fall-thru
3641
3642 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3643 {
3644 assert(cct->_conf->osd_kill_backfill_at != 2);
3645
3646 info.set_last_backfill(m->last_backfill);
3647 info.stats = m->stats;
3648
3649 ObjectStore::Transaction t;
3650 dirty_info = true;
3651 write_if_dirty(t);
3652 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3653 assert(tr == 0);
3654 }
3655 break;
3656
3657 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3658 {
3659 assert(is_primary());
3660 assert(cct->_conf->osd_kill_backfill_at != 3);
3661 finish_recovery_op(hobject_t::get_max());
3662 }
3663 break;
3664 }
3665 }
3666
3667 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3668 {
3669 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3670 op->get_req());
3671 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3672 dout(7) << __func__ << " " << m->ls << dendl;
3673
3674 op->mark_started();
3675
3676 ObjectStore::Transaction t;
3677 for (auto& p : m->ls) {
3678 remove_snap_mapped_object(t, p.first);
3679 }
3680 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3681 assert(r == 0);
3682 }
3683
3684 int PrimaryLogPG::trim_object(
3685 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3686 {
3687 *ctxp = NULL;
3688 // load clone info
3689 bufferlist bl;
3690 ObjectContextRef obc = get_object_context(coid, false, NULL);
3691 if (!obc || !obc->ssc || !obc->ssc->exists) {
3692 osd->clog->error() << __func__ << ": Can not trim " << coid
3693 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3694 return -ENOENT;
3695 }
3696
3697 hobject_t snapoid(
3698 coid.oid, coid.get_key(),
3699 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3700 info.pgid.pool(), coid.get_namespace());
3701 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3702 if (!snapset_obc) {
3703 osd->clog->error() << __func__ << ": Can not trim " << coid
3704 << " repair needed, no snapset obc for " << snapoid;
3705 return -ENOENT;
3706 }
3707
3708 SnapSet& snapset = obc->ssc->snapset;
3709
3710 bool legacy = snapset.is_legacy() ||
3711 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3712
3713 object_info_t &coi = obc->obs.oi;
3714 set<snapid_t> old_snaps;
3715 if (legacy) {
3716 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3717 } else {
3718 auto p = snapset.clone_snaps.find(coid.snap);
3719 if (p == snapset.clone_snaps.end()) {
3720 osd->clog->error() << "No clone_snaps in snapset " << snapset
3721 << " for object " << coid << "\n";
3722 return -ENOENT;
3723 }
3724 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3725 snapset.clone_snaps[coid.snap].end());
3726 }
3727 if (old_snaps.empty()) {
3728 osd->clog->error() << "No object info snaps for object " << coid;
3729 return -ENOENT;
3730 }
3731
3732 dout(10) << coid << " old_snaps " << old_snaps
3733 << " old snapset " << snapset << dendl;
3734 if (snapset.seq == 0) {
3735 osd->clog->error() << "No snapset.seq for object " << coid;
3736 return -ENOENT;
3737 }
3738
3739 set<snapid_t> new_snaps;
3740 for (set<snapid_t>::iterator i = old_snaps.begin();
3741 i != old_snaps.end();
3742 ++i) {
3743 if (!pool.info.is_removed_snap(*i))
3744 new_snaps.insert(*i);
3745 }
3746
3747 vector<snapid_t>::iterator p = snapset.clones.end();
3748
3749 if (new_snaps.empty()) {
3750 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3751 if (p == snapset.clones.end()) {
3752 osd->clog->error() << "Snap " << coid.snap << " not in clones";
3753 return -ENOENT;
3754 }
3755 }
3756
3757 OpContextUPtr ctx = simple_opc_create(obc);
3758 ctx->snapset_obc = snapset_obc;
3759
3760 if (!ctx->lock_manager.get_snaptrimmer_write(
3761 coid,
3762 obc,
3763 first)) {
3764 close_op_ctx(ctx.release());
3765 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3766 return -ENOLCK;
3767 }
3768
3769 if (!ctx->lock_manager.get_snaptrimmer_write(
3770 snapoid,
3771 snapset_obc,
3772 first)) {
3773 close_op_ctx(ctx.release());
3774 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3775 return -ENOLCK;
3776 }
3777
3778 ctx->at_version = get_next_version();
3779
3780 PGTransaction *t = ctx->op_t.get();
3781
3782 if (new_snaps.empty()) {
3783 // remove clone
3784 dout(10) << coid << " snaps " << old_snaps << " -> "
3785 << new_snaps << " ... deleting" << dendl;
3786
3787 // ...from snapset
3788 assert(p != snapset.clones.end());
3789
3790 snapid_t last = coid.snap;
3791 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3792
3793 if (p != snapset.clones.begin()) {
3794 // not the oldest... merge overlap into next older clone
3795 vector<snapid_t>::iterator n = p - 1;
3796 hobject_t prev_coid = coid;
3797 prev_coid.snap = *n;
3798 bool adjust_prev_bytes = is_present_clone(prev_coid);
3799
3800 if (adjust_prev_bytes)
3801 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3802
3803 snapset.clone_overlap[*n].intersection_of(
3804 snapset.clone_overlap[*p]);
3805
3806 if (adjust_prev_bytes)
3807 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3808 }
3809 ctx->delta_stats.num_objects--;
3810 if (coi.is_dirty())
3811 ctx->delta_stats.num_objects_dirty--;
3812 if (coi.is_omap())
3813 ctx->delta_stats.num_objects_omap--;
3814 if (coi.is_whiteout()) {
3815 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3816 ctx->delta_stats.num_whiteouts--;
3817 }
3818 ctx->delta_stats.num_object_clones--;
3819 if (coi.is_cache_pinned())
3820 ctx->delta_stats.num_objects_pinned--;
3821 obc->obs.exists = false;
3822
3823 snapset.clones.erase(p);
3824 snapset.clone_overlap.erase(last);
3825 snapset.clone_size.erase(last);
3826 snapset.clone_snaps.erase(last);
3827
3828 ctx->log.push_back(
3829 pg_log_entry_t(
3830 pg_log_entry_t::DELETE,
3831 coid,
3832 ctx->at_version,
3833 ctx->obs->oi.version,
3834 0,
3835 osd_reqid_t(),
3836 ctx->mtime,
3837 0)
3838 );
3839 t->remove(coid);
3840 t->update_snaps(
3841 coid,
3842 old_snaps,
3843 new_snaps);
3844
3845 coi = object_info_t(coid);
3846
3847 ctx->at_version.version++;
3848 } else {
3849 // save adjusted snaps for this object
3850 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3851 if (legacy) {
3852 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3853 } else {
3854 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3855 new_snaps.rend());
3856 // we still do a 'modify' event on this object just to trigger a
3857 // snapmapper.update ... :(
3858 }
3859
3860 coi.prior_version = coi.version;
3861 coi.version = ctx->at_version;
3862 bl.clear();
3863 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3864 t->setattr(coid, OI_ATTR, bl);
3865
3866 ctx->log.push_back(
3867 pg_log_entry_t(
3868 pg_log_entry_t::MODIFY,
3869 coid,
3870 coi.version,
3871 coi.prior_version,
3872 0,
3873 osd_reqid_t(),
3874 ctx->mtime,
3875 0)
3876 );
3877 ctx->at_version.version++;
3878
3879 t->update_snaps(
3880 coid,
3881 old_snaps,
3882 new_snaps);
3883 }
3884
3885 // save head snapset
3886 dout(10) << coid << " new snapset " << snapset << " on "
3887 << snapset_obc->obs.oi << dendl;
3888 if (snapset.clones.empty() &&
3889 (!snapset.head_exists ||
3890 (snapset_obc->obs.oi.is_whiteout() &&
3891 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3892 !snapset_obc->obs.oi.is_cache_pinned()))) {
3893 // NOTE: this arguably constitutes minor interference with the
3894 // tiering agent if this is a cache tier since a snap trim event
3895 // is effectively evicting a whiteout we might otherwise want to
3896 // keep around.
3897 dout(10) << coid << " removing " << snapoid << dendl;
3898 ctx->log.push_back(
3899 pg_log_entry_t(
3900 pg_log_entry_t::DELETE,
3901 snapoid,
3902 ctx->at_version,
3903 ctx->snapset_obc->obs.oi.version,
3904 0,
3905 osd_reqid_t(),
3906 ctx->mtime,
3907 0)
3908 );
3909 if (snapoid.is_head()) {
3910 derr << "removing snap head" << dendl;
3911 object_info_t& oi = ctx->snapset_obc->obs.oi;
3912 ctx->delta_stats.num_objects--;
3913 if (oi.is_dirty()) {
3914 ctx->delta_stats.num_objects_dirty--;
3915 }
3916 if (oi.is_omap())
3917 ctx->delta_stats.num_objects_omap--;
3918 if (oi.is_whiteout()) {
3919 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3920 ctx->delta_stats.num_whiteouts--;
3921 }
3922 if (oi.is_cache_pinned()) {
3923 ctx->delta_stats.num_objects_pinned--;
3924 }
3925 }
3926 ctx->snapset_obc->obs.exists = false;
3927 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3928 t->remove(snapoid);
3929 } else {
3930 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3931 snapset.filter(pool.info);
3932 dout(10) << coid << " writing updated snapset on " << snapoid
3933 << ", snapset is " << snapset << dendl;
3934 ctx->log.push_back(
3935 pg_log_entry_t(
3936 pg_log_entry_t::MODIFY,
3937 snapoid,
3938 ctx->at_version,
3939 ctx->snapset_obc->obs.oi.version,
3940 0,
3941 osd_reqid_t(),
3942 ctx->mtime,
3943 0)
3944 );
3945
3946 ctx->snapset_obc->obs.oi.prior_version =
3947 ctx->snapset_obc->obs.oi.version;
3948 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3949
3950 map <string, bufferlist> attrs;
3951 bl.clear();
3952 ::encode(snapset, bl);
3953 attrs[SS_ATTR].claim(bl);
3954
3955 bl.clear();
3956 ::encode(ctx->snapset_obc->obs.oi, bl,
3957 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3958 attrs[OI_ATTR].claim(bl);
3959 t->setattrs(snapoid, attrs);
3960 }
3961
3962 *ctxp = std::move(ctx);
3963 return 0;
3964 }
3965
3966 void PrimaryLogPG::kick_snap_trim()
3967 {
3968 assert(is_active());
3969 assert(is_primary());
3970 if (is_clean() && !snap_trimq.empty()) {
3971 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3972 snap_trimmer_machine.process_event(KickTrim());
3973 }
3974 }
3975
3976 void PrimaryLogPG::snap_trimmer_scrub_complete()
3977 {
3978 if (is_primary() && is_active() && is_clean()) {
3979 assert(!snap_trimq.empty());
3980 snap_trimmer_machine.process_event(ScrubComplete());
3981 }
3982 }
3983
3984 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3985 {
3986 if (deleting || pg_has_reset_since(queued)) {
3987 return;
3988 }
3989
3990 assert(is_primary());
3991
3992 dout(10) << "snap_trimmer posting" << dendl;
3993 snap_trimmer_machine.process_event(DoSnapWork());
3994 dout(10) << "snap_trimmer complete" << dendl;
3995 return;
3996 }
3997
3998 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3999 {
4000 __u64 v2;
4001
4002 string v2s(xattr.c_str(), xattr.length());
4003 if (v2s.length())
4004 v2 = strtoull(v2s.c_str(), NULL, 10);
4005 else
4006 v2 = 0;
4007
4008 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4009
4010 switch (op) {
4011 case CEPH_OSD_CMPXATTR_OP_EQ:
4012 return (v1 == v2);
4013 case CEPH_OSD_CMPXATTR_OP_NE:
4014 return (v1 != v2);
4015 case CEPH_OSD_CMPXATTR_OP_GT:
4016 return (v1 > v2);
4017 case CEPH_OSD_CMPXATTR_OP_GTE:
4018 return (v1 >= v2);
4019 case CEPH_OSD_CMPXATTR_OP_LT:
4020 return (v1 < v2);
4021 case CEPH_OSD_CMPXATTR_OP_LTE:
4022 return (v1 <= v2);
4023 default:
4024 return -EINVAL;
4025 }
4026 }
4027
4028 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4029 {
4030 string v2s(xattr.c_str(), xattr.length());
4031
4032 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4033
4034 switch (op) {
4035 case CEPH_OSD_CMPXATTR_OP_EQ:
4036 return (v1s.compare(v2s) == 0);
4037 case CEPH_OSD_CMPXATTR_OP_NE:
4038 return (v1s.compare(v2s) != 0);
4039 case CEPH_OSD_CMPXATTR_OP_GT:
4040 return (v1s.compare(v2s) > 0);
4041 case CEPH_OSD_CMPXATTR_OP_GTE:
4042 return (v1s.compare(v2s) >= 0);
4043 case CEPH_OSD_CMPXATTR_OP_LT:
4044 return (v1s.compare(v2s) < 0);
4045 case CEPH_OSD_CMPXATTR_OP_LTE:
4046 return (v1s.compare(v2s) <= 0);
4047 default:
4048 return -EINVAL;
4049 }
4050 }
4051
4052 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4053 {
4054 ceph_osd_op& op = osd_op.op;
4055 vector<OSDOp> write_ops(1);
4056 OSDOp& write_op = write_ops[0];
4057 uint64_t write_length = op.writesame.length;
4058 int result = 0;
4059
4060 if (!write_length)
4061 return 0;
4062
4063 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4064 return -EINVAL;
4065
4066 if (op.writesame.data_length != osd_op.indata.length()) {
4067 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4068 return -EINVAL;
4069 }
4070
4071 while (write_length) {
4072 write_op.indata.append(osd_op.indata);
4073 write_length -= op.writesame.data_length;
4074 }
4075
4076 write_op.op.op = CEPH_OSD_OP_WRITE;
4077 write_op.op.extent.offset = op.writesame.offset;
4078 write_op.op.extent.length = op.writesame.length;
4079 result = do_osd_ops(ctx, write_ops);
4080 if (result < 0)
4081 derr << "do_writesame do_osd_ops failed " << result << dendl;
4082
4083 return result;
4084 }
4085
4086 // ========================================================================
4087 // low level osd ops
4088
4089 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4090 {
4091 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4092 bufferlist header, vals;
4093 int r = _get_tmap(ctx, &header, &vals);
4094 if (r < 0) {
4095 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4096 r = 0;
4097 return r;
4098 }
4099
4100 vector<OSDOp> ops(3);
4101
4102 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4103 ops[0].op.extent.offset = 0;
4104 ops[0].op.extent.length = 0;
4105
4106 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4107 ops[1].indata.claim(header);
4108
4109 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4110 ops[2].indata.claim(vals);
4111
4112 return do_osd_ops(ctx, ops);
4113 }
4114
4115 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4116 bufferlist& bl)
4117 {
4118 // decode
4119 bufferlist header;
4120 map<string, bufferlist> m;
4121 if (bl.length()) {
4122 bufferlist::iterator p = bl.begin();
4123 ::decode(header, p);
4124 ::decode(m, p);
4125 assert(p.end());
4126 }
4127
4128 // do the update(s)
4129 while (!bp.end()) {
4130 __u8 op;
4131 string key;
4132 ::decode(op, bp);
4133
4134 switch (op) {
4135 case CEPH_OSD_TMAP_SET: // insert key
4136 {
4137 ::decode(key, bp);
4138 bufferlist data;
4139 ::decode(data, bp);
4140 m[key] = data;
4141 }
4142 break;
4143 case CEPH_OSD_TMAP_RM: // remove key
4144 ::decode(key, bp);
4145 if (!m.count(key)) {
4146 return -ENOENT;
4147 }
4148 m.erase(key);
4149 break;
4150 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4151 ::decode(key, bp);
4152 m.erase(key);
4153 break;
4154 case CEPH_OSD_TMAP_HDR: // update header
4155 {
4156 ::decode(header, bp);
4157 }
4158 break;
4159 default:
4160 return -EINVAL;
4161 }
4162 }
4163
4164 // reencode
4165 bufferlist obl;
4166 ::encode(header, obl);
4167 ::encode(m, obl);
4168
4169 // write it out
4170 vector<OSDOp> nops(1);
4171 OSDOp& newop = nops[0];
4172 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4173 newop.op.extent.offset = 0;
4174 newop.op.extent.length = obl.length();
4175 newop.indata = obl;
4176 do_osd_ops(ctx, nops);
4177 osd_op.outdata.claim(newop.outdata);
4178 return 0;
4179 }
4180
4181 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4182 {
4183 bufferlist::iterator orig_bp = bp;
4184 int result = 0;
4185 if (bp.end()) {
4186 dout(10) << "tmapup is a no-op" << dendl;
4187 } else {
4188 // read the whole object
4189 vector<OSDOp> nops(1);
4190 OSDOp& newop = nops[0];
4191 newop.op.op = CEPH_OSD_OP_READ;
4192 newop.op.extent.offset = 0;
4193 newop.op.extent.length = 0;
4194 result = do_osd_ops(ctx, nops);
4195
4196 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4197
4198 dout(30) << " starting is \n";
4199 newop.outdata.hexdump(*_dout);
4200 *_dout << dendl;
4201
4202 bufferlist::iterator ip = newop.outdata.begin();
4203 bufferlist obl;
4204
4205 dout(30) << "the update command is: \n";
4206 osd_op.indata.hexdump(*_dout);
4207 *_dout << dendl;
4208
4209 // header
4210 bufferlist header;
4211 __u32 nkeys = 0;
4212 if (newop.outdata.length()) {
4213 ::decode(header, ip);
4214 ::decode(nkeys, ip);
4215 }
4216 dout(10) << "tmapup header " << header.length() << dendl;
4217
4218 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4219 ++bp;
4220 ::decode(header, bp);
4221 dout(10) << "tmapup new header " << header.length() << dendl;
4222 }
4223
4224 ::encode(header, obl);
4225
4226 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4227
4228 // update keys
4229 bufferlist newkeydata;
4230 string nextkey, last_in_key;
4231 bufferlist nextval;
4232 bool have_next = false;
4233 if (!ip.end()) {
4234 have_next = true;
4235 ::decode(nextkey, ip);
4236 ::decode(nextval, ip);
4237 }
4238 while (!bp.end() && !result) {
4239 __u8 op;
4240 string key;
4241 try {
4242 ::decode(op, bp);
4243 ::decode(key, bp);
4244 }
4245 catch (buffer::error& e) {
4246 return -EINVAL;
4247 }
4248 if (key < last_in_key) {
4249 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4250 << "', falling back to an inefficient (unsorted) update" << dendl;
4251 bp = orig_bp;
4252 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4253 }
4254 last_in_key = key;
4255
4256 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4257
4258 // skip existing intervening keys
4259 bool key_exists = false;
4260 while (have_next && !key_exists) {
4261 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4262 if (nextkey > key)
4263 break;
4264 if (nextkey < key) {
4265 // copy untouched.
4266 ::encode(nextkey, newkeydata);
4267 ::encode(nextval, newkeydata);
4268 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4269 } else {
4270 // don't copy; discard old value. and stop.
4271 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4272 key_exists = true;
4273 nkeys--;
4274 }
4275 if (!ip.end()) {
4276 ::decode(nextkey, ip);
4277 ::decode(nextval, ip);
4278 } else {
4279 have_next = false;
4280 }
4281 }
4282
4283 if (op == CEPH_OSD_TMAP_SET) {
4284 bufferlist val;
4285 try {
4286 ::decode(val, bp);
4287 }
4288 catch (buffer::error& e) {
4289 return -EINVAL;
4290 }
4291 ::encode(key, newkeydata);
4292 ::encode(val, newkeydata);
4293 dout(20) << " set " << key << " " << val.length() << dendl;
4294 nkeys++;
4295 } else if (op == CEPH_OSD_TMAP_CREATE) {
4296 if (key_exists) {
4297 return -EEXIST;
4298 }
4299 bufferlist val;
4300 try {
4301 ::decode(val, bp);
4302 }
4303 catch (buffer::error& e) {
4304 return -EINVAL;
4305 }
4306 ::encode(key, newkeydata);
4307 ::encode(val, newkeydata);
4308 dout(20) << " create " << key << " " << val.length() << dendl;
4309 nkeys++;
4310 } else if (op == CEPH_OSD_TMAP_RM) {
4311 // do nothing.
4312 if (!key_exists) {
4313 return -ENOENT;
4314 }
4315 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4316 // do nothing
4317 } else {
4318 dout(10) << " invalid tmap op " << (int)op << dendl;
4319 return -EINVAL;
4320 }
4321 }
4322
4323 // copy remaining
4324 if (have_next) {
4325 ::encode(nextkey, newkeydata);
4326 ::encode(nextval, newkeydata);
4327 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4328 }
4329 if (!ip.end()) {
4330 bufferlist rest;
4331 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4332 dout(20) << " keep trailing " << rest.length()
4333 << " at " << newkeydata.length() << dendl;
4334 newkeydata.claim_append(rest);
4335 }
4336
4337 // encode final key count + key data
4338 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4339 ::encode(nkeys, obl);
4340 obl.claim_append(newkeydata);
4341
4342 if (0) {
4343 dout(30) << " final is \n";
4344 obl.hexdump(*_dout);
4345 *_dout << dendl;
4346
4347 // sanity check
4348 bufferlist::iterator tp = obl.begin();
4349 bufferlist h;
4350 ::decode(h, tp);
4351 map<string,bufferlist> d;
4352 ::decode(d, tp);
4353 assert(tp.end());
4354 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4355 }
4356
4357 // write it out
4358 if (!result) {
4359 dout(20) << "tmapput write " << obl.length() << dendl;
4360 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4361 newop.op.extent.offset = 0;
4362 newop.op.extent.length = obl.length();
4363 newop.indata = obl;
4364 do_osd_ops(ctx, nops);
4365 osd_op.outdata.claim(newop.outdata);
4366 }
4367 }
4368 return result;
4369 }
4370
4371 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4372 {
4373 if (offset >= max ||
4374 length > max ||
4375 offset + length > max)
4376 return -EFBIG;
4377
4378 return 0;
4379 }
4380
4381 struct FillInVerifyExtent : public Context {
4382 ceph_le64 *r;
4383 int32_t *rval;
4384 bufferlist *outdatap;
4385 boost::optional<uint32_t> maybe_crc;
4386 uint64_t size;
4387 OSDService *osd;
4388 hobject_t soid;
4389 __le32 flags;
4390 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4391 boost::optional<uint32_t> mc, uint64_t size,
4392 OSDService *osd, hobject_t soid, __le32 flags) :
4393 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4394 size(size), osd(osd), soid(soid), flags(flags) {}
4395 void finish(int len) override {
4396 *r = len;
4397 if (len < 0) {
4398 *rval = len;
4399 return;
4400 }
4401 *rval = 0;
4402
4403 // whole object? can we verify the checksum?
4404 if (maybe_crc && *r == size) {
4405 uint32_t crc = outdatap->crc32c(-1);
4406 if (maybe_crc != crc) {
4407 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4408 << " != expected 0x" << *maybe_crc
4409 << std::dec << " on " << soid;
4410 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4411 *rval = -EIO;
4412 *r = 0;
4413 }
4414 }
4415 }
4416 }
4417 };
4418
4419 struct ToSparseReadResult : public Context {
4420 int* result;
4421 bufferlist* data_bl;
4422 uint64_t data_offset;
4423 ceph_le64* len;
4424 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4425 ceph_le64* len)
4426 : result(result), data_bl(bl), data_offset(offset),len(len) {}
4427 void finish(int r) override {
4428 if (r < 0) {
4429 *result = r;
4430 return;
4431 }
4432 *result = 0;
4433 *len = r;
4434 bufferlist outdata;
4435 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4436 ::encode(extents, outdata);
4437 ::encode_destructively(*data_bl, outdata);
4438 data_bl->swap(outdata);
4439 }
4440 };
4441
4442 template<typename V>
4443 static string list_keys(const map<string, V>& m) {
4444 string s;
4445 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4446 if (!s.empty()) {
4447 s.push_back(',');
4448 }
4449 s.append(itr->first);
4450 }
4451 return s;
4452 }
4453
4454 template<typename T>
4455 static string list_entries(const T& m) {
4456 string s;
4457 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4458 if (!s.empty()) {
4459 s.push_back(',');
4460 }
4461 s.append(*itr);
4462 }
4463 return s;
4464 }
4465
4466 void PrimaryLogPG::maybe_create_new_object(
4467 OpContext *ctx,
4468 bool ignore_transaction)
4469 {
4470 ObjectState& obs = ctx->new_obs;
4471 if (!obs.exists) {
4472 ctx->delta_stats.num_objects++;
4473 obs.exists = true;
4474 assert(!obs.oi.is_whiteout());
4475 obs.oi.new_object();
4476 if (!ignore_transaction)
4477 ctx->op_t->create(obs.oi.soid);
4478 } else if (obs.oi.is_whiteout()) {
4479 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4480 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4481 --ctx->delta_stats.num_whiteouts;
4482 }
4483 }
4484
4485 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4486 OSDOp& osd_op;
4487
4488 ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4489 }
4490
4491 int execute() override {
4492 return osd_op.rval;
4493 }
4494 };
4495
4496 struct C_ChecksumRead : public Context {
4497 PrimaryLogPG *primary_log_pg;
4498 OSDOp &osd_op;
4499 Checksummer::CSumType csum_type;
4500 bufferlist init_value_bl;
4501 ceph_le64 read_length;
4502 bufferlist read_bl;
4503 Context *fill_extent_ctx;
4504
4505 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4506 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4507 boost::optional<uint32_t> maybe_crc, uint64_t size,
4508 OSDService *osd, hobject_t soid, __le32 flags)
4509 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4510 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4511 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4512 &read_bl, maybe_crc, size,
4513 osd, soid, flags)) {
4514 }
4515 ~C_ChecksumRead() override {
4516 delete fill_extent_ctx;
4517 }
4518
4519 void finish(int r) override {
4520 fill_extent_ctx->complete(r);
4521 fill_extent_ctx = nullptr;
4522
4523 if (osd_op.rval >= 0) {
4524 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4525 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4526 &init_value_bl_it, read_bl);
4527 }
4528 }
4529 };
4530
4531 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4532 bufferlist::iterator *bl_it)
4533 {
4534 dout(20) << __func__ << dendl;
4535
4536 auto& op = osd_op.op;
4537 if (op.checksum.chunk_size > 0) {
4538 if (op.checksum.length == 0) {
4539 dout(10) << __func__ << ": length required when chunk size provided"
4540 << dendl;
4541 return -EINVAL;
4542 }
4543 if (op.checksum.length % op.checksum.chunk_size != 0) {
4544 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4545 return -EINVAL;
4546 }
4547 }
4548
4549 auto& oi = ctx->new_obs.oi;
4550 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4551 // zeroed offset+length implies checksum whole object
4552 op.checksum.length = oi.size;
4553 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4554 return -EOVERFLOW;
4555 }
4556
4557 Checksummer::CSumType csum_type;
4558 switch (op.checksum.type) {
4559 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4560 csum_type = Checksummer::CSUM_XXHASH32;
4561 break;
4562 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4563 csum_type = Checksummer::CSUM_XXHASH64;
4564 break;
4565 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4566 csum_type = Checksummer::CSUM_CRC32C;
4567 break;
4568 default:
4569 dout(10) << __func__ << ": unknown crc type ("
4570 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4571 return -EINVAL;
4572 }
4573
4574 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4575 if (bl_it->get_remaining() < csum_init_value_size) {
4576 dout(10) << __func__ << ": init value not provided" << dendl;
4577 return -EINVAL;
4578 }
4579
4580 bufferlist init_value_bl;
4581 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4582 csum_init_value_size);
4583 bl_it->advance(csum_init_value_size);
4584
4585 if (pool.info.require_rollback() && op.checksum.length > 0) {
4586 // If there is a data digest and it is possible we are reading
4587 // entire object, pass the digest.
4588 boost::optional<uint32_t> maybe_crc;
4589 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4590 op.checksum.length >= oi.size) {
4591 maybe_crc = oi.data_digest;
4592 }
4593
4594 // async read
4595 auto& soid = oi.soid;
4596 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4597 std::move(init_value_bl), maybe_crc,
4598 oi.size, osd, soid, op.flags);
4599
4600 ctx->pending_async_reads.push_back({
4601 {op.checksum.offset, op.checksum.length, op.flags},
4602 {&checksum_ctx->read_bl, checksum_ctx}});
4603
4604 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4605 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4606 new ReadFinisher(osd_op));
4607 return -EINPROGRESS;
4608 }
4609
4610 // sync read
4611 std::vector<OSDOp> read_ops(1);
4612 auto& read_op = read_ops[0];
4613 if (op.checksum.length > 0) {
4614 read_op.op.op = CEPH_OSD_OP_READ;
4615 read_op.op.flags = op.flags;
4616 read_op.op.extent.offset = op.checksum.offset;
4617 read_op.op.extent.length = op.checksum.length;
4618 read_op.op.extent.truncate_size = 0;
4619 read_op.op.extent.truncate_seq = 0;
4620
4621 int r = do_osd_ops(ctx, read_ops);
4622 if (r < 0) {
4623 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4624 return r;
4625 }
4626 }
4627
4628 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4629 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4630 read_op.outdata);
4631 }
4632
4633 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4634 Checksummer::CSumType csum_type,
4635 bufferlist::iterator *init_value_bl_it,
4636 const bufferlist &read_bl) {
4637 dout(20) << __func__ << dendl;
4638
4639 auto& op = osd_op.op;
4640
4641 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4642 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4643 << op.checksum.length << dendl;
4644 return -EINVAL;
4645 }
4646
4647 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4648 op.checksum.chunk_size : read_bl.length());
4649 uint32_t csum_count = (csum_chunk_size > 0 ?
4650 read_bl.length() / csum_chunk_size : 0);
4651
4652 bufferlist csum;
4653 bufferptr csum_data;
4654 if (csum_count > 0) {
4655 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4656 csum_data = buffer::create(csum_value_size * csum_count);
4657 csum_data.zero();
4658 csum.append(csum_data);
4659
4660 switch (csum_type) {
4661 case Checksummer::CSUM_XXHASH32:
4662 {
4663 Checksummer::xxhash32::init_value_t init_value;
4664 ::decode(init_value, *init_value_bl_it);
4665 Checksummer::calculate<Checksummer::xxhash32>(
4666 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4667 &csum_data);
4668 }
4669 break;
4670 case Checksummer::CSUM_XXHASH64:
4671 {
4672 Checksummer::xxhash64::init_value_t init_value;
4673 ::decode(init_value, *init_value_bl_it);
4674 Checksummer::calculate<Checksummer::xxhash64>(
4675 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4676 &csum_data);
4677 }
4678 break;
4679 case Checksummer::CSUM_CRC32C:
4680 {
4681 Checksummer::crc32c::init_value_t init_value;
4682 ::decode(init_value, *init_value_bl_it);
4683 Checksummer::calculate<Checksummer::crc32c>(
4684 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4685 &csum_data);
4686 }
4687 break;
4688 default:
4689 break;
4690 }
4691 }
4692
4693 ::encode(csum_count, osd_op.outdata);
4694 osd_op.outdata.claim_append(csum);
4695 return 0;
4696 }
4697
4698 struct C_ExtentCmpRead : public Context {
4699 PrimaryLogPG *primary_log_pg;
4700 OSDOp &osd_op;
4701 ceph_le64 read_length;
4702 bufferlist read_bl;
4703 Context *fill_extent_ctx;
4704
4705 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4706 boost::optional<uint32_t> maybe_crc, uint64_t size,
4707 OSDService *osd, hobject_t soid, __le32 flags)
4708 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4709 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4710 &read_bl, maybe_crc, size,
4711 osd, soid, flags)) {
4712 }
4713 ~C_ExtentCmpRead() override {
4714 delete fill_extent_ctx;
4715 }
4716
4717 void finish(int r) override {
4718 if (r == -ENOENT) {
4719 osd_op.rval = 0;
4720 read_bl.clear();
4721 delete fill_extent_ctx;
4722 } else {
4723 fill_extent_ctx->complete(r);
4724 }
4725 fill_extent_ctx = nullptr;
4726
4727 if (osd_op.rval >= 0) {
4728 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4729 }
4730 }
4731 };
4732
4733 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4734 {
4735 dout(20) << __func__ << dendl;
4736 ceph_osd_op& op = osd_op.op;
4737
4738 auto& oi = ctx->new_obs.oi;
4739 uint64_t size = oi.size;
4740 if ((oi.truncate_seq < op.extent.truncate_seq) &&
4741 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4742 size = op.extent.truncate_size;
4743 }
4744
4745 if (op.extent.offset >= size) {
4746 op.extent.length = 0;
4747 } else if (op.extent.offset + op.extent.length > size) {
4748 op.extent.length = size - op.extent.offset;
4749 }
4750
4751 if (op.extent.length == 0) {
4752 dout(20) << __func__ << " zero length extent" << dendl;
4753 return finish_extent_cmp(osd_op, bufferlist{});
4754 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4755 dout(20) << __func__ << " object DNE" << dendl;
4756 return finish_extent_cmp(osd_op, {});
4757 } else if (pool.info.require_rollback()) {
4758 // If there is a data digest and it is possible we are reading
4759 // entire object, pass the digest.
4760 boost::optional<uint32_t> maybe_crc;
4761 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4762 op.checksum.length >= oi.size) {
4763 maybe_crc = oi.data_digest;
4764 }
4765
4766 // async read
4767 auto& soid = oi.soid;
4768 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4769 osd, soid, op.flags);
4770 ctx->pending_async_reads.push_back({
4771 {op.extent.offset, op.extent.length, op.flags},
4772 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4773
4774 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4775
4776 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4777 new ReadFinisher(osd_op));
4778 return -EINPROGRESS;
4779 }
4780
4781 // sync read
4782 vector<OSDOp> read_ops(1);
4783 OSDOp& read_op = read_ops[0];
4784
4785 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4786 read_op.op.extent.offset = op.extent.offset;
4787 read_op.op.extent.length = op.extent.length;
4788 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4789 read_op.op.extent.truncate_size = op.extent.truncate_size;
4790
4791 int result = do_osd_ops(ctx, read_ops);
4792 if (result < 0) {
4793 derr << __func__ << " failed " << result << dendl;
4794 return result;
4795 }
4796 return finish_extent_cmp(osd_op, read_op.outdata);
4797 }
4798
4799 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4800 {
4801 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4802 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4803 if (osd_op.indata[idx] != read_byte) {
4804 return (-MAX_ERRNO - idx);
4805 }
4806 }
4807
4808 return 0;
4809 }
4810
4811 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4812 dout(20) << __func__ << dendl;
4813 auto& op = osd_op.op;
4814 auto& oi = ctx->new_obs.oi;
4815 auto& soid = oi.soid;
4816 __u32 seq = oi.truncate_seq;
4817 uint64_t size = oi.size;
4818 bool trimmed_read = false;
4819
4820 // are we beyond truncate_size?
4821 if ( (seq < op.extent.truncate_seq) &&
4822 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4823 size = op.extent.truncate_size;
4824
4825 if (op.extent.length == 0) //length is zero mean read the whole object
4826 op.extent.length = size;
4827
4828 if (op.extent.offset >= size) {
4829 op.extent.length = 0;
4830 trimmed_read = true;
4831 } else if (op.extent.offset + op.extent.length > size) {
4832 op.extent.length = size - op.extent.offset;
4833 trimmed_read = true;
4834 }
4835
4836 // read into a buffer
4837 int result = 0;
4838 if (trimmed_read && op.extent.length == 0) {
4839 // read size was trimmed to zero and it is expected to do nothing
4840 // a read operation of 0 bytes does *not* do nothing, this is why
4841 // the trimmed_read boolean is needed
4842 } else if (pool.info.require_rollback()) {
4843 boost::optional<uint32_t> maybe_crc;
4844 // If there is a data digest and it is possible we are reading
4845 // entire object, pass the digest. FillInVerifyExtent will
4846 // will check the oi.size again.
4847 if (oi.is_data_digest() && op.extent.offset == 0 &&
4848 op.extent.length >= oi.size)
4849 maybe_crc = oi.data_digest;
4850 ctx->pending_async_reads.push_back(
4851 make_pair(
4852 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4853 make_pair(&osd_op.outdata,
4854 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4855 &osd_op.outdata, maybe_crc, oi.size,
4856 osd, soid, op.flags))));
4857 dout(10) << " async_read noted for " << soid << dendl;
4858
4859 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4860 new ReadFinisher(osd_op));
4861 } else {
4862 int r = pgbackend->objects_read_sync(
4863 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4864 if (r == -EIO) {
4865 r = rep_repair_primary_object(soid, ctx->op);
4866 }
4867 if (r >= 0)
4868 op.extent.length = r;
4869 else {
4870 result = r;
4871 op.extent.length = 0;
4872 }
4873 dout(10) << " read got " << r << " / " << op.extent.length
4874 << " bytes from obj " << soid << dendl;
4875
4876 // whole object? can we verify the checksum?
4877 if (op.extent.length == oi.size && oi.is_data_digest()) {
4878 uint32_t crc = osd_op.outdata.crc32c(-1);
4879 if (oi.data_digest != crc) {
4880 osd->clog->error() << info.pgid << std::hex
4881 << " full-object read crc 0x" << crc
4882 << " != expected 0x" << oi.data_digest
4883 << std::dec << " on " << soid;
4884 // FIXME fall back to replica or something?
4885 result = -EIO;
4886 }
4887 }
4888 }
4889
4890 // XXX the op.extent.length is the requested length for async read
4891 // On error this length is changed to 0 after the error comes back.
4892 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4893 ctx->delta_stats.num_rd++;
4894 return result;
4895 }
4896
4897 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4898 dout(20) << __func__ << dendl;
4899 auto& op = osd_op.op;
4900 auto& oi = ctx->new_obs.oi;
4901 auto& soid = oi.soid;
4902
4903 if (op.extent.truncate_seq) {
4904 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4905 return -EINVAL;
4906 }
4907
4908 ++ctx->num_read;
4909 if (pool.info.ec_pool()) {
4910 // translate sparse read to a normal one if not supported
4911 uint64_t offset = op.extent.offset;
4912 uint64_t length = op.extent.length;
4913 if (offset > oi.size) {
4914 length = 0;
4915 } else if (offset + length > oi.size) {
4916 length = oi.size - offset;
4917 }
4918
4919 if (length > 0) {
4920 ctx->pending_async_reads.push_back(
4921 make_pair(
4922 boost::make_tuple(offset, length, op.flags),
4923 make_pair(
4924 &osd_op.outdata,
4925 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4926 &op.extent.length))));
4927 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4928
4929 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4930 new ReadFinisher(osd_op));
4931 } else {
4932 dout(10) << " sparse read ended up empty for " << soid << dendl;
4933 map<uint64_t, uint64_t> extents;
4934 ::encode(extents, osd_op.outdata);
4935 }
4936 } else {
4937 // read into a buffer
4938 map<uint64_t, uint64_t> m;
4939 uint32_t total_read = 0;
4940 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4941 info.pgid.shard),
4942 op.extent.offset, op.extent.length, m);
4943 if (r < 0) {
4944 return r;
4945 }
4946
4947 map<uint64_t, uint64_t>::iterator miter;
4948 bufferlist data_bl;
4949 uint64_t last = op.extent.offset;
4950 for (miter = m.begin(); miter != m.end(); ++miter) {
4951 // verify hole?
4952 if (cct->_conf->osd_verify_sparse_read_holes &&
4953 last < miter->first) {
4954 bufferlist t;
4955 uint64_t len = miter->first - last;
4956 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4957 if (r < 0) {
4958 osd->clog->error() << coll << " " << soid
4959 << " sparse-read failed to read: "
4960 << r;
4961 } else if (!t.is_zero()) {
4962 osd->clog->error() << coll << " " << soid
4963 << " sparse-read found data in hole "
4964 << last << "~" << len;
4965 }
4966 }
4967
4968 bufferlist tmpbl;
4969 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4970 op.flags, &tmpbl);
4971 if (r == -EIO) {
4972 r = rep_repair_primary_object(soid, ctx->op);
4973 }
4974 if (r < 0) {
4975 return r;
4976 }
4977
4978 // this is usually happen when we get extent that exceeds the actual file
4979 // size
4980 if (r < (int)miter->second)
4981 miter->second = r;
4982 total_read += r;
4983 dout(10) << "sparse-read " << miter->first << "@" << miter->second
4984 << dendl;
4985 data_bl.claim_append(tmpbl);
4986 last = miter->first + r;
4987 }
4988
4989 if (r < 0) {
4990 return r;
4991 }
4992
4993 // verify trailing hole?
4994 if (cct->_conf->osd_verify_sparse_read_holes) {
4995 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4996 if (last < end) {
4997 bufferlist t;
4998 uint64_t len = end - last;
4999 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5000 if (r < 0) {
5001 osd->clog->error() << coll << " " << soid
5002 << " sparse-read failed to read: " << r;
5003 } else if (!t.is_zero()) {
5004 osd->clog->error() << coll << " " << soid
5005 << " sparse-read found data in hole "
5006 << last << "~" << len;
5007 }
5008 }
5009 }
5010
5011 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5012 // Maybe at first, there is no much whole objects. With continued use, more
5013 // and more whole object exist. So from this point, for spare-read add
5014 // checksum make sense.
5015 if (total_read == oi.size && oi.is_data_digest()) {
5016 uint32_t crc = data_bl.crc32c(-1);
5017 if (oi.data_digest != crc) {
5018 osd->clog->error() << info.pgid << std::hex
5019 << " full-object read crc 0x" << crc
5020 << " != expected 0x" << oi.data_digest
5021 << std::dec << " on " << soid;
5022 // FIXME fall back to replica or something?
5023 return -EIO;
5024 }
5025 }
5026
5027 op.extent.length = total_read;
5028
5029 ::encode(m, osd_op.outdata); // re-encode since it might be modified
5030 ::encode_destructively(data_bl, osd_op.outdata);
5031
5032 dout(10) << " sparse_read got " << total_read << " bytes from object "
5033 << soid << dendl;
5034 }
5035
5036 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5037 ctx->delta_stats.num_rd++;
5038 return 0;
5039 }
5040
5041 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5042 {
5043 int result = 0;
5044 SnapSetContext *ssc = ctx->obc->ssc;
5045 ObjectState& obs = ctx->new_obs;
5046 object_info_t& oi = obs.oi;
5047 const hobject_t& soid = oi.soid;
5048
5049 PGTransaction* t = ctx->op_t.get();
5050
5051 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5052
5053 ctx->current_osd_subop_num = 0;
5054 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5055 OSDOp& osd_op = *p;
5056 ceph_osd_op& op = osd_op.op;
5057
5058 OpFinisher* op_finisher = nullptr;
5059 {
5060 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5061 if (op_finisher_it != ctx->op_finishers.end()) {
5062 op_finisher = op_finisher_it->second.get();
5063 }
5064 }
5065
5066 // TODO: check endianness (__le32 vs uint32_t, etc.)
5067 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5068 // but the code in this function seems to treat them as native-endian. What should the
5069 // tracepoints do?
5070 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5071
5072 dout(10) << "do_osd_op " << osd_op << dendl;
5073
5074 bufferlist::iterator bp = osd_op.indata.begin();
5075
5076 // user-visible modifcation?
5077 switch (op.op) {
5078 // non user-visible modifications
5079 case CEPH_OSD_OP_WATCH:
5080 case CEPH_OSD_OP_CACHE_EVICT:
5081 case CEPH_OSD_OP_CACHE_FLUSH:
5082 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5083 case CEPH_OSD_OP_UNDIRTY:
5084 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5085 case CEPH_OSD_OP_CACHE_PIN:
5086 case CEPH_OSD_OP_CACHE_UNPIN:
5087 case CEPH_OSD_OP_SET_REDIRECT:
5088 break;
5089 default:
5090 if (op.op & CEPH_OSD_OP_MODE_WR)
5091 ctx->user_modify = true;
5092 }
5093
5094 // munge -1 truncate to 0 truncate
5095 if (ceph_osd_op_uses_extent(op.op) &&
5096 op.extent.truncate_seq == 1 &&
5097 op.extent.truncate_size == (-1ULL)) {
5098 op.extent.truncate_size = 0;
5099 op.extent.truncate_seq = 0;
5100 }
5101
5102 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5103 if (op.op == CEPH_OSD_OP_ZERO &&
5104 obs.exists &&
5105 op.extent.offset < cct->_conf->osd_max_object_size &&
5106 op.extent.length >= 1 &&
5107 op.extent.length <= cct->_conf->osd_max_object_size &&
5108 op.extent.offset + op.extent.length >= oi.size) {
5109 if (op.extent.offset >= oi.size) {
5110 // no-op
5111 goto fail;
5112 }
5113 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5114 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5115 op.op = CEPH_OSD_OP_TRUNCATE;
5116 }
5117
5118 switch (op.op) {
5119
5120 // --- READS ---
5121
5122 case CEPH_OSD_OP_CMPEXT:
5123 ++ctx->num_read;
5124 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5125 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5126 op.extent.length, op.extent.truncate_size,
5127 op.extent.truncate_seq);
5128
5129 if (op_finisher == nullptr) {
5130 result = do_extent_cmp(ctx, osd_op);
5131 } else {
5132 result = op_finisher->execute();
5133 }
5134 break;
5135
5136 case CEPH_OSD_OP_SYNC_READ:
5137 if (pool.info.require_rollback()) {
5138 result = -EOPNOTSUPP;
5139 break;
5140 }
5141 // fall through
5142 case CEPH_OSD_OP_READ:
5143 ++ctx->num_read;
5144 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5145 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5146 op.extent.length, op.extent.truncate_size,
5147 op.extent.truncate_seq);
5148 if (op_finisher == nullptr) {
5149 if (!ctx->data_off) {
5150 ctx->data_off = op.extent.offset;
5151 }
5152 result = do_read(ctx, osd_op);
5153 } else {
5154 result = op_finisher->execute();
5155 }
5156 break;
5157
5158 case CEPH_OSD_OP_CHECKSUM:
5159 ++ctx->num_read;
5160 {
5161 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5162 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5163 op.checksum.offset, op.checksum.length,
5164 op.checksum.chunk_size);
5165
5166 if (op_finisher == nullptr) {
5167 result = do_checksum(ctx, osd_op, &bp);
5168 } else {
5169 result = op_finisher->execute();
5170 }
5171 }
5172 break;
5173
5174 /* map extents */
5175 case CEPH_OSD_OP_MAPEXT:
5176 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5177 if (pool.info.require_rollback()) {
5178 result = -EOPNOTSUPP;
5179 break;
5180 }
5181 ++ctx->num_read;
5182 {
5183 // read into a buffer
5184 bufferlist bl;
5185 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5186 info.pgid.shard),
5187 op.extent.offset, op.extent.length, bl);
5188 osd_op.outdata.claim(bl);
5189 if (r < 0)
5190 result = r;
5191 else
5192 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5193 ctx->delta_stats.num_rd++;
5194 dout(10) << " map_extents done on object " << soid << dendl;
5195 }
5196 break;
5197
5198 /* map extents */
5199 case CEPH_OSD_OP_SPARSE_READ:
5200 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5201 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5202 op.extent.length, op.extent.truncate_size,
5203 op.extent.truncate_seq);
5204 if (op_finisher == nullptr) {
5205 result = do_sparse_read(ctx, osd_op);
5206 } else {
5207 result = op_finisher->execute();
5208 }
5209 break;
5210
5211 case CEPH_OSD_OP_CALL:
5212 {
5213 string cname, mname;
5214 bufferlist indata;
5215 try {
5216 bp.copy(op.cls.class_len, cname);
5217 bp.copy(op.cls.method_len, mname);
5218 bp.copy(op.cls.indata_len, indata);
5219 } catch (buffer::error& e) {
5220 dout(10) << "call unable to decode class + method + indata" << dendl;
5221 dout(30) << "in dump: ";
5222 osd_op.indata.hexdump(*_dout);
5223 *_dout << dendl;
5224 result = -EINVAL;
5225 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5226 break;
5227 }
5228 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5229
5230 ClassHandler::ClassData *cls;
5231 result = osd->class_handler->open_class(cname, &cls);
5232 assert(result == 0); // init_op_flags() already verified this works.
5233
5234 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5235 if (!method) {
5236 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5237 result = -EOPNOTSUPP;
5238 break;
5239 }
5240
5241 int flags = method->get_flags();
5242 if (flags & CLS_METHOD_WR)
5243 ctx->user_modify = true;
5244
5245 bufferlist outdata;
5246 dout(10) << "call method " << cname << "." << mname << dendl;
5247 int prev_rd = ctx->num_read;
5248 int prev_wr = ctx->num_write;
5249 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5250
5251 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5252 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5253 result = -EIO;
5254 break;
5255 }
5256 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5257 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5258 result = -EIO;
5259 break;
5260 }
5261
5262 dout(10) << "method called response length=" << outdata.length() << dendl;
5263 op.extent.length = outdata.length();
5264 osd_op.outdata.claim_append(outdata);
5265 dout(30) << "out dump: ";
5266 osd_op.outdata.hexdump(*_dout);
5267 *_dout << dendl;
5268 }
5269 break;
5270
5271 case CEPH_OSD_OP_STAT:
5272 // note: stat does not require RD
5273 {
5274 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5275
5276 if (obs.exists && !oi.is_whiteout()) {
5277 ::encode(oi.size, osd_op.outdata);
5278 ::encode(oi.mtime, osd_op.outdata);
5279 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5280 } else {
5281 result = -ENOENT;
5282 dout(10) << "stat oi object does not exist" << dendl;
5283 }
5284
5285 ctx->delta_stats.num_rd++;
5286 }
5287 break;
5288
5289 case CEPH_OSD_OP_ISDIRTY:
5290 ++ctx->num_read;
5291 {
5292 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5293 bool is_dirty = obs.oi.is_dirty();
5294 ::encode(is_dirty, osd_op.outdata);
5295 ctx->delta_stats.num_rd++;
5296 result = 0;
5297 }
5298 break;
5299
5300 case CEPH_OSD_OP_UNDIRTY:
5301 ++ctx->num_write;
5302 {
5303 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5304 if (oi.is_dirty()) {
5305 ctx->undirty = true; // see make_writeable()
5306 ctx->modify = true;
5307 ctx->delta_stats.num_wr++;
5308 }
5309 result = 0;
5310 }
5311 break;
5312
5313 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5314 ++ctx->num_write;
5315 {
5316 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5317 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5318 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5319 result = -EINVAL;
5320 break;
5321 }
5322 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5323 result = -EINVAL;
5324 break;
5325 }
5326 if (!obs.exists) {
5327 result = 0;
5328 break;
5329 }
5330 if (oi.is_cache_pinned()) {
5331 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5332 result = -EPERM;
5333 break;
5334 }
5335 if (oi.is_dirty()) {
5336 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5337 if (result == -EINPROGRESS)
5338 result = -EAGAIN;
5339 } else {
5340 result = 0;
5341 }
5342 }
5343 break;
5344
5345 case CEPH_OSD_OP_CACHE_FLUSH:
5346 ++ctx->num_write;
5347 {
5348 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5349 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5350 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5351 result = -EINVAL;
5352 break;
5353 }
5354 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5355 result = -EINVAL;
5356 break;
5357 }
5358 if (!obs.exists) {
5359 result = 0;
5360 break;
5361 }
5362 if (oi.is_cache_pinned()) {
5363 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5364 result = -EPERM;
5365 break;
5366 }
5367 hobject_t missing;
5368 if (oi.is_dirty()) {
5369 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5370 if (result == -EINPROGRESS)
5371 result = -EAGAIN;
5372 } else {
5373 result = 0;
5374 }
5375 // Check special return value which has set missing_return
5376 if (result == -ENOENT) {
5377 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5378 assert(!missing.is_min());
5379 wait_for_unreadable_object(missing, ctx->op);
5380 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5381 result = -EAGAIN;
5382 }
5383 }
5384 break;
5385
5386 case CEPH_OSD_OP_CACHE_EVICT:
5387 ++ctx->num_write;
5388 {
5389 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5390 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5391 result = -EINVAL;
5392 break;
5393 }
5394 if (!obs.exists) {
5395 result = 0;
5396 break;
5397 }
5398 if (oi.is_cache_pinned()) {
5399 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5400 result = -EPERM;
5401 break;
5402 }
5403 if (oi.is_dirty()) {
5404 result = -EBUSY;
5405 break;
5406 }
5407 if (!oi.watchers.empty()) {
5408 result = -EBUSY;
5409 break;
5410 }
5411 if (soid.snap == CEPH_NOSNAP) {
5412 result = _verify_no_head_clones(soid, ssc->snapset);
5413 if (result < 0)
5414 break;
5415 }
5416 result = _delete_oid(ctx, true, false);
5417 if (result >= 0) {
5418 // mark that this is a cache eviction to avoid triggering normal
5419 // make_writeable() clone or snapdir object creation in finish_ctx()
5420 ctx->cache_evict = true;
5421 }
5422 osd->logger->inc(l_osd_tier_evict);
5423 }
5424 break;
5425
5426 case CEPH_OSD_OP_GETXATTR:
5427 ++ctx->num_read;
5428 {
5429 string aname;
5430 bp.copy(op.xattr.name_len, aname);
5431 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5432 string name = "_" + aname;
5433 int r = getattr_maybe_cache(
5434 ctx->obc,
5435 name,
5436 &(osd_op.outdata));
5437 if (r >= 0) {
5438 op.xattr.value_len = osd_op.outdata.length();
5439 result = 0;
5440 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5441 } else
5442 result = r;
5443
5444 ctx->delta_stats.num_rd++;
5445 }
5446 break;
5447
5448 case CEPH_OSD_OP_GETXATTRS:
5449 ++ctx->num_read;
5450 {
5451 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5452 map<string, bufferlist> out;
5453 result = getattrs_maybe_cache(
5454 ctx->obc,
5455 &out);
5456
5457 bufferlist bl;
5458 ::encode(out, bl);
5459 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5460 ctx->delta_stats.num_rd++;
5461 osd_op.outdata.claim_append(bl);
5462 }
5463 break;
5464
5465 case CEPH_OSD_OP_CMPXATTR:
5466 ++ctx->num_read;
5467 {
5468 string aname;
5469 bp.copy(op.xattr.name_len, aname);
5470 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5471 string name = "_" + aname;
5472 name[op.xattr.name_len + 1] = 0;
5473
5474 bufferlist xattr;
5475 result = getattr_maybe_cache(
5476 ctx->obc,
5477 name,
5478 &xattr);
5479 if (result < 0 && result != -EEXIST && result != -ENODATA)
5480 break;
5481
5482 ctx->delta_stats.num_rd++;
5483 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5484
5485 switch (op.xattr.cmp_mode) {
5486 case CEPH_OSD_CMPXATTR_MODE_STRING:
5487 {
5488 string val;
5489 bp.copy(op.xattr.value_len, val);
5490 val[op.xattr.value_len] = 0;
5491 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5492 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5493 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5494 }
5495 break;
5496
5497 case CEPH_OSD_CMPXATTR_MODE_U64:
5498 {
5499 uint64_t u64val;
5500 try {
5501 ::decode(u64val, bp);
5502 }
5503 catch (buffer::error& e) {
5504 result = -EINVAL;
5505 goto fail;
5506 }
5507 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5508 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5509 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5510 }
5511 break;
5512
5513 default:
5514 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5515 result = -EINVAL;
5516 }
5517
5518 if (!result) {
5519 dout(10) << "comparison returned false" << dendl;
5520 result = -ECANCELED;
5521 break;
5522 }
5523 if (result < 0) {
5524 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5525 break;
5526 }
5527
5528 dout(10) << "comparison returned true" << dendl;
5529 }
5530 break;
5531
5532 case CEPH_OSD_OP_ASSERT_VER:
5533 ++ctx->num_read;
5534 {
5535 uint64_t ver = op.assert_ver.ver;
5536 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5537 if (!ver)
5538 result = -EINVAL;
5539 else if (ver < oi.user_version)
5540 result = -ERANGE;
5541 else if (ver > oi.user_version)
5542 result = -EOVERFLOW;
5543 }
5544 break;
5545
5546 case CEPH_OSD_OP_LIST_WATCHERS:
5547 ++ctx->num_read;
5548 {
5549 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5550 obj_list_watch_response_t resp;
5551
5552 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5553 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5554 ++oi_iter) {
5555 dout(20) << "key cookie=" << oi_iter->first.first
5556 << " entity=" << oi_iter->first.second << " "
5557 << oi_iter->second << dendl;
5558 assert(oi_iter->first.first == oi_iter->second.cookie);
5559 assert(oi_iter->first.second.is_client());
5560
5561 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5562 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5563 resp.entries.push_back(wi);
5564 }
5565
5566 resp.encode(osd_op.outdata, ctx->get_features());
5567 result = 0;
5568
5569 ctx->delta_stats.num_rd++;
5570 break;
5571 }
5572
5573 case CEPH_OSD_OP_LIST_SNAPS:
5574 ++ctx->num_read;
5575 {
5576 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5577 obj_list_snap_response_t resp;
5578
5579 if (!ssc) {
5580 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5581 }
5582 assert(ssc);
5583
5584 int clonecount = ssc->snapset.clones.size();
5585 if (ssc->snapset.head_exists)
5586 clonecount++;
5587 resp.clones.reserve(clonecount);
5588 for (auto clone_iter = ssc->snapset.clones.begin();
5589 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5590 clone_info ci;
5591 ci.cloneid = *clone_iter;
5592
5593 hobject_t clone_oid = soid;
5594 clone_oid.snap = *clone_iter;
5595
5596 if (!ssc->snapset.is_legacy()) {
5597 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5598 if (p == ssc->snapset.clone_snaps.end()) {
5599 osd->clog->error() << "osd." << osd->whoami
5600 << ": inconsistent clone_snaps found for oid "
5601 << soid << " clone " << *clone_iter
5602 << " snapset " << ssc->snapset;
5603 result = -EINVAL;
5604 break;
5605 }
5606 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5607 ci.snaps.push_back(*q);
5608 }
5609 } else {
5610 /* No need to take a lock here. We are only inspecting state cached on
5611 * in the ObjectContext, so we aren't performing an actual read unless
5612 * the clone obc is not already loaded (in which case, it cannot have
5613 * an in progress write). We also do not risk exposing uncommitted
5614 * state since we do have a read lock on the head object or snapdir,
5615 * which we would have to write lock in order to make user visible
5616 * modifications to the snapshot state (snap trim related mutations
5617 * are not user visible).
5618 */
5619 if (is_missing_object(clone_oid)) {
5620 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5621 wait_for_unreadable_object(clone_oid, ctx->op);
5622 result = -EAGAIN;
5623 break;
5624 }
5625
5626 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5627 if (!clone_obc) {
5628 if (maybe_handle_cache(
5629 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5630 // promoting the clone
5631 result = -EAGAIN;
5632 } else {
5633 osd->clog->error() << "osd." << osd->whoami
5634 << ": missing clone " << clone_oid
5635 << " for oid "
5636 << soid;
5637 // should not happen
5638 result = -ENOENT;
5639 }
5640 break;
5641 }
5642 for (vector<snapid_t>::reverse_iterator p =
5643 clone_obc->obs.oi.legacy_snaps.rbegin();
5644 p != clone_obc->obs.oi.legacy_snaps.rend();
5645 ++p) {
5646 ci.snaps.push_back(*p);
5647 }
5648 }
5649
5650 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5651
5652 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5653 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5654 if (coi == ssc->snapset.clone_overlap.end()) {
5655 osd->clog->error() << "osd." << osd->whoami
5656 << ": inconsistent clone_overlap found for oid "
5657 << soid << " clone " << *clone_iter;
5658 result = -EINVAL;
5659 break;
5660 }
5661 const interval_set<uint64_t> &o = coi->second;
5662 ci.overlap.reserve(o.num_intervals());
5663 for (interval_set<uint64_t>::const_iterator r = o.begin();
5664 r != o.end(); ++r) {
5665 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5666 r.get_len()));
5667 }
5668
5669 map<snapid_t, uint64_t>::const_iterator si;
5670 si = ssc->snapset.clone_size.find(ci.cloneid);
5671 if (si == ssc->snapset.clone_size.end()) {
5672 osd->clog->error() << "osd." << osd->whoami
5673 << ": inconsistent clone_size found for oid "
5674 << soid << " clone " << *clone_iter;
5675 result = -EINVAL;
5676 break;
5677 }
5678 ci.size = si->second;
5679
5680 resp.clones.push_back(ci);
5681 }
5682 if (result < 0) {
5683 break;
5684 }
5685 if (ssc->snapset.head_exists &&
5686 !ctx->obc->obs.oi.is_whiteout()) {
5687 assert(obs.exists);
5688 clone_info ci;
5689 ci.cloneid = CEPH_NOSNAP;
5690
5691 //Size for HEAD is oi.size
5692 ci.size = oi.size;
5693
5694 resp.clones.push_back(ci);
5695 }
5696 resp.seq = ssc->snapset.seq;
5697
5698 resp.encode(osd_op.outdata);
5699 result = 0;
5700
5701 ctx->delta_stats.num_rd++;
5702 break;
5703 }
5704
5705 case CEPH_OSD_OP_NOTIFY:
5706 ++ctx->num_read;
5707 {
5708 uint32_t timeout;
5709 bufferlist bl;
5710
5711 try {
5712 uint32_t ver; // obsolete
5713 ::decode(ver, bp);
5714 ::decode(timeout, bp);
5715 ::decode(bl, bp);
5716 } catch (const buffer::error &e) {
5717 timeout = 0;
5718 }
5719 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5720 if (!timeout)
5721 timeout = cct->_conf->osd_default_notify_timeout;
5722
5723 notify_info_t n;
5724 n.timeout = timeout;
5725 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5726 n.cookie = op.watch.cookie;
5727 n.bl = bl;
5728 ctx->notifies.push_back(n);
5729
5730 // return our unique notify id to the client
5731 ::encode(n.notify_id, osd_op.outdata);
5732 }
5733 break;
5734
5735 case CEPH_OSD_OP_NOTIFY_ACK:
5736 ++ctx->num_read;
5737 {
5738 try {
5739 uint64_t notify_id = 0;
5740 uint64_t watch_cookie = 0;
5741 ::decode(notify_id, bp);
5742 ::decode(watch_cookie, bp);
5743 bufferlist reply_bl;
5744 if (!bp.end()) {
5745 ::decode(reply_bl, bp);
5746 }
5747 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5748 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5749 ctx->notify_acks.push_back(ack);
5750 } catch (const buffer::error &e) {
5751 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5752 OpContext::NotifyAck ack(
5753 // op.watch.cookie is actually the notify_id for historical reasons
5754 op.watch.cookie
5755 );
5756 ctx->notify_acks.push_back(ack);
5757 }
5758 }
5759 break;
5760
5761 case CEPH_OSD_OP_SETALLOCHINT:
5762 ++ctx->num_write;
5763 {
5764 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5765 maybe_create_new_object(ctx);
5766 oi.expected_object_size = op.alloc_hint.expected_object_size;
5767 oi.expected_write_size = op.alloc_hint.expected_write_size;
5768 oi.alloc_hint_flags = op.alloc_hint.flags;
5769 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5770 op.alloc_hint.expected_write_size,
5771 op.alloc_hint.flags);
5772 ctx->delta_stats.num_wr++;
5773 result = 0;
5774 }
5775 break;
5776
5777
5778 // --- WRITES ---
5779
5780 // -- object data --
5781
5782 case CEPH_OSD_OP_WRITE:
5783 ++ctx->num_write;
5784 { // write
5785 __u32 seq = oi.truncate_seq;
5786 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5787 if (op.extent.length != osd_op.indata.length()) {
5788 result = -EINVAL;
5789 break;
5790 }
5791
5792 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5793 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5794
5795 if (pool.info.requires_aligned_append() &&
5796 (op.extent.offset % pool.info.required_alignment() != 0)) {
5797 result = -EOPNOTSUPP;
5798 break;
5799 }
5800
5801 if (!obs.exists) {
5802 if (pool.info.requires_aligned_append() && op.extent.offset) {
5803 result = -EOPNOTSUPP;
5804 break;
5805 }
5806 } else if (op.extent.offset != oi.size &&
5807 pool.info.requires_aligned_append()) {
5808 result = -EOPNOTSUPP;
5809 break;
5810 }
5811
5812 if (seq && (seq > op.extent.truncate_seq) &&
5813 (op.extent.offset + op.extent.length > oi.size)) {
5814 // old write, arrived after trimtrunc
5815 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5816 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5817 << ", adjusting write length to " << op.extent.length << dendl;
5818 bufferlist t;
5819 t.substr_of(osd_op.indata, 0, op.extent.length);
5820 osd_op.indata.swap(t);
5821 }
5822 if (op.extent.truncate_seq > seq) {
5823 // write arrives before trimtrunc
5824 if (obs.exists && !oi.is_whiteout()) {
5825 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5826 << ", truncating to " << op.extent.truncate_size << dendl;
5827 t->truncate(soid, op.extent.truncate_size);
5828 oi.truncate_seq = op.extent.truncate_seq;
5829 oi.truncate_size = op.extent.truncate_size;
5830 if (op.extent.truncate_size != oi.size) {
5831 ctx->delta_stats.num_bytes -= oi.size;
5832 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5833 oi.size = op.extent.truncate_size;
5834 }
5835 } else {
5836 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5837 << ", but object is new" << dendl;
5838 oi.truncate_seq = op.extent.truncate_seq;
5839 oi.truncate_size = op.extent.truncate_size;
5840 }
5841 }
5842 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5843 if (result < 0)
5844 break;
5845
5846 maybe_create_new_object(ctx);
5847
5848 if (op.extent.length == 0) {
5849 if (op.extent.offset > oi.size) {
5850 t->truncate(
5851 soid, op.extent.offset);
5852 } else {
5853 t->nop(soid);
5854 }
5855 } else {
5856 t->write(
5857 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5858 }
5859
5860 if (op.extent.offset == 0 && op.extent.length >= oi.size)
5861 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5862 else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5863 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5864 else
5865 obs.oi.clear_data_digest();
5866 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5867 op.extent.offset, op.extent.length);
5868
5869 }
5870 break;
5871
5872 case CEPH_OSD_OP_WRITEFULL:
5873 ++ctx->num_write;
5874 { // write full object
5875 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5876
5877 if (op.extent.length != osd_op.indata.length()) {
5878 result = -EINVAL;
5879 break;
5880 }
5881 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5882 if (result < 0)
5883 break;
5884
5885 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5886 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5887
5888 maybe_create_new_object(ctx);
5889 if (pool.info.require_rollback()) {
5890 t->truncate(soid, 0);
5891 } else if (obs.exists && op.extent.length < oi.size) {
5892 t->truncate(soid, op.extent.length);
5893 }
5894 if (op.extent.length) {
5895 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5896 }
5897 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5898
5899 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5900 0, op.extent.length, true);
5901 }
5902 break;
5903
5904 case CEPH_OSD_OP_WRITESAME:
5905 ++ctx->num_write;
5906 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5907 result = do_writesame(ctx, osd_op);
5908 break;
5909
5910 case CEPH_OSD_OP_ROLLBACK :
5911 ++ctx->num_write;
5912 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5913 result = _rollback_to(ctx, op);
5914 break;
5915
5916 case CEPH_OSD_OP_ZERO:
5917 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5918 if (pool.info.requires_aligned_append()) {
5919 result = -EOPNOTSUPP;
5920 break;
5921 }
5922 ++ctx->num_write;
5923 { // zero
5924 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5925 if (result < 0)
5926 break;
5927 assert(op.extent.length);
5928 if (obs.exists && !oi.is_whiteout()) {
5929 t->zero(soid, op.extent.offset, op.extent.length);
5930 interval_set<uint64_t> ch;
5931 ch.insert(op.extent.offset, op.extent.length);
5932 ctx->modified_ranges.union_of(ch);
5933 ctx->delta_stats.num_wr++;
5934 oi.clear_data_digest();
5935 } else {
5936 // no-op
5937 }
5938 }
5939 break;
5940 case CEPH_OSD_OP_CREATE:
5941 ++ctx->num_write;
5942 {
5943 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5944 int flags = le32_to_cpu(op.flags);
5945 if (obs.exists && !oi.is_whiteout() &&
5946 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5947 result = -EEXIST; /* this is an exclusive create */
5948 } else {
5949 if (osd_op.indata.length()) {
5950 bufferlist::iterator p = osd_op.indata.begin();
5951 string category;
5952 try {
5953 ::decode(category, p);
5954 }
5955 catch (buffer::error& e) {
5956 result = -EINVAL;
5957 goto fail;
5958 }
5959 // category is no longer implemented.
5960 }
5961 if (result >= 0) {
5962 maybe_create_new_object(ctx);
5963 t->nop(soid);
5964 }
5965 }
5966 }
5967 break;
5968
5969 case CEPH_OSD_OP_TRIMTRUNC:
5970 op.extent.offset = op.extent.truncate_size;
5971 // falling through
5972
5973 case CEPH_OSD_OP_TRUNCATE:
5974 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5975 if (pool.info.requires_aligned_append()) {
5976 result = -EOPNOTSUPP;
5977 break;
5978 }
5979 ++ctx->num_write;
5980 {
5981 // truncate
5982 if (!obs.exists || oi.is_whiteout()) {
5983 dout(10) << " object dne, truncate is a no-op" << dendl;
5984 break;
5985 }
5986
5987 if (op.extent.offset > cct->_conf->osd_max_object_size) {
5988 result = -EFBIG;
5989 break;
5990 }
5991
5992 if (op.extent.truncate_seq) {
5993 assert(op.extent.offset == op.extent.truncate_size);
5994 if (op.extent.truncate_seq <= oi.truncate_seq) {
5995 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5996 << ", no-op" << dendl;
5997 break; // old
5998 }
5999 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6000 << ", truncating" << dendl;
6001 oi.truncate_seq = op.extent.truncate_seq;
6002 oi.truncate_size = op.extent.truncate_size;
6003 }
6004
6005 maybe_create_new_object(ctx);
6006 t->truncate(soid, op.extent.offset);
6007 if (oi.size > op.extent.offset) {
6008 interval_set<uint64_t> trim;
6009 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6010 ctx->modified_ranges.union_of(trim);
6011 }
6012 if (op.extent.offset != oi.size) {
6013 ctx->delta_stats.num_bytes -= oi.size;
6014 ctx->delta_stats.num_bytes += op.extent.offset;
6015 oi.size = op.extent.offset;
6016 }
6017 ctx->delta_stats.num_wr++;
6018 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6019
6020 oi.clear_data_digest();
6021 }
6022 break;
6023
6024 case CEPH_OSD_OP_DELETE:
6025 ++ctx->num_write;
6026 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6027 {
6028 result = _delete_oid(ctx, false, ctx->ignore_cache);
6029 }
6030 break;
6031
6032 case CEPH_OSD_OP_WATCH:
6033 ++ctx->num_write;
6034 {
6035 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6036 op.watch.cookie, op.watch.op);
6037 if (!obs.exists) {
6038 result = -ENOENT;
6039 break;
6040 }
6041 uint64_t cookie = op.watch.cookie;
6042 entity_name_t entity = ctx->reqid.name;
6043 ObjectContextRef obc = ctx->obc;
6044
6045 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6046 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6047 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6048 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6049 dout(10) << "watch: peer_addr="
6050 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6051
6052 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6053 if (op.watch.timeout != 0) {
6054 timeout = op.watch.timeout;
6055 }
6056
6057 watch_info_t w(cookie, timeout,
6058 ctx->op->get_req()->get_connection()->get_peer_addr());
6059 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6060 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6061 if (oi.watchers.count(make_pair(cookie, entity))) {
6062 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6063 } else {
6064 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6065 oi.watchers[make_pair(cookie, entity)] = w;
6066 t->nop(soid); // make sure update the object_info on disk!
6067 }
6068 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6069 ctx->watch_connects.push_back(make_pair(w, will_ping));
6070 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6071 if (!oi.watchers.count(make_pair(cookie, entity))) {
6072 result = -ENOTCONN;
6073 break;
6074 }
6075 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6076 ctx->watch_connects.push_back(make_pair(w, true));
6077 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6078 /* Note: WATCH with PING doesn't cause may_write() to return true,
6079 * so if there is nothing else in the transaction, this is going
6080 * to run do_osd_op_effects, but not write out a log entry */
6081 if (!oi.watchers.count(make_pair(cookie, entity))) {
6082 result = -ENOTCONN;
6083 break;
6084 }
6085 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6086 obc->watchers.find(make_pair(cookie, entity));
6087 if (p == obc->watchers.end() ||
6088 !p->second->is_connected()) {
6089 // client needs to reconnect
6090 result = -ETIMEDOUT;
6091 break;
6092 }
6093 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6094 p->second->got_ping(ceph_clock_now());
6095 result = 0;
6096 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6097 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6098 oi.watchers.find(make_pair(cookie, entity));
6099 if (oi_iter != oi.watchers.end()) {
6100 dout(10) << " removed watch " << oi_iter->second << " by "
6101 << entity << dendl;
6102 oi.watchers.erase(oi_iter);
6103 t->nop(soid); // update oi on disk
6104 ctx->watch_disconnects.push_back(
6105 watch_disconnect_t(cookie, entity, false));
6106 } else {
6107 dout(10) << " can't remove: no watch by " << entity << dendl;
6108 }
6109 }
6110 }
6111 break;
6112
6113 case CEPH_OSD_OP_CACHE_PIN:
6114 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6115 if ((!pool.info.is_tier() ||
6116 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6117 result = -EINVAL;
6118 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6119 break;
6120 }
6121 ++ctx->num_write;
6122 {
6123 if (!obs.exists || oi.is_whiteout()) {
6124 result = -ENOENT;
6125 break;
6126 }
6127
6128 if (!oi.is_cache_pinned()) {
6129 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6130 ctx->modify = true;
6131 ctx->delta_stats.num_objects_pinned++;
6132 ctx->delta_stats.num_wr++;
6133 }
6134 result = 0;
6135 }
6136 break;
6137
6138 case CEPH_OSD_OP_CACHE_UNPIN:
6139 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6140 if ((!pool.info.is_tier() ||
6141 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6142 result = -EINVAL;
6143 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6144 break;
6145 }
6146 ++ctx->num_write;
6147 {
6148 if (!obs.exists || oi.is_whiteout()) {
6149 result = -ENOENT;
6150 break;
6151 }
6152
6153 if (oi.is_cache_pinned()) {
6154 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6155 ctx->modify = true;
6156 ctx->delta_stats.num_objects_pinned--;
6157 ctx->delta_stats.num_wr++;
6158 }
6159 result = 0;
6160 }
6161 break;
6162
6163 case CEPH_OSD_OP_SET_REDIRECT:
6164 ++ctx->num_write;
6165 {
6166 if (pool.info.is_tier()) {
6167 result = -EINVAL;
6168 break;
6169 }
6170 if (!obs.exists) {
6171 result = -ENOENT;
6172 break;
6173 }
6174 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6175 result = -EOPNOTSUPP;
6176 break;
6177 }
6178
6179 object_t target_name;
6180 object_locator_t target_oloc;
6181 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6182 version_t target_version = op.copy_from.src_version;
6183 try {
6184 ::decode(target_name, bp);
6185 ::decode(target_oloc, bp);
6186 }
6187 catch (buffer::error& e) {
6188 result = -EINVAL;
6189 goto fail;
6190 }
6191 pg_t raw_pg;
6192 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6193 hobject_t target(target_name, target_oloc.key, target_snapid,
6194 raw_pg.ps(), raw_pg.pool(),
6195 target_oloc.nspace);
6196 if (target == soid) {
6197 dout(20) << " set-redirect self is invalid" << dendl;
6198 result = -EINVAL;
6199 break;
6200 }
6201 oi.set_flag(object_info_t::FLAG_MANIFEST);
6202 oi.manifest.redirect_target = target;
6203 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6204 t->truncate(soid, 0);
6205 if (oi.is_omap() && pool.info.supports_omap()) {
6206 t->omap_clear(soid);
6207 obs.oi.clear_omap_digest();
6208 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6209 }
6210 ctx->delta_stats.num_bytes -= oi.size;
6211 oi.size = 0;
6212 oi.new_object();
6213 oi.user_version = target_version;
6214 ctx->user_at_version = target_version;
6215 /* rm_attrs */
6216 map<string,bufferlist> rmattrs;
6217 result = getattrs_maybe_cache(ctx->obc,
6218 &rmattrs);
6219 if (result < 0) {
6220 return result;
6221 }
6222 map<string, bufferlist>::iterator iter;
6223 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6224 const string& name = iter->first;
6225 t->rmattr(soid, name);
6226 }
6227 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6228 }
6229
6230 break;
6231
6232 // -- object attrs --
6233
6234 case CEPH_OSD_OP_SETXATTR:
6235 ++ctx->num_write;
6236 {
6237 if (cct->_conf->osd_max_attr_size > 0 &&
6238 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6239 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6240 result = -EFBIG;
6241 break;
6242 }
6243 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6244 cct->_conf->osd_max_attr_name_len);
6245 if (op.xattr.name_len > max_name_len) {
6246 result = -ENAMETOOLONG;
6247 break;
6248 }
6249 maybe_create_new_object(ctx);
6250 string aname;
6251 bp.copy(op.xattr.name_len, aname);
6252 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6253 string name = "_" + aname;
6254 bufferlist bl;
6255 bp.copy(op.xattr.value_len, bl);
6256 t->setattr(soid, name, bl);
6257 ctx->delta_stats.num_wr++;
6258 }
6259 break;
6260
6261 case CEPH_OSD_OP_RMXATTR:
6262 ++ctx->num_write;
6263 {
6264 string aname;
6265 bp.copy(op.xattr.name_len, aname);
6266 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6267 if (!obs.exists || oi.is_whiteout()) {
6268 result = -ENOENT;
6269 break;
6270 }
6271 string name = "_" + aname;
6272 t->rmattr(soid, name);
6273 ctx->delta_stats.num_wr++;
6274 }
6275 break;
6276
6277
6278 // -- fancy writers --
6279 case CEPH_OSD_OP_APPEND:
6280 {
6281 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6282 // just do it inline; this works because we are happy to execute
6283 // fancy op on replicas as well.
6284 vector<OSDOp> nops(1);
6285 OSDOp& newop = nops[0];
6286 newop.op.op = CEPH_OSD_OP_WRITE;
6287 newop.op.extent.offset = oi.size;
6288 newop.op.extent.length = op.extent.length;
6289 newop.op.extent.truncate_seq = oi.truncate_seq;
6290 newop.indata = osd_op.indata;
6291 result = do_osd_ops(ctx, nops);
6292 osd_op.outdata.claim(newop.outdata);
6293 }
6294 break;
6295
6296 case CEPH_OSD_OP_STARTSYNC:
6297 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6298 t->nop(soid);
6299 break;
6300
6301
6302 // -- trivial map --
6303 case CEPH_OSD_OP_TMAPGET:
6304 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6305 if (pool.info.require_rollback()) {
6306 result = -EOPNOTSUPP;
6307 break;
6308 }
6309 {
6310 vector<OSDOp> nops(1);
6311 OSDOp& newop = nops[0];
6312 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6313 newop.op.extent.offset = 0;
6314 newop.op.extent.length = 0;
6315 do_osd_ops(ctx, nops);
6316 osd_op.outdata.claim(newop.outdata);
6317 }
6318 break;
6319
6320 case CEPH_OSD_OP_TMAPPUT:
6321 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6322 if (pool.info.require_rollback()) {
6323 result = -EOPNOTSUPP;
6324 break;
6325 }
6326 {
6327 //_dout_lock.Lock();
6328 //osd_op.data.hexdump(*_dout);
6329 //_dout_lock.Unlock();
6330
6331 // verify sort order
6332 bool unsorted = false;
6333 if (true) {
6334 bufferlist header;
6335 ::decode(header, bp);
6336 uint32_t n;
6337 ::decode(n, bp);
6338 string last_key;
6339 while (n--) {
6340 string key;
6341 ::decode(key, bp);
6342 dout(10) << "tmapput key " << key << dendl;
6343 bufferlist val;
6344 ::decode(val, bp);
6345 if (key < last_key) {
6346 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6347 unsorted = true;
6348 break;
6349 }
6350 last_key = key;
6351 }
6352 }
6353
6354 // write it
6355 vector<OSDOp> nops(1);
6356 OSDOp& newop = nops[0];
6357 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6358 newop.op.extent.offset = 0;
6359 newop.op.extent.length = osd_op.indata.length();
6360 newop.indata = osd_op.indata;
6361
6362 if (unsorted) {
6363 bp = osd_op.indata.begin();
6364 bufferlist header;
6365 map<string, bufferlist> m;
6366 ::decode(header, bp);
6367 ::decode(m, bp);
6368 assert(bp.end());
6369 bufferlist newbl;
6370 ::encode(header, newbl);
6371 ::encode(m, newbl);
6372 newop.indata = newbl;
6373 }
6374 result = do_osd_ops(ctx, nops);
6375 assert(result == 0);
6376 }
6377 break;
6378
6379 case CEPH_OSD_OP_TMAPUP:
6380 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6381 if (pool.info.require_rollback()) {
6382 result = -EOPNOTSUPP;
6383 break;
6384 }
6385 ++ctx->num_write;
6386 result = do_tmapup(ctx, bp, osd_op);
6387 break;
6388
6389 case CEPH_OSD_OP_TMAP2OMAP:
6390 ++ctx->num_write;
6391 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6392 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6393 break;
6394
6395 // OMAP Read ops
6396 case CEPH_OSD_OP_OMAPGETKEYS:
6397 ++ctx->num_read;
6398 {
6399 string start_after;
6400 uint64_t max_return;
6401 try {
6402 ::decode(start_after, bp);
6403 ::decode(max_return, bp);
6404 }
6405 catch (buffer::error& e) {
6406 result = -EINVAL;
6407 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6408 goto fail;
6409 }
6410 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6411 max_return = cct->_conf->osd_max_omap_entries_per_request;
6412 }
6413 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6414
6415 bufferlist bl;
6416 uint32_t num = 0;
6417 bool truncated = false;
6418 if (oi.is_omap()) {
6419 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6420 coll, ghobject_t(soid)
6421 );
6422 assert(iter);
6423 iter->upper_bound(start_after);
6424 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6425 if (num >= max_return ||
6426 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6427 truncated = true;
6428 break;
6429 }
6430 ::encode(iter->key(), bl);
6431 }
6432 } // else return empty out_set
6433 ::encode(num, osd_op.outdata);
6434 osd_op.outdata.claim_append(bl);
6435 ::encode(truncated, osd_op.outdata);
6436 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6437 ctx->delta_stats.num_rd++;
6438 }
6439 break;
6440
6441 case CEPH_OSD_OP_OMAPGETVALS:
6442 ++ctx->num_read;
6443 {
6444 string start_after;
6445 uint64_t max_return;
6446 string filter_prefix;
6447 try {
6448 ::decode(start_after, bp);
6449 ::decode(max_return, bp);
6450 ::decode(filter_prefix, bp);
6451 }
6452 catch (buffer::error& e) {
6453 result = -EINVAL;
6454 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6455 goto fail;
6456 }
6457 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6458 max_return = cct->_conf->osd_max_omap_entries_per_request;
6459 }
6460 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6461
6462 uint32_t num = 0;
6463 bool truncated = false;
6464 bufferlist bl;
6465 if (oi.is_omap()) {
6466 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6467 coll, ghobject_t(soid)
6468 );
6469 if (!iter) {
6470 result = -ENOENT;
6471 goto fail;
6472 }
6473 iter->upper_bound(start_after);
6474 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6475 for (num = 0;
6476 iter->valid() &&
6477 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6478 ++num, iter->next(false)) {
6479 dout(20) << "Found key " << iter->key() << dendl;
6480 if (num >= max_return ||
6481 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6482 truncated = true;
6483 break;
6484 }
6485 ::encode(iter->key(), bl);
6486 ::encode(iter->value(), bl);
6487 }
6488 } // else return empty out_set
6489 ::encode(num, osd_op.outdata);
6490 osd_op.outdata.claim_append(bl);
6491 ::encode(truncated, osd_op.outdata);
6492 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6493 ctx->delta_stats.num_rd++;
6494 }
6495 break;
6496
6497 case CEPH_OSD_OP_OMAPGETHEADER:
6498 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6499 if (!oi.is_omap()) {
6500 // return empty header
6501 break;
6502 }
6503 ++ctx->num_read;
6504 {
6505 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6506 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6507 ctx->delta_stats.num_rd++;
6508 }
6509 break;
6510
6511 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6512 ++ctx->num_read;
6513 {
6514 set<string> keys_to_get;
6515 try {
6516 ::decode(keys_to_get, bp);
6517 }
6518 catch (buffer::error& e) {
6519 result = -EINVAL;
6520 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6521 goto fail;
6522 }
6523 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6524 map<string, bufferlist> out;
6525 if (oi.is_omap()) {
6526 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6527 } // else return empty omap entries
6528 ::encode(out, osd_op.outdata);
6529 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6530 ctx->delta_stats.num_rd++;
6531 }
6532 break;
6533
6534 case CEPH_OSD_OP_OMAP_CMP:
6535 ++ctx->num_read;
6536 {
6537 if (!obs.exists || oi.is_whiteout()) {
6538 result = -ENOENT;
6539 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6540 break;
6541 }
6542 map<string, pair<bufferlist, int> > assertions;
6543 try {
6544 ::decode(assertions, bp);
6545 }
6546 catch (buffer::error& e) {
6547 result = -EINVAL;
6548 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6549 goto fail;
6550 }
6551 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6552
6553 map<string, bufferlist> out;
6554
6555 if (oi.is_omap()) {
6556 set<string> to_get;
6557 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6558 i != assertions.end();
6559 ++i)
6560 to_get.insert(i->first);
6561 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6562 to_get, &out);
6563 if (r < 0) {
6564 result = r;
6565 break;
6566 }
6567 } // else leave out empty
6568
6569 //Should set num_rd_kb based on encode length of map
6570 ctx->delta_stats.num_rd++;
6571
6572 int r = 0;
6573 bufferlist empty;
6574 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6575 i != assertions.end();
6576 ++i) {
6577 auto out_entry = out.find(i->first);
6578 bufferlist &bl = (out_entry != out.end()) ?
6579 out_entry->second : empty;
6580 switch (i->second.second) {
6581 case CEPH_OSD_CMPXATTR_OP_EQ:
6582 if (!(bl == i->second.first)) {
6583 r = -ECANCELED;
6584 }
6585 break;
6586 case CEPH_OSD_CMPXATTR_OP_LT:
6587 if (!(bl < i->second.first)) {
6588 r = -ECANCELED;
6589 }
6590 break;
6591 case CEPH_OSD_CMPXATTR_OP_GT:
6592 if (!(bl > i->second.first)) {
6593 r = -ECANCELED;
6594 }
6595 break;
6596 default:
6597 r = -EINVAL;
6598 break;
6599 }
6600 if (r < 0)
6601 break;
6602 }
6603 if (r < 0) {
6604 result = r;
6605 }
6606 }
6607 break;
6608
6609 // OMAP Write ops
6610 case CEPH_OSD_OP_OMAPSETVALS:
6611 if (!pool.info.supports_omap()) {
6612 result = -EOPNOTSUPP;
6613 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6614 break;
6615 }
6616 ++ctx->num_write;
6617 {
6618 maybe_create_new_object(ctx);
6619 bufferlist to_set_bl;
6620 try {
6621 decode_str_str_map_to_bl(bp, &to_set_bl);
6622 }
6623 catch (buffer::error& e) {
6624 result = -EINVAL;
6625 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6626 goto fail;
6627 }
6628 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6629 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6630 dout(20) << "setting vals: " << dendl;
6631 map<string,bufferlist> to_set;
6632 bufferlist::iterator pt = to_set_bl.begin();
6633 ::decode(to_set, pt);
6634 for (map<string, bufferlist>::iterator i = to_set.begin();
6635 i != to_set.end();
6636 ++i) {
6637 dout(20) << "\t" << i->first << dendl;
6638 }
6639 }
6640 t->omap_setkeys(soid, to_set_bl);
6641 ctx->delta_stats.num_wr++;
6642 }
6643 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6644 obs.oi.clear_omap_digest();
6645 break;
6646
6647 case CEPH_OSD_OP_OMAPSETHEADER:
6648 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6649 if (!pool.info.supports_omap()) {
6650 result = -EOPNOTSUPP;
6651 break;
6652 }
6653 ++ctx->num_write;
6654 {
6655 maybe_create_new_object(ctx);
6656 t->omap_setheader(soid, osd_op.indata);
6657 ctx->delta_stats.num_wr++;
6658 }
6659 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6660 obs.oi.clear_omap_digest();
6661 break;
6662
6663 case CEPH_OSD_OP_OMAPCLEAR:
6664 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6665 if (!pool.info.supports_omap()) {
6666 result = -EOPNOTSUPP;
6667 break;
6668 }
6669 ++ctx->num_write;
6670 {
6671 if (!obs.exists || oi.is_whiteout()) {
6672 result = -ENOENT;
6673 break;
6674 }
6675 if (oi.is_omap()) {
6676 t->omap_clear(soid);
6677 ctx->delta_stats.num_wr++;
6678 obs.oi.clear_omap_digest();
6679 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6680 }
6681 }
6682 break;
6683
6684 case CEPH_OSD_OP_OMAPRMKEYS:
6685 if (!pool.info.supports_omap()) {
6686 result = -EOPNOTSUPP;
6687 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6688 break;
6689 }
6690 ++ctx->num_write;
6691 {
6692 if (!obs.exists || oi.is_whiteout()) {
6693 result = -ENOENT;
6694 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6695 break;
6696 }
6697 bufferlist to_rm_bl;
6698 try {
6699 decode_str_set_to_bl(bp, &to_rm_bl);
6700 }
6701 catch (buffer::error& e) {
6702 result = -EINVAL;
6703 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6704 goto fail;
6705 }
6706 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6707 t->omap_rmkeys(soid, to_rm_bl);
6708 ctx->delta_stats.num_wr++;
6709 }
6710 obs.oi.clear_omap_digest();
6711 break;
6712
6713 case CEPH_OSD_OP_COPY_GET:
6714 ++ctx->num_read;
6715 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6716 soid.snap.val);
6717 if (op_finisher == nullptr) {
6718 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6719 } else {
6720 result = op_finisher->execute();
6721 }
6722 break;
6723
6724 case CEPH_OSD_OP_COPY_FROM:
6725 ++ctx->num_write;
6726 {
6727 object_t src_name;
6728 object_locator_t src_oloc;
6729 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6730 version_t src_version = op.copy_from.src_version;
6731 try {
6732 ::decode(src_name, bp);
6733 ::decode(src_oloc, bp);
6734 }
6735 catch (buffer::error& e) {
6736 result = -EINVAL;
6737 tracepoint(osd,
6738 do_osd_op_pre_copy_from,
6739 soid.oid.name.c_str(),
6740 soid.snap.val,
6741 "???",
6742 0,
6743 "???",
6744 "???",
6745 0,
6746 src_snapid,
6747 src_version);
6748 goto fail;
6749 }
6750 tracepoint(osd,
6751 do_osd_op_pre_copy_from,
6752 soid.oid.name.c_str(),
6753 soid.snap.val,
6754 src_name.name.c_str(),
6755 src_oloc.pool,
6756 src_oloc.key.c_str(),
6757 src_oloc.nspace.c_str(),
6758 src_oloc.hash,
6759 src_snapid,
6760 src_version);
6761 if (op_finisher == nullptr) {
6762 // start
6763 pg_t raw_pg;
6764 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6765 hobject_t src(src_name, src_oloc.key, src_snapid,
6766 raw_pg.ps(), raw_pg.pool(),
6767 src_oloc.nspace);
6768 if (src == soid) {
6769 dout(20) << " copy from self is invalid" << dendl;
6770 result = -EINVAL;
6771 break;
6772 }
6773 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6774 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6775 new CopyFromFinisher(cb));
6776 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6777 op.copy_from.flags,
6778 false,
6779 op.copy_from.src_fadvise_flags,
6780 op.flags);
6781 result = -EINPROGRESS;
6782 } else {
6783 // finish
6784 result = op_finisher->execute();
6785 assert(result == 0);
6786
6787 // COPY_FROM cannot be executed multiple times -- it must restart
6788 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6789 }
6790 }
6791 break;
6792
6793 default:
6794 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6795 dout(1) << "unrecognized osd op " << op.op
6796 << " " << ceph_osd_op_name(op.op)
6797 << dendl;
6798 result = -EOPNOTSUPP;
6799 }
6800
6801 fail:
6802 osd_op.rval = result;
6803 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6804 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6805 result = 0;
6806
6807 if (result < 0)
6808 break;
6809 }
6810 return result;
6811 }
6812
6813 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6814 {
6815 if (ctx->new_obs.oi.size == 0) {
6816 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6817 return -ENODATA;
6818 }
6819 vector<OSDOp> nops(1);
6820 OSDOp &newop = nops[0];
6821 newop.op.op = CEPH_OSD_OP_TMAPGET;
6822 do_osd_ops(ctx, nops);
6823 try {
6824 bufferlist::iterator i = newop.outdata.begin();
6825 ::decode(*header, i);
6826 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6827 } catch (...) {
6828 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6829 << dendl;
6830 return -EINVAL;
6831 }
6832 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6833 << dendl;
6834 return 0;
6835 }
6836
6837 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6838 const SnapSet& ss)
6839 {
6840 // verify that all clones have been evicted
6841 dout(20) << __func__ << " verifying clones are absent "
6842 << ss << dendl;
6843 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6844 p != ss.clones.end();
6845 ++p) {
6846 hobject_t clone_oid = soid;
6847 clone_oid.snap = *p;
6848 if (is_missing_object(clone_oid))
6849 return -EBUSY;
6850 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6851 if (clone_obc && clone_obc->obs.exists) {
6852 dout(10) << __func__ << " cannot evict head before clone "
6853 << clone_oid << dendl;
6854 return -EBUSY;
6855 }
6856 if (copy_ops.count(clone_oid)) {
6857 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6858 << clone_oid << dendl;
6859 return -EBUSY;
6860 }
6861 }
6862 return 0;
6863 }
6864
6865 inline int PrimaryLogPG::_delete_oid(
6866 OpContext *ctx,
6867 bool no_whiteout, // no whiteouts, no matter what.
6868 bool try_no_whiteout) // try not to whiteout
6869 {
6870 SnapSet& snapset = ctx->new_snapset;
6871 ObjectState& obs = ctx->new_obs;
6872 object_info_t& oi = obs.oi;
6873 const hobject_t& soid = oi.soid;
6874 PGTransaction* t = ctx->op_t.get();
6875
6876 // cache: cache: set whiteout on delete?
6877 bool whiteout = false;
6878 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6879 && !no_whiteout
6880 && !try_no_whiteout) {
6881 whiteout = true;
6882 }
6883 bool legacy;
6884 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6885 legacy = false;
6886 // in luminous or later, we can't delete the head if there are
6887 // clones. we trust the caller passing no_whiteout has already
6888 // verified they don't exist.
6889 if (!snapset.clones.empty() ||
6890 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6891 if (no_whiteout) {
6892 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6893 << dendl;
6894 } else {
6895 dout(20) << __func__ << " has or will have clones; will whiteout"
6896 << dendl;
6897 whiteout = true;
6898 }
6899 }
6900 } else {
6901 legacy = true;
6902 }
6903 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6904 << " no_whiteout=" << (int)no_whiteout
6905 << " try_no_whiteout=" << (int)try_no_whiteout
6906 << dendl;
6907 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6908 return -ENOENT;
6909
6910 t->remove(soid);
6911
6912 if (oi.size > 0) {
6913 interval_set<uint64_t> ch;
6914 ch.insert(0, oi.size);
6915 ctx->modified_ranges.union_of(ch);
6916 }
6917
6918 ctx->delta_stats.num_wr++;
6919 if (soid.is_snap()) {
6920 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6921 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6922 } else {
6923 ctx->delta_stats.num_bytes -= oi.size;
6924 }
6925 oi.size = 0;
6926 oi.new_object();
6927
6928 // disconnect all watchers
6929 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6930 oi.watchers.begin();
6931 p != oi.watchers.end();
6932 ++p) {
6933 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6934 ctx->watch_disconnects.push_back(
6935 watch_disconnect_t(p->first.first, p->first.second, true));
6936 }
6937 oi.watchers.clear();
6938
6939 if (whiteout) {
6940 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6941 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6942 ctx->delta_stats.num_whiteouts++;
6943 t->create(soid);
6944 osd->logger->inc(l_osd_tier_whiteout);
6945 return 0;
6946 }
6947
6948 // delete the head
6949 ctx->delta_stats.num_objects--;
6950 if (soid.is_snap())
6951 ctx->delta_stats.num_object_clones--;
6952 if (oi.is_whiteout()) {
6953 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6954 ctx->delta_stats.num_whiteouts--;
6955 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6956 }
6957 if (oi.is_cache_pinned()) {
6958 ctx->delta_stats.num_objects_pinned--;
6959 }
6960 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6961 snapset.head_exists = false;
6962 }
6963 obs.exists = false;
6964 return 0;
6965 }
6966
6967 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6968 {
6969 SnapSet& snapset = ctx->new_snapset;
6970 ObjectState& obs = ctx->new_obs;
6971 object_info_t& oi = obs.oi;
6972 const hobject_t& soid = oi.soid;
6973 PGTransaction* t = ctx->op_t.get();
6974 snapid_t snapid = (uint64_t)op.snap.snapid;
6975 hobject_t missing_oid;
6976
6977 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6978
6979 ObjectContextRef rollback_to;
6980 int ret = find_object_context(
6981 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6982 soid.get_namespace()),
6983 &rollback_to, false, false, &missing_oid);
6984 if (ret == -EAGAIN) {
6985 /* clone must be missing */
6986 assert(is_degraded_or_backfilling_object(missing_oid));
6987 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
6988 << missing_oid << " (requested snapid: ) " << snapid << dendl;
6989 block_write_on_degraded_snap(missing_oid, ctx->op);
6990 return ret;
6991 }
6992 {
6993 ObjectContextRef promote_obc;
6994 cache_result_t tier_mode_result;
6995 if (obs.exists && obs.oi.has_manifest()) {
6996 tier_mode_result =
6997 maybe_handle_manifest_detail(
6998 ctx->op,
6999 true,
7000 rollback_to);
7001 } else {
7002 tier_mode_result =
7003 maybe_handle_cache_detail(
7004 ctx->op,
7005 true,
7006 rollback_to,
7007 ret,
7008 missing_oid,
7009 true,
7010 false,
7011 &promote_obc);
7012 }
7013 switch (tier_mode_result) {
7014 case cache_result_t::NOOP:
7015 break;
7016 case cache_result_t::BLOCKED_PROMOTE:
7017 assert(promote_obc);
7018 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7019 return -EAGAIN;
7020 case cache_result_t::BLOCKED_FULL:
7021 block_write_on_full_cache(soid, ctx->op);
7022 return -EAGAIN;
7023 case cache_result_t::REPLIED_WITH_EAGAIN:
7024 assert(0 == "this can't happen, no rollback on replica");
7025 default:
7026 assert(0 == "must promote was set, other values are not valid");
7027 return -EAGAIN;
7028 }
7029 }
7030
7031 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7032 // there's no snapshot here, or there's no object.
7033 // if there's no snapshot, we delete the object; otherwise, do nothing.
7034 dout(20) << "_rollback_to deleting head on " << soid.oid
7035 << " because got ENOENT|whiteout on find_object_context" << dendl;
7036 if (ctx->obc->obs.oi.watchers.size()) {
7037 // Cannot delete an object with watchers
7038 ret = -EBUSY;
7039 } else {
7040 _delete_oid(ctx, false, false);
7041 ret = 0;
7042 }
7043 } else if (ret) {
7044 // ummm....huh? It *can't* return anything else at time of writing.
7045 assert(0 == "unexpected error code in _rollback_to");
7046 } else { //we got our context, let's use it to do the rollback!
7047 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7048 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7049 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7050 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7051 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7052 ret = -EAGAIN;
7053 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7054 // rolling back to the head; we just need to clone it.
7055 ctx->modify = true;
7056 } else {
7057 /* 1) Delete current head
7058 * 2) Clone correct snapshot into head
7059 * 3) Calculate clone_overlaps by following overlaps
7060 * forward from rollback snapshot */
7061 dout(10) << "_rollback_to deleting " << soid.oid
7062 << " and rolling back to old snap" << dendl;
7063
7064 if (obs.exists) {
7065 t->remove(soid);
7066 }
7067 t->clone(soid, rollback_to_sobject);
7068 snapset.head_exists = true;
7069 t->add_obc(rollback_to);
7070
7071 map<snapid_t, interval_set<uint64_t> >::iterator iter =
7072 snapset.clone_overlap.lower_bound(snapid);
7073 interval_set<uint64_t> overlaps = iter->second;
7074 assert(iter != snapset.clone_overlap.end());
7075 for ( ;
7076 iter != snapset.clone_overlap.end();
7077 ++iter)
7078 overlaps.intersection_of(iter->second);
7079
7080 if (obs.oi.size > 0) {
7081 interval_set<uint64_t> modified;
7082 modified.insert(0, obs.oi.size);
7083 overlaps.intersection_of(modified);
7084 modified.subtract(overlaps);
7085 ctx->modified_ranges.union_of(modified);
7086 }
7087
7088 // Adjust the cached objectcontext
7089 maybe_create_new_object(ctx, true);
7090 ctx->delta_stats.num_bytes -= obs.oi.size;
7091 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7092 obs.oi.size = rollback_to->obs.oi.size;
7093 if (rollback_to->obs.oi.is_data_digest())
7094 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7095 else
7096 obs.oi.clear_data_digest();
7097 if (rollback_to->obs.oi.is_omap_digest())
7098 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7099 else
7100 obs.oi.clear_omap_digest();
7101
7102 if (rollback_to->obs.oi.is_omap()) {
7103 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7104 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7105 } else {
7106 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7107 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7108 }
7109
7110 snapset.head_exists = true;
7111 }
7112 }
7113 return ret;
7114 }
7115
7116 void PrimaryLogPG::_make_clone(
7117 OpContext *ctx,
7118 PGTransaction* t,
7119 ObjectContextRef obc,
7120 const hobject_t& head, const hobject_t& coid,
7121 object_info_t *poi)
7122 {
7123 bufferlist bv;
7124 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7125
7126 t->clone(coid, head);
7127 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7128 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7129 }
7130
7131 void PrimaryLogPG::make_writeable(OpContext *ctx)
7132 {
7133 const hobject_t& soid = ctx->obs->oi.soid;
7134 SnapContext& snapc = ctx->snapc;
7135
7136 // clone?
7137 assert(soid.snap == CEPH_NOSNAP);
7138 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7139 << " snapc=" << snapc << dendl;
7140
7141 bool was_dirty = ctx->obc->obs.oi.is_dirty();
7142 if (ctx->new_obs.exists) {
7143 // we will mark the object dirty
7144 if (ctx->undirty && was_dirty) {
7145 dout(20) << " clearing DIRTY flag" << dendl;
7146 assert(ctx->new_obs.oi.is_dirty());
7147 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7148 --ctx->delta_stats.num_objects_dirty;
7149 osd->logger->inc(l_osd_tier_clean);
7150 } else if (!was_dirty && !ctx->undirty) {
7151 dout(20) << " setting DIRTY flag" << dendl;
7152 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7153 ++ctx->delta_stats.num_objects_dirty;
7154 osd->logger->inc(l_osd_tier_dirty);
7155 }
7156 } else {
7157 if (was_dirty) {
7158 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7159 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7160 --ctx->delta_stats.num_objects_dirty;
7161 }
7162 }
7163
7164 if ((ctx->new_obs.exists &&
7165 ctx->new_obs.oi.is_omap()) &&
7166 (!ctx->obc->obs.exists ||
7167 !ctx->obc->obs.oi.is_omap())) {
7168 ++ctx->delta_stats.num_objects_omap;
7169 }
7170 if ((!ctx->new_obs.exists ||
7171 !ctx->new_obs.oi.is_omap()) &&
7172 (ctx->obc->obs.exists &&
7173 ctx->obc->obs.oi.is_omap())) {
7174 --ctx->delta_stats.num_objects_omap;
7175 }
7176
7177 // use newer snapc?
7178 if (ctx->new_snapset.seq > snapc.seq) {
7179 snapc.seq = ctx->new_snapset.seq;
7180 snapc.snaps = ctx->new_snapset.snaps;
7181 filter_snapc(snapc.snaps);
7182 dout(10) << " using newer snapc " << snapc << dendl;
7183 }
7184
7185 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7186 snapc.snaps.size() && // there are snaps
7187 !ctx->cache_evict &&
7188 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
7189 // clone
7190 hobject_t coid = soid;
7191 coid.snap = snapc.seq;
7192
7193 unsigned l;
7194 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7195
7196 vector<snapid_t> snaps(l);
7197 for (unsigned i=0; i<l; i++)
7198 snaps[i] = snapc.snaps[i];
7199
7200 // prepare clone
7201 object_info_t static_snap_oi(coid);
7202 object_info_t *snap_oi;
7203 if (is_primary()) {
7204 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7205 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7206 ctx->clone_obc->obs.oi = static_snap_oi;
7207 ctx->clone_obc->obs.exists = true;
7208 ctx->clone_obc->ssc = ctx->obc->ssc;
7209 ctx->clone_obc->ssc->ref++;
7210 if (pool.info.require_rollback())
7211 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7212 snap_oi = &ctx->clone_obc->obs.oi;
7213 bool got = ctx->lock_manager.get_write_greedy(
7214 coid,
7215 ctx->clone_obc,
7216 ctx->op);
7217 assert(got);
7218 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7219 } else {
7220 snap_oi = &static_snap_oi;
7221 }
7222 snap_oi->version = ctx->at_version;
7223 snap_oi->prior_version = ctx->obs->oi.version;
7224 snap_oi->copy_user_bits(ctx->obs->oi);
7225
7226 bool legacy = ctx->new_snapset.is_legacy() ||
7227 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7228 if (legacy) {
7229 snap_oi->legacy_snaps = snaps;
7230 }
7231
7232 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7233
7234 ctx->delta_stats.num_objects++;
7235 if (snap_oi->is_dirty()) {
7236 ctx->delta_stats.num_objects_dirty++;
7237 osd->logger->inc(l_osd_tier_dirty);
7238 }
7239 if (snap_oi->is_omap())
7240 ctx->delta_stats.num_objects_omap++;
7241 if (snap_oi->is_cache_pinned())
7242 ctx->delta_stats.num_objects_pinned++;
7243 ctx->delta_stats.num_object_clones++;
7244 ctx->new_snapset.clones.push_back(coid.snap);
7245 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7246 if (!legacy) {
7247 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7248 }
7249
7250 // clone_overlap should contain an entry for each clone
7251 // (an empty interval_set if there is no overlap)
7252 ctx->new_snapset.clone_overlap[coid.snap];
7253 if (ctx->obs->oi.size)
7254 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7255
7256 // log clone
7257 dout(10) << " cloning v " << ctx->obs->oi.version
7258 << " to " << coid << " v " << ctx->at_version
7259 << " snaps=" << snaps
7260 << " snapset=" << ctx->new_snapset << dendl;
7261 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7262 ctx->obs->oi.version,
7263 ctx->obs->oi.user_version,
7264 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7265 ::encode(snaps, ctx->log.back().snaps);
7266
7267 ctx->at_version.version++;
7268 }
7269
7270 // update most recent clone_overlap and usage stats
7271 if (ctx->new_snapset.clones.size() > 0) {
7272 /* we need to check whether the most recent clone exists, if it's been evicted,
7273 * it's not included in the stats */
7274 hobject_t last_clone_oid = soid;
7275 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7276 if (is_present_clone(last_clone_oid)) {
7277 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7278 ctx->modified_ranges.intersection_of(newest_overlap);
7279 // modified_ranges is still in use by the clone
7280 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7281 newest_overlap.subtract(ctx->modified_ranges);
7282 }
7283 }
7284
7285 // update snapset with latest snap context
7286 ctx->new_snapset.seq = snapc.seq;
7287 ctx->new_snapset.snaps = snapc.snaps;
7288 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7289 // pessimistic assumption that this is a net-new legacy SnapSet
7290 ctx->delta_stats.num_legacy_snapsets++;
7291 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7292 } else if (ctx->new_snapset.is_legacy()) {
7293 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7294 }
7295 dout(20) << "make_writeable " << soid
7296 << " done, snapset=" << ctx->new_snapset << dendl;
7297 }
7298
7299
7300 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7301 interval_set<uint64_t>& modified, uint64_t offset,
7302 uint64_t length, bool write_full)
7303 {
7304 interval_set<uint64_t> ch;
7305 if (write_full) {
7306 if (oi.size)
7307 ch.insert(0, oi.size);
7308 } else if (length)
7309 ch.insert(offset, length);
7310 modified.union_of(ch);
7311 if (write_full || offset + length > oi.size) {
7312 uint64_t new_size = offset + length;
7313 delta_stats.num_bytes -= oi.size;
7314 delta_stats.num_bytes += new_size;
7315 oi.size = new_size;
7316 }
7317 delta_stats.num_wr++;
7318 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7319 }
7320
7321 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7322 {
7323 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7324 delta_stats.num_bytes += p.get_len();
7325 }
7326 }
7327
7328 void PrimaryLogPG::complete_disconnect_watches(
7329 ObjectContextRef obc,
7330 const list<watch_disconnect_t> &to_disconnect)
7331 {
7332 for (list<watch_disconnect_t>::const_iterator i =
7333 to_disconnect.begin();
7334 i != to_disconnect.end();
7335 ++i) {
7336 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7337 auto watchers_entry = obc->watchers.find(watcher);
7338 if (watchers_entry != obc->watchers.end()) {
7339 WatchRef watch = watchers_entry->second;
7340 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7341 obc->watchers.erase(watcher);
7342 watch->remove(i->send_disconnect);
7343 } else {
7344 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7345 << watcher << dendl;
7346 }
7347 }
7348 }
7349
7350 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7351 {
7352 entity_name_t entity = ctx->reqid.name;
7353 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7354
7355 // disconnects first
7356 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7357
7358 assert(conn);
7359
7360 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7361 if (!session.get())
7362 return;
7363 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7364
7365 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7366 i != ctx->watch_connects.end();
7367 ++i) {
7368 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7369 dout(15) << "do_osd_op_effects applying watch connect on session "
7370 << session.get() << " watcher " << watcher << dendl;
7371 WatchRef watch;
7372 if (ctx->obc->watchers.count(watcher)) {
7373 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7374 << dendl;
7375 watch = ctx->obc->watchers[watcher];
7376 } else {
7377 dout(15) << "do_osd_op_effects new watcher " << watcher
7378 << dendl;
7379 watch = Watch::makeWatchRef(
7380 this, osd, ctx->obc, i->first.timeout_seconds,
7381 i->first.cookie, entity, conn->get_peer_addr());
7382 ctx->obc->watchers.insert(
7383 make_pair(
7384 watcher,
7385 watch));
7386 }
7387 watch->connect(conn, i->second);
7388 }
7389
7390 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7391 p != ctx->notifies.end();
7392 ++p) {
7393 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7394 ConnectionRef conn(ctx->op->get_req()->get_connection());
7395 NotifyRef notif(
7396 Notify::makeNotifyRef(
7397 conn,
7398 ctx->reqid.name.num(),
7399 p->bl,
7400 p->timeout,
7401 p->cookie,
7402 p->notify_id,
7403 ctx->obc->obs.oi.user_version,
7404 osd));
7405 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7406 ctx->obc->watchers.begin();
7407 i != ctx->obc->watchers.end();
7408 ++i) {
7409 dout(10) << "starting notify on watch " << i->first << dendl;
7410 i->second->start_notify(notif);
7411 }
7412 notif->init();
7413 }
7414
7415 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7416 p != ctx->notify_acks.end();
7417 ++p) {
7418 if (p->watch_cookie)
7419 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7420 else
7421 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7422 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7423 ctx->obc->watchers.begin();
7424 i != ctx->obc->watchers.end();
7425 ++i) {
7426 if (i->first.second != entity) continue;
7427 if (p->watch_cookie &&
7428 p->watch_cookie.get() != i->first.first) continue;
7429 dout(10) << "acking notify on watch " << i->first << dendl;
7430 i->second->notify_ack(p->notify_id, p->reply_bl);
7431 }
7432 }
7433 }
7434
7435 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7436 {
7437 ostringstream ss;
7438 ss << "temp_" << info.pgid << "_" << get_role()
7439 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7440 hobject_t hoid = target.make_temp_hobject(ss.str());
7441 dout(20) << __func__ << " " << hoid << dendl;
7442 return hoid;
7443 }
7444
7445 hobject_t PrimaryLogPG::get_temp_recovery_object(
7446 const hobject_t& target,
7447 eversion_t version)
7448 {
7449 ostringstream ss;
7450 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7451 << "_" << version
7452 << "_" << info.history.same_interval_since
7453 << "_" << target.snap;
7454 // pgid + version + interval + snapid is unique, and short
7455 hobject_t hoid = target.make_temp_hobject(ss.str());
7456 dout(20) << __func__ << " " << hoid << dendl;
7457 return hoid;
7458 }
7459
7460 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7461 {
7462 assert(!ctx->ops->empty());
7463
7464 const hobject_t& soid = ctx->obs->oi.soid;
7465
7466 // valid snap context?
7467 if (!ctx->snapc.is_valid()) {
7468 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7469 return -EINVAL;
7470 }
7471
7472 // prepare the actual mutation
7473 int result = do_osd_ops(ctx, *ctx->ops);
7474 if (result < 0) {
7475 if (ctx->op->may_write() &&
7476 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7477 // need to save the error code in the pg log, to detect dup ops,
7478 // but do nothing else
7479 ctx->update_log_only = true;
7480 }
7481 return result;
7482 }
7483
7484 // read-op? write-op noop? done?
7485 if (ctx->op_t->empty() && !ctx->modify) {
7486 unstable_stats.add(ctx->delta_stats);
7487 if (ctx->op->may_write() &&
7488 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7489 ctx->update_log_only = true;
7490 }
7491 return result;
7492 }
7493
7494 // check for full
7495 if ((ctx->delta_stats.num_bytes > 0 ||
7496 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7497 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7498 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7499 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7500 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7501 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7502 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7503 << dendl;
7504 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7505 // they tried, they failed.
7506 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7507 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7508 } else {
7509 // drop request
7510 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7511 return -EAGAIN;
7512 }
7513 }
7514
7515 // clone, if necessary
7516 if (soid.snap == CEPH_NOSNAP)
7517 make_writeable(ctx);
7518
7519 finish_ctx(ctx,
7520 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7521 pg_log_entry_t::DELETE);
7522
7523 return result;
7524 }
7525
7526 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7527 {
7528 const hobject_t& soid = ctx->obs->oi.soid;
7529 dout(20) << __func__ << " " << soid << " " << ctx
7530 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7531 << dendl;
7532 utime_t now = ceph_clock_now();
7533
7534 // snapset
7535 bufferlist bss;
7536
7537 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7538 ::encode(ctx->new_snapset, bss);
7539 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7540 !ctx->new_snapset.is_legacy());
7541
7542 if (ctx->new_obs.exists) {
7543 if (!ctx->obs->exists) {
7544 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7545 hobject_t snapoid = soid.get_snapdir();
7546 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7547 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7548 ctx->at_version,
7549 ctx->snapset_obc->obs.oi.version,
7550 0, osd_reqid_t(), ctx->mtime, 0));
7551 ctx->op_t->remove(snapoid);
7552
7553 ctx->at_version.version++;
7554
7555 ctx->snapset_obc->obs.exists = false;
7556 }
7557 }
7558 } else if (!ctx->new_snapset.clones.empty() &&
7559 !ctx->cache_evict &&
7560 !ctx->new_snapset.head_exists &&
7561 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7562 // save snapset on _snap
7563 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7564 info.pgid.pool(), soid.get_namespace());
7565 dout(10) << " final snapset " << ctx->new_snapset
7566 << " in " << snapoid << dendl;
7567 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7568 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7569 ctx->at_version,
7570 eversion_t(),
7571 0, osd_reqid_t(), ctx->mtime, 0));
7572
7573 if (!ctx->snapset_obc)
7574 ctx->snapset_obc = get_object_context(snapoid, true);
7575 bool got = false;
7576 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7577 got = ctx->lock_manager.get_write_greedy(
7578 snapoid,
7579 ctx->snapset_obc,
7580 ctx->op);
7581 } else {
7582 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7583 got = ctx->lock_manager.get_lock_type(
7584 ObjectContext::RWState::RWEXCL,
7585 snapoid,
7586 ctx->snapset_obc,
7587 ctx->op);
7588 }
7589 assert(got);
7590 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7591 ctx->snapset_obc->obs.exists = true;
7592 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7593 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7594 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7595 ctx->snapset_obc->obs.oi.local_mtime = now;
7596
7597 map<string, bufferlist> attrs;
7598 bufferlist bv(sizeof(ctx->new_obs.oi));
7599 ::encode(ctx->snapset_obc->obs.oi, bv,
7600 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7601 ctx->op_t->create(snapoid);
7602 attrs[OI_ATTR].claim(bv);
7603 attrs[SS_ATTR].claim(bss);
7604 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7605 ctx->at_version.version++;
7606 }
7607 }
7608
7609 // finish and log the op.
7610 if (ctx->user_modify) {
7611 // update the user_version for any modify ops, except for the watch op
7612 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7613 /* In order for new clients and old clients to interoperate properly
7614 * when exchanging versions, we need to lower bound the user_version
7615 * (which our new clients pay proper attention to)
7616 * by the at_version (which is all the old clients can ever see). */
7617 if (ctx->at_version.version > ctx->user_at_version)
7618 ctx->user_at_version = ctx->at_version.version;
7619 ctx->new_obs.oi.user_version = ctx->user_at_version;
7620 }
7621 ctx->bytes_written = ctx->op_t->get_bytes_written();
7622
7623 if (ctx->new_obs.exists) {
7624 // on the head object
7625 ctx->new_obs.oi.version = ctx->at_version;
7626 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7627 ctx->new_obs.oi.last_reqid = ctx->reqid;
7628 if (ctx->mtime != utime_t()) {
7629 ctx->new_obs.oi.mtime = ctx->mtime;
7630 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7631 ctx->new_obs.oi.local_mtime = now;
7632 } else {
7633 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7634 }
7635
7636 map <string, bufferlist> attrs;
7637 bufferlist bv(sizeof(ctx->new_obs.oi));
7638 ::encode(ctx->new_obs.oi, bv,
7639 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7640 attrs[OI_ATTR].claim(bv);
7641
7642 if (soid.snap == CEPH_NOSNAP) {
7643 dout(10) << " final snapset " << ctx->new_snapset
7644 << " in " << soid << dendl;
7645 attrs[SS_ATTR].claim(bss);
7646 } else {
7647 dout(10) << " no snapset (this is a clone)" << dendl;
7648 }
7649 ctx->op_t->setattrs(soid, attrs);
7650 } else {
7651 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7652 }
7653
7654 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7655 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7656
7657 // append to log
7658 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7659 ctx->obs->oi.version,
7660 ctx->user_at_version, ctx->reqid,
7661 ctx->mtime, 0));
7662 if (soid.snap < CEPH_NOSNAP) {
7663 switch (log_op_type) {
7664 case pg_log_entry_t::MODIFY:
7665 case pg_log_entry_t::PROMOTE:
7666 case pg_log_entry_t::CLEAN:
7667 if (legacy_snapset) {
7668 dout(20) << __func__ << " encoding legacy_snaps "
7669 << ctx->new_obs.oi.legacy_snaps
7670 << dendl;
7671 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7672 } else {
7673 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7674 << dendl;
7675 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7676 }
7677 break;
7678 default:
7679 break;
7680 }
7681 }
7682
7683 if (!ctx->extra_reqids.empty()) {
7684 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7685 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7686 }
7687
7688 // apply new object state.
7689 ctx->obc->obs = ctx->new_obs;
7690
7691 if (soid.is_head() && !ctx->obc->obs.exists &&
7692 (!maintain_ssc || ctx->cache_evict)) {
7693 ctx->obc->ssc->exists = false;
7694 ctx->obc->ssc->snapset = SnapSet();
7695 } else {
7696 ctx->obc->ssc->exists = true;
7697 ctx->obc->ssc->snapset = ctx->new_snapset;
7698 }
7699 }
7700
7701 void PrimaryLogPG::apply_stats(
7702 const hobject_t &soid,
7703 const object_stat_sum_t &delta_stats) {
7704
7705 info.stats.stats.add(delta_stats);
7706
7707 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7708 i != backfill_targets.end();
7709 ++i) {
7710 pg_shard_t bt = *i;
7711 pg_info_t& pinfo = peer_info[bt];
7712 if (soid <= pinfo.last_backfill)
7713 pinfo.stats.stats.add(delta_stats);
7714 else if (soid <= last_backfill_started)
7715 pending_backfill_updates[soid].stats.add(delta_stats);
7716 }
7717
7718 if (is_primary() && scrubber.active) {
7719 if (soid < scrubber.start) {
7720 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7721 << "," << scrubber.end << ")" << dendl;
7722 scrub_cstat.add(delta_stats);
7723 } else {
7724 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7725 << "," << scrubber.end << ")" << dendl;
7726 }
7727 }
7728 }
7729
7730 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7731 {
7732 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7733 assert(ctx->async_reads_complete());
7734
7735 for (vector<OSDOp>::iterator p = ctx->ops->begin();
7736 p != ctx->ops->end() && result >= 0; ++p) {
7737 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7738 result = p->rval;
7739 break;
7740 }
7741 ctx->bytes_read += p->outdata.length();
7742 }
7743 ctx->reply->claim_op_out_data(*ctx->ops);
7744 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7745
7746 MOSDOpReply *reply = ctx->reply;
7747 ctx->reply = nullptr;
7748
7749 if (result >= 0) {
7750 if (!ctx->ignore_log_op_stats) {
7751 log_op_stats(ctx);
7752 publish_stats_to_osd();
7753 }
7754
7755 // on read, return the current object version
7756 if (ctx->obs) {
7757 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7758 } else {
7759 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7760 }
7761 } else if (result == -ENOENT) {
7762 // on ENOENT, set a floor for what the next user version will be.
7763 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7764 }
7765
7766 reply->set_result(result);
7767 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7768 osd->send_message_osd_client(reply, m->get_connection());
7769 close_op_ctx(ctx);
7770 }
7771
7772 // ========================================================================
7773 // copyfrom
7774
7775 struct C_Copyfrom : public Context {
7776 PrimaryLogPGRef pg;
7777 hobject_t oid;
7778 epoch_t last_peering_reset;
7779 ceph_tid_t tid;
7780 PrimaryLogPG::CopyOpRef cop;
7781 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7782 const PrimaryLogPG::CopyOpRef& c)
7783 : pg(p), oid(o), last_peering_reset(lpr),
7784 tid(0), cop(c)
7785 {}
7786 void finish(int r) override {
7787 if (r == -ECANCELED)
7788 return;
7789 pg->lock();
7790 if (last_peering_reset == pg->get_last_peering_reset()) {
7791 pg->process_copy_chunk(oid, tid, r);
7792 }
7793 pg->unlock();
7794 }
7795 };
7796
7797 struct C_CopyFrom_AsyncReadCb : public Context {
7798 OSDOp *osd_op;
7799 object_copy_data_t reply_obj;
7800 uint64_t features;
7801 size_t len;
7802 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7803 osd_op(osd_op), features(features), len(0) {}
7804 void finish(int r) override {
7805 osd_op->rval = r;
7806 if (r < 0) {
7807 return;
7808 }
7809
7810 assert(len > 0);
7811 assert(len <= reply_obj.data.length());
7812 bufferlist bl;
7813 bl.substr_of(reply_obj.data, 0, len);
7814 reply_obj.data.swap(bl);
7815 ::encode(reply_obj, osd_op->outdata, features);
7816 }
7817 };
7818
7819 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7820 OSDOp& osd_op, ObjectContextRef &obc)
7821 {
7822 object_info_t& oi = obc->obs.oi;
7823 hobject_t& soid = oi.soid;
7824 int result = 0;
7825 object_copy_cursor_t cursor;
7826 uint64_t out_max;
7827 try {
7828 ::decode(cursor, bp);
7829 ::decode(out_max, bp);
7830 }
7831 catch (buffer::error& e) {
7832 result = -EINVAL;
7833 return result;
7834 }
7835
7836 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7837 uint64_t features = op->get_features();
7838
7839 bool async_read_started = false;
7840 object_copy_data_t _reply_obj;
7841 C_CopyFrom_AsyncReadCb *cb = NULL;
7842 if (pool.info.require_rollback()) {
7843 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7844 }
7845 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7846 // size, mtime
7847 reply_obj.size = oi.size;
7848 reply_obj.mtime = oi.mtime;
7849 assert(obc->ssc);
7850 if (soid.snap < CEPH_NOSNAP) {
7851 if (obc->ssc->snapset.is_legacy()) {
7852 reply_obj.snaps = oi.legacy_snaps;
7853 } else {
7854 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7855 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7856 reply_obj.snaps = p->second;
7857 }
7858 } else {
7859 reply_obj.snap_seq = obc->ssc->snapset.seq;
7860 }
7861 if (oi.is_data_digest()) {
7862 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7863 reply_obj.data_digest = oi.data_digest;
7864 }
7865 if (oi.is_omap_digest()) {
7866 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7867 reply_obj.omap_digest = oi.omap_digest;
7868 }
7869 reply_obj.truncate_seq = oi.truncate_seq;
7870 reply_obj.truncate_size = oi.truncate_size;
7871
7872 // attrs
7873 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7874 if (!cursor.attr_complete) {
7875 result = getattrs_maybe_cache(
7876 ctx->obc,
7877 &out_attrs);
7878 if (result < 0) {
7879 if (cb) {
7880 delete cb;
7881 }
7882 return result;
7883 }
7884 cursor.attr_complete = true;
7885 dout(20) << " got attrs" << dendl;
7886 }
7887
7888 int64_t left = out_max - osd_op.outdata.length();
7889
7890 // data
7891 bufferlist& bl = reply_obj.data;
7892 if (left > 0 && !cursor.data_complete) {
7893 if (cursor.data_offset < oi.size) {
7894 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7895 if (cb) {
7896 async_read_started = true;
7897 ctx->pending_async_reads.push_back(
7898 make_pair(
7899 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7900 make_pair(&bl, cb)));
7901 cb->len = max_read;
7902
7903 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7904 new ReadFinisher(osd_op));
7905 result = -EINPROGRESS;
7906
7907 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7908 } else {
7909 result = pgbackend->objects_read_sync(
7910 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7911 if (result < 0)
7912 return result;
7913 }
7914 left -= max_read;
7915 cursor.data_offset += max_read;
7916 }
7917 if (cursor.data_offset == oi.size) {
7918 cursor.data_complete = true;
7919 dout(20) << " got data" << dendl;
7920 }
7921 assert(cursor.data_offset <= oi.size);
7922 }
7923
7924 // omap
7925 uint32_t omap_keys = 0;
7926 if (!pool.info.supports_omap() || !oi.is_omap()) {
7927 cursor.omap_complete = true;
7928 } else {
7929 if (left > 0 && !cursor.omap_complete) {
7930 assert(cursor.data_complete);
7931 if (cursor.omap_offset.empty()) {
7932 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7933 &reply_obj.omap_header);
7934 }
7935 bufferlist omap_data;
7936 ObjectMap::ObjectMapIterator iter =
7937 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7938 assert(iter);
7939 iter->upper_bound(cursor.omap_offset);
7940 for (; iter->valid(); iter->next(false)) {
7941 ++omap_keys;
7942 ::encode(iter->key(), omap_data);
7943 ::encode(iter->value(), omap_data);
7944 left -= iter->key().length() + 4 + iter->value().length() + 4;
7945 if (left <= 0)
7946 break;
7947 }
7948 if (omap_keys) {
7949 ::encode(omap_keys, reply_obj.omap_data);
7950 reply_obj.omap_data.claim_append(omap_data);
7951 }
7952 if (iter->valid()) {
7953 cursor.omap_offset = iter->key();
7954 } else {
7955 cursor.omap_complete = true;
7956 dout(20) << " got omap" << dendl;
7957 }
7958 }
7959 }
7960
7961 if (cursor.is_complete()) {
7962 // include reqids only in the final step. this is a bit fragile
7963 // but it works...
7964 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7965 dout(20) << " got reqids" << dendl;
7966 }
7967
7968 dout(20) << " cursor.is_complete=" << cursor.is_complete()
7969 << " " << out_attrs.size() << " attrs"
7970 << " " << bl.length() << " bytes"
7971 << " " << reply_obj.omap_header.length() << " omap header bytes"
7972 << " " << reply_obj.omap_data.length() << " omap data bytes in "
7973 << omap_keys << " keys"
7974 << " " << reply_obj.reqids.size() << " reqids"
7975 << dendl;
7976 reply_obj.cursor = cursor;
7977 if (!async_read_started) {
7978 ::encode(reply_obj, osd_op.outdata, features);
7979 }
7980 if (cb && !async_read_started) {
7981 delete cb;
7982 }
7983
7984 if (result > 0) {
7985 result = 0;
7986 }
7987 return result;
7988 }
7989
7990 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7991 OSDOp& osd_op)
7992 {
7993 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7994 // be careful not to modify anything else that will upset a racing
7995 // operator<<
7996 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7997 uint64_t features = m->get_features();
7998 object_copy_data_t reply_obj;
7999
8000 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
8001 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8002 ::encode(reply_obj, osd_op.outdata, features);
8003 osd_op.rval = -ENOENT;
8004 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8005 reply->claim_op_out_data(m->ops);
8006 reply->set_result(-ENOENT);
8007 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8008 osd->send_message_osd_client(reply, m->get_connection());
8009 }
8010
8011 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8012 hobject_t src, object_locator_t oloc,
8013 version_t version, unsigned flags,
8014 bool mirror_snapset,
8015 unsigned src_obj_fadvise_flags,
8016 unsigned dest_obj_fadvise_flags)
8017 {
8018 const hobject_t& dest = obc->obs.oi.soid;
8019 dout(10) << __func__ << " " << dest
8020 << " from " << src << " " << oloc << " v" << version
8021 << " flags " << flags
8022 << (mirror_snapset ? " mirror_snapset" : "")
8023 << dendl;
8024
8025 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8026 src.snap == CEPH_SNAPDIR));
8027
8028 // cancel a previous in-progress copy?
8029 if (copy_ops.count(dest)) {
8030 // FIXME: if the src etc match, we could avoid restarting from the
8031 // beginning.
8032 CopyOpRef cop = copy_ops[dest];
8033 vector<ceph_tid_t> tids;
8034 cancel_copy(cop, false, &tids);
8035 osd->objecter->op_cancel(tids, -ECANCELED);
8036 }
8037
8038 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8039 mirror_snapset, src_obj_fadvise_flags,
8040 dest_obj_fadvise_flags));
8041 copy_ops[dest] = cop;
8042 obc->start_block();
8043
8044 _copy_some(obc, cop);
8045 }
8046
8047 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8048 {
8049 dout(10) << __func__ << " " << obc << " " << cop << dendl;
8050
8051 unsigned flags = 0;
8052 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8053 flags |= CEPH_OSD_FLAG_FLUSH;
8054 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8055 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8056 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8057 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8058 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8059 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8060 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8061 flags |= CEPH_OSD_FLAG_RWORDERED;
8062
8063 C_GatherBuilder gather(cct);
8064
8065 if (cop->cursor.is_initial() && cop->mirror_snapset) {
8066 // list snaps too.
8067 assert(cop->src.snap == CEPH_NOSNAP);
8068 ObjectOperation op;
8069 op.list_snaps(&cop->results.snapset, NULL);
8070 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8071 CEPH_SNAPDIR, NULL,
8072 flags, gather.new_sub(), NULL);
8073 cop->objecter_tid2 = tid;
8074 }
8075
8076 ObjectOperation op;
8077 if (cop->results.user_version) {
8078 op.assert_version(cop->results.user_version);
8079 } else {
8080 // we should learn the version after the first chunk, if we didn't know
8081 // it already!
8082 assert(cop->cursor.is_initial());
8083 }
8084 op.copy_get(&cop->cursor, get_copy_chunk_size(),
8085 &cop->results.object_size, &cop->results.mtime,
8086 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8087 &cop->results.snaps, &cop->results.snap_seq,
8088 &cop->results.flags,
8089 &cop->results.source_data_digest,
8090 &cop->results.source_omap_digest,
8091 &cop->results.reqids,
8092 &cop->results.truncate_seq,
8093 &cop->results.truncate_size,
8094 &cop->rval);
8095 op.set_last_op_flags(cop->src_obj_fadvise_flags);
8096
8097 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8098 get_last_peering_reset(), cop);
8099 gather.set_finisher(new C_OnFinisher(fin,
8100 &osd->objecter_finisher));
8101
8102 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8103 cop->src.snap, NULL,
8104 flags,
8105 gather.new_sub(),
8106 // discover the object version if we don't know it yet
8107 cop->results.user_version ? NULL : &cop->results.user_version);
8108 fin->tid = tid;
8109 cop->objecter_tid = tid;
8110 gather.activate();
8111 }
8112
8113 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8114 {
8115 vector<ceph_tid_t> tids;
8116 dout(10) << __func__ << " " << oid << " tid " << tid
8117 << " " << cpp_strerror(r) << dendl;
8118 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8119 if (p == copy_ops.end()) {
8120 dout(10) << __func__ << " no copy_op found" << dendl;
8121 return;
8122 }
8123 CopyOpRef cop = p->second;
8124 if (tid != cop->objecter_tid) {
8125 dout(10) << __func__ << " tid " << tid << " != cop " << cop
8126 << " tid " << cop->objecter_tid << dendl;
8127 return;
8128 }
8129
8130 if (cop->omap_data.length() || cop->omap_header.length())
8131 cop->results.has_omap = true;
8132
8133 if (r >= 0 && !pool.info.supports_omap() &&
8134 (cop->omap_data.length() || cop->omap_header.length())) {
8135 r = -EOPNOTSUPP;
8136 }
8137 cop->objecter_tid = 0;
8138 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
8139 ObjectContextRef& cobc = cop->obc;
8140
8141 if (r < 0)
8142 goto out;
8143
8144 assert(cop->rval >= 0);
8145
8146 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8147 // verify snap hasn't been deleted
8148 vector<snapid_t>::iterator p = cop->results.snaps.begin();
8149 while (p != cop->results.snaps.end()) {
8150 if (pool.info.is_removed_snap(*p)) {
8151 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8152 << dendl;
8153 for (vector<snapid_t>::iterator q = p + 1;
8154 q != cop->results.snaps.end();
8155 ++q)
8156 *(q - 1) = *q;
8157 cop->results.snaps.resize(cop->results.snaps.size() - 1);
8158 } else {
8159 ++p;
8160 }
8161 }
8162 if (cop->results.snaps.empty()) {
8163 dout(10) << __func__ << " no more snaps for " << oid << dendl;
8164 r = -ENOENT;
8165 goto out;
8166 }
8167 }
8168
8169 assert(cop->rval >= 0);
8170
8171 if (!cop->temp_cursor.data_complete) {
8172 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8173 }
8174 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8175 if (cop->omap_header.length()) {
8176 cop->results.omap_digest =
8177 cop->omap_header.crc32c(cop->results.omap_digest);
8178 }
8179 if (cop->omap_data.length()) {
8180 bufferlist keys;
8181 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8182 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8183 }
8184 }
8185
8186 if (!cop->temp_cursor.attr_complete) {
8187 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8188 p != cop->attrs.end();
8189 ++p) {
8190 cop->results.attrs[string("_") + p->first] = p->second;
8191 }
8192 cop->attrs.clear();
8193 }
8194
8195 if (!cop->cursor.is_complete()) {
8196 // write out what we have so far
8197 if (cop->temp_cursor.is_initial()) {
8198 assert(!cop->results.started_temp_obj);
8199 cop->results.started_temp_obj = true;
8200 cop->results.temp_oid = generate_temp_object(oid);
8201 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8202 }
8203 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8204 OpContextUPtr ctx = simple_opc_create(tempobc);
8205 if (cop->temp_cursor.is_initial()) {
8206 ctx->new_temp_oid = cop->results.temp_oid;
8207 }
8208 _write_copy_chunk(cop, ctx->op_t.get());
8209 simple_opc_submit(std::move(ctx));
8210 dout(10) << __func__ << " fetching more" << dendl;
8211 _copy_some(cobc, cop);
8212 return;
8213 }
8214
8215 // verify digests?
8216 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8217 dout(20) << __func__ << std::hex
8218 << " got digest: rx data 0x" << cop->results.data_digest
8219 << " omap 0x" << cop->results.omap_digest
8220 << ", source: data 0x" << cop->results.source_data_digest
8221 << " omap 0x" << cop->results.source_omap_digest
8222 << std::dec
8223 << " flags " << cop->results.flags
8224 << dendl;
8225 }
8226 if (cop->results.is_data_digest() &&
8227 cop->results.data_digest != cop->results.source_data_digest) {
8228 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8229 << " != source 0x" << cop->results.source_data_digest << std::dec
8230 << dendl;
8231 osd->clog->error() << info.pgid << " copy from " << cop->src
8232 << " to " << cop->obc->obs.oi.soid << std::hex
8233 << " data digest 0x" << cop->results.data_digest
8234 << " != source 0x" << cop->results.source_data_digest
8235 << std::dec;
8236 r = -EIO;
8237 goto out;
8238 }
8239 if (cop->results.is_omap_digest() &&
8240 cop->results.omap_digest != cop->results.source_omap_digest) {
8241 derr << __func__ << std::hex
8242 << " omap digest 0x" << cop->results.omap_digest
8243 << " != source 0x" << cop->results.source_omap_digest
8244 << std::dec << dendl;
8245 osd->clog->error() << info.pgid << " copy from " << cop->src
8246 << " to " << cop->obc->obs.oi.soid << std::hex
8247 << " omap digest 0x" << cop->results.omap_digest
8248 << " != source 0x" << cop->results.source_omap_digest
8249 << std::dec;
8250 r = -EIO;
8251 goto out;
8252 }
8253 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8254 derr << __func__ << " injecting copyfrom failure" << dendl;
8255 r = -EIO;
8256 goto out;
8257 }
8258
8259 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8260 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8261 ObjectState& obs = cop->obc->obs;
8262 if (cop->temp_cursor.is_initial()) {
8263 dout(20) << "fill_in_final_tx: writing "
8264 << "directly to final object" << dendl;
8265 // write directly to final object
8266 cop->results.temp_oid = obs.oi.soid;
8267 _write_copy_chunk(cop, t);
8268 } else {
8269 // finish writing to temp object, then move into place
8270 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8271 _write_copy_chunk(cop, t);
8272 t->rename(obs.oi.soid, cop->results.temp_oid);
8273 }
8274 t->setattrs(obs.oi.soid, cop->results.attrs);
8275 });
8276
8277 dout(20) << __func__ << " success; committing" << dendl;
8278
8279 out:
8280 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8281 CopyCallbackResults results(r, &cop->results);
8282 cop->cb->complete(results);
8283
8284 copy_ops.erase(cobc->obs.oi.soid);
8285 cobc->stop_block();
8286
8287 if (r < 0 && cop->results.started_temp_obj) {
8288 dout(10) << __func__ << " deleting partial temp object "
8289 << cop->results.temp_oid << dendl;
8290 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8291 OpContextUPtr ctx = simple_opc_create(tempobc);
8292 ctx->op_t->remove(cop->results.temp_oid);
8293 ctx->discard_temp_oid = cop->results.temp_oid;
8294 simple_opc_submit(std::move(ctx));
8295 }
8296
8297 // cancel and requeue proxy ops on this object
8298 if (!r) {
8299 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8300 it != proxyread_ops.end();) {
8301 if (it->second->soid == cobc->obs.oi.soid) {
8302 cancel_proxy_read((it++)->second, &tids);
8303 } else {
8304 ++it;
8305 }
8306 }
8307 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8308 it != proxywrite_ops.end();) {
8309 if (it->second->soid == cobc->obs.oi.soid) {
8310 cancel_proxy_write((it++)->second, &tids);
8311 } else {
8312 ++it;
8313 }
8314 }
8315 osd->objecter->op_cancel(tids, -ECANCELED);
8316 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8317 }
8318
8319 kick_object_context_blocked(cobc);
8320 }
8321
8322 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
8323 vector<ceph_tid_t> tids;
8324 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8325 it != proxyread_ops.end();) {
8326 if (it->second->soid == oid) {
8327 cancel_proxy_read((it++)->second, &tids);
8328 } else {
8329 ++it;
8330 }
8331 }
8332 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8333 it != proxywrite_ops.end();) {
8334 if (it->second->soid == oid) {
8335 cancel_proxy_write((it++)->second, &tids);
8336 } else {
8337 ++it;
8338 }
8339 }
8340 osd->objecter->op_cancel(tids, -ECANCELED);
8341 kick_proxy_ops_blocked(oid);
8342 }
8343
8344 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8345 {
8346 dout(20) << __func__ << " " << cop
8347 << " " << cop->attrs.size() << " attrs"
8348 << " " << cop->data.length() << " bytes"
8349 << " " << cop->omap_header.length() << " omap header bytes"
8350 << " " << cop->omap_data.length() << " omap data bytes"
8351 << dendl;
8352 if (!cop->temp_cursor.attr_complete) {
8353 t->create(cop->results.temp_oid);
8354 }
8355 if (!cop->temp_cursor.data_complete) {
8356 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8357 cop->cursor.data_offset);
8358 if (pool.info.requires_aligned_append() &&
8359 !cop->cursor.data_complete) {
8360 /**
8361 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8362 * to pick it up on the next pass.
8363 */
8364 assert(cop->temp_cursor.data_offset %
8365 pool.info.required_alignment() == 0);
8366 if (cop->data.length() % pool.info.required_alignment() != 0) {
8367 uint64_t to_trim =
8368 cop->data.length() % pool.info.required_alignment();
8369 bufferlist bl;
8370 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8371 cop->data.swap(bl);
8372 cop->cursor.data_offset -= to_trim;
8373 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8374 cop->cursor.data_offset);
8375 }
8376 }
8377 if (cop->data.length()) {
8378 t->write(
8379 cop->results.temp_oid,
8380 cop->temp_cursor.data_offset,
8381 cop->data.length(),
8382 cop->data,
8383 cop->dest_obj_fadvise_flags);
8384 }
8385 cop->data.clear();
8386 }
8387 if (pool.info.supports_omap()) {
8388 if (!cop->temp_cursor.omap_complete) {
8389 if (cop->omap_header.length()) {
8390 t->omap_setheader(
8391 cop->results.temp_oid,
8392 cop->omap_header);
8393 cop->omap_header.clear();
8394 }
8395 if (cop->omap_data.length()) {
8396 map<string,bufferlist> omap;
8397 bufferlist::iterator p = cop->omap_data.begin();
8398 ::decode(omap, p);
8399 t->omap_setkeys(cop->results.temp_oid, omap);
8400 cop->omap_data.clear();
8401 }
8402 }
8403 } else {
8404 assert(cop->omap_header.length() == 0);
8405 assert(cop->omap_data.length() == 0);
8406 }
8407 cop->temp_cursor = cop->cursor;
8408 }
8409
8410 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8411 {
8412 OpContext *ctx = cb->ctx;
8413 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8414
8415 ObjectState& obs = ctx->new_obs;
8416 if (obs.exists) {
8417 dout(20) << __func__ << ": exists, removing" << dendl;
8418 ctx->op_t->remove(obs.oi.soid);
8419 } else {
8420 ctx->delta_stats.num_objects++;
8421 obs.exists = true;
8422 }
8423 if (cb->is_temp_obj_used()) {
8424 ctx->discard_temp_oid = cb->results->temp_oid;
8425 }
8426 cb->results->fill_in_final_tx(ctx->op_t.get());
8427
8428 // CopyFromCallback fills this in for us
8429 obs.oi.user_version = ctx->user_at_version;
8430
8431 obs.oi.set_data_digest(cb->results->data_digest);
8432 obs.oi.set_omap_digest(cb->results->omap_digest);
8433
8434 obs.oi.truncate_seq = cb->results->truncate_seq;
8435 obs.oi.truncate_size = cb->results->truncate_size;
8436
8437 ctx->extra_reqids = cb->results->reqids;
8438
8439 // cache: clear whiteout?
8440 if (obs.oi.is_whiteout()) {
8441 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8442 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8443 --ctx->delta_stats.num_whiteouts;
8444 }
8445
8446 if (cb->results->has_omap) {
8447 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8448 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8449 } else {
8450 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8451 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8452 }
8453
8454 interval_set<uint64_t> ch;
8455 if (obs.oi.size > 0)
8456 ch.insert(0, obs.oi.size);
8457 ctx->modified_ranges.union_of(ch);
8458
8459 if (cb->get_data_size() != obs.oi.size) {
8460 ctx->delta_stats.num_bytes -= obs.oi.size;
8461 obs.oi.size = cb->get_data_size();
8462 ctx->delta_stats.num_bytes += obs.oi.size;
8463 }
8464 ctx->delta_stats.num_wr++;
8465 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8466
8467 osd->logger->inc(l_osd_copyfrom);
8468 }
8469
8470 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8471 ObjectContextRef obc)
8472 {
8473 const hobject_t& soid = obc->obs.oi.soid;
8474 dout(10) << __func__ << " " << soid << " r=" << r
8475 << " uv" << results->user_version << dendl;
8476
8477 if (r == -ECANCELED) {
8478 return;
8479 }
8480
8481 if (r != -ENOENT && soid.is_snap()) {
8482 if (results->snaps.empty()) {
8483 // we must have read "snap" content from the head object in
8484 // the base pool. use snap_seq to construct what snaps should
8485 // be for this clone (what is was before we evicted the clean
8486 // clone from this pool, and what it will be when we flush and
8487 // the clone eventually happens in the base pool).
8488 SnapSet& snapset = obc->ssc->snapset;
8489 vector<snapid_t>::iterator p = snapset.snaps.begin();
8490 while (p != snapset.snaps.end() && *p > soid.snap)
8491 ++p;
8492 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8493 results->snaps.push_back(*p);
8494 ++p;
8495 }
8496 }
8497
8498 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8499 filter_snapc(results->snaps);
8500
8501 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8502 if (results->snaps.empty()) {
8503 dout(20) << __func__
8504 << " snaps are empty, clone is invalid,"
8505 << " setting r to ENOENT" << dendl;
8506 r = -ENOENT;
8507 }
8508 }
8509
8510 if (r < 0 && results->started_temp_obj) {
8511 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8512 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8513 assert(tempobc);
8514 OpContextUPtr ctx = simple_opc_create(tempobc);
8515 ctx->op_t->remove(results->temp_oid);
8516 simple_opc_submit(std::move(ctx));
8517 results->started_temp_obj = false;
8518 }
8519
8520 if (r == -ENOENT && soid.is_snap()) {
8521 dout(10) << __func__
8522 << ": enoent while trying to promote clone, " << soid
8523 << " must have been trimmed, removing from snapset"
8524 << dendl;
8525 hobject_t head(soid.get_head());
8526 ObjectContextRef obc = get_object_context(head, false);
8527 assert(obc);
8528
8529 OpContextUPtr tctx = simple_opc_create(obc);
8530 tctx->at_version = get_next_version();
8531 filter_snapc(tctx->new_snapset.snaps);
8532 vector<snapid_t> new_clones;
8533 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8534 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8535 i != tctx->new_snapset.clones.end();
8536 ++i) {
8537 if (*i != soid.snap) {
8538 new_clones.push_back(*i);
8539 auto p = tctx->new_snapset.clone_snaps.find(*i);
8540 if (p != tctx->new_snapset.clone_snaps.end()) {
8541 new_clone_snaps[*i] = p->second;
8542 }
8543 }
8544 }
8545 tctx->new_snapset.clones.swap(new_clones);
8546 tctx->new_snapset.clone_overlap.erase(soid.snap);
8547 tctx->new_snapset.clone_size.erase(soid.snap);
8548 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8549
8550 // take RWWRITE lock for duration of our local write. ignore starvation.
8551 if (!tctx->lock_manager.take_write_lock(
8552 head,
8553 obc)) {
8554 assert(0 == "problem!");
8555 }
8556 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8557
8558 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8559
8560 simple_opc_submit(std::move(tctx));
8561 return;
8562 }
8563
8564 bool whiteout = false;
8565 if (r == -ENOENT) {
8566 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8567 dout(10) << __func__ << " whiteout " << soid << dendl;
8568 whiteout = true;
8569 }
8570
8571 if (r < 0 && !whiteout) {
8572 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8573 // pass error to everyone blocked on this object
8574 // FIXME: this is pretty sloppy, but at this point we got
8575 // something unexpected and don't have many other options.
8576 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8577 waiting_for_blocked_object.find(soid);
8578 if (blocked_iter != waiting_for_blocked_object.end()) {
8579 while (!blocked_iter->second.empty()) {
8580 osd->reply_op_error(blocked_iter->second.front(), r);
8581 blocked_iter->second.pop_front();
8582 }
8583 waiting_for_blocked_object.erase(blocked_iter);
8584 }
8585 return;
8586 }
8587
8588 osd->promote_finish(results->object_size);
8589
8590 OpContextUPtr tctx = simple_opc_create(obc);
8591 tctx->at_version = get_next_version();
8592
8593 ++tctx->delta_stats.num_objects;
8594 if (soid.snap < CEPH_NOSNAP)
8595 ++tctx->delta_stats.num_object_clones;
8596 tctx->new_obs.exists = true;
8597
8598 tctx->extra_reqids = results->reqids;
8599
8600 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8601 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8602
8603 if (whiteout) {
8604 // create a whiteout
8605 tctx->op_t->create(soid);
8606 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8607 ++tctx->delta_stats.num_whiteouts;
8608 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8609 osd->logger->inc(l_osd_tier_whiteout);
8610 } else {
8611 if (results->has_omap) {
8612 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8613 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8614 ++tctx->delta_stats.num_objects_omap;
8615 }
8616
8617 results->fill_in_final_tx(tctx->op_t.get());
8618 if (results->started_temp_obj) {
8619 tctx->discard_temp_oid = results->temp_oid;
8620 }
8621 tctx->new_obs.oi.size = results->object_size;
8622 tctx->new_obs.oi.user_version = results->user_version;
8623 // Don't care src object whether have data or omap digest
8624 if (results->object_size)
8625 tctx->new_obs.oi.set_data_digest(results->data_digest);
8626 if (results->has_omap)
8627 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8628 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8629 tctx->new_obs.oi.truncate_size = results->truncate_size;
8630
8631 if (soid.snap != CEPH_NOSNAP) {
8632 if (legacy_snapset) {
8633 tctx->new_obs.oi.legacy_snaps = results->snaps;
8634 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8635 } else {
8636 // it's already in the snapset
8637 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8638 }
8639 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8640 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8641 results->object_size);
8642 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8643
8644 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8645 } else {
8646 tctx->delta_stats.num_bytes += results->object_size;
8647 }
8648 }
8649
8650 if (results->mirror_snapset) {
8651 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8652 tctx->new_snapset.from_snap_set(
8653 results->snapset,
8654 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8655 }
8656 tctx->new_snapset.head_exists = true;
8657 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8658
8659 // take RWWRITE lock for duration of our local write. ignore starvation.
8660 if (!tctx->lock_manager.take_write_lock(
8661 obc->obs.oi.soid,
8662 obc)) {
8663 assert(0 == "problem!");
8664 }
8665 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8666
8667 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8668
8669 simple_opc_submit(std::move(tctx));
8670
8671 osd->logger->inc(l_osd_tier_promote);
8672
8673 if (agent_state &&
8674 agent_state->is_idle())
8675 agent_choose_mode();
8676 }
8677
8678 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
8679 vector<ceph_tid_t> *tids)
8680 {
8681 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8682 << " from " << cop->src << " " << cop->oloc
8683 << " v" << cop->results.user_version << dendl;
8684
8685 // cancel objecter op, if we can
8686 if (cop->objecter_tid) {
8687 tids->push_back(cop->objecter_tid);
8688 cop->objecter_tid = 0;
8689 if (cop->objecter_tid2) {
8690 tids->push_back(cop->objecter_tid2);
8691 cop->objecter_tid2 = 0;
8692 }
8693 }
8694
8695 copy_ops.erase(cop->obc->obs.oi.soid);
8696 cop->obc->stop_block();
8697
8698 kick_object_context_blocked(cop->obc);
8699 cop->results.should_requeue = requeue;
8700 CopyCallbackResults result(-ECANCELED, &cop->results);
8701 cop->cb->complete(result);
8702
8703 // There may still be an objecter callback referencing this copy op.
8704 // That callback will not need the obc since it's been canceled, and
8705 // we need the obc reference to go away prior to flush.
8706 cop->obc = ObjectContextRef();
8707 }
8708
8709 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
8710 {
8711 dout(10) << __func__ << dendl;
8712 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8713 while (p != copy_ops.end()) {
8714 // requeue this op? can I queue up all of them?
8715 cancel_copy((p++)->second, requeue, tids);
8716 }
8717 }
8718
8719
8720 // ========================================================================
8721 // flush
8722 //
8723 // Flush a dirty object in the cache tier by writing it back to the
8724 // base tier. The sequence looks like:
8725 //
8726 // * send a copy-from operation to the base tier to copy the current
8727 // version of the object
8728 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8729 // * on completion, we check if the object has been modified. if so,
8730 // just reply with -EAGAIN.
8731 // * try to take a write lock so we can clear the dirty flag. if this
8732 // fails, wait and retry
8733 // * start a repop that clears the bit.
8734 //
8735 // If we have to wait, we will retry by coming back through the
8736 // start_flush method. We check if a flush is already in progress
8737 // and, if so, try to finish it by rechecking the version and trying
8738 // to clear the dirty bit.
8739 //
8740 // In order for the cache-flush (a write op) to not block the copy-get
8741 // from reading the object, the client *must* set the SKIPRWLOCKS
8742 // flag.
8743 //
8744 // NOTE: normally writes are strictly ordered for the client, but
8745 // flushes are special in that they can be reordered with respect to
8746 // other writes. In particular, we can't have a flush request block
8747 // an update to the cache pool object!
8748
8749 struct C_Flush : public Context {
8750 PrimaryLogPGRef pg;
8751 hobject_t oid;
8752 epoch_t last_peering_reset;
8753 ceph_tid_t tid;
8754 utime_t start;
8755 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8756 : pg(p), oid(o), last_peering_reset(lpr),
8757 tid(0), start(ceph_clock_now())
8758 {}
8759 void finish(int r) override {
8760 if (r == -ECANCELED)
8761 return;
8762 pg->lock();
8763 if (last_peering_reset == pg->get_last_peering_reset()) {
8764 pg->finish_flush(oid, tid, r);
8765 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8766 }
8767 pg->unlock();
8768 }
8769 };
8770
8771 int PrimaryLogPG::start_flush(
8772 OpRequestRef op, ObjectContextRef obc,
8773 bool blocking, hobject_t *pmissing,
8774 boost::optional<std::function<void()>> &&on_flush)
8775 {
8776 const object_info_t& oi = obc->obs.oi;
8777 const hobject_t& soid = oi.soid;
8778 dout(10) << __func__ << " " << soid
8779 << " v" << oi.version
8780 << " uv" << oi.user_version
8781 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8782 << dendl;
8783
8784 // get a filtered snapset, need to remove removed snaps
8785 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8786
8787 // verify there are no (older) check for dirty clones
8788 {
8789 dout(20) << " snapset " << snapset << dendl;
8790 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8791 while (p != snapset.clones.rend() && *p >= soid.snap)
8792 ++p;
8793 if (p != snapset.clones.rend()) {
8794 hobject_t next = soid;
8795 next.snap = *p;
8796 assert(next.snap < soid.snap);
8797 if (pg_log.get_missing().is_missing(next)) {
8798 dout(10) << __func__ << " missing clone is " << next << dendl;
8799 if (pmissing)
8800 *pmissing = next;
8801 return -ENOENT;
8802 }
8803 ObjectContextRef older_obc = get_object_context(next, false);
8804 if (older_obc) {
8805 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8806 << dendl;
8807 if (older_obc->obs.oi.is_dirty()) {
8808 dout(10) << __func__ << " next oldest clone is dirty: "
8809 << older_obc->obs.oi << dendl;
8810 return -EBUSY;
8811 }
8812 } else {
8813 dout(20) << __func__ << " next oldest clone " << next
8814 << " is not present; implicitly clean" << dendl;
8815 }
8816 } else {
8817 dout(20) << __func__ << " no older clones" << dendl;
8818 }
8819 }
8820
8821 if (blocking)
8822 obc->start_block();
8823
8824 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8825 if (p != flush_ops.end()) {
8826 FlushOpRef fop = p->second;
8827 if (fop->op == op) {
8828 // we couldn't take the write lock on a cache-try-flush before;
8829 // now we are trying again for the lock.
8830 return try_flush_mark_clean(fop);
8831 }
8832 if (fop->flushed_version == obc->obs.oi.user_version &&
8833 (fop->blocking || !blocking)) {
8834 // nonblocking can join anything
8835 // blocking can only join a blocking flush
8836 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8837 if (op)
8838 fop->dup_ops.push_back(op);
8839 return -EAGAIN; // clean up this ctx; op will retry later
8840 }
8841
8842 // cancel current flush since it will fail anyway, or because we
8843 // are blocking and the existing flush is nonblocking.
8844 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8845 if (fop->op)
8846 osd->reply_op_error(fop->op, -EBUSY);
8847 while (!fop->dup_ops.empty()) {
8848 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8849 fop->dup_ops.pop_front();
8850 }
8851 vector<ceph_tid_t> tids;
8852 cancel_flush(fop, false, &tids);
8853 osd->objecter->op_cancel(tids, -ECANCELED);
8854 }
8855
8856 /**
8857 * In general, we need to send a delete and a copyfrom.
8858 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8859 * where 4 is marked as clean. To flush 10, we have to:
8860 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8861 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8862 *
8863 * There is a complicating case. Supposed there had been a clone 7
8864 * for snaps [7, 6] which has been trimmed since they no longer exist.
8865 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8866 * the delete, the snap will be promoted to 5, and the head will become
8867 * a snapdir. When the copy-from goes through, we'll end up with
8868 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8869 *
8870 * Another complication is the case where there is an interval change
8871 * after doing the delete and the flush but before marking the object
8872 * clean. We'll happily delete head and then recreate it at the same
8873 * sequence number, which works out ok.
8874 */
8875
8876 SnapContext snapc, dsnapc;
8877 if (snapset.seq != 0) {
8878 if (soid.snap == CEPH_NOSNAP) {
8879 snapc.seq = snapset.seq;
8880 snapc.snaps = snapset.snaps;
8881 } else {
8882 snapid_t min_included_snap;
8883 if (snapset.is_legacy()) {
8884 min_included_snap = oi.legacy_snaps.back();
8885 } else {
8886 auto p = snapset.clone_snaps.find(soid.snap);
8887 assert(p != snapset.clone_snaps.end());
8888 min_included_snap = p->second.back();
8889 }
8890 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8891 }
8892
8893 snapid_t prev_snapc = 0;
8894 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8895 citer != snapset.clones.rend();
8896 ++citer) {
8897 if (*citer < soid.snap) {
8898 prev_snapc = *citer;
8899 break;
8900 }
8901 }
8902
8903 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8904 }
8905
8906 object_locator_t base_oloc(soid);
8907 base_oloc.pool = pool.info.tier_of;
8908
8909 if (dsnapc.seq < snapc.seq) {
8910 ObjectOperation o;
8911 o.remove();
8912 osd->objecter->mutate(
8913 soid.oid,
8914 base_oloc,
8915 o,
8916 dsnapc,
8917 ceph::real_clock::from_ceph_timespec(oi.mtime),
8918 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8919 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8920 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8921 }
8922
8923 FlushOpRef fop(std::make_shared<FlushOp>());
8924 fop->obc = obc;
8925 fop->flushed_version = oi.user_version;
8926 fop->blocking = blocking;
8927 fop->on_flush = std::move(on_flush);
8928 fop->op = op;
8929
8930 ObjectOperation o;
8931 if (oi.is_whiteout()) {
8932 fop->removal = true;
8933 o.remove();
8934 } else {
8935 object_locator_t oloc(soid);
8936 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8937 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8938 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8939 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8940 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8941 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8942
8943 //mean the base tier don't cache data after this
8944 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8945 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8946 }
8947 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8948
8949 ceph_tid_t tid = osd->objecter->mutate(
8950 soid.oid, base_oloc, o, snapc,
8951 ceph::real_clock::from_ceph_timespec(oi.mtime),
8952 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8953 new C_OnFinisher(fin,
8954 &osd->objecter_finisher));
8955 /* we're under the pg lock and fin->finish() is grabbing that */
8956 fin->tid = tid;
8957 fop->objecter_tid = tid;
8958
8959 flush_ops[soid] = fop;
8960 info.stats.stats.sum.num_flush++;
8961 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8962 return -EINPROGRESS;
8963 }
8964
8965 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8966 {
8967 dout(10) << __func__ << " " << oid << " tid " << tid
8968 << " " << cpp_strerror(r) << dendl;
8969 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8970 if (p == flush_ops.end()) {
8971 dout(10) << __func__ << " no flush_op found" << dendl;
8972 return;
8973 }
8974 FlushOpRef fop = p->second;
8975 if (tid != fop->objecter_tid) {
8976 dout(10) << __func__ << " tid " << tid << " != fop " << fop
8977 << " tid " << fop->objecter_tid << dendl;
8978 return;
8979 }
8980 ObjectContextRef obc = fop->obc;
8981 fop->objecter_tid = 0;
8982
8983 if (r < 0 && !(r == -ENOENT && fop->removal)) {
8984 if (fop->op)
8985 osd->reply_op_error(fop->op, -EBUSY);
8986 if (fop->blocking) {
8987 obc->stop_block();
8988 kick_object_context_blocked(obc);
8989 }
8990
8991 if (!fop->dup_ops.empty()) {
8992 dout(20) << __func__ << " requeueing dups" << dendl;
8993 requeue_ops(fop->dup_ops);
8994 }
8995 if (fop->on_flush) {
8996 (*(fop->on_flush))();
8997 fop->on_flush = boost::none;
8998 }
8999 flush_ops.erase(oid);
9000 return;
9001 }
9002
9003 r = try_flush_mark_clean(fop);
9004 if (r == -EBUSY && fop->op) {
9005 osd->reply_op_error(fop->op, r);
9006 }
9007 }
9008
9009 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
9010 {
9011 ObjectContextRef obc = fop->obc;
9012 const hobject_t& oid = obc->obs.oi.soid;
9013
9014 if (fop->blocking) {
9015 obc->stop_block();
9016 kick_object_context_blocked(obc);
9017 }
9018
9019 if (fop->flushed_version != obc->obs.oi.user_version ||
9020 !obc->obs.exists) {
9021 if (obc->obs.exists)
9022 dout(10) << __func__ << " flushed_version " << fop->flushed_version
9023 << " != current " << obc->obs.oi.user_version
9024 << dendl;
9025 else
9026 dout(10) << __func__ << " object no longer exists" << dendl;
9027
9028 if (!fop->dup_ops.empty()) {
9029 dout(20) << __func__ << " requeueing dups" << dendl;
9030 requeue_ops(fop->dup_ops);
9031 }
9032 if (fop->on_flush) {
9033 (*(fop->on_flush))();
9034 fop->on_flush = boost::none;
9035 }
9036 flush_ops.erase(oid);
9037 if (fop->blocking)
9038 osd->logger->inc(l_osd_tier_flush_fail);
9039 else
9040 osd->logger->inc(l_osd_tier_try_flush_fail);
9041 return -EBUSY;
9042 }
9043
9044 if (!fop->blocking &&
9045 scrubber.write_blocked_by_scrub(oid)) {
9046 if (fop->op) {
9047 dout(10) << __func__ << " blocked by scrub" << dendl;
9048 requeue_op(fop->op);
9049 requeue_ops(fop->dup_ops);
9050 return -EAGAIN; // will retry
9051 } else {
9052 osd->logger->inc(l_osd_tier_try_flush_fail);
9053 vector<ceph_tid_t> tids;
9054 cancel_flush(fop, false, &tids);
9055 osd->objecter->op_cancel(tids, -ECANCELED);
9056 return -ECANCELED;
9057 }
9058 }
9059
9060 // successfully flushed, can we evict this object?
9061 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9062 agent_maybe_evict(obc, true)) {
9063 osd->logger->inc(l_osd_tier_clean);
9064 if (fop->on_flush) {
9065 (*(fop->on_flush))();
9066 fop->on_flush = boost::none;
9067 }
9068 flush_ops.erase(oid);
9069 return 0;
9070 }
9071
9072 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9073 OpContextUPtr ctx = simple_opc_create(fop->obc);
9074
9075 // successfully flushed; can we clear the dirty bit?
9076 // try to take the lock manually, since we don't
9077 // have a ctx yet.
9078 if (ctx->lock_manager.get_lock_type(
9079 ObjectContext::RWState::RWWRITE,
9080 oid,
9081 obc,
9082 fop->op)) {
9083 dout(20) << __func__ << " took write lock" << dendl;
9084 } else if (fop->op) {
9085 dout(10) << __func__ << " waiting on write lock" << dendl;
9086 close_op_ctx(ctx.release());
9087 requeue_op(fop->op);
9088 requeue_ops(fop->dup_ops);
9089 return -EAGAIN; // will retry
9090 } else {
9091 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9092 close_op_ctx(ctx.release());
9093 osd->logger->inc(l_osd_tier_try_flush_fail);
9094 vector<ceph_tid_t> tids;
9095 cancel_flush(fop, false, &tids);
9096 osd->objecter->op_cancel(tids, -ECANCELED);
9097 return -ECANCELED;
9098 }
9099
9100 if (fop->on_flush) {
9101 ctx->register_on_finish(*(fop->on_flush));
9102 fop->on_flush = boost::none;
9103 }
9104
9105 ctx->at_version = get_next_version();
9106
9107 ctx->new_obs = obc->obs;
9108 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9109 --ctx->delta_stats.num_objects_dirty;
9110
9111 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9112
9113 osd->logger->inc(l_osd_tier_clean);
9114
9115 if (!fop->dup_ops.empty() || fop->op) {
9116 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9117 list<OpRequestRef> ls;
9118 if (fop->op)
9119 ls.push_back(fop->op);
9120 ls.splice(ls.end(), fop->dup_ops);
9121 requeue_ops(ls);
9122 }
9123
9124 simple_opc_submit(std::move(ctx));
9125
9126 flush_ops.erase(oid);
9127
9128 if (fop->blocking)
9129 osd->logger->inc(l_osd_tier_flush);
9130 else
9131 osd->logger->inc(l_osd_tier_try_flush);
9132
9133 return -EINPROGRESS;
9134 }
9135
9136 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
9137 vector<ceph_tid_t> *tids)
9138 {
9139 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9140 << fop->objecter_tid << dendl;
9141 if (fop->objecter_tid) {
9142 tids->push_back(fop->objecter_tid);
9143 fop->objecter_tid = 0;
9144 }
9145 if (fop->io_tids.size()) {
9146 for (auto &p : fop->io_tids) {
9147 tids->push_back(p.second);
9148 p.second = 0;
9149 }
9150 }
9151 if (fop->blocking && fop->obc->is_blocked()) {
9152 fop->obc->stop_block();
9153 kick_object_context_blocked(fop->obc);
9154 }
9155 if (requeue) {
9156 if (fop->op)
9157 requeue_op(fop->op);
9158 requeue_ops(fop->dup_ops);
9159 }
9160 if (fop->on_flush) {
9161 (*(fop->on_flush))();
9162 fop->on_flush = boost::none;
9163 }
9164 flush_ops.erase(fop->obc->obs.oi.soid);
9165 }
9166
9167 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
9168 {
9169 dout(10) << __func__ << dendl;
9170 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9171 while (p != flush_ops.end()) {
9172 cancel_flush((p++)->second, requeue, tids);
9173 }
9174 }
9175
9176 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9177 {
9178 if (!pool.info.allow_incomplete_clones())
9179 return true;
9180 if (is_missing_object(coid))
9181 return true;
9182 ObjectContextRef obc = get_object_context(coid, false);
9183 return obc && obc->obs.exists;
9184 }
9185
9186 // ========================================================================
9187 // rep op gather
9188
9189 class C_OSD_RepopApplied : public Context {
9190 PrimaryLogPGRef pg;
9191 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9192 public:
9193 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9194 : pg(pg), repop(repop) {}
9195 void finish(int) override {
9196 pg->repop_all_applied(repop.get());
9197 }
9198 };
9199
9200
9201 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9202 {
9203 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9204 << dendl;
9205 assert(!repop->applies_with_commit);
9206 repop->all_applied = true;
9207 if (!repop->rep_aborted) {
9208 eval_repop(repop);
9209 }
9210 }
9211
9212 class C_OSD_RepopCommit : public Context {
9213 PrimaryLogPGRef pg;
9214 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9215 public:
9216 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9217 : pg(pg), repop(repop) {}
9218 void finish(int) override {
9219 pg->repop_all_committed(repop.get());
9220 }
9221 };
9222
9223 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9224 {
9225 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9226 << dendl;
9227 repop->all_committed = true;
9228 if (repop->applies_with_commit) {
9229 assert(!repop->all_applied);
9230 repop->all_applied = true;
9231 }
9232
9233 if (!repop->rep_aborted) {
9234 if (repop->v != eversion_t()) {
9235 last_update_ondisk = repop->v;
9236 last_complete_ondisk = repop->pg_local_last_complete;
9237 }
9238 eval_repop(repop);
9239 }
9240 }
9241
9242 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9243 {
9244 dout(10) << "op_applied version " << applied_version << dendl;
9245 if (applied_version == eversion_t())
9246 return;
9247 assert(applied_version > last_update_applied);
9248 assert(applied_version <= info.last_update);
9249 last_update_applied = applied_version;
9250 if (is_primary()) {
9251 if (scrubber.active) {
9252 if (last_update_applied >= scrubber.subset_last_update) {
9253 if (ops_blocked_by_scrub()) {
9254 requeue_scrub(true);
9255 } else {
9256 requeue_scrub(false);
9257 }
9258
9259 }
9260 } else {
9261 assert(scrubber.start == scrubber.end);
9262 }
9263 } else {
9264 if (scrubber.active_rep_scrub) {
9265 if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9266 scrubber.active_rep_scrub->get_req())->scrub_to) {
9267 osd->enqueue_back(
9268 info.pgid,
9269 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9270 scrubber.active_rep_scrub = OpRequestRef();
9271 }
9272 }
9273 }
9274 }
9275
9276 void PrimaryLogPG::eval_repop(RepGather *repop)
9277 {
9278 const MOSDOp *m = NULL;
9279 if (repop->op)
9280 m = static_cast<const MOSDOp *>(repop->op->get_req());
9281
9282 if (m)
9283 dout(10) << "eval_repop " << *repop
9284 << (repop->rep_done ? " DONE" : "")
9285 << dendl;
9286 else
9287 dout(10) << "eval_repop " << *repop << " (no op)"
9288 << (repop->rep_done ? " DONE" : "")
9289 << dendl;
9290
9291 if (repop->rep_done)
9292 return;
9293
9294 // ondisk?
9295 if (repop->all_committed) {
9296 dout(10) << " commit: " << *repop << dendl;
9297 for (auto p = repop->on_committed.begin();
9298 p != repop->on_committed.end();
9299 repop->on_committed.erase(p++)) {
9300 (*p)();
9301 }
9302 // send dup commits, in order
9303 if (waiting_for_ondisk.count(repop->v)) {
9304 assert(waiting_for_ondisk.begin()->first == repop->v);
9305 for (list<pair<OpRequestRef, version_t> >::iterator i =
9306 waiting_for_ondisk[repop->v].begin();
9307 i != waiting_for_ondisk[repop->v].end();
9308 ++i) {
9309 osd->reply_op_error(i->first, repop->r, repop->v,
9310 i->second);
9311 }
9312 waiting_for_ondisk.erase(repop->v);
9313 }
9314 }
9315
9316 // applied?
9317 if (repop->all_applied) {
9318 if (repop->applies_with_commit) {
9319 assert(repop->on_applied.empty());
9320 }
9321 dout(10) << " applied: " << *repop << " " << dendl;
9322 for (auto p = repop->on_applied.begin();
9323 p != repop->on_applied.end();
9324 repop->on_applied.erase(p++)) {
9325 (*p)();
9326 }
9327 }
9328
9329 // done.
9330 if (repop->all_applied && repop->all_committed) {
9331 repop->rep_done = true;
9332
9333 publish_stats_to_osd();
9334 calc_min_last_complete_ondisk();
9335
9336 dout(10) << " removing " << *repop << dendl;
9337 assert(!repop_queue.empty());
9338 dout(20) << " q front is " << *repop_queue.front() << dendl;
9339 if (repop_queue.front() != repop) {
9340 if (!repop->applies_with_commit) {
9341 dout(0) << " removing " << *repop << dendl;
9342 dout(0) << " q front is " << *repop_queue.front() << dendl;
9343 assert(repop_queue.front() == repop);
9344 }
9345 } else {
9346 RepGather *to_remove = nullptr;
9347 while (!repop_queue.empty() &&
9348 (to_remove = repop_queue.front())->rep_done) {
9349 repop_queue.pop_front();
9350 for (auto p = to_remove->on_success.begin();
9351 p != to_remove->on_success.end();
9352 to_remove->on_success.erase(p++)) {
9353 (*p)();
9354 }
9355 remove_repop(to_remove);
9356 }
9357 }
9358 }
9359 }
9360
9361 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9362 {
9363 FUNCTRACE();
9364 const hobject_t& soid = ctx->obs->oi.soid;
9365 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9366 << " o " << soid
9367 << dendl;
9368
9369 repop->v = ctx->at_version;
9370 if (ctx->at_version > eversion_t()) {
9371 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9372 i != actingbackfill.end();
9373 ++i) {
9374 if (*i == get_primary()) continue;
9375 pg_info_t &pinfo = peer_info[*i];
9376 // keep peer_info up to date
9377 if (pinfo.last_complete == pinfo.last_update)
9378 pinfo.last_complete = ctx->at_version;
9379 pinfo.last_update = ctx->at_version;
9380 }
9381 }
9382
9383 ctx->obc->ondisk_write_lock();
9384
9385 bool unlock_snapset_obc = false;
9386 ctx->op_t->add_obc(ctx->obc);
9387 if (ctx->clone_obc) {
9388 ctx->clone_obc->ondisk_write_lock();
9389 ctx->op_t->add_obc(ctx->clone_obc);
9390 }
9391 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9392 ctx->obc->obs.oi.soid) {
9393 ctx->snapset_obc->ondisk_write_lock();
9394 unlock_snapset_obc = true;
9395 ctx->op_t->add_obc(ctx->snapset_obc);
9396 }
9397
9398 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9399 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9400 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9401 ctx->obc,
9402 ctx->clone_obc,
9403 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9404 if (!(ctx->log.empty())) {
9405 assert(ctx->at_version >= projected_last_update);
9406 projected_last_update = ctx->at_version;
9407 }
9408 for (auto &&entry: ctx->log) {
9409 projected_log.add(entry);
9410 }
9411 pgbackend->submit_transaction(
9412 soid,
9413 ctx->delta_stats,
9414 ctx->at_version,
9415 std::move(ctx->op_t),
9416 pg_trim_to,
9417 min_last_complete_ondisk,
9418 ctx->log,
9419 ctx->updated_hset_history,
9420 onapplied_sync,
9421 on_all_applied,
9422 on_all_commit,
9423 repop->rep_tid,
9424 ctx->reqid,
9425 ctx->op);
9426 }
9427
9428 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9429 OpContext *ctx, ObjectContextRef obc,
9430 ceph_tid_t rep_tid)
9431 {
9432 if (ctx->op)
9433 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9434 else
9435 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9436
9437 RepGather *repop = new RepGather(
9438 ctx, rep_tid, info.last_complete, false);
9439
9440 repop->start = ceph_clock_now();
9441
9442 repop_queue.push_back(&repop->queue_item);
9443 repop->get();
9444
9445 osd->logger->inc(l_osd_op_wip);
9446
9447 dout(10) << __func__ << ": " << *repop << dendl;
9448 return repop;
9449 }
9450
9451 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9452 eversion_t version,
9453 int r,
9454 ObcLockManager &&manager,
9455 OpRequestRef &&op,
9456 boost::optional<std::function<void(void)> > &&on_complete)
9457 {
9458 RepGather *repop = new RepGather(
9459 std::move(manager),
9460 std::move(op),
9461 std::move(on_complete),
9462 osd->get_tid(),
9463 info.last_complete,
9464 true,
9465 r);
9466 repop->v = version;
9467
9468 repop->start = ceph_clock_now();
9469
9470 repop_queue.push_back(&repop->queue_item);
9471
9472 osd->logger->inc(l_osd_op_wip);
9473
9474 dout(10) << __func__ << ": " << *repop << dendl;
9475 return boost::intrusive_ptr<RepGather>(repop);
9476 }
9477
9478 void PrimaryLogPG::remove_repop(RepGather *repop)
9479 {
9480 dout(20) << __func__ << " " << *repop << dendl;
9481
9482 for (auto p = repop->on_finish.begin();
9483 p != repop->on_finish.end();
9484 repop->on_finish.erase(p++)) {
9485 (*p)();
9486 }
9487
9488 release_object_locks(
9489 repop->lock_manager);
9490 repop->put();
9491
9492 osd->logger->dec(l_osd_op_wip);
9493 }
9494
9495 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9496 {
9497 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9498 ceph_tid_t rep_tid = osd->get_tid();
9499 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9500 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9501 ctx->op_t.reset(new PGTransaction());
9502 ctx->mtime = ceph_clock_now();
9503 return ctx;
9504 }
9505
9506 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9507 {
9508 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9509 dout(20) << __func__ << " " << repop << dendl;
9510 issue_repop(repop, ctx.get());
9511 eval_repop(repop);
9512 calc_trim_to();
9513 repop->put();
9514 }
9515
9516
9517 void PrimaryLogPG::submit_log_entries(
9518 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9519 ObcLockManager &&manager,
9520 boost::optional<std::function<void(void)> > &&_on_complete,
9521 OpRequestRef op,
9522 int r)
9523 {
9524 dout(10) << __func__ << " " << entries << dendl;
9525 assert(is_primary());
9526
9527 eversion_t version;
9528 if (!entries.empty()) {
9529 assert(entries.rbegin()->version >= projected_last_update);
9530 version = projected_last_update = entries.rbegin()->version;
9531 }
9532
9533 boost::intrusive_ptr<RepGather> repop;
9534 boost::optional<std::function<void(void)> > on_complete;
9535 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9536 repop = new_repop(
9537 version,
9538 r,
9539 std::move(manager),
9540 std::move(op),
9541 std::move(_on_complete));
9542 } else {
9543 on_complete = std::move(_on_complete);
9544 }
9545
9546 pgbackend->call_write_ordered(
9547 [this, entries, repop, on_complete]() {
9548 ObjectStore::Transaction t;
9549 eversion_t old_last_update = info.last_update;
9550 merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
9551
9552
9553 set<pg_shard_t> waiting_on;
9554 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9555 i != actingbackfill.end();
9556 ++i) {
9557 pg_shard_t peer(*i);
9558 if (peer == pg_whoami) continue;
9559 assert(peer_missing.count(peer));
9560 assert(peer_info.count(peer));
9561 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9562 assert(repop);
9563 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9564 entries,
9565 spg_t(info.pgid.pgid, i->shard),
9566 pg_whoami.shard,
9567 get_osdmap()->get_epoch(),
9568 last_peering_reset,
9569 repop->rep_tid,
9570 pg_trim_to,
9571 min_last_complete_ondisk);
9572 osd->send_message_osd_cluster(
9573 peer.osd, m, get_osdmap()->get_epoch());
9574 waiting_on.insert(peer);
9575 } else {
9576 MOSDPGLog *m = new MOSDPGLog(
9577 peer.shard, pg_whoami.shard,
9578 info.last_update.epoch,
9579 info);
9580 m->log.log = entries;
9581 m->log.tail = old_last_update;
9582 m->log.head = info.last_update;
9583 osd->send_message_osd_cluster(
9584 peer.osd, m, get_osdmap()->get_epoch());
9585 }
9586 }
9587 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9588 ceph_tid_t rep_tid = repop->rep_tid;
9589 waiting_on.insert(pg_whoami);
9590 log_entry_update_waiting_on.insert(
9591 make_pair(
9592 rep_tid,
9593 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9594 ));
9595 struct OnComplete : public Context {
9596 PrimaryLogPGRef pg;
9597 ceph_tid_t rep_tid;
9598 epoch_t epoch;
9599 OnComplete(
9600 PrimaryLogPGRef pg,
9601 ceph_tid_t rep_tid,
9602 epoch_t epoch)
9603 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9604 void finish(int) override {
9605 pg->lock();
9606 if (!pg->pg_has_reset_since(epoch)) {
9607 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9608 assert(it != pg->log_entry_update_waiting_on.end());
9609 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9610 assert(it2 != it->second.waiting_on.end());
9611 it->second.waiting_on.erase(it2);
9612 if (it->second.waiting_on.empty()) {
9613 pg->repop_all_committed(it->second.repop.get());
9614 pg->log_entry_update_waiting_on.erase(it);
9615 }
9616 }
9617 pg->unlock();
9618 }
9619 };
9620 t.register_on_commit(
9621 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9622 } else {
9623 if (on_complete) {
9624 struct OnComplete : public Context {
9625 PrimaryLogPGRef pg;
9626 std::function<void(void)> on_complete;
9627 epoch_t epoch;
9628 OnComplete(
9629 PrimaryLogPGRef pg,
9630 const std::function<void(void)> &on_complete,
9631 epoch_t epoch)
9632 : pg(pg),
9633 on_complete(std::move(on_complete)),
9634 epoch(epoch) {}
9635 void finish(int) override {
9636 pg->lock();
9637 if (!pg->pg_has_reset_since(epoch))
9638 on_complete();
9639 pg->unlock();
9640 }
9641 };
9642 t.register_on_complete(
9643 new OnComplete{
9644 this, *on_complete, get_osdmap()->get_epoch()
9645 });
9646 }
9647 }
9648 t.register_on_applied(
9649 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9650 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9651 assert(r == 0);
9652 });
9653
9654 calc_trim_to();
9655 }
9656
9657 void PrimaryLogPG::cancel_log_updates()
9658 {
9659 // get rid of all the LogUpdateCtx so their references to repops are
9660 // dropped
9661 log_entry_update_waiting_on.clear();
9662 }
9663
9664 // -------------------------------------------------------
9665
9666 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9667 {
9668 pair<hobject_t, ObjectContextRef> i;
9669 while (object_contexts.get_next(i.first, &i)) {
9670 ObjectContextRef obc(i.second);
9671 get_obc_watchers(obc, pg_watchers);
9672 }
9673 }
9674
9675 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9676 {
9677 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9678 obc->watchers.begin();
9679 j != obc->watchers.end();
9680 ++j) {
9681 obj_watch_item_t owi;
9682
9683 owi.obj = obc->obs.oi.soid;
9684 owi.wi.addr = j->second->get_peer_addr();
9685 owi.wi.name = j->second->get_entity();
9686 owi.wi.cookie = j->second->get_cookie();
9687 owi.wi.timeout_seconds = j->second->get_timeout();
9688
9689 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9690 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9691
9692 pg_watchers.push_back(owi);
9693 }
9694 }
9695
9696 void PrimaryLogPG::check_blacklisted_watchers()
9697 {
9698 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9699 pair<hobject_t, ObjectContextRef> i;
9700 while (object_contexts.get_next(i.first, &i))
9701 check_blacklisted_obc_watchers(i.second);
9702 }
9703
9704 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9705 {
9706 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9707 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9708 obc->watchers.begin();
9709 k != obc->watchers.end();
9710 ) {
9711 //Advance iterator now so handle_watch_timeout() can erase element
9712 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9713 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9714 entity_addr_t ea = j->second->get_peer_addr();
9715 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9716 if (get_osdmap()->is_blacklisted(ea)) {
9717 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9718 assert(j->second->get_pg() == this);
9719 j->second->unregister_cb();
9720 handle_watch_timeout(j->second);
9721 }
9722 }
9723 }
9724
9725 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9726 {
9727 assert(is_active());
9728 assert((recovering.count(obc->obs.oi.soid) ||
9729 !is_missing_object(obc->obs.oi.soid)) ||
9730 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9731 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9732 pg_log_entry_t::LOST_REVERT &&
9733 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9734 obc->obs.oi.version));
9735
9736 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9737 assert(obc->watchers.empty());
9738 // populate unconnected_watchers
9739 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9740 obc->obs.oi.watchers.begin();
9741 p != obc->obs.oi.watchers.end();
9742 ++p) {
9743 utime_t expire = info.stats.last_became_active;
9744 expire += p->second.timeout_seconds;
9745 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9746 WatchRef watch(
9747 Watch::makeWatchRef(
9748 this, osd, obc, p->second.timeout_seconds, p->first.first,
9749 p->first.second, p->second.addr));
9750 watch->disconnect();
9751 obc->watchers.insert(
9752 make_pair(
9753 make_pair(p->first.first, p->first.second),
9754 watch));
9755 }
9756 // Look for watchers from blacklisted clients and drop
9757 check_blacklisted_obc_watchers(obc);
9758 }
9759
9760 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9761 {
9762 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9763 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9764
9765 if (!is_active()) {
9766 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9767 return;
9768 }
9769 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9770 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9771 watch->get_delayed_cb()
9772 );
9773 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9774 << obc->obs.oi.soid
9775 << dendl;
9776 return;
9777 }
9778
9779 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9780 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9781 << obc->obs.oi.soid
9782 << dendl;
9783 scrubber.add_callback(
9784 watch->get_delayed_cb() // This callback!
9785 );
9786 return;
9787 }
9788
9789 OpContextUPtr ctx = simple_opc_create(obc);
9790 ctx->at_version = get_next_version();
9791
9792 object_info_t& oi = ctx->new_obs.oi;
9793 oi.watchers.erase(make_pair(watch->get_cookie(),
9794 watch->get_entity()));
9795
9796 list<watch_disconnect_t> watch_disconnects = {
9797 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9798 };
9799 ctx->register_on_success(
9800 [this, obc, watch_disconnects]() {
9801 complete_disconnect_watches(obc, watch_disconnects);
9802 });
9803
9804
9805 PGTransaction *t = ctx->op_t.get();
9806 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9807 ctx->at_version,
9808 oi.version,
9809 0,
9810 osd_reqid_t(), ctx->mtime, 0));
9811
9812 oi.prior_version = obc->obs.oi.version;
9813 oi.version = ctx->at_version;
9814 bufferlist bl;
9815 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9816 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9817
9818 // apply new object state.
9819 ctx->obc->obs = ctx->new_obs;
9820
9821 // no ctx->delta_stats
9822 simple_opc_submit(std::move(ctx));
9823 }
9824
9825 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9826 SnapSetContext *ssc)
9827 {
9828 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9829 assert(obc->destructor_callback == NULL);
9830 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9831 obc->obs.oi = oi;
9832 obc->obs.exists = false;
9833 obc->ssc = ssc;
9834 if (ssc)
9835 register_snapset_context(ssc);
9836 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9837 if (is_active())
9838 populate_obc_watchers(obc);
9839 return obc;
9840 }
9841
9842 ObjectContextRef PrimaryLogPG::get_object_context(
9843 const hobject_t& soid,
9844 bool can_create,
9845 const map<string, bufferlist> *attrs)
9846 {
9847 assert(
9848 attrs || !pg_log.get_missing().is_missing(soid) ||
9849 // or this is a revert... see recover_primary()
9850 (pg_log.get_log().objects.count(soid) &&
9851 pg_log.get_log().objects.find(soid)->second->op ==
9852 pg_log_entry_t::LOST_REVERT));
9853 ObjectContextRef obc = object_contexts.lookup(soid);
9854 osd->logger->inc(l_osd_object_ctx_cache_total);
9855 if (obc) {
9856 osd->logger->inc(l_osd_object_ctx_cache_hit);
9857 dout(10) << __func__ << ": found obc in cache: " << obc
9858 << dendl;
9859 } else {
9860 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9861 // check disk
9862 bufferlist bv;
9863 if (attrs) {
9864 assert(attrs->count(OI_ATTR));
9865 bv = attrs->find(OI_ATTR)->second;
9866 } else {
9867 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9868 if (r < 0) {
9869 if (!can_create) {
9870 dout(10) << __func__ << ": no obc for soid "
9871 << soid << " and !can_create"
9872 << dendl;
9873 return ObjectContextRef(); // -ENOENT!
9874 }
9875
9876 dout(10) << __func__ << ": no obc for soid "
9877 << soid << " but can_create"
9878 << dendl;
9879 // new object.
9880 object_info_t oi(soid);
9881 SnapSetContext *ssc = get_snapset_context(
9882 soid, true, 0, false);
9883 assert(ssc);
9884 obc = create_object_context(oi, ssc);
9885 dout(10) << __func__ << ": " << obc << " " << soid
9886 << " " << obc->rwstate
9887 << " oi: " << obc->obs.oi
9888 << " ssc: " << obc->ssc
9889 << " snapset: " << obc->ssc->snapset << dendl;
9890 return obc;
9891 }
9892 }
9893
9894 object_info_t oi;
9895 try {
9896 bufferlist::iterator bliter = bv.begin();
9897 ::decode(oi, bliter);
9898 } catch (...) {
9899 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9900 return ObjectContextRef(); // -ENOENT!
9901 }
9902
9903 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9904
9905 obc = object_contexts.lookup_or_create(oi.soid);
9906 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9907 obc->obs.oi = oi;
9908 obc->obs.exists = true;
9909
9910 obc->ssc = get_snapset_context(
9911 soid, true,
9912 soid.has_snapset() ? attrs : 0);
9913
9914 if (is_active())
9915 populate_obc_watchers(obc);
9916
9917 if (pool.info.require_rollback()) {
9918 if (attrs) {
9919 obc->attr_cache = *attrs;
9920 } else {
9921 int r = pgbackend->objects_get_attrs(
9922 soid,
9923 &obc->attr_cache);
9924 assert(r == 0);
9925 }
9926 }
9927
9928 dout(10) << __func__ << ": creating obc from disk: " << obc
9929 << dendl;
9930 }
9931
9932 // XXX: Caller doesn't expect this
9933 if (obc->ssc == NULL) {
9934 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9935 return ObjectContextRef(); // -ENOENT!
9936 }
9937
9938 dout(10) << __func__ << ": " << obc << " " << soid
9939 << " " << obc->rwstate
9940 << " oi: " << obc->obs.oi
9941 << " exists: " << (int)obc->obs.exists
9942 << " ssc: " << obc->ssc
9943 << " snapset: " << obc->ssc->snapset << dendl;
9944 return obc;
9945 }
9946
9947 void PrimaryLogPG::context_registry_on_change()
9948 {
9949 pair<hobject_t, ObjectContextRef> i;
9950 while (object_contexts.get_next(i.first, &i)) {
9951 ObjectContextRef obc(i.second);
9952 if (obc) {
9953 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9954 obc->watchers.begin();
9955 j != obc->watchers.end();
9956 obc->watchers.erase(j++)) {
9957 j->second->discard();
9958 }
9959 }
9960 }
9961 }
9962
9963
9964 /*
9965 * If we return an error, and set *pmissing, then promoting that
9966 * object may help.
9967 *
9968 * If we return -EAGAIN, we will always set *pmissing to the missing
9969 * object to wait for.
9970 *
9971 * If we return an error but do not set *pmissing, then we know the
9972 * object does not exist.
9973 */
9974 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9975 ObjectContextRef *pobc,
9976 bool can_create,
9977 bool map_snapid_to_clone,
9978 hobject_t *pmissing)
9979 {
9980 FUNCTRACE();
9981 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9982 // want the head?
9983 if (oid.snap == CEPH_NOSNAP) {
9984 ObjectContextRef obc = get_object_context(oid, can_create);
9985 if (!obc) {
9986 if (pmissing)
9987 *pmissing = oid;
9988 return -ENOENT;
9989 }
9990 dout(10) << "find_object_context " << oid
9991 << " @" << oid.snap
9992 << " oi=" << obc->obs.oi
9993 << dendl;
9994 *pobc = obc;
9995
9996 return 0;
9997 }
9998
9999 hobject_t head = oid.get_head();
10000
10001 // want the snapdir?
10002 if (oid.snap == CEPH_SNAPDIR) {
10003 // return head or snapdir, whichever exists.
10004 ObjectContextRef headobc = get_object_context(head, can_create);
10005 ObjectContextRef obc = headobc;
10006 if (!obc || !obc->obs.exists)
10007 obc = get_object_context(oid, can_create);
10008 if (!obc || !obc->obs.exists) {
10009 // if we have neither, we would want to promote the head.
10010 if (pmissing)
10011 *pmissing = head;
10012 if (pobc)
10013 *pobc = headobc; // may be null
10014 return -ENOENT;
10015 }
10016 dout(10) << "find_object_context " << oid
10017 << " @" << oid.snap
10018 << " oi=" << obc->obs.oi
10019 << dendl;
10020 *pobc = obc;
10021
10022 // always populate ssc for SNAPDIR...
10023 if (!obc->ssc)
10024 obc->ssc = get_snapset_context(
10025 oid, true);
10026 return 0;
10027 }
10028
10029 // we want a snap
10030 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
10031 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
10032 return -ENOENT;
10033 }
10034
10035 SnapSetContext *ssc = get_snapset_context(oid, can_create);
10036 if (!ssc || !(ssc->exists || can_create)) {
10037 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
10038 if (pmissing)
10039 *pmissing = head; // start by getting the head
10040 if (ssc)
10041 put_snapset_context(ssc);
10042 return -ENOENT;
10043 }
10044
10045 if (map_snapid_to_clone) {
10046 dout(10) << "find_object_context " << oid << " @" << oid.snap
10047 << " snapset " << ssc->snapset
10048 << " map_snapid_to_clone=true" << dendl;
10049 if (oid.snap > ssc->snapset.seq) {
10050 // already must be readable
10051 ObjectContextRef obc = get_object_context(head, false);
10052 dout(10) << "find_object_context " << oid << " @" << oid.snap
10053 << " snapset " << ssc->snapset
10054 << " maps to head" << dendl;
10055 *pobc = obc;
10056 put_snapset_context(ssc);
10057 return (obc && obc->obs.exists) ? 0 : -ENOENT;
10058 } else {
10059 vector<snapid_t>::const_iterator citer = std::find(
10060 ssc->snapset.clones.begin(),
10061 ssc->snapset.clones.end(),
10062 oid.snap);
10063 if (citer == ssc->snapset.clones.end()) {
10064 dout(10) << "find_object_context " << oid << " @" << oid.snap
10065 << " snapset " << ssc->snapset
10066 << " maps to nothing" << dendl;
10067 put_snapset_context(ssc);
10068 return -ENOENT;
10069 }
10070
10071 dout(10) << "find_object_context " << oid << " @" << oid.snap
10072 << " snapset " << ssc->snapset
10073 << " maps to " << oid << dendl;
10074
10075 if (pg_log.get_missing().is_missing(oid)) {
10076 dout(10) << "find_object_context " << oid << " @" << oid.snap
10077 << " snapset " << ssc->snapset
10078 << " " << oid << " is missing" << dendl;
10079 if (pmissing)
10080 *pmissing = oid;
10081 put_snapset_context(ssc);
10082 return -EAGAIN;
10083 }
10084
10085 ObjectContextRef obc = get_object_context(oid, false);
10086 if (!obc || !obc->obs.exists) {
10087 dout(10) << "find_object_context " << oid << " @" << oid.snap
10088 << " snapset " << ssc->snapset
10089 << " " << oid << " is not present" << dendl;
10090 if (pmissing)
10091 *pmissing = oid;
10092 put_snapset_context(ssc);
10093 return -ENOENT;
10094 }
10095 dout(10) << "find_object_context " << oid << " @" << oid.snap
10096 << " snapset " << ssc->snapset
10097 << " " << oid << " HIT" << dendl;
10098 *pobc = obc;
10099 put_snapset_context(ssc);
10100 return 0;
10101 }
10102 ceph_abort(); //unreachable
10103 }
10104
10105 dout(10) << "find_object_context " << oid << " @" << oid.snap
10106 << " snapset " << ssc->snapset << dendl;
10107
10108 // head?
10109 if (oid.snap > ssc->snapset.seq) {
10110 if (ssc->snapset.head_exists) {
10111 ObjectContextRef obc = get_object_context(head, false);
10112 dout(10) << "find_object_context " << head
10113 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10114 << " -- HIT " << obc->obs
10115 << dendl;
10116 if (!obc->ssc)
10117 obc->ssc = ssc;
10118 else {
10119 assert(ssc == obc->ssc);
10120 put_snapset_context(ssc);
10121 }
10122 *pobc = obc;
10123 return 0;
10124 }
10125 dout(10) << "find_object_context " << head
10126 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10127 << " but head dne -- DNE"
10128 << dendl;
10129 put_snapset_context(ssc);
10130 return -ENOENT;
10131 }
10132
10133 // which clone would it be?
10134 unsigned k = 0;
10135 while (k < ssc->snapset.clones.size() &&
10136 ssc->snapset.clones[k] < oid.snap)
10137 k++;
10138 if (k == ssc->snapset.clones.size()) {
10139 dout(10) << "find_object_context no clones with last >= oid.snap "
10140 << oid.snap << " -- DNE" << dendl;
10141 put_snapset_context(ssc);
10142 return -ENOENT;
10143 }
10144 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10145 info.pgid.pool(), oid.get_namespace());
10146
10147 if (pg_log.get_missing().is_missing(soid)) {
10148 dout(20) << "find_object_context " << soid << " missing, try again later"
10149 << dendl;
10150 if (pmissing)
10151 *pmissing = soid;
10152 put_snapset_context(ssc);
10153 return -EAGAIN;
10154 }
10155
10156 ObjectContextRef obc = get_object_context(soid, false);
10157 if (!obc || !obc->obs.exists) {
10158 if (pmissing)
10159 *pmissing = soid;
10160 put_snapset_context(ssc);
10161 if (is_degraded_or_backfilling_object(soid)) {
10162 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10163 return -EAGAIN;
10164 } else {
10165 dout(20) << __func__ << " missing clone " << soid << dendl;
10166 return -ENOENT;
10167 }
10168 }
10169
10170 if (!obc->ssc) {
10171 obc->ssc = ssc;
10172 } else {
10173 assert(obc->ssc == ssc);
10174 put_snapset_context(ssc);
10175 }
10176 ssc = 0;
10177
10178 // clone
10179 dout(20) << "find_object_context " << soid
10180 << " snapset " << obc->ssc->snapset
10181 << " legacy_snaps " << obc->obs.oi.legacy_snaps
10182 << dendl;
10183 snapid_t first, last;
10184 if (obc->ssc->snapset.is_legacy()) {
10185 first = obc->obs.oi.legacy_snaps.back();
10186 last = obc->obs.oi.legacy_snaps.front();
10187 } else {
10188 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10189 assert(p != obc->ssc->snapset.clone_snaps.end());
10190 first = p->second.back();
10191 last = p->second.front();
10192 }
10193 if (first <= oid.snap) {
10194 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10195 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10196 *pobc = obc;
10197 return 0;
10198 } else {
10199 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10200 << "] does not contain " << oid.snap << " -- DNE" << dendl;
10201 return -ENOENT;
10202 }
10203 }
10204
10205 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10206 {
10207 if (obc->ssc)
10208 put_snapset_context(obc->ssc);
10209 }
10210
10211 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10212 {
10213 object_info_t& oi = obc->obs.oi;
10214
10215 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10216 object_stat_sum_t stat;
10217
10218 stat.num_bytes += oi.size;
10219
10220 if (oi.soid.snap != CEPH_SNAPDIR)
10221 stat.num_objects++;
10222 if (oi.is_dirty())
10223 stat.num_objects_dirty++;
10224 if (oi.is_whiteout())
10225 stat.num_whiteouts++;
10226 if (oi.is_omap())
10227 stat.num_objects_omap++;
10228 if (oi.is_cache_pinned())
10229 stat.num_objects_pinned++;
10230
10231 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10232 stat.num_object_clones++;
10233
10234 if (!obc->ssc)
10235 obc->ssc = get_snapset_context(oi.soid, false);
10236 assert(obc->ssc);
10237
10238 // subtract off clone overlap
10239 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10240 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10241 for (interval_set<uint64_t>::const_iterator r = o.begin();
10242 r != o.end();
10243 ++r) {
10244 stat.num_bytes -= r.get_len();
10245 }
10246 }
10247 }
10248
10249 // add it in
10250 pgstat->stats.sum.add(stat);
10251 }
10252
10253 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10254 {
10255 const hobject_t& soid = obc->obs.oi.soid;
10256 if (obc->is_blocked()) {
10257 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10258 return;
10259 }
10260
10261 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10262 if (p != waiting_for_blocked_object.end()) {
10263 list<OpRequestRef>& ls = p->second;
10264 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10265 requeue_ops(ls);
10266 waiting_for_blocked_object.erase(p);
10267 }
10268
10269 map<hobject_t, ObjectContextRef>::iterator i =
10270 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10271 if (i != objects_blocked_on_snap_promotion.end()) {
10272 assert(i->second == obc);
10273 objects_blocked_on_snap_promotion.erase(i);
10274 }
10275
10276 if (obc->requeue_scrub_on_unblock) {
10277 obc->requeue_scrub_on_unblock = false;
10278 requeue_scrub();
10279 }
10280 }
10281
10282 SnapSetContext *PrimaryLogPG::get_snapset_context(
10283 const hobject_t& oid,
10284 bool can_create,
10285 const map<string, bufferlist> *attrs,
10286 bool oid_existed)
10287 {
10288 Mutex::Locker l(snapset_contexts_lock);
10289 SnapSetContext *ssc;
10290 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10291 oid.get_snapdir());
10292 if (p != snapset_contexts.end()) {
10293 if (can_create || p->second->exists) {
10294 ssc = p->second;
10295 } else {
10296 return NULL;
10297 }
10298 } else {
10299 bufferlist bv;
10300 if (!attrs) {
10301 int r = -ENOENT;
10302 if (!(oid.is_head() && !oid_existed))
10303 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10304 if (r < 0) {
10305 // try _snapset
10306 if (!(oid.is_snapdir() && !oid_existed))
10307 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10308 if (r < 0 && !can_create)
10309 return NULL;
10310 }
10311 } else {
10312 assert(attrs->count(SS_ATTR));
10313 bv = attrs->find(SS_ATTR)->second;
10314 }
10315 ssc = new SnapSetContext(oid.get_snapdir());
10316 _register_snapset_context(ssc);
10317 if (bv.length()) {
10318 bufferlist::iterator bvp = bv.begin();
10319 try {
10320 ssc->snapset.decode(bvp);
10321 } catch (buffer::error& e) {
10322 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10323 return NULL;
10324 }
10325 ssc->exists = true;
10326 } else {
10327 ssc->exists = false;
10328 }
10329 }
10330 assert(ssc);
10331 ssc->ref++;
10332 return ssc;
10333 }
10334
10335 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10336 {
10337 Mutex::Locker l(snapset_contexts_lock);
10338 --ssc->ref;
10339 if (ssc->ref == 0) {
10340 if (ssc->registered)
10341 snapset_contexts.erase(ssc->oid);
10342 delete ssc;
10343 }
10344 }
10345
10346 /** pull - request object from a peer
10347 */
10348
10349 /*
10350 * Return values:
10351 * NONE - didn't pull anything
10352 * YES - pulled what the caller wanted
10353 * OTHER - needed to pull something else first (_head or _snapdir)
10354 */
10355 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10356
10357 int PrimaryLogPG::recover_missing(
10358 const hobject_t &soid, eversion_t v,
10359 int priority,
10360 PGBackend::RecoveryHandle *h)
10361 {
10362 if (missing_loc.is_unfound(soid)) {
10363 dout(7) << "pull " << soid
10364 << " v " << v
10365 << " but it is unfound" << dendl;
10366 return PULL_NONE;
10367 }
10368
10369 if (missing_loc.is_deleted(soid)) {
10370 start_recovery_op(soid);
10371 assert(!recovering.count(soid));
10372 recovering.insert(make_pair(soid, ObjectContextRef()));
10373 epoch_t cur_epoch = get_osdmap()->get_epoch();
10374 remove_missing_object(soid, v, new FunctionContext(
10375 [=](int) {
10376 lock();
10377 if (!pg_has_reset_since(cur_epoch)) {
10378 bool object_missing = false;
10379 for (const auto& shard : actingbackfill) {
10380 if (shard == pg_whoami)
10381 continue;
10382 if (peer_missing[shard].is_missing(soid)) {
10383 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10384 object_missing = true;
10385 break;
10386 }
10387 }
10388 if (!object_missing) {
10389 object_stat_sum_t stat_diff;
10390 stat_diff.num_objects_recovered = 1;
10391 on_global_recover(soid, stat_diff, true);
10392 } else {
10393 auto recovery_handle = pgbackend->open_recovery_op();
10394 pgbackend->recover_delete_object(soid, v, recovery_handle);
10395 pgbackend->run_recovery_op(recovery_handle, priority);
10396 }
10397 }
10398 unlock();
10399 }));
10400 return PULL_YES;
10401 }
10402
10403 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10404 ObjectContextRef obc;
10405 ObjectContextRef head_obc;
10406 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10407 // do we have the head and/or snapdir?
10408 hobject_t head = soid.get_head();
10409 if (pg_log.get_missing().is_missing(head)) {
10410 if (recovering.count(head)) {
10411 dout(10) << " missing but already recovering head " << head << dendl;
10412 return PULL_NONE;
10413 } else {
10414 int r = recover_missing(
10415 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10416 h);
10417 if (r != PULL_NONE)
10418 return PULL_OTHER;
10419 return PULL_NONE;
10420 }
10421 }
10422 head = soid.get_snapdir();
10423 if (pg_log.get_missing().is_missing(head)) {
10424 if (recovering.count(head)) {
10425 dout(10) << " missing but already recovering snapdir " << head << dendl;
10426 return PULL_NONE;
10427 } else {
10428 int r = recover_missing(
10429 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10430 h);
10431 if (r != PULL_NONE)
10432 return PULL_OTHER;
10433 return PULL_NONE;
10434 }
10435 }
10436
10437 // we must have one or the other
10438 head_obc = get_object_context(
10439 soid.get_head(),
10440 false,
10441 0);
10442 if (!head_obc)
10443 head_obc = get_object_context(
10444 soid.get_snapdir(),
10445 false,
10446 0);
10447 assert(head_obc);
10448 }
10449 start_recovery_op(soid);
10450 assert(!recovering.count(soid));
10451 recovering.insert(make_pair(soid, obc));
10452 int r = pgbackend->recover_object(
10453 soid,
10454 v,
10455 head_obc,
10456 obc,
10457 h);
10458 // This is only a pull which shouldn't return an error
10459 assert(r >= 0);
10460 return PULL_YES;
10461 }
10462
10463 void PrimaryLogPG::send_remove_op(
10464 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10465 {
10466 ceph_tid_t tid = osd->get_tid();
10467 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10468
10469 dout(10) << "send_remove_op " << oid << " from osd." << peer
10470 << " tid " << tid << dendl;
10471
10472 MOSDSubOp *subop = new MOSDSubOp(
10473 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10474 oid, CEPH_OSD_FLAG_ACK,
10475 get_osdmap()->get_epoch(), tid, v);
10476 subop->ops = vector<OSDOp>(1);
10477 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10478
10479 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10480 }
10481
10482 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10483 eversion_t v, Context *on_complete)
10484 {
10485 dout(20) << __func__ << " " << soid << " " << v << dendl;
10486 assert(on_complete != nullptr);
10487 // delete locally
10488 ObjectStore::Transaction t;
10489 remove_snap_mapped_object(t, soid);
10490
10491 ObjectRecoveryInfo recovery_info;
10492 recovery_info.soid = soid;
10493 recovery_info.version = v;
10494
10495 epoch_t cur_epoch = get_osdmap()->get_epoch();
10496 t.register_on_complete(new FunctionContext(
10497 [=](int) {
10498 lock();
10499 if (!pg_has_reset_since(cur_epoch)) {
10500 ObjectStore::Transaction t2;
10501 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10502 t2.register_on_complete(on_complete);
10503 int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10504 assert(r == 0);
10505 unlock();
10506 } else {
10507 unlock();
10508 on_complete->complete(-EAGAIN);
10509 }
10510 }));
10511 int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10512 assert(r == 0);
10513 }
10514
10515 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10516 {
10517 dout(10) << "finish_degraded_object " << oid << dendl;
10518 if (callbacks_for_degraded_object.count(oid)) {
10519 list<Context*> contexts;
10520 contexts.swap(callbacks_for_degraded_object[oid]);
10521 callbacks_for_degraded_object.erase(oid);
10522 for (list<Context*>::iterator i = contexts.begin();
10523 i != contexts.end();
10524 ++i) {
10525 (*i)->complete(0);
10526 }
10527 }
10528 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10529 oid.get_head());
10530 if (i != objects_blocked_on_degraded_snap.end() &&
10531 i->second == oid.snap)
10532 objects_blocked_on_degraded_snap.erase(i);
10533 }
10534
10535 void PrimaryLogPG::_committed_pushed_object(
10536 epoch_t epoch, eversion_t last_complete)
10537 {
10538 lock();
10539 if (!pg_has_reset_since(epoch)) {
10540 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10541 last_complete_ondisk = last_complete;
10542
10543 if (last_complete_ondisk == info.last_update) {
10544 if (!is_primary()) {
10545 // Either we are a replica or backfill target.
10546 // we are fully up to date. tell the primary!
10547 osd->send_message_osd_cluster(
10548 get_primary().osd,
10549 new MOSDPGTrim(
10550 get_osdmap()->get_epoch(),
10551 spg_t(info.pgid.pgid, get_primary().shard),
10552 last_complete_ondisk),
10553 get_osdmap()->get_epoch());
10554 } else {
10555 calc_min_last_complete_ondisk();
10556 }
10557 }
10558
10559 } else {
10560 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10561 }
10562
10563 unlock();
10564 }
10565
10566 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10567 {
10568 lock();
10569 dout(20) << __func__ << dendl;
10570 if (obc) {
10571 dout(20) << "obc = " << *obc << dendl;
10572 }
10573 assert(active_pushes >= 1);
10574 --active_pushes;
10575
10576 // requeue an active chunky scrub waiting on recovery ops
10577 if (!deleting && active_pushes == 0
10578 && scrubber.is_chunky_scrub_active()) {
10579 if (ops_blocked_by_scrub()) {
10580 requeue_scrub(true);
10581 } else {
10582 requeue_scrub(false);
10583 }
10584 }
10585 unlock();
10586 }
10587
10588 void PrimaryLogPG::_applied_recovered_object_replica()
10589 {
10590 lock();
10591 dout(20) << __func__ << dendl;
10592 assert(active_pushes >= 1);
10593 --active_pushes;
10594
10595 // requeue an active chunky scrub waiting on recovery ops
10596 if (!deleting && active_pushes == 0 &&
10597 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10598 scrubber.active_rep_scrub->get_req())->chunky) {
10599 osd->enqueue_back(
10600 info.pgid,
10601 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10602 scrubber.active_rep_scrub = OpRequestRef();
10603 }
10604 unlock();
10605 }
10606
10607 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10608 {
10609 dout(10) << "got missing " << oid << " v " << v << dendl;
10610 pg_log.recover_got(oid, v, info);
10611 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10612 dout(10) << "last_complete now " << info.last_complete
10613 << " log.complete_to " << pg_log.get_log().complete_to->version
10614 << dendl;
10615 } else {
10616 dout(10) << "last_complete now " << info.last_complete
10617 << " log.complete_to at end" << dendl;
10618 //below is not true in the repair case.
10619 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10620 assert(info.last_complete == info.last_update);
10621 }
10622 }
10623
10624 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10625 {
10626 list<pg_shard_t> fl = { pg_whoami };
10627 failed_push(fl, soid);
10628 }
10629
10630 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10631 {
10632 dout(20) << __func__ << ": " << soid << dendl;
10633 assert(recovering.count(soid));
10634 auto obc = recovering[soid];
10635 if (obc) {
10636 list<OpRequestRef> blocked_ops;
10637 obc->drop_recovery_read(&blocked_ops);
10638 requeue_ops(blocked_ops);
10639 }
10640 recovering.erase(soid);
10641 for (auto&& i : from)
10642 missing_loc.remove_location(soid, i);
10643 dout(0) << __func__ << " " << soid << " from shard " << from
10644 << ", reps on " << missing_loc.get_locations(soid)
10645 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10646 finish_recovery_op(soid); // close out this attempt,
10647 }
10648
10649 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10650 {
10651 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10652 assert(m->get_type() == MSG_OSD_SUBOP);
10653 dout(7) << "sub_op_remove " << m->poid << dendl;
10654
10655 op->mark_started();
10656
10657 ObjectStore::Transaction t;
10658 remove_snap_mapped_object(t, m->poid);
10659 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10660 assert(r == 0);
10661 }
10662
10663 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10664 {
10665 eversion_t v;
10666 pg_missing_item pmi;
10667 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10668 assert(is_missing);
10669 v = pmi.have;
10670 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10671
10672 assert(!actingbackfill.empty());
10673 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10674 i != actingbackfill.end();
10675 ++i) {
10676 if (*i == get_primary()) continue;
10677 pg_shard_t peer = *i;
10678 if (!peer_missing[peer].is_missing(oid)) {
10679 continue;
10680 }
10681 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10682 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10683 if (h > v)
10684 v = h;
10685 }
10686
10687 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10688 return v;
10689 }
10690
10691 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10692 {
10693 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10694 op->get_req());
10695 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10696 ObjectStore::Transaction t;
10697 boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
10698 if (m->pg_trim_to != eversion_t())
10699 op_trim_to = m->pg_trim_to;
10700 if (m->pg_roll_forward_to != eversion_t())
10701 op_roll_forward_to = m->pg_roll_forward_to;
10702
10703 dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
10704
10705 append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
10706 eversion_t new_lcod = info.last_complete;
10707
10708 Context *complete = new FunctionContext(
10709 [=](int) {
10710 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10711 op->get_req());
10712 lock();
10713 if (!pg_has_reset_since(msg->get_epoch())) {
10714 update_last_complete_ondisk(new_lcod);
10715 MOSDPGUpdateLogMissingReply *reply =
10716 new MOSDPGUpdateLogMissingReply(
10717 spg_t(info.pgid.pgid, primary_shard().shard),
10718 pg_whoami.shard,
10719 msg->get_epoch(),
10720 msg->min_epoch,
10721 msg->get_tid(),
10722 new_lcod);
10723 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10724 msg->get_connection()->send_message(reply);
10725 }
10726 unlock();
10727 });
10728
10729 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10730 t.register_on_commit(complete);
10731 } else {
10732 /* Hack to work around the fact that ReplicatedBackend sends
10733 * ack+commit if commit happens first
10734 *
10735 * This behavior is no longer necessary, but we preserve it so old
10736 * primaries can keep their repops in order */
10737 if (pool.info.ec_pool()) {
10738 t.register_on_complete(complete);
10739 } else {
10740 t.register_on_commit(complete);
10741 }
10742 }
10743 t.register_on_applied(
10744 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10745 int tr = osd->store->queue_transaction(
10746 osr.get(),
10747 std::move(t),
10748 nullptr);
10749 assert(tr == 0);
10750 }
10751
10752 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10753 {
10754 const MOSDPGUpdateLogMissingReply *m =
10755 static_cast<const MOSDPGUpdateLogMissingReply*>(
10756 op->get_req());
10757 dout(20) << __func__ << " got reply from "
10758 << m->get_from() << dendl;
10759
10760 auto it = log_entry_update_waiting_on.find(m->get_tid());
10761 if (it != log_entry_update_waiting_on.end()) {
10762 if (it->second.waiting_on.count(m->get_from())) {
10763 it->second.waiting_on.erase(m->get_from());
10764 if (m->last_complete_ondisk != eversion_t()) {
10765 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
10766 }
10767 } else {
10768 osd->clog->error()
10769 << info.pgid << " got reply "
10770 << *m << " from shard we are not waiting for "
10771 << m->get_from();
10772 }
10773
10774 if (it->second.waiting_on.empty()) {
10775 repop_all_committed(it->second.repop.get());
10776 log_entry_update_waiting_on.erase(it);
10777 }
10778 } else {
10779 osd->clog->error()
10780 << info.pgid << " got reply "
10781 << *m << " on unknown tid " << m->get_tid();
10782 }
10783 }
10784
10785 /* Mark all unfound objects as lost.
10786 */
10787 void PrimaryLogPG::mark_all_unfound_lost(
10788 int what,
10789 ConnectionRef con,
10790 ceph_tid_t tid)
10791 {
10792 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10793 list<hobject_t> oids;
10794
10795 dout(30) << __func__ << ": log before:\n";
10796 pg_log.get_log().print(*_dout);
10797 *_dout << dendl;
10798
10799 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10800
10801 utime_t mtime = ceph_clock_now();
10802 map<hobject_t, pg_missing_item>::const_iterator m =
10803 missing_loc.get_needs_recovery().begin();
10804 map<hobject_t, pg_missing_item>::const_iterator mend =
10805 missing_loc.get_needs_recovery().end();
10806
10807 ObcLockManager manager;
10808 eversion_t v = get_next_version();
10809 v.epoch = get_osdmap()->get_epoch();
10810 uint64_t num_unfound = missing_loc.num_unfound();
10811 while (m != mend) {
10812 const hobject_t &oid(m->first);
10813 if (!missing_loc.is_unfound(oid)) {
10814 // We only care about unfound objects
10815 ++m;
10816 continue;
10817 }
10818
10819 ObjectContextRef obc;
10820 eversion_t prev;
10821
10822 switch (what) {
10823 case pg_log_entry_t::LOST_MARK:
10824 assert(0 == "actually, not implemented yet!");
10825 break;
10826
10827 case pg_log_entry_t::LOST_REVERT:
10828 prev = pick_newest_available(oid);
10829 if (prev > eversion_t()) {
10830 // log it
10831 pg_log_entry_t e(
10832 pg_log_entry_t::LOST_REVERT, oid, v,
10833 m->second.need, 0, osd_reqid_t(), mtime, 0);
10834 e.reverting_to = prev;
10835 e.mark_unrollbackable();
10836 log_entries.push_back(e);
10837 dout(10) << e << dendl;
10838
10839 // we are now missing the new version; recovery code will sort it out.
10840 ++v.version;
10841 ++m;
10842 break;
10843 }
10844
10845 case pg_log_entry_t::LOST_DELETE:
10846 {
10847 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10848 0, osd_reqid_t(), mtime, 0);
10849 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10850 if (pool.info.require_rollback()) {
10851 e.mod_desc.try_rmobject(v.version);
10852 } else {
10853 e.mark_unrollbackable();
10854 }
10855 } // otherwise, just do what we used to do
10856 dout(10) << e << dendl;
10857 log_entries.push_back(e);
10858 oids.push_back(oid);
10859
10860 // If context found mark object as deleted in case
10861 // of racing with new creation. This can happen if
10862 // object lost and EIO at primary.
10863 obc = object_contexts.lookup(oid);
10864 if (obc)
10865 obc->obs.exists = false;
10866
10867 ++v.version;
10868 ++m;
10869 }
10870 break;
10871
10872 default:
10873 ceph_abort();
10874 }
10875 }
10876
10877 info.stats.stats_invalid = true;
10878
10879 submit_log_entries(
10880 log_entries,
10881 std::move(manager),
10882 boost::optional<std::function<void(void)> >(
10883 [this, oids, con, num_unfound, tid]() {
10884 if (perform_deletes_during_peering()) {
10885 for (auto oid : oids) {
10886 // clear old locations - merge_new_log_entries will have
10887 // handled rebuilding missing_loc for each of these
10888 // objects if we have the RECOVERY_DELETES flag
10889 missing_loc.recovered(oid);
10890 }
10891 }
10892
10893 if (is_recovery_unfound()) {
10894 queue_peering_event(
10895 CephPeeringEvtRef(
10896 std::make_shared<CephPeeringEvt>(
10897 get_osdmap()->get_epoch(),
10898 get_osdmap()->get_epoch(),
10899 DoRecovery())));
10900 } else if (is_backfill_unfound()) {
10901 queue_peering_event(
10902 CephPeeringEvtRef(
10903 std::make_shared<CephPeeringEvt>(
10904 get_osdmap()->get_epoch(),
10905 get_osdmap()->get_epoch(),
10906 RequestBackfill())));
10907 } else {
10908 queue_recovery();
10909 }
10910
10911 stringstream ss;
10912 ss << "pg has " << num_unfound
10913 << " objects unfound and apparently lost marking";
10914 string rs = ss.str();
10915 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10916 osd->clog->info() << rs;
10917 if (con) {
10918 MCommandReply *reply = new MCommandReply(0, rs);
10919 reply->set_tid(tid);
10920 con->send_message(reply);
10921 }
10922 }),
10923 OpRequestRef());
10924 }
10925
10926 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10927 {
10928 assert(repop_queue.empty());
10929 }
10930
10931 /*
10932 * pg status change notification
10933 */
10934
10935 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10936 {
10937 list<OpRequestRef> rq;
10938
10939 // apply all repops
10940 while (!repop_queue.empty()) {
10941 RepGather *repop = repop_queue.front();
10942 repop_queue.pop_front();
10943 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10944 repop->rep_aborted = true;
10945 repop->on_applied.clear();
10946 repop->on_committed.clear();
10947 repop->on_success.clear();
10948
10949 if (requeue) {
10950 if (repop->op) {
10951 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10952 rq.push_back(repop->op);
10953 repop->op = OpRequestRef();
10954 }
10955
10956 // also requeue any dups, interleaved into position
10957 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10958 waiting_for_ondisk.find(repop->v);
10959 if (p != waiting_for_ondisk.end()) {
10960 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10961 for (list<pair<OpRequestRef, version_t> >::iterator i =
10962 p->second.begin();
10963 i != p->second.end();
10964 ++i) {
10965 rq.push_back(i->first);
10966 }
10967 waiting_for_ondisk.erase(p);
10968 }
10969 }
10970
10971 remove_repop(repop);
10972 }
10973
10974 assert(repop_queue.empty());
10975
10976 if (requeue) {
10977 requeue_ops(rq);
10978 if (!waiting_for_ondisk.empty()) {
10979 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10980 waiting_for_ondisk.begin();
10981 i != waiting_for_ondisk.end();
10982 ++i) {
10983 for (list<pair<OpRequestRef, version_t> >::iterator j =
10984 i->second.begin();
10985 j != i->second.end();
10986 ++j) {
10987 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10988 << i->first << dendl;
10989 }
10990 }
10991 assert(waiting_for_ondisk.empty());
10992 }
10993 }
10994
10995 waiting_for_ondisk.clear();
10996 }
10997
10998 void PrimaryLogPG::on_flushed()
10999 {
11000 assert(flushes_in_progress > 0);
11001 flushes_in_progress--;
11002 if (flushes_in_progress == 0) {
11003 requeue_ops(waiting_for_flush);
11004 }
11005 if (!is_peered() || !is_primary()) {
11006 pair<hobject_t, ObjectContextRef> i;
11007 while (object_contexts.get_next(i.first, &i)) {
11008 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
11009 }
11010 assert(object_contexts.empty());
11011 }
11012 pgbackend->on_flushed();
11013 }
11014
11015 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
11016 {
11017 dout(10) << "on_removal" << dendl;
11018
11019 // adjust info to backfill
11020 info.set_last_backfill(hobject_t());
11021 pg_log.reset_backfill();
11022 dirty_info = true;
11023
11024
11025 // clear log
11026 PGLogEntryHandler rollbacker{this, t};
11027 pg_log.roll_forward(&rollbacker);
11028
11029 write_if_dirty(*t);
11030
11031 if (!deleting)
11032 on_shutdown();
11033 }
11034
11035 void PrimaryLogPG::clear_async_reads()
11036 {
11037 dout(10) << __func__ << dendl;
11038 for(auto& i : in_progress_async_reads) {
11039 dout(10) << "clear ctx: "
11040 << "OpRequestRef " << i.first
11041 << " OpContext " << i.second
11042 << dendl;
11043 close_op_ctx(i.second);
11044 }
11045 }
11046
11047 void PrimaryLogPG::on_shutdown()
11048 {
11049 dout(10) << "on_shutdown" << dendl;
11050
11051 // remove from queues
11052 osd->pg_stat_queue_dequeue(this);
11053 osd->peering_wq.dequeue(this);
11054
11055 // handles queue races
11056 deleting = true;
11057
11058 if (recovery_queued) {
11059 recovery_queued = false;
11060 osd->clear_queued_recovery(this);
11061 }
11062
11063 clear_scrub_reserved();
11064 scrub_clear_state();
11065
11066 unreg_next_scrub();
11067
11068 vector<ceph_tid_t> tids;
11069 cancel_copy_ops(false, &tids);
11070 cancel_flush_ops(false, &tids);
11071 cancel_proxy_ops(false, &tids);
11072 osd->objecter->op_cancel(tids, -ECANCELED);
11073
11074 apply_and_flush_repops(false);
11075 cancel_log_updates();
11076 // we must remove PGRefs, so do this this prior to release_backoffs() callers
11077 clear_backoffs();
11078 // clean up snap trim references
11079 snap_trimmer_machine.process_event(Reset());
11080
11081 pgbackend->on_change();
11082
11083 context_registry_on_change();
11084 object_contexts.clear();
11085
11086 clear_async_reads();
11087
11088 osd->remote_reserver.cancel_reservation(info.pgid);
11089 osd->local_reserver.cancel_reservation(info.pgid);
11090
11091 clear_primary_state();
11092 cancel_recovery();
11093 }
11094
11095 void PrimaryLogPG::on_activate()
11096 {
11097 // all clean?
11098 if (needs_recovery()) {
11099 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11100 queue_peering_event(
11101 CephPeeringEvtRef(
11102 std::make_shared<CephPeeringEvt>(
11103 get_osdmap()->get_epoch(),
11104 get_osdmap()->get_epoch(),
11105 DoRecovery())));
11106 } else if (needs_backfill()) {
11107 dout(10) << "activate queueing backfill" << dendl;
11108 queue_peering_event(
11109 CephPeeringEvtRef(
11110 std::make_shared<CephPeeringEvt>(
11111 get_osdmap()->get_epoch(),
11112 get_osdmap()->get_epoch(),
11113 RequestBackfill())));
11114 } else {
11115 dout(10) << "activate all replicas clean, no recovery" << dendl;
11116 eio_errors_to_process = false;
11117 queue_peering_event(
11118 CephPeeringEvtRef(
11119 std::make_shared<CephPeeringEvt>(
11120 get_osdmap()->get_epoch(),
11121 get_osdmap()->get_epoch(),
11122 AllReplicasRecovered())));
11123 }
11124
11125 publish_stats_to_osd();
11126
11127 if (!backfill_targets.empty()) {
11128 last_backfill_started = earliest_backfill();
11129 new_backfill = true;
11130 assert(!last_backfill_started.is_max());
11131 dout(5) << "on activate: bft=" << backfill_targets
11132 << " from " << last_backfill_started << dendl;
11133 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11134 i != backfill_targets.end();
11135 ++i) {
11136 dout(5) << "target shard " << *i
11137 << " from " << peer_info[*i].last_backfill
11138 << dendl;
11139 }
11140 }
11141
11142 hit_set_setup();
11143 agent_setup();
11144 }
11145
11146 void PrimaryLogPG::_on_new_interval()
11147 {
11148 dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11149 if (!pg_log.get_missing().may_include_deletes &&
11150 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11151 pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11152 }
11153 assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11154 }
11155
11156 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11157 {
11158 dout(10) << "on_change" << dendl;
11159
11160 if (hit_set && hit_set->insert_count() == 0) {
11161 dout(20) << " discarding empty hit_set" << dendl;
11162 hit_set_clear();
11163 }
11164
11165 if (recovery_queued) {
11166 recovery_queued = false;
11167 osd->clear_queued_recovery(this);
11168 }
11169
11170 // requeue everything in the reverse order they should be
11171 // reexamined.
11172 requeue_ops(waiting_for_peered);
11173 requeue_ops(waiting_for_flush);
11174 requeue_ops(waiting_for_active);
11175
11176 clear_scrub_reserved();
11177
11178 vector<ceph_tid_t> tids;
11179 cancel_copy_ops(is_primary(), &tids);
11180 cancel_flush_ops(is_primary(), &tids);
11181 cancel_proxy_ops(is_primary(), &tids);
11182 osd->objecter->op_cancel(tids, -ECANCELED);
11183
11184 // requeue object waiters
11185 for (auto& p : waiting_for_unreadable_object) {
11186 release_backoffs(p.first);
11187 }
11188 if (is_primary()) {
11189 requeue_object_waiters(waiting_for_unreadable_object);
11190 } else {
11191 waiting_for_unreadable_object.clear();
11192 }
11193 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11194 p != waiting_for_degraded_object.end();
11195 waiting_for_degraded_object.erase(p++)) {
11196 release_backoffs(p->first);
11197 if (is_primary())
11198 requeue_ops(p->second);
11199 else
11200 p->second.clear();
11201 finish_degraded_object(p->first);
11202 }
11203
11204 // requeues waiting_for_scrub
11205 scrub_clear_state();
11206
11207 for (auto p = waiting_for_blocked_object.begin();
11208 p != waiting_for_blocked_object.end();
11209 waiting_for_blocked_object.erase(p++)) {
11210 if (is_primary())
11211 requeue_ops(p->second);
11212 else
11213 p->second.clear();
11214 }
11215 for (auto i = callbacks_for_degraded_object.begin();
11216 i != callbacks_for_degraded_object.end();
11217 ) {
11218 finish_degraded_object((i++)->first);
11219 }
11220 assert(callbacks_for_degraded_object.empty());
11221
11222 if (is_primary()) {
11223 requeue_ops(waiting_for_cache_not_full);
11224 } else {
11225 waiting_for_cache_not_full.clear();
11226 }
11227 objects_blocked_on_cache_full.clear();
11228
11229 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11230 in_progress_async_reads.begin();
11231 i != in_progress_async_reads.end();
11232 in_progress_async_reads.erase(i++)) {
11233 close_op_ctx(i->second);
11234 if (is_primary())
11235 requeue_op(i->first);
11236 }
11237
11238 // this will requeue ops we were working on but didn't finish, and
11239 // any dups
11240 apply_and_flush_repops(is_primary());
11241 cancel_log_updates();
11242
11243 // do this *after* apply_and_flush_repops so that we catch any newly
11244 // registered watches.
11245 context_registry_on_change();
11246
11247 pgbackend->on_change_cleanup(t);
11248 scrubber.cleanup_store(t);
11249 pgbackend->on_change();
11250
11251 // clear snap_trimmer state
11252 snap_trimmer_machine.process_event(Reset());
11253
11254 debug_op_order.clear();
11255 unstable_stats.clear();
11256
11257 // we don't want to cache object_contexts through the interval change
11258 // NOTE: we actually assert that all currently live references are dead
11259 // by the time the flush for the next interval completes.
11260 object_contexts.clear();
11261
11262 // should have been cleared above by finishing all of the degraded objects
11263 assert(objects_blocked_on_degraded_snap.empty());
11264 }
11265
11266 void PrimaryLogPG::on_role_change()
11267 {
11268 dout(10) << "on_role_change" << dendl;
11269 if (get_role() != 0 && hit_set) {
11270 dout(10) << " clearing hit set" << dendl;
11271 hit_set_clear();
11272 }
11273 }
11274
11275 void PrimaryLogPG::on_pool_change()
11276 {
11277 dout(10) << __func__ << dendl;
11278 // requeue cache full waiters just in case the cache_mode is
11279 // changing away from writeback mode. note that if we are not
11280 // active the normal requeuing machinery is sufficient (and properly
11281 // ordered).
11282 if (is_active() &&
11283 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11284 !waiting_for_cache_not_full.empty()) {
11285 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11286 << dendl;
11287 requeue_ops(waiting_for_cache_not_full);
11288 objects_blocked_on_cache_full.clear();
11289 }
11290 hit_set_setup();
11291 agent_setup();
11292 }
11293
11294 // clear state. called on recovery completion AND cancellation.
11295 void PrimaryLogPG::_clear_recovery_state()
11296 {
11297 missing_loc.clear();
11298 #ifdef DEBUG_RECOVERY_OIDS
11299 recovering_oids.clear();
11300 #endif
11301 last_backfill_started = hobject_t();
11302 set<hobject_t>::iterator i = backfills_in_flight.begin();
11303 while (i != backfills_in_flight.end()) {
11304 assert(recovering.count(*i));
11305 backfills_in_flight.erase(i++);
11306 }
11307
11308 list<OpRequestRef> blocked_ops;
11309 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11310 i != recovering.end();
11311 recovering.erase(i++)) {
11312 if (i->second) {
11313 i->second->drop_recovery_read(&blocked_ops);
11314 requeue_ops(blocked_ops);
11315 }
11316 }
11317 assert(backfills_in_flight.empty());
11318 pending_backfill_updates.clear();
11319 assert(recovering.empty());
11320 pgbackend->clear_recovery_state();
11321 }
11322
11323 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11324 {
11325 dout(20) << __func__ << ": " << soid << dendl;
11326 assert(recovering.count(soid));
11327 ObjectContextRef obc = recovering[soid];
11328 if (obc) {
11329 list<OpRequestRef> blocked_ops;
11330 obc->drop_recovery_read(&blocked_ops);
11331 requeue_ops(blocked_ops);
11332 }
11333 recovering.erase(soid);
11334 finish_recovery_op(soid);
11335 release_backoffs(soid);
11336 if (waiting_for_degraded_object.count(soid)) {
11337 dout(20) << " kicking degraded waiters on " << soid << dendl;
11338 requeue_ops(waiting_for_degraded_object[soid]);
11339 waiting_for_degraded_object.erase(soid);
11340 }
11341 if (waiting_for_unreadable_object.count(soid)) {
11342 dout(20) << " kicking unreadable waiters on " << soid << dendl;
11343 requeue_ops(waiting_for_unreadable_object[soid]);
11344 waiting_for_unreadable_object.erase(soid);
11345 }
11346 if (is_missing_object(soid))
11347 pg_log.set_last_requested(0); // get recover_primary to start over
11348 finish_degraded_object(soid);
11349 }
11350
11351 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11352 {
11353 /*
11354 * check that any peers we are planning to (or currently) pulling
11355 * objects from are dealt with.
11356 */
11357 missing_loc.check_recovery_sources(osdmap);
11358 pgbackend->check_recovery_sources(osdmap);
11359
11360 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11361 i != peer_log_requested.end();
11362 ) {
11363 if (!osdmap->is_up(i->osd)) {
11364 dout(10) << "peer_log_requested removing " << *i << dendl;
11365 peer_log_requested.erase(i++);
11366 } else {
11367 ++i;
11368 }
11369 }
11370
11371 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11372 i != peer_missing_requested.end();
11373 ) {
11374 if (!osdmap->is_up(i->osd)) {
11375 dout(10) << "peer_missing_requested removing " << *i << dendl;
11376 peer_missing_requested.erase(i++);
11377 } else {
11378 ++i;
11379 }
11380 }
11381 }
11382
11383 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11384 {
11385 set<pg_shard_t> now_down;
11386 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11387 p != missing_loc_sources.end();
11388 ) {
11389 if (osdmap->is_up(p->osd)) {
11390 ++p;
11391 continue;
11392 }
11393 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11394 now_down.insert(*p);
11395 missing_loc_sources.erase(p++);
11396 }
11397
11398 if (now_down.empty()) {
11399 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11400 } else {
11401 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11402 << missing_loc_sources << dendl;
11403
11404 // filter missing_loc
11405 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11406 while (p != missing_loc.end()) {
11407 set<pg_shard_t>::iterator q = p->second.begin();
11408 while (q != p->second.end())
11409 if (now_down.count(*q)) {
11410 p->second.erase(q++);
11411 } else {
11412 ++q;
11413 }
11414 if (p->second.empty())
11415 missing_loc.erase(p++);
11416 else
11417 ++p;
11418 }
11419 }
11420 }
11421
11422
11423 bool PrimaryLogPG::start_recovery_ops(
11424 uint64_t max,
11425 ThreadPool::TPHandle &handle,
11426 uint64_t *ops_started)
11427 {
11428 uint64_t& started = *ops_started;
11429 started = 0;
11430 bool work_in_progress = false;
11431 assert(is_primary());
11432
11433 if (!state_test(PG_STATE_RECOVERING) &&
11434 !state_test(PG_STATE_BACKFILLING)) {
11435 /* TODO: I think this case is broken and will make do_recovery()
11436 * unhappy since we're returning false */
11437 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11438 return false;
11439 }
11440
11441 const auto &missing = pg_log.get_missing();
11442
11443 unsigned int num_missing = missing.num_missing();
11444 uint64_t num_unfound = get_num_unfound();
11445
11446 if (num_missing == 0) {
11447 info.last_complete = info.last_update;
11448 }
11449
11450 if (num_missing == num_unfound) {
11451 // All of the missing objects we have are unfound.
11452 // Recover the replicas.
11453 started = recover_replicas(max, handle);
11454 }
11455 if (!started) {
11456 // We still have missing objects that we should grab from replicas.
11457 started += recover_primary(max, handle);
11458 }
11459 if (!started && num_unfound != get_num_unfound()) {
11460 // second chance to recovery replicas
11461 started = recover_replicas(max, handle);
11462 }
11463
11464 if (started)
11465 work_in_progress = true;
11466
11467 bool deferred_backfill = false;
11468 if (recovering.empty() &&
11469 state_test(PG_STATE_BACKFILLING) &&
11470 !backfill_targets.empty() && started < max &&
11471 missing.num_missing() == 0 &&
11472 waiting_on_backfill.empty()) {
11473 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11474 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11475 deferred_backfill = true;
11476 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11477 !is_degraded()) {
11478 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11479 deferred_backfill = true;
11480 } else if (!backfill_reserved) {
11481 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11482 if (!backfill_reserving) {
11483 dout(10) << "queueing RequestBackfill" << dendl;
11484 backfill_reserving = true;
11485 queue_peering_event(
11486 CephPeeringEvtRef(
11487 std::make_shared<CephPeeringEvt>(
11488 get_osdmap()->get_epoch(),
11489 get_osdmap()->get_epoch(),
11490 RequestBackfill())));
11491 }
11492 deferred_backfill = true;
11493 } else {
11494 started += recover_backfill(max - started, handle, &work_in_progress);
11495 }
11496 }
11497
11498 dout(10) << " started " << started << dendl;
11499 osd->logger->inc(l_osd_rop, started);
11500
11501 if (!recovering.empty() ||
11502 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11503 return work_in_progress;
11504
11505 assert(recovering.empty());
11506 assert(recovery_ops_active == 0);
11507
11508 dout(10) << __func__ << " needs_recovery: "
11509 << missing_loc.get_needs_recovery()
11510 << dendl;
11511 dout(10) << __func__ << " missing_loc: "
11512 << missing_loc.get_missing_locs()
11513 << dendl;
11514 int unfound = get_num_unfound();
11515 if (unfound) {
11516 dout(10) << " still have " << unfound << " unfound" << dendl;
11517 return work_in_progress;
11518 }
11519
11520 if (missing.num_missing() > 0) {
11521 // this shouldn't happen!
11522 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11523 << missing.num_missing() << ": " << missing.get_items();
11524 return work_in_progress;
11525 }
11526
11527 if (needs_recovery()) {
11528 // this shouldn't happen!
11529 // We already checked num_missing() so we must have missing replicas
11530 osd->clog->error() << info.pgid
11531 << " Unexpected Error: recovery ending with missing replicas";
11532 return work_in_progress;
11533 }
11534
11535 if (state_test(PG_STATE_RECOVERING)) {
11536 state_clear(PG_STATE_RECOVERING);
11537 state_clear(PG_STATE_FORCED_RECOVERY);
11538 if (needs_backfill()) {
11539 dout(10) << "recovery done, queuing backfill" << dendl;
11540 queue_peering_event(
11541 CephPeeringEvtRef(
11542 std::make_shared<CephPeeringEvt>(
11543 get_osdmap()->get_epoch(),
11544 get_osdmap()->get_epoch(),
11545 RequestBackfill())));
11546 } else {
11547 dout(10) << "recovery done, no backfill" << dendl;
11548 eio_errors_to_process = false;
11549 state_clear(PG_STATE_FORCED_BACKFILL);
11550 queue_peering_event(
11551 CephPeeringEvtRef(
11552 std::make_shared<CephPeeringEvt>(
11553 get_osdmap()->get_epoch(),
11554 get_osdmap()->get_epoch(),
11555 AllReplicasRecovered())));
11556 }
11557 } else { // backfilling
11558 state_clear(PG_STATE_BACKFILLING);
11559 state_clear(PG_STATE_FORCED_BACKFILL);
11560 state_clear(PG_STATE_FORCED_RECOVERY);
11561 dout(10) << "recovery done, backfill done" << dendl;
11562 eio_errors_to_process = false;
11563 queue_peering_event(
11564 CephPeeringEvtRef(
11565 std::make_shared<CephPeeringEvt>(
11566 get_osdmap()->get_epoch(),
11567 get_osdmap()->get_epoch(),
11568 Backfilled())));
11569 }
11570
11571 return false;
11572 }
11573
11574 /**
11575 * do one recovery op.
11576 * return true if done, false if nothing left to do.
11577 */
11578 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11579 {
11580 assert(is_primary());
11581
11582 const auto &missing = pg_log.get_missing();
11583
11584 dout(10) << "recover_primary recovering " << recovering.size()
11585 << " in pg" << dendl;
11586 dout(10) << "recover_primary " << missing << dendl;
11587 dout(25) << "recover_primary " << missing.get_items() << dendl;
11588
11589 // look at log!
11590 pg_log_entry_t *latest = 0;
11591 unsigned started = 0;
11592 int skipped = 0;
11593
11594 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11595 map<version_t, hobject_t>::const_iterator p =
11596 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11597 while (p != missing.get_rmissing().end()) {
11598 handle.reset_tp_timeout();
11599 hobject_t soid;
11600 version_t v = p->first;
11601
11602 if (pg_log.get_log().objects.count(p->second)) {
11603 latest = pg_log.get_log().objects.find(p->second)->second;
11604 assert(latest->is_update() || latest->is_delete());
11605 soid = latest->soid;
11606 } else {
11607 latest = 0;
11608 soid = p->second;
11609 }
11610 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11611 ++p;
11612
11613 hobject_t head = soid.get_head();
11614
11615 eversion_t need = item.need;
11616
11617 dout(10) << "recover_primary "
11618 << soid << " " << item.need
11619 << (missing.is_missing(soid) ? " (missing)":"")
11620 << (missing.is_missing(head) ? " (missing head)":"")
11621 << (recovering.count(soid) ? " (recovering)":"")
11622 << (recovering.count(head) ? " (recovering head)":"")
11623 << dendl;
11624
11625 if (latest) {
11626 switch (latest->op) {
11627 case pg_log_entry_t::CLONE:
11628 /*
11629 * Handling for this special case removed for now, until we
11630 * can correctly construct an accurate SnapSet from the old
11631 * one.
11632 */
11633 break;
11634
11635 case pg_log_entry_t::LOST_REVERT:
11636 {
11637 if (item.have == latest->reverting_to) {
11638 ObjectContextRef obc = get_object_context(soid, true);
11639
11640 if (obc->obs.oi.version == latest->version) {
11641 // I'm already reverting
11642 dout(10) << " already reverting " << soid << dendl;
11643 } else {
11644 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11645 obc->ondisk_write_lock();
11646 obc->obs.oi.version = latest->version;
11647
11648 ObjectStore::Transaction t;
11649 bufferlist b2;
11650 obc->obs.oi.encode(
11651 b2,
11652 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11653 assert(!pool.info.require_rollback());
11654 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11655
11656 recover_got(soid, latest->version);
11657 missing_loc.add_location(soid, pg_whoami);
11658
11659 ++active_pushes;
11660
11661 osd->store->queue_transaction(osr.get(), std::move(t),
11662 new C_OSD_AppliedRecoveredObject(this, obc),
11663 new C_OSD_CommittedPushedObject(
11664 this,
11665 get_osdmap()->get_epoch(),
11666 info.last_complete),
11667 new C_OSD_OndiskWriteUnlock(obc));
11668 continue;
11669 }
11670 } else {
11671 /*
11672 * Pull the old version of the object. Update missing_loc here to have the location
11673 * of the version we want.
11674 *
11675 * This doesn't use the usual missing_loc paths, but that's okay:
11676 * - if we have it locally, we hit the case above, and go from there.
11677 * - if we don't, we always pass through this case during recovery and set up the location
11678 * properly.
11679 * - this way we don't need to mangle the missing code to be general about needing an old
11680 * version...
11681 */
11682 eversion_t alternate_need = latest->reverting_to;
11683 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11684
11685 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11686 p != peer_missing.end();
11687 ++p)
11688 if (p->second.is_missing(soid, need) &&
11689 p->second.get_items().at(soid).have == alternate_need) {
11690 missing_loc.add_location(soid, p->first);
11691 }
11692 dout(10) << " will pull " << alternate_need << " or " << need
11693 << " from one of " << missing_loc.get_locations(soid)
11694 << dendl;
11695 }
11696 }
11697 break;
11698 }
11699 }
11700
11701 if (!recovering.count(soid)) {
11702 if (recovering.count(head)) {
11703 ++skipped;
11704 } else {
11705 int r = recover_missing(
11706 soid, need, get_recovery_op_priority(), h);
11707 switch (r) {
11708 case PULL_YES:
11709 ++started;
11710 break;
11711 case PULL_OTHER:
11712 ++started;
11713 case PULL_NONE:
11714 ++skipped;
11715 break;
11716 default:
11717 ceph_abort();
11718 }
11719 if (started >= max)
11720 break;
11721 }
11722 }
11723
11724 // only advance last_requested if we haven't skipped anything
11725 if (!skipped)
11726 pg_log.set_last_requested(v);
11727 }
11728
11729 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11730 return started;
11731 }
11732
11733 bool PrimaryLogPG::primary_error(
11734 const hobject_t& soid, eversion_t v)
11735 {
11736 pg_log.missing_add(soid, v, eversion_t());
11737 pg_log.set_last_requested(0);
11738 missing_loc.remove_location(soid, pg_whoami);
11739 bool uhoh = true;
11740 assert(!actingbackfill.empty());
11741 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11742 i != actingbackfill.end();
11743 ++i) {
11744 if (*i == get_primary()) continue;
11745 pg_shard_t peer = *i;
11746 if (!peer_missing[peer].is_missing(soid, v)) {
11747 missing_loc.add_location(soid, peer);
11748 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11749 << ", there should be a copy on shard " << peer << dendl;
11750 uhoh = false;
11751 }
11752 }
11753 if (uhoh)
11754 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11755 else
11756 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11757 << ", will try copies on " << missing_loc.get_locations(soid);
11758 return uhoh;
11759 }
11760
11761 int PrimaryLogPG::prep_object_replica_deletes(
11762 const hobject_t& soid, eversion_t v,
11763 PGBackend::RecoveryHandle *h)
11764 {
11765 assert(is_primary());
11766 dout(10) << __func__ << ": on " << soid << dendl;
11767
11768 start_recovery_op(soid);
11769 assert(!recovering.count(soid));
11770 recovering.insert(make_pair(soid, ObjectContextRef()));
11771
11772 pgbackend->recover_delete_object(soid, v, h);
11773 return 1;
11774 }
11775
11776 int PrimaryLogPG::prep_object_replica_pushes(
11777 const hobject_t& soid, eversion_t v,
11778 PGBackend::RecoveryHandle *h)
11779 {
11780 assert(is_primary());
11781 dout(10) << __func__ << ": on " << soid << dendl;
11782
11783 // NOTE: we know we will get a valid oloc off of disk here.
11784 ObjectContextRef obc = get_object_context(soid, false);
11785 if (!obc) {
11786 primary_error(soid, v);
11787 return 0;
11788 }
11789
11790 if (!obc->get_recovery_read()) {
11791 dout(20) << "recovery delayed on " << soid
11792 << "; could not get rw_manager lock" << dendl;
11793 return 0;
11794 } else {
11795 dout(20) << "recovery got recovery read lock on " << soid
11796 << dendl;
11797 }
11798
11799 start_recovery_op(soid);
11800 assert(!recovering.count(soid));
11801 recovering.insert(make_pair(soid, obc));
11802
11803 /* We need this in case there is an in progress write on the object. In fact,
11804 * the only possible write is an update to the xattr due to a lost_revert --
11805 * a client write would be blocked since the object is degraded.
11806 * In almost all cases, therefore, this lock should be uncontended.
11807 */
11808 obc->ondisk_read_lock();
11809 int r = pgbackend->recover_object(
11810 soid,
11811 v,
11812 ObjectContextRef(),
11813 obc, // has snapset context
11814 h);
11815 obc->ondisk_read_unlock();
11816 if (r < 0) {
11817 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11818 primary_failed(soid);
11819 primary_error(soid, v);
11820 return 0;
11821 }
11822 return 1;
11823 }
11824
11825 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11826 {
11827 dout(10) << __func__ << "(" << max << ")" << dendl;
11828 uint64_t started = 0;
11829
11830 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11831
11832 // this is FAR from an optimal recovery order. pretty lame, really.
11833 assert(!actingbackfill.empty());
11834 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11835 i != actingbackfill.end();
11836 ++i) {
11837 if (*i == get_primary()) continue;
11838 pg_shard_t peer = *i;
11839 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11840 assert(pm != peer_missing.end());
11841 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11842 assert(pi != peer_info.end());
11843 size_t m_sz = pm->second.num_missing();
11844
11845 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11846 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11847
11848 // oldest first!
11849 const pg_missing_t &m(pm->second);
11850 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11851 p != m.get_rmissing().end() && started < max;
11852 ++p) {
11853 handle.reset_tp_timeout();
11854 const hobject_t soid(p->second);
11855
11856 if (missing_loc.is_unfound(soid)) {
11857 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11858 continue;
11859 }
11860
11861 if (soid > pi->second.last_backfill) {
11862 if (!recovering.count(soid)) {
11863 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11864 derr << __func__ << ": object added to missing set for backfill, but "
11865 << "is not in recovering, error!" << dendl;
11866 ceph_abort();
11867 }
11868 continue;
11869 }
11870
11871 if (recovering.count(soid)) {
11872 dout(10) << __func__ << ": already recovering " << soid << dendl;
11873 continue;
11874 }
11875
11876 if (missing_loc.is_deleted(soid)) {
11877 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11878 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11879 started += prep_object_replica_deletes(soid, r->second.need, h);
11880 continue;
11881 }
11882
11883 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11884 dout(10) << __func__ << ": " << soid.get_head()
11885 << " still missing on primary" << dendl;
11886 continue;
11887 }
11888
11889 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11890 dout(10) << __func__ << ": " << soid.get_snapdir()
11891 << " still missing on primary" << dendl;
11892 continue;
11893 }
11894
11895 if (pg_log.get_missing().is_missing(soid)) {
11896 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11897 continue;
11898 }
11899
11900 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11901 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11902 started += prep_object_replica_pushes(soid, r->second.need,
11903 h);
11904 }
11905 }
11906
11907 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11908 return started;
11909 }
11910
11911 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11912 {
11913 hobject_t e = hobject_t::get_max();
11914 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11915 i != backfill_targets.end();
11916 ++i) {
11917 pg_shard_t peer = *i;
11918 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11919 peer_backfill_info.find(peer);
11920 assert(iter != peer_backfill_info.end());
11921 if (iter->second.begin < e)
11922 e = iter->second.begin;
11923 }
11924 return e;
11925 }
11926
11927 bool PrimaryLogPG::all_peer_done() const
11928 {
11929 // Primary hasn't got any more objects
11930 assert(backfill_info.empty());
11931
11932 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11933 i != backfill_targets.end();
11934 ++i) {
11935 pg_shard_t bt = *i;
11936 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11937 peer_backfill_info.find(bt);
11938 assert(piter != peer_backfill_info.end());
11939 const BackfillInterval& pbi = piter->second;
11940 // See if peer has more to process
11941 if (!pbi.extends_to_end() || !pbi.empty())
11942 return false;
11943 }
11944 return true;
11945 }
11946
11947 /**
11948 * recover_backfill
11949 *
11950 * Invariants:
11951 *
11952 * backfilled: fully pushed to replica or present in replica's missing set (both
11953 * our copy and theirs).
11954 *
11955 * All objects on a backfill_target in
11956 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11957 * objects have been actually deleted and all logically-valid objects are replicated.
11958 * There may be PG objects in this interval yet to be backfilled.
11959 *
11960 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11961 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11962 *
11963 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11964 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11965 * interval remain on the backfill target.
11966 *
11967 * For a backfill target, all objects <= peer_info[target].last_backfill
11968 * have been backfilled to target
11969 *
11970 * There *MAY* be missing/outdated objects between last_backfill_started and
11971 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11972 * io created objects since the last scan. For this reason, we call
11973 * update_range() again before continuing backfill.
11974 */
11975 uint64_t PrimaryLogPG::recover_backfill(
11976 uint64_t max,
11977 ThreadPool::TPHandle &handle, bool *work_started)
11978 {
11979 dout(10) << "recover_backfill (" << max << ")"
11980 << " bft=" << backfill_targets
11981 << " last_backfill_started " << last_backfill_started
11982 << (new_backfill ? " new_backfill":"")
11983 << dendl;
11984 assert(!backfill_targets.empty());
11985
11986 // Initialize from prior backfill state
11987 if (new_backfill) {
11988 // on_activate() was called prior to getting here
11989 assert(last_backfill_started == earliest_backfill());
11990 new_backfill = false;
11991
11992 // initialize BackfillIntervals
11993 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11994 i != backfill_targets.end();
11995 ++i) {
11996 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11997 }
11998 backfill_info.reset(last_backfill_started);
11999
12000 backfills_in_flight.clear();
12001 pending_backfill_updates.clear();
12002 }
12003
12004 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12005 i != backfill_targets.end();
12006 ++i) {
12007 dout(10) << "peer osd." << *i
12008 << " info " << peer_info[*i]
12009 << " interval " << peer_backfill_info[*i].begin
12010 << "-" << peer_backfill_info[*i].end
12011 << " " << peer_backfill_info[*i].objects.size() << " objects"
12012 << dendl;
12013 }
12014
12015 // update our local interval to cope with recent changes
12016 backfill_info.begin = last_backfill_started;
12017 update_range(&backfill_info, handle);
12018
12019 unsigned ops = 0;
12020 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
12021 set<hobject_t> add_to_stat;
12022
12023 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12024 i != backfill_targets.end();
12025 ++i) {
12026 peer_backfill_info[*i].trim_to(
12027 std::max(peer_info[*i].last_backfill, last_backfill_started));
12028 }
12029 backfill_info.trim_to(last_backfill_started);
12030
12031 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12032 while (ops < max) {
12033 if (backfill_info.begin <= earliest_peer_backfill() &&
12034 !backfill_info.extends_to_end() && backfill_info.empty()) {
12035 hobject_t next = backfill_info.end;
12036 backfill_info.reset(next);
12037 backfill_info.end = hobject_t::get_max();
12038 update_range(&backfill_info, handle);
12039 backfill_info.trim();
12040 }
12041
12042 dout(20) << " my backfill interval " << backfill_info << dendl;
12043
12044 bool sent_scan = false;
12045 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12046 i != backfill_targets.end();
12047 ++i) {
12048 pg_shard_t bt = *i;
12049 BackfillInterval& pbi = peer_backfill_info[bt];
12050
12051 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
12052 if (pbi.begin <= backfill_info.begin &&
12053 !pbi.extends_to_end() && pbi.empty()) {
12054 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
12055 epoch_t e = get_osdmap()->get_epoch();
12056 MOSDPGScan *m = new MOSDPGScan(
12057 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
12058 spg_t(info.pgid.pgid, bt.shard),
12059 pbi.end, hobject_t());
12060 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12061 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
12062 waiting_on_backfill.insert(bt);
12063 sent_scan = true;
12064 }
12065 }
12066
12067 // Count simultaneous scans as a single op and let those complete
12068 if (sent_scan) {
12069 ops++;
12070 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12071 break;
12072 }
12073
12074 if (backfill_info.empty() && all_peer_done()) {
12075 dout(10) << " reached end for both local and all peers" << dendl;
12076 break;
12077 }
12078
12079 // Get object within set of peers to operate on and
12080 // the set of targets for which that object applies.
12081 hobject_t check = earliest_peer_backfill();
12082
12083 if (check < backfill_info.begin) {
12084
12085 set<pg_shard_t> check_targets;
12086 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12087 i != backfill_targets.end();
12088 ++i) {
12089 pg_shard_t bt = *i;
12090 BackfillInterval& pbi = peer_backfill_info[bt];
12091 if (pbi.begin == check)
12092 check_targets.insert(bt);
12093 }
12094 assert(!check_targets.empty());
12095
12096 dout(20) << " BACKFILL removing " << check
12097 << " from peers " << check_targets << dendl;
12098 for (set<pg_shard_t>::iterator i = check_targets.begin();
12099 i != check_targets.end();
12100 ++i) {
12101 pg_shard_t bt = *i;
12102 BackfillInterval& pbi = peer_backfill_info[bt];
12103 assert(pbi.begin == check);
12104
12105 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12106 pbi.pop_front();
12107 }
12108
12109 /* This requires a bit of explanation. We compare head against
12110 * last_backfill to determine whether to send an operation
12111 * to the replica. A single write operation can touch up to three
12112 * objects: head, the snapdir, and a new clone which sorts closer to
12113 * head than any existing clone. If last_backfill points at a clone,
12114 * the transaction won't be sent and all 3 must lie on the right side
12115 * of the line (i.e., we'll backfill them later). If last_backfill
12116 * points at snapdir, it sorts greater than head, so we send the
12117 * transaction which is correct because all three must lie to the left
12118 * of the line.
12119 *
12120 * If it points at head, we have a bit of an issue. If head actually
12121 * exists, no problem, because any transaction which touches snapdir
12122 * must end up creating it (and deleting head), so sending the
12123 * operation won't pose a problem -- we'll end up having to scan it,
12124 * but it'll end up being the right version so we won't bother to
12125 * rebackfill it. However, if head doesn't exist, any write on head
12126 * will remove snapdir. For a replicated pool, this isn't a problem,
12127 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12128 * It only poses a problem for EC pools, because we never just delete
12129 * an object, we rename it into a rollback object. That operation
12130 * will end up crashing the osd with ENOENT. Tolerating the failure
12131 * wouldn't work either, even if snapdir exists, we'd be creating a
12132 * rollback object past the last_backfill line which wouldn't get
12133 * cleaned up (no rollback objects past the last_backfill line is an
12134 * existing important invariant). Thus, let's avoid the whole issue
12135 * by just not updating last_backfill_started here if head doesn't
12136 * exist and snapdir does. We aren't using up a recovery count here,
12137 * so we're going to recover snapdir immediately anyway. We'll only
12138 * fail "backward" if we fail to get the rw lock and that just means
12139 * we'll re-process this section of the hash space again.
12140 *
12141 * I'm choosing this hack here because the really "correct" answer is
12142 * going to be to unify snapdir and head into a single object (a
12143 * snapdir is really just a confusing way to talk about head existing
12144 * as a whiteout), but doing that is going to be a somewhat larger
12145 * undertaking.
12146 *
12147 * @see http://tracker.ceph.com/issues/17668
12148 */
12149 if (!(check.is_head() &&
12150 backfill_info.begin.is_snapdir() &&
12151 check == backfill_info.begin.get_head()))
12152 last_backfill_started = check;
12153
12154 // Don't increment ops here because deletions
12155 // are cheap and not replied to unlike real recovery_ops,
12156 // and we can't increment ops without requeueing ourself
12157 // for recovery.
12158 } else {
12159 eversion_t& obj_v = backfill_info.objects.begin()->second;
12160
12161 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12162 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12163 i != backfill_targets.end();
12164 ++i) {
12165 pg_shard_t bt = *i;
12166 BackfillInterval& pbi = peer_backfill_info[bt];
12167 // Find all check peers that have the wrong version
12168 if (check == backfill_info.begin && check == pbi.begin) {
12169 if (pbi.objects.begin()->second != obj_v) {
12170 need_ver_targs.push_back(bt);
12171 } else {
12172 keep_ver_targs.push_back(bt);
12173 }
12174 } else {
12175 pg_info_t& pinfo = peer_info[bt];
12176
12177 // Only include peers that we've caught up to their backfill line
12178 // otherwise, they only appear to be missing this object
12179 // because their pbi.begin > backfill_info.begin.
12180 if (backfill_info.begin > pinfo.last_backfill)
12181 missing_targs.push_back(bt);
12182 else
12183 skip_targs.push_back(bt);
12184 }
12185 }
12186
12187 if (!keep_ver_targs.empty()) {
12188 // These peers have version obj_v
12189 dout(20) << " BACKFILL keeping " << check
12190 << " with ver " << obj_v
12191 << " on peers " << keep_ver_targs << dendl;
12192 //assert(!waiting_for_degraded_object.count(check));
12193 }
12194 if (!need_ver_targs.empty() || !missing_targs.empty()) {
12195 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12196 assert(obc);
12197 if (obc->get_recovery_read()) {
12198 if (!need_ver_targs.empty()) {
12199 dout(20) << " BACKFILL replacing " << check
12200 << " with ver " << obj_v
12201 << " to peers " << need_ver_targs << dendl;
12202 }
12203 if (!missing_targs.empty()) {
12204 dout(20) << " BACKFILL pushing " << backfill_info.begin
12205 << " with ver " << obj_v
12206 << " to peers " << missing_targs << dendl;
12207 }
12208 vector<pg_shard_t> all_push = need_ver_targs;
12209 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12210
12211 handle.reset_tp_timeout();
12212 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12213 if (r < 0) {
12214 *work_started = true;
12215 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12216 break;
12217 }
12218 ops++;
12219 } else {
12220 *work_started = true;
12221 dout(20) << "backfill blocking on " << backfill_info.begin
12222 << "; could not get rw_manager lock" << dendl;
12223 break;
12224 }
12225 }
12226 dout(20) << "need_ver_targs=" << need_ver_targs
12227 << " keep_ver_targs=" << keep_ver_targs << dendl;
12228 dout(20) << "backfill_targets=" << backfill_targets
12229 << " missing_targs=" << missing_targs
12230 << " skip_targs=" << skip_targs << dendl;
12231
12232 last_backfill_started = backfill_info.begin;
12233 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12234 backfill_info.pop_front();
12235 vector<pg_shard_t> check_targets = need_ver_targs;
12236 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12237 for (vector<pg_shard_t>::iterator i = check_targets.begin();
12238 i != check_targets.end();
12239 ++i) {
12240 pg_shard_t bt = *i;
12241 BackfillInterval& pbi = peer_backfill_info[bt];
12242 pbi.pop_front();
12243 }
12244 }
12245 }
12246
12247 hobject_t backfill_pos =
12248 std::min(backfill_info.begin, earliest_peer_backfill());
12249
12250 for (set<hobject_t>::iterator i = add_to_stat.begin();
12251 i != add_to_stat.end();
12252 ++i) {
12253 ObjectContextRef obc = get_object_context(*i, false);
12254 assert(obc);
12255 pg_stat_t stat;
12256 add_object_context_to_pg_stat(obc, &stat);
12257 pending_backfill_updates[*i] = stat;
12258 }
12259 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12260 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12261 for (unsigned i = 0; i < to_remove.size(); ++i) {
12262 handle.reset_tp_timeout();
12263 const hobject_t& oid = to_remove[i].get<0>();
12264 eversion_t v = to_remove[i].get<1>();
12265 pg_shard_t peer = to_remove[i].get<2>();
12266 MOSDPGBackfillRemove *m;
12267 auto it = reqs.find(peer);
12268 if (it != reqs.end()) {
12269 m = it->second;
12270 } else {
12271 m = reqs[peer] = new MOSDPGBackfillRemove(
12272 spg_t(info.pgid.pgid, peer.shard),
12273 get_osdmap()->get_epoch());
12274 }
12275 m->ls.push_back(make_pair(oid, v));
12276
12277 if (oid <= last_backfill_started)
12278 pending_backfill_updates[oid]; // add empty stat!
12279 }
12280 for (auto p : reqs) {
12281 osd->send_message_osd_cluster(p.first.osd, p.second,
12282 get_osdmap()->get_epoch());
12283 }
12284 } else {
12285 // for jewel targets
12286 for (unsigned i = 0; i < to_remove.size(); ++i) {
12287 handle.reset_tp_timeout();
12288
12289 // ordered before any subsequent updates
12290 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12291 to_remove[i].get<2>());
12292
12293 if (to_remove[i].get<0>() <= last_backfill_started)
12294 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12295 }
12296 }
12297
12298 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12299
12300 dout(5) << "backfill_pos is " << backfill_pos << dendl;
12301 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12302 i != backfills_in_flight.end();
12303 ++i) {
12304 dout(20) << *i << " is still in flight" << dendl;
12305 }
12306
12307 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12308 backfill_pos : *(backfills_in_flight.begin());
12309 hobject_t new_last_backfill = earliest_backfill();
12310 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12311 for (map<hobject_t, pg_stat_t>::iterator i =
12312 pending_backfill_updates.begin();
12313 i != pending_backfill_updates.end() &&
12314 i->first < next_backfill_to_complete;
12315 pending_backfill_updates.erase(i++)) {
12316 dout(20) << " pending_backfill_update " << i->first << dendl;
12317 assert(i->first > new_last_backfill);
12318 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12319 j != backfill_targets.end();
12320 ++j) {
12321 pg_shard_t bt = *j;
12322 pg_info_t& pinfo = peer_info[bt];
12323 //Add stats to all peers that were missing object
12324 if (i->first > pinfo.last_backfill)
12325 pinfo.stats.add(i->second);
12326 }
12327 new_last_backfill = i->first;
12328 }
12329 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12330
12331 assert(!pending_backfill_updates.empty() ||
12332 new_last_backfill == last_backfill_started);
12333 if (pending_backfill_updates.empty() &&
12334 backfill_pos.is_max()) {
12335 assert(backfills_in_flight.empty());
12336 new_last_backfill = backfill_pos;
12337 last_backfill_started = backfill_pos;
12338 }
12339 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12340
12341 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12342 // all the backfill targets. Otherwise, we will move last_backfill up on
12343 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12344 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12345 i != backfill_targets.end();
12346 ++i) {
12347 pg_shard_t bt = *i;
12348 pg_info_t& pinfo = peer_info[bt];
12349
12350 if (new_last_backfill > pinfo.last_backfill) {
12351 pinfo.set_last_backfill(new_last_backfill);
12352 epoch_t e = get_osdmap()->get_epoch();
12353 MOSDPGBackfill *m = NULL;
12354 if (pinfo.last_backfill.is_max()) {
12355 m = new MOSDPGBackfill(
12356 MOSDPGBackfill::OP_BACKFILL_FINISH,
12357 e,
12358 last_peering_reset,
12359 spg_t(info.pgid.pgid, bt.shard));
12360 // Use default priority here, must match sub_op priority
12361 /* pinfo.stats might be wrong if we did log-based recovery on the
12362 * backfilled portion in addition to continuing backfill.
12363 */
12364 pinfo.stats = info.stats;
12365 start_recovery_op(hobject_t::get_max());
12366 } else {
12367 m = new MOSDPGBackfill(
12368 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12369 e,
12370 last_peering_reset,
12371 spg_t(info.pgid.pgid, bt.shard));
12372 // Use default priority here, must match sub_op priority
12373 }
12374 m->last_backfill = pinfo.last_backfill;
12375 m->stats = pinfo.stats;
12376 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12377 dout(10) << " peer " << bt
12378 << " num_objects now " << pinfo.stats.stats.sum.num_objects
12379 << " / " << info.stats.stats.sum.num_objects << dendl;
12380 }
12381 }
12382
12383 if (ops)
12384 *work_started = true;
12385 return ops;
12386 }
12387
12388 int PrimaryLogPG::prep_backfill_object_push(
12389 hobject_t oid, eversion_t v,
12390 ObjectContextRef obc,
12391 vector<pg_shard_t> peers,
12392 PGBackend::RecoveryHandle *h)
12393 {
12394 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12395 assert(!peers.empty());
12396
12397 backfills_in_flight.insert(oid);
12398 for (unsigned int i = 0 ; i < peers.size(); ++i) {
12399 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12400 assert(bpm != peer_missing.end());
12401 bpm->second.add(oid, eversion_t(), eversion_t(), false);
12402 }
12403
12404 assert(!recovering.count(oid));
12405
12406 start_recovery_op(oid);
12407 recovering.insert(make_pair(oid, obc));
12408
12409 // We need to take the read_lock here in order to flush in-progress writes
12410 obc->ondisk_read_lock();
12411 int r = pgbackend->recover_object(
12412 oid,
12413 v,
12414 ObjectContextRef(),
12415 obc,
12416 h);
12417 obc->ondisk_read_unlock();
12418 if (r < 0) {
12419 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12420 primary_failed(oid);
12421 primary_error(oid, v);
12422 backfills_in_flight.erase(oid);
12423 missing_loc.add_missing(oid, v, eversion_t());
12424 }
12425 return r;
12426 }
12427
12428 void PrimaryLogPG::update_range(
12429 BackfillInterval *bi,
12430 ThreadPool::TPHandle &handle)
12431 {
12432 int local_min = cct->_conf->osd_backfill_scan_min;
12433 int local_max = cct->_conf->osd_backfill_scan_max;
12434
12435 if (bi->version < info.log_tail) {
12436 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12437 << dendl;
12438 if (last_update_applied >= info.log_tail) {
12439 bi->version = last_update_applied;
12440 } else {
12441 osr->flush();
12442 bi->version = info.last_update;
12443 }
12444 scan_range(local_min, local_max, bi, handle);
12445 }
12446
12447 if (bi->version >= projected_last_update) {
12448 dout(10) << __func__<< ": bi is current " << dendl;
12449 assert(bi->version == projected_last_update);
12450 } else if (bi->version >= info.log_tail) {
12451 if (pg_log.get_log().empty() && projected_log.empty()) {
12452 /* Because we don't move log_tail on split, the log might be
12453 * empty even if log_tail != last_update. However, the only
12454 * way to get here with an empty log is if log_tail is actually
12455 * eversion_t(), because otherwise the entry which changed
12456 * last_update since the last scan would have to be present.
12457 */
12458 assert(bi->version == eversion_t());
12459 return;
12460 }
12461
12462 dout(10) << __func__<< ": bi is old, (" << bi->version
12463 << ") can be updated with log to projected_last_update "
12464 << projected_last_update << dendl;
12465
12466 auto func = [&](const pg_log_entry_t &e) {
12467 dout(10) << __func__ << ": updating from version " << e.version
12468 << dendl;
12469 const hobject_t &soid = e.soid;
12470 if (soid >= bi->begin &&
12471 soid < bi->end) {
12472 if (e.is_update()) {
12473 dout(10) << __func__ << ": " << e.soid << " updated to version "
12474 << e.version << dendl;
12475 bi->objects.erase(e.soid);
12476 bi->objects.insert(
12477 make_pair(
12478 e.soid,
12479 e.version));
12480 } else if (e.is_delete()) {
12481 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12482 bi->objects.erase(e.soid);
12483 }
12484 }
12485 };
12486 dout(10) << "scanning pg log first" << dendl;
12487 pg_log.get_log().scan_log_after(bi->version, func);
12488 dout(10) << "scanning projected log" << dendl;
12489 projected_log.scan_log_after(bi->version, func);
12490 bi->version = projected_last_update;
12491 } else {
12492 assert(0 == "scan_range should have raised bi->version past log_tail");
12493 }
12494 }
12495
12496 void PrimaryLogPG::scan_range(
12497 int min, int max, BackfillInterval *bi,
12498 ThreadPool::TPHandle &handle)
12499 {
12500 assert(is_locked());
12501 dout(10) << "scan_range from " << bi->begin << dendl;
12502 bi->clear_objects();
12503
12504 vector<hobject_t> ls;
12505 ls.reserve(max);
12506 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12507 assert(r >= 0);
12508 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12509 dout(20) << ls << dendl;
12510
12511 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12512 handle.reset_tp_timeout();
12513 ObjectContextRef obc;
12514 if (is_primary())
12515 obc = object_contexts.lookup(*p);
12516 if (obc) {
12517 bi->objects[*p] = obc->obs.oi.version;
12518 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12519 } else {
12520 bufferlist bl;
12521 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12522
12523 /* If the object does not exist here, it must have been removed
12524 * between the collection_list_partial and here. This can happen
12525 * for the first item in the range, which is usually last_backfill.
12526 */
12527 if (r == -ENOENT)
12528 continue;
12529
12530 assert(r >= 0);
12531 object_info_t oi(bl);
12532 bi->objects[*p] = oi.version;
12533 dout(20) << " " << *p << " " << oi.version << dendl;
12534 }
12535 }
12536 }
12537
12538
12539 /** check_local
12540 *
12541 * verifies that stray objects have been deleted
12542 */
12543 void PrimaryLogPG::check_local()
12544 {
12545 dout(10) << __func__ << dendl;
12546
12547 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12548
12549 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12550 return;
12551
12552 // just scan the log.
12553 set<hobject_t> did;
12554 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12555 p != pg_log.get_log().log.rend();
12556 ++p) {
12557 if (did.count(p->soid))
12558 continue;
12559 did.insert(p->soid);
12560
12561 if (p->is_delete() && !is_missing_object(p->soid)) {
12562 dout(10) << " checking " << p->soid
12563 << " at " << p->version << dendl;
12564 struct stat st;
12565 int r = osd->store->stat(
12566 ch,
12567 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12568 &st);
12569 if (r != -ENOENT) {
12570 derr << __func__ << " " << p->soid << " exists, but should have been "
12571 << "deleted" << dendl;
12572 assert(0 == "erroneously present object");
12573 }
12574 } else {
12575 // ignore old(+missing) objects
12576 }
12577 }
12578 }
12579
12580
12581
12582 // ===========================
12583 // hit sets
12584
12585 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12586 {
12587 ostringstream ss;
12588 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12589 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12590 info.pgid.ps(), info.pgid.pool(),
12591 cct->_conf->osd_hit_set_namespace);
12592 dout(20) << __func__ << " " << hoid << dendl;
12593 return hoid;
12594 }
12595
12596 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12597 utime_t end,
12598 bool using_gmt)
12599 {
12600 ostringstream ss;
12601 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12602 if (using_gmt) {
12603 start.gmtime(ss) << "_";
12604 end.gmtime(ss);
12605 } else {
12606 start.localtime(ss) << "_";
12607 end.localtime(ss);
12608 }
12609 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12610 info.pgid.ps(), info.pgid.pool(),
12611 cct->_conf->osd_hit_set_namespace);
12612 dout(20) << __func__ << " " << hoid << dendl;
12613 return hoid;
12614 }
12615
12616 void PrimaryLogPG::hit_set_clear()
12617 {
12618 dout(20) << __func__ << dendl;
12619 hit_set.reset();
12620 hit_set_start_stamp = utime_t();
12621 }
12622
12623 void PrimaryLogPG::hit_set_setup()
12624 {
12625 if (!is_active() ||
12626 !is_primary()) {
12627 hit_set_clear();
12628 return;
12629 }
12630
12631 if (is_active() && is_primary() &&
12632 (!pool.info.hit_set_count ||
12633 !pool.info.hit_set_period ||
12634 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12635 hit_set_clear();
12636
12637 // only primary is allowed to remove all the hit set objects
12638 hit_set_remove_all();
12639 return;
12640 }
12641
12642 // FIXME: discard any previous data for now
12643 hit_set_create();
12644
12645 // include any writes we know about from the pg log. this doesn't
12646 // capture reads, but it is better than nothing!
12647 hit_set_apply_log();
12648 }
12649
12650 void PrimaryLogPG::hit_set_remove_all()
12651 {
12652 // If any archives are degraded we skip this
12653 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12654 p != info.hit_set.history.end();
12655 ++p) {
12656 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12657
12658 // Once we hit a degraded object just skip
12659 if (is_degraded_or_backfilling_object(aoid))
12660 return;
12661 if (scrubber.write_blocked_by_scrub(aoid))
12662 return;
12663 }
12664
12665 if (!info.hit_set.history.empty()) {
12666 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12667 assert(p != info.hit_set.history.rend());
12668 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12669 assert(!is_degraded_or_backfilling_object(oid));
12670 ObjectContextRef obc = get_object_context(oid, false);
12671 assert(obc);
12672
12673 OpContextUPtr ctx = simple_opc_create(obc);
12674 ctx->at_version = get_next_version();
12675 ctx->updated_hset_history = info.hit_set;
12676 utime_t now = ceph_clock_now();
12677 ctx->mtime = now;
12678 hit_set_trim(ctx, 0);
12679 simple_opc_submit(std::move(ctx));
12680 }
12681
12682 info.hit_set = pg_hit_set_history_t();
12683 if (agent_state) {
12684 agent_state->discard_hit_sets();
12685 }
12686 }
12687
12688 void PrimaryLogPG::hit_set_create()
12689 {
12690 utime_t now = ceph_clock_now();
12691 // make a copy of the params to modify
12692 HitSet::Params params(pool.info.hit_set_params);
12693
12694 dout(20) << __func__ << " " << params << dendl;
12695 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12696 BloomHitSet::Params *p =
12697 static_cast<BloomHitSet::Params*>(params.impl.get());
12698
12699 // convert false positive rate so it holds up across the full period
12700 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12701 if (p->get_fpp() <= 0.0)
12702 p->set_fpp(.01); // fpp cannot be zero!
12703
12704 // if we don't have specified size, estimate target size based on the
12705 // previous bin!
12706 if (p->target_size == 0 && hit_set) {
12707 utime_t dur = now - hit_set_start_stamp;
12708 unsigned unique = hit_set->approx_unique_insert_count();
12709 dout(20) << __func__ << " previous set had approx " << unique
12710 << " unique items over " << dur << " seconds" << dendl;
12711 p->target_size = (double)unique * (double)pool.info.hit_set_period
12712 / (double)dur;
12713 }
12714 if (p->target_size <
12715 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12716 p->target_size = cct->_conf->osd_hit_set_min_size;
12717
12718 if (p->target_size
12719 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12720 p->target_size = cct->_conf->osd_hit_set_max_size;
12721
12722 p->seed = now.sec();
12723
12724 dout(10) << __func__ << " target_size " << p->target_size
12725 << " fpp " << p->get_fpp() << dendl;
12726 }
12727 hit_set.reset(new HitSet(params));
12728 hit_set_start_stamp = now;
12729 }
12730
12731 /**
12732 * apply log entries to set
12733 *
12734 * this would only happen after peering, to at least capture writes
12735 * during an interval that was potentially lost.
12736 */
12737 bool PrimaryLogPG::hit_set_apply_log()
12738 {
12739 if (!hit_set)
12740 return false;
12741
12742 eversion_t to = info.last_update;
12743 eversion_t from = info.hit_set.current_last_update;
12744 if (to <= from) {
12745 dout(20) << __func__ << " no update" << dendl;
12746 return false;
12747 }
12748
12749 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12750 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12751 while (p != pg_log.get_log().log.rend() && p->version > to)
12752 ++p;
12753 while (p != pg_log.get_log().log.rend() && p->version > from) {
12754 hit_set->insert(p->soid);
12755 ++p;
12756 }
12757
12758 return true;
12759 }
12760
12761 void PrimaryLogPG::hit_set_persist()
12762 {
12763 dout(10) << __func__ << dendl;
12764 bufferlist bl;
12765 unsigned max = pool.info.hit_set_count;
12766
12767 utime_t now = ceph_clock_now();
12768 hobject_t oid;
12769
12770 // If any archives are degraded we skip this persist request
12771 // account for the additional entry being added below
12772 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12773 p != info.hit_set.history.end();
12774 ++p) {
12775 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12776
12777 // Once we hit a degraded object just skip further trim
12778 if (is_degraded_or_backfilling_object(aoid))
12779 return;
12780 if (scrubber.write_blocked_by_scrub(aoid))
12781 return;
12782 }
12783
12784 // If backfill is in progress and we could possibly overlap with the
12785 // hit_set_* objects, back off. Since these all have
12786 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12787 // look just at that. This is necessary because our transactions
12788 // may include a modify of the new hit_set *and* a delete of the
12789 // old one, and this may span the backfill boundary.
12790 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12791 p != backfill_targets.end();
12792 ++p) {
12793 assert(peer_info.count(*p));
12794 const pg_info_t& pi = peer_info[*p];
12795 if (pi.last_backfill == hobject_t() ||
12796 pi.last_backfill.get_hash() == info.pgid.ps()) {
12797 dout(10) << __func__ << " backfill target osd." << *p
12798 << " last_backfill has not progressed past pgid ps"
12799 << dendl;
12800 return;
12801 }
12802 }
12803
12804
12805 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12806 new_hset.begin = hit_set_start_stamp;
12807 new_hset.end = now;
12808 oid = get_hit_set_archive_object(
12809 new_hset.begin,
12810 new_hset.end,
12811 new_hset.using_gmt);
12812
12813 // If the current object is degraded we skip this persist request
12814 if (scrubber.write_blocked_by_scrub(oid))
12815 return;
12816
12817 hit_set->seal();
12818 ::encode(*hit_set, bl);
12819 dout(20) << __func__ << " archive " << oid << dendl;
12820
12821 if (agent_state) {
12822 agent_state->add_hit_set(new_hset.begin, hit_set);
12823 uint32_t size = agent_state->hit_set_map.size();
12824 if (size >= pool.info.hit_set_count) {
12825 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12826 }
12827 hit_set_in_memory_trim(size);
12828 }
12829
12830 ObjectContextRef obc = get_object_context(oid, true);
12831 OpContextUPtr ctx = simple_opc_create(obc);
12832
12833 ctx->at_version = get_next_version();
12834 ctx->updated_hset_history = info.hit_set;
12835 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12836
12837 updated_hit_set_hist.current_last_update = info.last_update;
12838 new_hset.version = ctx->at_version;
12839
12840 updated_hit_set_hist.history.push_back(new_hset);
12841 hit_set_create();
12842
12843 // fabricate an object_info_t and SnapSet
12844 obc->obs.oi.version = ctx->at_version;
12845 obc->obs.oi.mtime = now;
12846 obc->obs.oi.size = bl.length();
12847 obc->obs.exists = true;
12848 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12849
12850 ctx->new_obs = obc->obs;
12851
12852 obc->ssc->snapset.head_exists = true;
12853 ctx->new_snapset = obc->ssc->snapset;
12854
12855 ctx->delta_stats.num_objects++;
12856 ctx->delta_stats.num_objects_hit_set_archive++;
12857 ctx->delta_stats.num_bytes += bl.length();
12858 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12859
12860 bufferlist bss;
12861 ::encode(ctx->new_snapset, bss);
12862 bufferlist boi(sizeof(ctx->new_obs.oi));
12863 ::encode(ctx->new_obs.oi, boi,
12864 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12865
12866 ctx->op_t->create(oid);
12867 if (bl.length()) {
12868 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12869 }
12870 map <string, bufferlist> attrs;
12871 attrs[OI_ATTR].claim(boi);
12872 attrs[SS_ATTR].claim(bss);
12873 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12874 ctx->log.push_back(
12875 pg_log_entry_t(
12876 pg_log_entry_t::MODIFY,
12877 oid,
12878 ctx->at_version,
12879 eversion_t(),
12880 0,
12881 osd_reqid_t(),
12882 ctx->mtime,
12883 0)
12884 );
12885
12886 hit_set_trim(ctx, max);
12887
12888 simple_opc_submit(std::move(ctx));
12889 }
12890
12891 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12892 {
12893 assert(ctx->updated_hset_history);
12894 pg_hit_set_history_t &updated_hit_set_hist =
12895 *(ctx->updated_hset_history);
12896 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12897 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12898 assert(p != updated_hit_set_hist.history.end());
12899 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12900
12901 assert(!is_degraded_or_backfilling_object(oid));
12902
12903 dout(20) << __func__ << " removing " << oid << dendl;
12904 ++ctx->at_version.version;
12905 ctx->log.push_back(
12906 pg_log_entry_t(pg_log_entry_t::DELETE,
12907 oid,
12908 ctx->at_version,
12909 p->version,
12910 0,
12911 osd_reqid_t(),
12912 ctx->mtime,
12913 0));
12914
12915 ctx->op_t->remove(oid);
12916 updated_hit_set_hist.history.pop_front();
12917
12918 ObjectContextRef obc = get_object_context(oid, false);
12919 assert(obc);
12920 --ctx->delta_stats.num_objects;
12921 --ctx->delta_stats.num_objects_hit_set_archive;
12922 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12923 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12924 }
12925 }
12926
12927 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12928 {
12929 while (agent_state->hit_set_map.size() > max_in_memory) {
12930 agent_state->remove_oldest_hit_set();
12931 }
12932 }
12933
12934
12935 // =======================================
12936 // cache agent
12937
12938 void PrimaryLogPG::agent_setup()
12939 {
12940 assert(is_locked());
12941 if (!is_active() ||
12942 !is_primary() ||
12943 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12944 pool.info.tier_of < 0 ||
12945 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12946 agent_clear();
12947 return;
12948 }
12949 if (!agent_state) {
12950 agent_state.reset(new TierAgentState);
12951
12952 // choose random starting position
12953 agent_state->position = hobject_t();
12954 agent_state->position.pool = info.pgid.pool();
12955 agent_state->position.set_hash(pool.info.get_random_pg_position(
12956 info.pgid.pgid,
12957 rand()));
12958 agent_state->start = agent_state->position;
12959
12960 dout(10) << __func__ << " allocated new state, position "
12961 << agent_state->position << dendl;
12962 } else {
12963 dout(10) << __func__ << " keeping existing state" << dendl;
12964 }
12965
12966 if (info.stats.stats_invalid) {
12967 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12968 }
12969
12970 agent_choose_mode();
12971 }
12972
12973 void PrimaryLogPG::agent_clear()
12974 {
12975 agent_stop();
12976 agent_state.reset(NULL);
12977 }
12978
12979 // Return false if no objects operated on since start of object hash space
12980 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12981 {
12982 lock();
12983 if (!agent_state) {
12984 dout(10) << __func__ << " no agent state, stopping" << dendl;
12985 unlock();
12986 return true;
12987 }
12988
12989 assert(!deleting);
12990
12991 if (agent_state->is_idle()) {
12992 dout(10) << __func__ << " idle, stopping" << dendl;
12993 unlock();
12994 return true;
12995 }
12996
12997 osd->logger->inc(l_osd_agent_wake);
12998
12999 dout(10) << __func__
13000 << " max " << start_max
13001 << ", flush " << agent_state->get_flush_mode_name()
13002 << ", evict " << agent_state->get_evict_mode_name()
13003 << ", pos " << agent_state->position
13004 << dendl;
13005 assert(is_primary());
13006 assert(is_active());
13007
13008 agent_load_hit_sets();
13009
13010 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13011 assert(base_pool);
13012
13013 int ls_min = 1;
13014 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13015
13016 // list some objects. this conveniently lists clones (oldest to
13017 // newest) before heads... the same order we want to flush in.
13018 //
13019 // NOTE: do not flush the Sequencer. we will assume that the
13020 // listing we get back is imprecise.
13021 vector<hobject_t> ls;
13022 hobject_t next;
13023 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13024 &ls, &next);
13025 assert(r >= 0);
13026 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13027 int started = 0;
13028 for (vector<hobject_t>::iterator p = ls.begin();
13029 p != ls.end();
13030 ++p) {
13031 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13032 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13033 osd->logger->inc(l_osd_agent_skip);
13034 continue;
13035 }
13036 if (is_degraded_or_backfilling_object(*p)) {
13037 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13038 osd->logger->inc(l_osd_agent_skip);
13039 continue;
13040 }
13041 if (is_missing_object(p->get_head())) {
13042 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13043 osd->logger->inc(l_osd_agent_skip);
13044 continue;
13045 }
13046 ObjectContextRef obc = get_object_context(*p, false, NULL);
13047 if (!obc) {
13048 // we didn't flush; we may miss something here.
13049 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13050 osd->logger->inc(l_osd_agent_skip);
13051 continue;
13052 }
13053 if (!obc->obs.exists) {
13054 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13055 osd->logger->inc(l_osd_agent_skip);
13056 continue;
13057 }
13058 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
13059 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13060 osd->logger->inc(l_osd_agent_skip);
13061 continue;
13062 }
13063 if (obc->is_blocked()) {
13064 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13065 osd->logger->inc(l_osd_agent_skip);
13066 continue;
13067 }
13068 if (obc->is_request_pending()) {
13069 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13070 osd->logger->inc(l_osd_agent_skip);
13071 continue;
13072 }
13073
13074 // be careful flushing omap to an EC pool.
13075 if (!base_pool->supports_omap() &&
13076 obc->obs.oi.is_omap()) {
13077 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13078 osd->logger->inc(l_osd_agent_skip);
13079 continue;
13080 }
13081
13082 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13083 agent_maybe_evict(obc, false))
13084 ++started;
13085 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13086 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13087 ++started;
13088 --agent_flush_quota;
13089 }
13090 if (started >= start_max) {
13091 // If finishing early, set "next" to the next object
13092 if (++p != ls.end())
13093 next = *p;
13094 break;
13095 }
13096 }
13097
13098 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13099 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13100 agent_state->hist_age = 0;
13101 agent_state->temp_hist.decay();
13102 }
13103
13104 // Total objects operated on so far
13105 int total_started = agent_state->started + started;
13106 bool need_delay = false;
13107
13108 dout(20) << __func__ << " start pos " << agent_state->position
13109 << " next start pos " << next
13110 << " started " << total_started << dendl;
13111
13112 // See if we've made a full pass over the object hash space
13113 // This might check at most ls_max objects a second time to notice that
13114 // we've checked every objects at least once.
13115 if (agent_state->position < agent_state->start &&
13116 next >= agent_state->start) {
13117 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13118 if (total_started == 0)
13119 need_delay = true;
13120 else
13121 total_started = 0;
13122 agent_state->start = next;
13123 }
13124 agent_state->started = total_started;
13125
13126 // See if we are starting from beginning
13127 if (next.is_max())
13128 agent_state->position = hobject_t();
13129 else
13130 agent_state->position = next;
13131
13132 // Discard old in memory HitSets
13133 hit_set_in_memory_trim(pool.info.hit_set_count);
13134
13135 if (need_delay) {
13136 assert(agent_state->delaying == false);
13137 agent_delay();
13138 unlock();
13139 return false;
13140 }
13141 agent_choose_mode();
13142 unlock();
13143 return true;
13144 }
13145
13146 void PrimaryLogPG::agent_load_hit_sets()
13147 {
13148 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13149 return;
13150 }
13151
13152 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13153 dout(10) << __func__ << dendl;
13154 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13155 p != info.hit_set.history.end(); ++p) {
13156 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13157 dout(10) << __func__ << " loading " << p->begin << "-"
13158 << p->end << dendl;
13159 if (!pool.info.is_replicated()) {
13160 // FIXME: EC not supported here yet
13161 derr << __func__ << " on non-replicated pool" << dendl;
13162 break;
13163 }
13164
13165 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13166 if (is_unreadable_object(oid)) {
13167 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13168 break;
13169 }
13170
13171 ObjectContextRef obc = get_object_context(oid, false);
13172 if (!obc) {
13173 derr << __func__ << ": could not load hitset " << oid << dendl;
13174 break;
13175 }
13176
13177 bufferlist bl;
13178 {
13179 obc->ondisk_read_lock();
13180 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13181 assert(r >= 0);
13182 obc->ondisk_read_unlock();
13183 }
13184 HitSetRef hs(new HitSet);
13185 bufferlist::iterator pbl = bl.begin();
13186 ::decode(*hs, pbl);
13187 agent_state->add_hit_set(p->begin.sec(), hs);
13188 }
13189 }
13190 }
13191 }
13192
13193 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13194 {
13195 if (!obc->obs.oi.is_dirty()) {
13196 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13197 osd->logger->inc(l_osd_agent_skip);
13198 return false;
13199 }
13200 if (obc->obs.oi.is_cache_pinned()) {
13201 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13202 osd->logger->inc(l_osd_agent_skip);
13203 return false;
13204 }
13205
13206 utime_t now = ceph_clock_now();
13207 utime_t ob_local_mtime;
13208 if (obc->obs.oi.local_mtime != utime_t()) {
13209 ob_local_mtime = obc->obs.oi.local_mtime;
13210 } else {
13211 ob_local_mtime = obc->obs.oi.mtime;
13212 }
13213 bool evict_mode_full =
13214 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13215 if (!evict_mode_full &&
13216 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
13217 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13218 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13219 osd->logger->inc(l_osd_agent_skip);
13220 return false;
13221 }
13222
13223 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13224 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13225 osd->logger->inc(l_osd_agent_skip);
13226 return false;
13227 }
13228
13229 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13230
13231 // FIXME: flush anything dirty, regardless of what distribution of
13232 // ages we expect.
13233
13234 hobject_t oid = obc->obs.oi.soid;
13235 osd->agent_start_op(oid);
13236 // no need to capture a pg ref, can't outlive fop or ctx
13237 std::function<void()> on_flush = [this, oid]() {
13238 osd->agent_finish_op(oid);
13239 };
13240
13241 int result = start_flush(
13242 OpRequestRef(), obc, false, NULL,
13243 on_flush);
13244 if (result != -EINPROGRESS) {
13245 on_flush();
13246 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13247 << " with " << result << dendl;
13248 osd->logger->inc(l_osd_agent_skip);
13249 return false;
13250 }
13251
13252 osd->logger->inc(l_osd_agent_flush);
13253 return true;
13254 }
13255
13256 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13257 {
13258 const hobject_t& soid = obc->obs.oi.soid;
13259 if (!after_flush && obc->obs.oi.is_dirty()) {
13260 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13261 return false;
13262 }
13263 if (!obc->obs.oi.watchers.empty()) {
13264 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13265 return false;
13266 }
13267 if (obc->is_blocked()) {
13268 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13269 return false;
13270 }
13271 if (obc->obs.oi.is_cache_pinned()) {
13272 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13273 return false;
13274 }
13275
13276 if (soid.snap == CEPH_NOSNAP) {
13277 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13278 if (result < 0) {
13279 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13280 return false;
13281 }
13282 }
13283
13284 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13285 // is this object old than cache_min_evict_age?
13286 utime_t now = ceph_clock_now();
13287 utime_t ob_local_mtime;
13288 if (obc->obs.oi.local_mtime != utime_t()) {
13289 ob_local_mtime = obc->obs.oi.local_mtime;
13290 } else {
13291 ob_local_mtime = obc->obs.oi.mtime;
13292 }
13293 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13294 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13295 osd->logger->inc(l_osd_agent_skip);
13296 return false;
13297 }
13298 // is this object old and/or cold enough?
13299 int temp = 0;
13300 uint64_t temp_upper = 0, temp_lower = 0;
13301 if (hit_set)
13302 agent_estimate_temp(soid, &temp);
13303 agent_state->temp_hist.add(temp);
13304 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13305
13306 dout(20) << __func__
13307 << " temp " << temp
13308 << " pos " << temp_lower << "-" << temp_upper
13309 << ", evict_effort " << agent_state->evict_effort
13310 << dendl;
13311 dout(30) << "agent_state:\n";
13312 Formatter *f = Formatter::create("");
13313 f->open_object_section("agent_state");
13314 agent_state->dump(f);
13315 f->close_section();
13316 f->flush(*_dout);
13317 delete f;
13318 *_dout << dendl;
13319
13320 if (1000000 - temp_upper >= agent_state->evict_effort)
13321 return false;
13322 }
13323
13324 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13325 OpContextUPtr ctx = simple_opc_create(obc);
13326
13327 if (!ctx->lock_manager.get_lock_type(
13328 ObjectContext::RWState::RWWRITE,
13329 obc->obs.oi.soid,
13330 obc,
13331 OpRequestRef())) {
13332 close_op_ctx(ctx.release());
13333 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13334 return false;
13335 }
13336
13337 osd->agent_start_evict_op();
13338 ctx->register_on_finish(
13339 [this]() {
13340 osd->agent_finish_evict_op();
13341 });
13342
13343 ctx->at_version = get_next_version();
13344 assert(ctx->new_obs.exists);
13345 int r = _delete_oid(ctx.get(), true, false);
13346 if (obc->obs.oi.is_omap())
13347 ctx->delta_stats.num_objects_omap--;
13348 ctx->delta_stats.num_evict++;
13349 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13350 if (obc->obs.oi.is_dirty())
13351 --ctx->delta_stats.num_objects_dirty;
13352 assert(r == 0);
13353 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13354 simple_opc_submit(std::move(ctx));
13355 osd->logger->inc(l_osd_tier_evict);
13356 osd->logger->inc(l_osd_agent_evict);
13357 return true;
13358 }
13359
13360 void PrimaryLogPG::agent_stop()
13361 {
13362 dout(20) << __func__ << dendl;
13363 if (agent_state && !agent_state->is_idle()) {
13364 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13365 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13366 osd->agent_disable_pg(this, agent_state->evict_effort);
13367 }
13368 }
13369
13370 void PrimaryLogPG::agent_delay()
13371 {
13372 dout(20) << __func__ << dendl;
13373 if (agent_state && !agent_state->is_idle()) {
13374 assert(agent_state->delaying == false);
13375 agent_state->delaying = true;
13376 osd->agent_disable_pg(this, agent_state->evict_effort);
13377 }
13378 }
13379
13380 void PrimaryLogPG::agent_choose_mode_restart()
13381 {
13382 dout(20) << __func__ << dendl;
13383 lock();
13384 if (agent_state && agent_state->delaying) {
13385 agent_state->delaying = false;
13386 agent_choose_mode(true);
13387 }
13388 unlock();
13389 }
13390
13391 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13392 {
13393 bool requeued = false;
13394 // Let delay play out
13395 if (agent_state->delaying) {
13396 dout(20) << __func__ << this << " delaying, ignored" << dendl;
13397 return requeued;
13398 }
13399
13400 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13401 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13402 unsigned evict_effort = 0;
13403
13404 if (info.stats.stats_invalid) {
13405 // idle; stats can't be trusted until we scrub.
13406 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13407 goto skip_calc;
13408 }
13409
13410 {
13411 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13412 assert(divisor > 0);
13413
13414 // adjust (effective) user objects down based on the number
13415 // of HitSet objects, which should not count toward our total since
13416 // they cannot be flushed.
13417 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13418
13419 // also exclude omap objects if ec backing pool
13420 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13421 assert(base_pool);
13422 if (!base_pool->supports_omap())
13423 unflushable += info.stats.stats.sum.num_objects_omap;
13424
13425 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13426 if (num_user_objects > unflushable)
13427 num_user_objects -= unflushable;
13428 else
13429 num_user_objects = 0;
13430
13431 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13432 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13433 num_user_bytes -= unflushable_bytes;
13434 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13435 num_user_bytes += num_overhead_bytes;
13436
13437 // also reduce the num_dirty by num_objects_omap
13438 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13439 if (!base_pool->supports_omap()) {
13440 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13441 num_dirty -= info.stats.stats.sum.num_objects_omap;
13442 else
13443 num_dirty = 0;
13444 }
13445
13446 dout(10) << __func__
13447 << " flush_mode: "
13448 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13449 << " evict_mode: "
13450 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13451 << " num_objects: " << info.stats.stats.sum.num_objects
13452 << " num_bytes: " << info.stats.stats.sum.num_bytes
13453 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13454 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13455 << " num_dirty: " << num_dirty
13456 << " num_user_objects: " << num_user_objects
13457 << " num_user_bytes: " << num_user_bytes
13458 << " num_overhead_bytes: " << num_overhead_bytes
13459 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13460 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13461 << dendl;
13462
13463 // get dirty, full ratios
13464 uint64_t dirty_micro = 0;
13465 uint64_t full_micro = 0;
13466 if (pool.info.target_max_bytes && num_user_objects > 0) {
13467 uint64_t avg_size = num_user_bytes / num_user_objects;
13468 dirty_micro =
13469 num_dirty * avg_size * 1000000 /
13470 MAX(pool.info.target_max_bytes / divisor, 1);
13471 full_micro =
13472 num_user_objects * avg_size * 1000000 /
13473 MAX(pool.info.target_max_bytes / divisor, 1);
13474 }
13475 if (pool.info.target_max_objects > 0) {
13476 uint64_t dirty_objects_micro =
13477 num_dirty * 1000000 /
13478 MAX(pool.info.target_max_objects / divisor, 1);
13479 if (dirty_objects_micro > dirty_micro)
13480 dirty_micro = dirty_objects_micro;
13481 uint64_t full_objects_micro =
13482 num_user_objects * 1000000 /
13483 MAX(pool.info.target_max_objects / divisor, 1);
13484 if (full_objects_micro > full_micro)
13485 full_micro = full_objects_micro;
13486 }
13487 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13488 << " full " << ((float)full_micro / 1000000.0)
13489 << dendl;
13490
13491 // flush mode
13492 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13493 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13494 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13495 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13496 flush_target += flush_slop;
13497 flush_high_target += flush_slop;
13498 } else {
13499 flush_target -= MIN(flush_target, flush_slop);
13500 flush_high_target -= MIN(flush_high_target, flush_slop);
13501 }
13502
13503 if (dirty_micro > flush_high_target) {
13504 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13505 } else if (dirty_micro > flush_target) {
13506 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13507 }
13508
13509 // evict mode
13510 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13511 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13512 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13513 evict_target += evict_slop;
13514 else
13515 evict_target -= MIN(evict_target, evict_slop);
13516
13517 if (full_micro > 1000000) {
13518 // evict anything clean
13519 evict_mode = TierAgentState::EVICT_MODE_FULL;
13520 evict_effort = 1000000;
13521 } else if (full_micro > evict_target) {
13522 // set effort in [0..1] range based on where we are between
13523 evict_mode = TierAgentState::EVICT_MODE_SOME;
13524 uint64_t over = full_micro - evict_target;
13525 uint64_t span = 1000000 - evict_target;
13526 evict_effort = MAX(over * 1000000 / span,
13527 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13528
13529 // quantize effort to avoid too much reordering in the agent_queue.
13530 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13531 assert(inc > 0);
13532 uint64_t was = evict_effort;
13533 evict_effort -= evict_effort % inc;
13534 if (evict_effort < inc)
13535 evict_effort = inc;
13536 assert(evict_effort >= inc && evict_effort <= 1000000);
13537 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13538 }
13539 }
13540
13541 skip_calc:
13542 bool old_idle = agent_state->is_idle();
13543 if (flush_mode != agent_state->flush_mode) {
13544 dout(5) << __func__ << " flush_mode "
13545 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13546 << " -> "
13547 << TierAgentState::get_flush_mode_name(flush_mode)
13548 << dendl;
13549 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13550 osd->agent_inc_high_count();
13551 info.stats.stats.sum.num_flush_mode_high = 1;
13552 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13553 info.stats.stats.sum.num_flush_mode_low = 1;
13554 }
13555 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13556 osd->agent_dec_high_count();
13557 info.stats.stats.sum.num_flush_mode_high = 0;
13558 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13559 info.stats.stats.sum.num_flush_mode_low = 0;
13560 }
13561 agent_state->flush_mode = flush_mode;
13562 }
13563 if (evict_mode != agent_state->evict_mode) {
13564 dout(5) << __func__ << " evict_mode "
13565 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13566 << " -> "
13567 << TierAgentState::get_evict_mode_name(evict_mode)
13568 << dendl;
13569 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13570 is_active()) {
13571 if (op)
13572 requeue_op(op);
13573 requeue_ops(waiting_for_flush);
13574 requeue_ops(waiting_for_active);
13575 requeue_ops(waiting_for_scrub);
13576 requeue_ops(waiting_for_cache_not_full);
13577 objects_blocked_on_cache_full.clear();
13578 requeued = true;
13579 }
13580 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13581 info.stats.stats.sum.num_evict_mode_some = 1;
13582 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13583 info.stats.stats.sum.num_evict_mode_full = 1;
13584 }
13585 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13586 info.stats.stats.sum.num_evict_mode_some = 0;
13587 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13588 info.stats.stats.sum.num_evict_mode_full = 0;
13589 }
13590 agent_state->evict_mode = evict_mode;
13591 }
13592 uint64_t old_effort = agent_state->evict_effort;
13593 if (evict_effort != agent_state->evict_effort) {
13594 dout(5) << __func__ << " evict_effort "
13595 << ((float)agent_state->evict_effort / 1000000.0)
13596 << " -> "
13597 << ((float)evict_effort / 1000000.0)
13598 << dendl;
13599 agent_state->evict_effort = evict_effort;
13600 }
13601
13602 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13603 // (including flush). This is probably fine (they should be
13604 // correlated) but it is not precisely correct.
13605 if (agent_state->is_idle()) {
13606 if (!restart && !old_idle) {
13607 osd->agent_disable_pg(this, old_effort);
13608 }
13609 } else {
13610 if (restart || old_idle) {
13611 osd->agent_enable_pg(this, agent_state->evict_effort);
13612 } else if (old_effort != agent_state->evict_effort) {
13613 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13614 }
13615 }
13616 return requeued;
13617 }
13618
13619 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13620 {
13621 assert(hit_set);
13622 assert(temp);
13623 *temp = 0;
13624 if (hit_set->contains(oid))
13625 *temp = 1000000;
13626 unsigned i = 0;
13627 int last_n = pool.info.hit_set_search_last_n;
13628 for (map<time_t,HitSetRef>::reverse_iterator p =
13629 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13630 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13631 if (p->second->contains(oid)) {
13632 *temp += pool.info.get_grade(i);
13633 --last_n;
13634 }
13635 }
13636 }
13637
13638 // Dup op detection
13639
13640 bool PrimaryLogPG::already_complete(eversion_t v)
13641 {
13642 dout(20) << __func__ << ": " << v << dendl;
13643 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13644 !i.end();
13645 ++i) {
13646 dout(20) << __func__ << ": " << **i << dendl;
13647 // skip copy from temp object ops
13648 if ((*i)->v == eversion_t()) {
13649 dout(20) << __func__ << ": " << **i
13650 << " version is empty" << dendl;
13651 continue;
13652 }
13653 if ((*i)->v > v) {
13654 dout(20) << __func__ << ": " << **i
13655 << " (*i)->v past v" << dendl;
13656 break;
13657 }
13658 if (!(*i)->all_committed) {
13659 dout(20) << __func__ << ": " << **i
13660 << " not committed, returning false"
13661 << dendl;
13662 return false;
13663 }
13664 }
13665 dout(20) << __func__ << ": returning true" << dendl;
13666 return true;
13667 }
13668
13669 bool PrimaryLogPG::already_ack(eversion_t v)
13670 {
13671 dout(20) << __func__ << ": " << v << dendl;
13672 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13673 !i.end();
13674 ++i) {
13675 // skip copy from temp object ops
13676 if ((*i)->v == eversion_t()) {
13677 dout(20) << __func__ << ": " << **i
13678 << " version is empty" << dendl;
13679 continue;
13680 }
13681 if ((*i)->v > v) {
13682 dout(20) << __func__ << ": " << **i
13683 << " (*i)->v past v" << dendl;
13684 break;
13685 }
13686 if (!(*i)->all_applied) {
13687 dout(20) << __func__ << ": " << **i
13688 << " not applied, returning false"
13689 << dendl;
13690 return false;
13691 }
13692 }
13693 dout(20) << __func__ << ": returning true" << dendl;
13694 return true;
13695 }
13696
13697
13698 // ==========================================================================================
13699 // SCRUB
13700
13701
13702 bool PrimaryLogPG::_range_available_for_scrub(
13703 const hobject_t &begin, const hobject_t &end)
13704 {
13705 pair<hobject_t, ObjectContextRef> next;
13706 next.second = object_contexts.lookup(begin);
13707 next.first = begin;
13708 bool more = true;
13709 while (more && next.first < end) {
13710 if (next.second && next.second->is_blocked()) {
13711 next.second->requeue_scrub_on_unblock = true;
13712 dout(10) << __func__ << ": scrub delayed, "
13713 << next.first << " is blocked"
13714 << dendl;
13715 return false;
13716 }
13717 more = object_contexts.get_next(next.first, &next);
13718 }
13719 return true;
13720 }
13721
13722 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13723 const vector<snapid_t>::reverse_iterator &curclone) {
13724 return snapset && curclone != snapset.get().clones.rend();
13725 }
13726
13727 void PrimaryLogPG::log_missing(unsigned missing,
13728 const boost::optional<hobject_t> &head,
13729 LogChannelRef clog,
13730 const spg_t &pgid,
13731 const char *func,
13732 const char *mode,
13733 bool allow_incomplete_clones)
13734 {
13735 assert(head);
13736 if (allow_incomplete_clones) {
13737 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13738 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13739 } else {
13740 clog->info() << mode << " " << pgid << " " << head.get()
13741 << " " << missing << " missing clone(s)";
13742 }
13743 }
13744
13745 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13746 const boost::optional<SnapSet> &snapset,
13747 LogChannelRef clog,
13748 const spg_t &pgid,
13749 const char *mode,
13750 bool allow_incomplete_clones,
13751 boost::optional<snapid_t> target,
13752 vector<snapid_t>::reverse_iterator *curclone,
13753 inconsistent_snapset_wrapper &e)
13754 {
13755 assert(head);
13756 assert(snapset);
13757 unsigned missing = 0;
13758
13759 // NOTE: clones are in descending order, thus **curclone > target test here
13760 hobject_t next_clone(head.get());
13761 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13762 ++missing;
13763 // it is okay to be missing one or more clones in a cache tier.
13764 // skip higher-numbered clones in the list.
13765 if (!allow_incomplete_clones) {
13766 next_clone.snap = **curclone;
13767 clog->error() << mode << " " << pgid << " " << head.get()
13768 << " expected clone " << next_clone << " " << missing
13769 << " missing";
13770 ++scrubber.shallow_errors;
13771 e.set_clone_missing(next_clone.snap);
13772 }
13773 // Clones are descending
13774 ++(*curclone);
13775 }
13776 return missing;
13777 }
13778
13779 /*
13780 * Validate consistency of the object info and snap sets.
13781 *
13782 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13783 * the comparison of the objects is against multiple snapset.clones. There are
13784 * multiple clone lists and in between lists we expect head or snapdir.
13785 *
13786 * Example
13787 *
13788 * objects expected
13789 * ======= =======
13790 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13791 * obj2 head head/snapdir, head ok
13792 * [SnapSet clones 6 4 2 1]
13793 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13794 * obj2 snap 6 obj2 snap 6, match
13795 * obj2 snap 4 obj2 snap 4, match
13796 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13797 * [Snapset clones 3 1]
13798 * obj3 snap 3 obj3 snap 3 match
13799 * obj3 snap 1 obj3 snap 1 match
13800 * obj4 snapdir head/snapdir, snapdir ok
13801 * [Snapset clones 4]
13802 * EOL obj4 snap 4, (expected)
13803 */
13804 void PrimaryLogPG::scrub_snapshot_metadata(
13805 ScrubMap &scrubmap,
13806 const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13807 {
13808 dout(10) << __func__ << dendl;
13809
13810 coll_t c(info.pgid);
13811 bool repair = state_test(PG_STATE_REPAIR);
13812 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13813 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13814 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13815
13816 /// snapsets to repair
13817 map<hobject_t,SnapSet> snapset_to_repair;
13818
13819 // traverse in reverse order.
13820 boost::optional<hobject_t> head;
13821 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13822 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13823 unsigned missing = 0;
13824 inconsistent_snapset_wrapper soid_error, head_error;
13825 unsigned soid_error_count = 0;
13826
13827 bufferlist last_data;
13828
13829 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13830 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13831 const hobject_t& soid = p->first;
13832 soid_error = inconsistent_snapset_wrapper{soid};
13833 object_stat_sum_t stat;
13834 boost::optional<object_info_t> oi;
13835
13836 if (!soid.is_snapdir())
13837 stat.num_objects++;
13838
13839 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13840 stat.num_objects_hit_set_archive++;
13841
13842 if (soid.is_snap()) {
13843 // it's a clone
13844 stat.num_object_clones++;
13845 }
13846
13847 // basic checks.
13848 if (p->second.attrs.count(OI_ATTR) == 0) {
13849 oi = boost::none;
13850 osd->clog->error() << mode << " " << info.pgid << " " << soid
13851 << " no '" << OI_ATTR << "' attr";
13852 ++scrubber.shallow_errors;
13853 soid_error.set_info_missing();
13854 } else {
13855 bufferlist bv;
13856 bv.push_back(p->second.attrs[OI_ATTR]);
13857 try {
13858 oi = object_info_t(); // Initialize optional<> before decode into it
13859 oi.get().decode(bv);
13860 } catch (buffer::error& e) {
13861 oi = boost::none;
13862 osd->clog->error() << mode << " " << info.pgid << " " << soid
13863 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13864 ++scrubber.shallow_errors;
13865 soid_error.set_info_corrupted();
13866 soid_error.set_info_missing(); // Not available too
13867 }
13868 }
13869
13870 if (oi) {
13871 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13872 osd->clog->error() << mode << " " << info.pgid << " " << soid
13873 << " on disk size (" << p->second.size
13874 << ") does not match object info size ("
13875 << oi->size << ") adjusted for ondisk to ("
13876 << pgbackend->be_get_ondisk_size(oi->size)
13877 << ")";
13878 soid_error.set_size_mismatch();
13879 ++scrubber.shallow_errors;
13880 }
13881
13882 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13883
13884 // A clone num_bytes will be added later when we have snapset
13885 if (!soid.is_snap()) {
13886 stat.num_bytes += oi->size;
13887 }
13888 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13889 stat.num_bytes_hit_set_archive += oi->size;
13890
13891 if (!soid.is_snapdir()) {
13892 if (oi->is_dirty())
13893 ++stat.num_objects_dirty;
13894 if (oi->is_whiteout())
13895 ++stat.num_whiteouts;
13896 if (oi->is_omap())
13897 ++stat.num_objects_omap;
13898 if (oi->is_cache_pinned())
13899 ++stat.num_objects_pinned;
13900 }
13901 } else {
13902 // pessimistic assumption that this object might contain a
13903 // legacy SnapSet
13904 stat.num_legacy_snapsets++;
13905 }
13906
13907 // Check for any problems while processing clones
13908 if (doing_clones(snapset, curclone)) {
13909 boost::optional<snapid_t> target;
13910 // Expecting an object with snap for current head
13911 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13912
13913 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13914 << soid << " while processing " << head.get() << dendl;
13915
13916 target = all_clones;
13917 } else {
13918 assert(soid.is_snap());
13919 target = soid.snap;
13920 }
13921
13922 // Log any clones we were expecting to be there up to target
13923 // This will set missing, but will be a no-op if snap.soid == *curclone.
13924 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13925 pool.info.allow_incomplete_clones(), target, &curclone,
13926 head_error);
13927 }
13928 bool expected;
13929 // Check doing_clones() again in case we ran process_clones_to()
13930 if (doing_clones(snapset, curclone)) {
13931 // A head/snapdir would have processed all clones above
13932 // or all greater than *curclone.
13933 assert(soid.is_snap() && *curclone <= soid.snap);
13934
13935 // After processing above clone snap should match the expected curclone
13936 expected = (*curclone == soid.snap);
13937 } else {
13938 // If we aren't doing clones any longer, then expecting head/snapdir
13939 expected = soid.has_snapset();
13940 }
13941 if (!expected) {
13942 // If we couldn't read the head's snapset, just ignore clones
13943 if (head && !snapset) {
13944 osd->clog->error() << mode << " " << info.pgid << " " << soid
13945 << " clone ignored due to missing snapset";
13946 } else {
13947 osd->clog->error() << mode << " " << info.pgid << " " << soid
13948 << " is an unexpected clone";
13949 }
13950 ++scrubber.shallow_errors;
13951 soid_error.set_headless();
13952 scrubber.store->add_snap_error(pool.id, soid_error);
13953 ++soid_error_count;
13954 if (head && soid.get_head() == head->get_head())
13955 head_error.set_clone(soid.snap);
13956 continue;
13957 }
13958
13959 // new snapset?
13960 if (soid.has_snapset()) {
13961
13962 if (missing) {
13963 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13964 pool.info.allow_incomplete_clones());
13965 }
13966
13967 // Save previous head error information
13968 if (head && (head_error.errors || soid_error_count))
13969 scrubber.store->add_snap_error(pool.id, head_error);
13970 // Set this as a new head object
13971 head = soid;
13972 missing = 0;
13973 head_error = soid_error;
13974 soid_error_count = 0;
13975
13976 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13977
13978 if (p->second.attrs.count(SS_ATTR) == 0) {
13979 osd->clog->error() << mode << " " << info.pgid << " " << soid
13980 << " no '" << SS_ATTR << "' attr";
13981 ++scrubber.shallow_errors;
13982 snapset = boost::none;
13983 head_error.set_snapset_missing();
13984 } else {
13985 bufferlist bl;
13986 bl.push_back(p->second.attrs[SS_ATTR]);
13987 bufferlist::iterator blp = bl.begin();
13988 try {
13989 snapset = SnapSet(); // Initialize optional<> before decoding into it
13990 ::decode(snapset.get(), blp);
13991 head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
13992 } catch (buffer::error& e) {
13993 snapset = boost::none;
13994 osd->clog->error() << mode << " " << info.pgid << " " << soid
13995 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13996 ++scrubber.shallow_errors;
13997 head_error.set_snapset_corrupted();
13998 }
13999 }
14000
14001 if (snapset) {
14002 // what will be next?
14003 curclone = snapset->clones.rbegin();
14004
14005 if (!snapset->clones.empty()) {
14006 dout(20) << " snapset " << snapset.get() << dendl;
14007 if (snapset->seq == 0) {
14008 osd->clog->error() << mode << " " << info.pgid << " " << soid
14009 << " snaps.seq not set";
14010 ++scrubber.shallow_errors;
14011 head_error.set_snapset_error();
14012 }
14013 }
14014
14015 if (soid.is_head() && !snapset->head_exists) {
14016 osd->clog->error() << mode << " " << info.pgid << " " << soid
14017 << " snapset.head_exists=false, but head exists";
14018 ++scrubber.shallow_errors;
14019 head_error.set_head_mismatch();
14020 // Fix head_exists locally so is_legacy() returns correctly
14021 snapset->head_exists = true;
14022 }
14023 if (soid.is_snapdir() && snapset->head_exists) {
14024 osd->clog->error() << mode << " " << info.pgid << " " << soid
14025 << " snapset.head_exists=true, but snapdir exists";
14026 ++scrubber.shallow_errors;
14027 head_error.set_head_mismatch();
14028 // For symmetry fix this too, but probably doesn't matter
14029 snapset->head_exists = false;
14030 }
14031
14032 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
14033 if (soid.is_snapdir()) {
14034 dout(10) << " will move snapset to head from " << soid << dendl;
14035 snapset_to_repair[soid.get_head()] = *snapset;
14036 } else if (snapset->is_legacy()) {
14037 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
14038 << dendl;
14039 snapset_to_repair[soid.get_head()] = *snapset;
14040 }
14041 } else {
14042 stat.num_legacy_snapsets++;
14043 }
14044 } else {
14045 // pessimistic assumption that this object might contain a
14046 // legacy SnapSet
14047 stat.num_legacy_snapsets++;
14048 }
14049 } else {
14050 assert(soid.is_snap());
14051 assert(head);
14052 assert(snapset);
14053 assert(soid.snap == *curclone);
14054
14055 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14056
14057 if (snapset->clone_size.count(soid.snap) == 0) {
14058 osd->clog->error() << mode << " " << info.pgid << " " << soid
14059 << " is missing in clone_size";
14060 ++scrubber.shallow_errors;
14061 soid_error.set_size_mismatch();
14062 } else {
14063 if (oi && oi->size != snapset->clone_size[soid.snap]) {
14064 osd->clog->error() << mode << " " << info.pgid << " " << soid
14065 << " size " << oi->size << " != clone_size "
14066 << snapset->clone_size[*curclone];
14067 ++scrubber.shallow_errors;
14068 soid_error.set_size_mismatch();
14069 }
14070
14071 if (snapset->clone_overlap.count(soid.snap) == 0) {
14072 osd->clog->error() << mode << " " << info.pgid << " " << soid
14073 << " is missing in clone_overlap";
14074 ++scrubber.shallow_errors;
14075 soid_error.set_size_mismatch();
14076 } else {
14077 // This checking is based on get_clone_bytes(). The first 2 asserts
14078 // can't happen because we know we have a clone_size and
14079 // a clone_overlap. Now we check that the interval_set won't
14080 // cause the last assert.
14081 uint64_t size = snapset->clone_size.find(soid.snap)->second;
14082 const interval_set<uint64_t> &overlap =
14083 snapset->clone_overlap.find(soid.snap)->second;
14084 bool bad_interval_set = false;
14085 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14086 i != overlap.end(); ++i) {
14087 if (size < i.get_len()) {
14088 bad_interval_set = true;
14089 break;
14090 }
14091 size -= i.get_len();
14092 }
14093
14094 if (bad_interval_set) {
14095 osd->clog->error() << mode << " " << info.pgid << " " << soid
14096 << " bad interval_set in clone_overlap";
14097 ++scrubber.shallow_errors;
14098 soid_error.set_size_mismatch();
14099 } else {
14100 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14101 }
14102 }
14103 }
14104
14105 // migrate legacy_snaps to snapset?
14106 auto p = snapset_to_repair.find(soid.get_head());
14107 if (p != snapset_to_repair.end()) {
14108 if (!oi || oi->legacy_snaps.empty()) {
14109 osd->clog->error() << mode << " " << info.pgid << " " << soid
14110 << " has no oi or legacy_snaps; cannot convert "
14111 << *snapset;
14112 ++scrubber.shallow_errors;
14113 } else {
14114 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
14115 << " to snapset " << p->second << dendl;
14116 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14117 }
14118 }
14119
14120 // what's next?
14121 ++curclone;
14122 if (soid_error.errors) {
14123 scrubber.store->add_snap_error(pool.id, soid_error);
14124 ++soid_error_count;
14125 }
14126 }
14127
14128 scrub_cstat.add(stat);
14129 }
14130
14131 if (doing_clones(snapset, curclone)) {
14132 dout(10) << __func__ << " " << mode << " " << info.pgid
14133 << " No more objects while processing " << head.get() << dendl;
14134
14135 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14136 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14137 head_error);
14138 }
14139 // There could be missing found by the test above or even
14140 // before dropping out of the loop for the last head.
14141 if (missing) {
14142 log_missing(missing, head, osd->clog, info.pgid, __func__,
14143 mode, pool.info.allow_incomplete_clones());
14144 }
14145 if (head && (head_error.errors || soid_error_count))
14146 scrubber.store->add_snap_error(pool.id, head_error);
14147
14148 for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14149 missing_digest.begin();
14150 p != missing_digest.end();
14151 ++p) {
14152 if (p->first.is_snapdir())
14153 continue;
14154 dout(10) << __func__ << " recording digests for " << p->first << dendl;
14155 ObjectContextRef obc = get_object_context(p->first, false);
14156 if (!obc) {
14157 osd->clog->error() << info.pgid << " " << mode
14158 << " cannot get object context for object "
14159 << p->first;
14160 continue;
14161 } else if (obc->obs.oi.soid != p->first) {
14162 osd->clog->error() << info.pgid << " " << mode
14163 << " object " << p->first
14164 << " has a valid oi attr with a mismatched name, "
14165 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14166 continue;
14167 }
14168 OpContextUPtr ctx = simple_opc_create(obc);
14169 ctx->at_version = get_next_version();
14170 ctx->mtime = utime_t(); // do not update mtime
14171 ctx->new_obs.oi.set_data_digest(p->second.first);
14172 ctx->new_obs.oi.set_omap_digest(p->second.second);
14173 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14174
14175 ctx->register_on_success(
14176 [this]() {
14177 dout(20) << "updating scrub digest" << dendl;
14178 if (--scrubber.num_digest_updates_pending == 0) {
14179 requeue_scrub();
14180 }
14181 });
14182
14183 simple_opc_submit(std::move(ctx));
14184 ++scrubber.num_digest_updates_pending;
14185 }
14186 for (auto& p : snapset_to_repair) {
14187 // cache pools may not have the clones, which means we won't know
14188 // what snaps they have. fake out the clone_snaps entries anyway (with
14189 // blank snap lists).
14190 p.second.head_exists = true;
14191 if (pool.info.allow_incomplete_clones()) {
14192 for (auto s : p.second.clones) {
14193 if (p.second.clone_snaps.count(s) == 0) {
14194 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14195 << s << dendl;
14196 p.second.clone_snaps[s];
14197 }
14198 }
14199 }
14200 if (p.second.clones.size() != p.second.clone_snaps.size() ||
14201 p.second.is_legacy()) {
14202 // this happens if we encounter other errors above, like a missing
14203 // or extra clone.
14204 dout(10) << __func__ << " not writing snapset to " << p.first
14205 << " snapset " << p.second << " clones " << p.second.clones
14206 << "; didn't convert fully" << dendl;
14207 scrub_cstat.sum.num_legacy_snapsets++;
14208 continue;
14209 }
14210 dout(10) << __func__ << " writing snapset to " << p.first
14211 << " " << p.second << dendl;
14212 ObjectContextRef obc = get_object_context(p.first, true);
14213 if (!obc) {
14214 osd->clog->error() << info.pgid << " " << mode
14215 << " cannot get object context for object "
14216 << p.first;
14217 continue;
14218 } else if (obc->obs.oi.soid != p.first) {
14219 osd->clog->error() << info.pgid << " " << mode
14220 << " object " << p.first
14221 << " has a valid oi attr with a mismatched name, "
14222 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14223 continue;
14224 }
14225 ObjectContextRef snapset_obc;
14226 if (!obc->obs.exists) {
14227 snapset_obc = get_object_context(p.first.get_snapdir(), false);
14228 if (!snapset_obc) {
14229 osd->clog->error() << info.pgid << " " << mode
14230 << " cannot get object context for "
14231 << p.first.get_snapdir();
14232 continue;
14233 }
14234 }
14235 OpContextUPtr ctx = simple_opc_create(obc);
14236 PGTransaction *t = ctx->op_t.get();
14237 ctx->snapset_obc = snapset_obc;
14238 ctx->at_version = get_next_version();
14239 ctx->mtime = utime_t(); // do not update mtime
14240 ctx->new_snapset = p.second;
14241 if (!ctx->new_obs.exists) {
14242 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
14243 ctx->new_obs.exists = true;
14244 ctx->new_snapset.head_exists = true;
14245 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14246 ++ctx->delta_stats.num_whiteouts;
14247 ++ctx->delta_stats.num_objects;
14248 t->create(p.first);
14249 if (p.first < scrubber.start) {
14250 dout(20) << __func__ << " kludging around update outside of scrub range"
14251 << dendl;
14252 } else {
14253 scrub_cstat.add(ctx->delta_stats);
14254 }
14255 }
14256 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
14257 assert(!ctx->new_snapset.is_legacy());
14258 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14259 ctx->register_on_success(
14260 [this]() {
14261 dout(20) << "updating snapset" << dendl;
14262 if (--scrubber.num_digest_updates_pending == 0) {
14263 requeue_scrub();
14264 }
14265 });
14266
14267 simple_opc_submit(std::move(ctx));
14268 ++scrubber.num_digest_updates_pending;
14269 }
14270
14271 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14272 }
14273
14274 void PrimaryLogPG::_scrub_clear_state()
14275 {
14276 scrub_cstat = object_stat_collection_t();
14277 }
14278
14279 void PrimaryLogPG::_scrub_finish()
14280 {
14281 bool repair = state_test(PG_STATE_REPAIR);
14282 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14283 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14284
14285 if (info.stats.stats_invalid) {
14286 info.stats.stats = scrub_cstat;
14287 info.stats.stats_invalid = false;
14288
14289 if (agent_state)
14290 agent_choose_mode();
14291 }
14292
14293 dout(10) << mode << " got "
14294 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14295 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14296 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14297 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14298 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14299 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14300 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14301 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14302 << dendl;
14303
14304 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14305 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14306 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14307 !info.stats.dirty_stats_invalid) ||
14308 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14309 !info.stats.omap_stats_invalid) ||
14310 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14311 !info.stats.pin_stats_invalid) ||
14312 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14313 !info.stats.hitset_stats_invalid) ||
14314 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14315 !info.stats.hitset_bytes_stats_invalid) ||
14316 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14317 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14318 osd->clog->error() << info.pgid << " " << mode
14319 << " stat mismatch, got "
14320 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14321 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14322 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14323 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14324 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14325 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14326 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14327 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14328 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14329 ++scrubber.shallow_errors;
14330
14331 if (repair) {
14332 ++scrubber.fixed;
14333 info.stats.stats = scrub_cstat;
14334 info.stats.dirty_stats_invalid = false;
14335 info.stats.omap_stats_invalid = false;
14336 info.stats.hitset_stats_invalid = false;
14337 info.stats.hitset_bytes_stats_invalid = false;
14338 publish_stats_to_osd();
14339 share_pg_info();
14340 }
14341 } else if (scrub_cstat.sum.num_legacy_snapsets !=
14342 info.stats.stats.sum.num_legacy_snapsets) {
14343 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14344 << " from " << info.stats.stats.sum.num_legacy_snapsets
14345 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14346 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14347 publish_stats_to_osd();
14348 share_pg_info();
14349 }
14350 // Clear object context cache to get repair information
14351 if (repair)
14352 object_contexts.clear();
14353 }
14354
14355 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14356 {
14357 return osd->check_osdmap_full(missing_on);
14358 }
14359
14360 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14361 {
14362 // Only supports replicated pools
14363 assert(!pool.info.require_rollback());
14364 assert(is_primary());
14365
14366 dout(10) << __func__ << " " << soid
14367 << " peers osd.{" << actingbackfill << "}" << dendl;
14368
14369 if (!is_clean()) {
14370 block_for_clean(soid, op);
14371 return -EAGAIN;
14372 }
14373
14374 assert(!pg_log.get_missing().is_missing(soid));
14375 bufferlist bv;
14376 object_info_t oi;
14377 eversion_t v;
14378 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14379 if (r < 0) {
14380 // Leave v and try to repair without a version, getting attr failed
14381 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14382 << soid << " error=" << r << dendl;
14383 } else try {
14384 bufferlist::iterator bliter = bv.begin();
14385 ::decode(oi, bliter);
14386 v = oi.version;
14387 } catch (...) {
14388 // Leave v as default constructed. This will fail when sent to older OSDs, but
14389 // not much worse than failing here.
14390 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14391 }
14392
14393 missing_loc.add_missing(soid, v, eversion_t());
14394 if (primary_error(soid, v)) {
14395 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14396 // XXX: If we knew that there is no down osd which could include this
14397 // object, it would be nice if we could return EIO here.
14398 // If a "never fail" flag was available, that could be used
14399 // for rbd to NOT return EIO until object marked lost.
14400
14401 // Drop through to save this op in case an osd comes up with the object.
14402 }
14403
14404 // Restart the op after object becomes readable again
14405 waiting_for_unreadable_object[soid].push_back(op);
14406 op->mark_delayed("waiting for missing object");
14407
14408 if (!eio_errors_to_process) {
14409 eio_errors_to_process = true;
14410 assert(is_clean());
14411 queue_peering_event(
14412 CephPeeringEvtRef(
14413 std::make_shared<CephPeeringEvt>(
14414 get_osdmap()->get_epoch(),
14415 get_osdmap()->get_epoch(),
14416 DoRecovery())));
14417 } else {
14418 // A prior error must have already cleared clean state and queued recovery
14419 // or a map change has triggered re-peering.
14420 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14421 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14422 }
14423
14424 return -EAGAIN;
14425 }
14426
14427 /*---SnapTrimmer Logging---*/
14428 #undef dout_prefix
14429 #define dout_prefix *_dout << pg->gen_prefix()
14430
14431 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14432 {
14433 ldout(pg->cct, 20) << "enter " << state_name << dendl;
14434 }
14435
14436 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14437 {
14438 ldout(pg->cct, 20) << "exit " << state_name << dendl;
14439 }
14440
14441 /*---SnapTrimmer states---*/
14442 #undef dout_prefix
14443 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14444 << "SnapTrimmer state<" << get_state_name() << ">: ")
14445
14446 /* NotTrimming */
14447 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14448 : my_base(ctx),
14449 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14450 {
14451 context< SnapTrimmer >().log_enter(state_name);
14452 }
14453
14454 void PrimaryLogPG::NotTrimming::exit()
14455 {
14456 context< SnapTrimmer >().log_exit(state_name, enter_time);
14457 }
14458
14459 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14460 {
14461 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14462 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14463
14464 if (!(pg->is_primary() && pg->is_active())) {
14465 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14466 return discard_event();
14467 }
14468 if (!pg->is_clean() ||
14469 pg->snap_trimq.empty()) {
14470 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14471 return discard_event();
14472 }
14473 if (pg->scrubber.active) {
14474 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14475 return transit< WaitScrub >();
14476 } else {
14477 return transit< Trimming >();
14478 }
14479 }
14480
14481 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14482 {
14483 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14484 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14485
14486 pending = nullptr;
14487 if (!context< SnapTrimmer >().can_trim()) {
14488 post_event(KickTrim());
14489 return transit< NotTrimming >();
14490 }
14491
14492 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14493 ldout(pg->cct, 10) << "NotTrimming: trimming "
14494 << pg->snap_trimq.range_start()
14495 << dendl;
14496 return transit< AwaitAsyncWork >();
14497 }
14498
14499 /* AwaitAsyncWork */
14500 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14501 : my_base(ctx),
14502 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14503 {
14504 auto *pg = context< SnapTrimmer >().pg;
14505 context< SnapTrimmer >().log_enter(state_name);
14506 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14507 pg->state_set(PG_STATE_SNAPTRIM);
14508 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14509 pg->publish_stats_to_osd();
14510 }
14511
14512 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14513 {
14514 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14515 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14516 auto &in_flight = context<Trimming>().in_flight;
14517 assert(in_flight.empty());
14518
14519 assert(pg->is_primary() && pg->is_active());
14520 if (!context< SnapTrimmer >().can_trim()) {
14521 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14522 post_event(KickTrim());
14523 return transit< NotTrimming >();
14524 }
14525
14526 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14527
14528 vector<hobject_t> to_trim;
14529 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14530 to_trim.reserve(max);
14531 int r = pg->snap_mapper.get_next_objects_to_trim(
14532 snap_to_trim,
14533 max,
14534 &to_trim);
14535 if (r != 0 && r != -ENOENT) {
14536 lderr(pg->cct) << "get_next_objects_to_trim returned "
14537 << cpp_strerror(r) << dendl;
14538 assert(0 == "get_next_objects_to_trim returned an invalid code");
14539 } else if (r == -ENOENT) {
14540 // Done!
14541 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14542
14543 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14544 << " to purged_snaps"
14545 << dendl;
14546 pg->info.purged_snaps.insert(snap_to_trim);
14547 pg->snap_trimq.erase(snap_to_trim);
14548 ldout(pg->cct, 10) << "purged_snaps now "
14549 << pg->info.purged_snaps << ", snap_trimq now "
14550 << pg->snap_trimq << dendl;
14551
14552 ObjectStore::Transaction t;
14553 pg->dirty_big_info = true;
14554 pg->write_if_dirty(t);
14555 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14556 assert(tr == 0);
14557
14558 pg->share_pg_info();
14559 post_event(KickTrim());
14560 return transit< NotTrimming >();
14561 }
14562 assert(!to_trim.empty());
14563
14564 for (auto &&object: to_trim) {
14565 // Get next
14566 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14567 OpContextUPtr ctx;
14568 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14569 if (error) {
14570 if (error == -ENOLCK) {
14571 ldout(pg->cct, 10) << "could not get write lock on obj "
14572 << object << dendl;
14573 } else {
14574 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14575 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14576 }
14577 if (!in_flight.empty()) {
14578 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14579 return transit< WaitRepops >();
14580 }
14581 if (error == -ENOLCK) {
14582 ldout(pg->cct, 10) << "waiting for it to clear"
14583 << dendl;
14584 return transit< WaitRWLock >();
14585 } else {
14586 return transit< NotTrimming >();
14587 }
14588 }
14589
14590 in_flight.insert(object);
14591 ctx->register_on_success(
14592 [pg, object, &in_flight]() {
14593 assert(in_flight.find(object) != in_flight.end());
14594 in_flight.erase(object);
14595 if (in_flight.empty()) {
14596 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14597 pg->snap_trimmer_machine.process_event(Reset());
14598 } else {
14599 pg->snap_trimmer_machine.process_event(RepopsComplete());
14600 }
14601 }
14602 });
14603
14604 pg->simple_opc_submit(std::move(ctx));
14605 }
14606
14607 return transit< WaitRepops >();
14608 }
14609
14610 void PrimaryLogPG::setattr_maybe_cache(
14611 ObjectContextRef obc,
14612 OpContext *op,
14613 PGTransaction *t,
14614 const string &key,
14615 bufferlist &val)
14616 {
14617 t->setattr(obc->obs.oi.soid, key, val);
14618 }
14619
14620 void PrimaryLogPG::setattrs_maybe_cache(
14621 ObjectContextRef obc,
14622 OpContext *op,
14623 PGTransaction *t,
14624 map<string, bufferlist> &attrs)
14625 {
14626 t->setattrs(obc->obs.oi.soid, attrs);
14627 }
14628
14629 void PrimaryLogPG::rmattr_maybe_cache(
14630 ObjectContextRef obc,
14631 OpContext *op,
14632 PGTransaction *t,
14633 const string &key)
14634 {
14635 t->rmattr(obc->obs.oi.soid, key);
14636 }
14637
14638 int PrimaryLogPG::getattr_maybe_cache(
14639 ObjectContextRef obc,
14640 const string &key,
14641 bufferlist *val)
14642 {
14643 if (pool.info.require_rollback()) {
14644 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14645 if (i != obc->attr_cache.end()) {
14646 if (val)
14647 *val = i->second;
14648 return 0;
14649 } else {
14650 return -ENODATA;
14651 }
14652 }
14653 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14654 }
14655
14656 int PrimaryLogPG::getattrs_maybe_cache(
14657 ObjectContextRef obc,
14658 map<string, bufferlist> *out)
14659 {
14660 int r = 0;
14661 assert(out);
14662 if (pool.info.require_rollback()) {
14663 *out = obc->attr_cache;
14664 } else {
14665 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14666 }
14667 map<string, bufferlist> tmp;
14668 for (map<string, bufferlist>::iterator i = out->begin();
14669 i != out->end();
14670 ++i) {
14671 if (i->first.size() > 1 && i->first[0] == '_')
14672 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14673 }
14674 tmp.swap(*out);
14675 return r;
14676 }
14677
14678 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14679 return osd->check_failsafe_full(ss);
14680 }
14681
14682 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14683 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14684
14685 #ifdef PG_DEBUG_REFS
14686 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14687 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14688 #endif
14689
14690 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14691 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }