]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
31
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
47
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
56
57 #ifdef WITH_LTTNG
58 #include "tracing/osd.h"
59 #else
60 #define tracepoint(...)
61 #endif
62
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66 #undef dout_prefix
67 #define dout_prefix _prefix(_dout, this)
68 template <typename T>
69 static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71 }
72
73
74 #include <sstream>
75 #include <utility>
76
77 #include <errno.h>
78
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81 PGLSFilter::PGLSFilter() : cct(nullptr)
82 {
83 }
84
85 PGLSFilter::~PGLSFilter()
86 {
87 }
88
89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104 };
105
106 /**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115 protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125 public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128 };
129
130 template <typename T>
131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135 public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146 };
147
148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152 }
153
154 class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158 public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169 };
170
171
172 Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174 }
175
176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185 };
186
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201 };
202
203 struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
210 opcontext->finish_read(pg);
211 }
212 ~OnReadComplete() override {}
213 };
214
215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
216 PrimaryLogPGRef pg;
217 ObjectContextRef obc;
218 public:
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
220 pg(p), obc(o) {}
221 void finish(int r) override {
222 pg->_applied_recovered_object(obc);
223 }
224 };
225
226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238 };
239
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 void finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 }
248 };
249
250 // OpContext
251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
252 {
253 inflightreads = 1;
254 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
255 pair<bufferlist*, Context*> > > in;
256 in.swap(pending_async_reads);
257 pg->pgbackend->objects_read_async(
258 obc->obs.oi.soid,
259 in,
260 new OnReadComplete(pg, this), pg->get_pool().fast_read);
261 }
262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
263 {
264 assert(inflightreads > 0);
265 --inflightreads;
266 if (async_reads_complete()) {
267 assert(pg->in_progress_async_reads.size());
268 assert(pg->in_progress_async_reads.front().second == this);
269 pg->in_progress_async_reads.pop_front();
270
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg->execute_ctx(this);
274 }
275 }
276
277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
278 public:
279 PrimaryLogPG::CopyResults *results = nullptr;
280 PrimaryLogPG::OpContext *ctx;
281 OSDOp &osd_op;
282
283 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
284 : ctx(ctx), osd_op(osd_op) {
285 }
286 ~CopyFromCallback() override {}
287
288 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
289 results = results_.get<1>();
290 int r = results_.get<0>();
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
297 } else {
298 if (r != -ECANCELED) { // on cancel just toss it out; client resends
299 if (ctx->op)
300 ctx->pg->osd->reply_op_error(ctx->op, r);
301 } else if (results->should_requeue) {
302 if (ctx->op)
303 ctx->pg->requeue_op(ctx->op);
304 }
305 ctx->pg->close_op_ctx(ctx);
306 }
307 }
308
309 bool is_temp_obj_used() {
310 return results->started_temp_obj;
311 }
312 uint64_t get_data_size() {
313 return results->object_size;
314 }
315 };
316
317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
318 CopyFromCallback *copy_from_callback;
319
320 CopyFromFinisher(CopyFromCallback *copy_from_callback)
321 : copy_from_callback(copy_from_callback) {
322 }
323
324 int execute() override {
325 // instance will be destructed after this method completes
326 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
327 return 0;
328 }
329 };
330
331 // ======================
332 // PGBackend::Listener
333
334 void PrimaryLogPG::on_local_recover(
335 const hobject_t &hoid,
336 const ObjectRecoveryInfo &_recovery_info,
337 ObjectContextRef obc,
338 bool is_delete,
339 ObjectStore::Transaction *t
340 )
341 {
342 dout(10) << __func__ << ": " << hoid << dendl;
343
344 ObjectRecoveryInfo recovery_info(_recovery_info);
345 clear_object_snap_mapping(t, hoid);
346 if (!is_delete && recovery_info.soid.is_snap()) {
347 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
348 set<snapid_t> snaps;
349 dout(20) << " snapset " << recovery_info.ss
350 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
351 if (recovery_info.ss.is_legacy() ||
352 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
353 assert(recovery_info.oi.legacy_snaps.size());
354 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
355 recovery_info.oi.legacy_snaps.end());
356 } else {
357 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
358 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
359 snaps.insert(p->second.begin(), p->second.end());
360 }
361 dout(20) << " snaps " << snaps << dendl;
362 snap_mapper.add_oid(
363 recovery_info.soid,
364 snaps,
365 &_t);
366 }
367 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
368 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
369 assert(is_primary());
370 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
371 if (latest->op == pg_log_entry_t::LOST_REVERT &&
372 latest->reverting_to == recovery_info.version) {
373 dout(10) << " got old revert version " << recovery_info.version
374 << " for " << *latest << dendl;
375 recovery_info.version = latest->version;
376 // update the attr to the revert event version
377 recovery_info.oi.prior_version = recovery_info.oi.version;
378 recovery_info.oi.version = latest->version;
379 bufferlist bl;
380 ::encode(recovery_info.oi, bl,
381 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
382 assert(!pool.info.require_rollback());
383 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
384 if (obc)
385 obc->attr_cache[OI_ATTR] = bl;
386 }
387 }
388
389 // keep track of active pushes for scrub
390 ++active_pushes;
391
392 if (recovery_info.version > pg_log.get_can_rollback_to()) {
393 /* This can only happen during a repair, and even then, it would
394 * be one heck of a race. If we are repairing the object, the
395 * write in question must be fully committed, so it's not valid
396 * to roll it back anyway (and we'll be rolled forward shortly
397 * anyway) */
398 PGLogEntryHandler h{this, t};
399 pg_log.roll_forward_to(recovery_info.version, &h);
400 }
401 recover_got(recovery_info.soid, recovery_info.version);
402
403 if (is_primary()) {
404 if (!is_delete) {
405 obc->obs.exists = true;
406 obc->ondisk_write_lock();
407
408 bool got = obc->get_recovery_read();
409 assert(got);
410
411 assert(recovering.count(obc->obs.oi.soid));
412 recovering[obc->obs.oi.soid] = obc;
413 obc->obs.oi = recovery_info.oi; // may have been updated above
414 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
415 }
416
417 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
418
419 publish_stats_to_osd();
420 assert(missing_loc.needs_recovery(hoid));
421 if (!is_delete)
422 missing_loc.add_location(hoid, pg_whoami);
423 release_backoffs(hoid);
424 if (!is_unreadable_object(hoid)) {
425 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
426 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
427 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
428 requeue_ops(unreadable_object_entry->second);
429 waiting_for_unreadable_object.erase(unreadable_object_entry);
430 }
431 }
432 } else {
433 t->register_on_applied(
434 new C_OSD_AppliedRecoveredObjectReplica(this));
435
436 }
437
438 t->register_on_commit(
439 new C_OSD_CommittedPushedObject(
440 this,
441 get_osdmap()->get_epoch(),
442 info.last_complete));
443
444 // update pg
445 dirty_info = true;
446 write_if_dirty(*t);
447 }
448
449 void PrimaryLogPG::on_global_recover(
450 const hobject_t &soid,
451 const object_stat_sum_t &stat_diff,
452 bool is_delete)
453 {
454 info.stats.stats.sum.add(stat_diff);
455 missing_loc.recovered(soid);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
459 assert(i != recovering.end());
460
461 if (!is_delete) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 assert(i->second);
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488 }
489
490 void PrimaryLogPG::on_peer_recover(
491 pg_shard_t peer,
492 const hobject_t &soid,
493 const ObjectRecoveryInfo &recovery_info)
494 {
495 publish_stats_to_osd();
496 // done!
497 peer_missing[peer].got(soid, recovery_info.version);
498 }
499
500 void PrimaryLogPG::begin_peer_recover(
501 pg_shard_t peer,
502 const hobject_t soid)
503 {
504 peer_missing[peer].revise_have(soid, eversion_t());
505 }
506
507 void PrimaryLogPG::schedule_recovery_work(
508 GenContext<ThreadPool::TPHandle&> *c)
509 {
510 osd->recovery_gen_wq.queue(c);
511 }
512
513 void PrimaryLogPG::send_message_osd_cluster(
514 int peer, Message *m, epoch_t from_epoch)
515 {
516 osd->send_message_osd_cluster(peer, m, from_epoch);
517 }
518
519 void PrimaryLogPG::send_message_osd_cluster(
520 Message *m, Connection *con)
521 {
522 osd->send_message_osd_cluster(m, con);
523 }
524
525 void PrimaryLogPG::send_message_osd_cluster(
526 Message *m, const ConnectionRef& con)
527 {
528 osd->send_message_osd_cluster(m, con);
529 }
530
531 void PrimaryLogPG::on_primary_error(
532 const hobject_t &oid,
533 eversion_t v)
534 {
535 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
536 primary_failed(oid);
537 primary_error(oid, v);
538 backfills_in_flight.erase(oid);
539 missing_loc.add_missing(oid, v, eversion_t());
540 }
541
542 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
543 int peer, epoch_t from_epoch)
544 {
545 return osd->get_con_osd_cluster(peer, from_epoch);
546 }
547
548 PerfCounters *PrimaryLogPG::get_logger()
549 {
550 return osd->logger;
551 }
552
553
554 // ====================
555 // missing objects
556
557 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
558 {
559 return pg_log.get_missing().get_items().count(soid);
560 }
561
562 void PrimaryLogPG::maybe_kick_recovery(
563 const hobject_t &soid)
564 {
565 eversion_t v;
566 if (!missing_loc.needs_recovery(soid, &v))
567 return;
568
569 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
570 if (p != recovering.end()) {
571 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
572 } else if (missing_loc.is_unfound(soid)) {
573 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
574 } else {
575 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
576 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
577 if (is_missing_object(soid)) {
578 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
579 } else if (missing_loc.is_deleted(soid)) {
580 prep_object_replica_deletes(soid, v, h);
581 } else {
582 prep_object_replica_pushes(soid, v, h);
583 }
584 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
585 }
586 }
587
588 void PrimaryLogPG::wait_for_unreadable_object(
589 const hobject_t& soid, OpRequestRef op)
590 {
591 assert(is_unreadable_object(soid));
592 maybe_kick_recovery(soid);
593 waiting_for_unreadable_object[soid].push_back(op);
594 op->mark_delayed("waiting for missing object");
595 }
596
597 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
598 {
599 /* The conditions below may clear (on_local_recover, before we queue
600 * the transaction) before we actually requeue the degraded waiters
601 * in on_global_recover after the transaction completes.
602 */
603 if (waiting_for_degraded_object.count(soid))
604 return true;
605 if (pg_log.get_missing().get_items().count(soid))
606 return true;
607 assert(!actingbackfill.empty());
608 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
609 i != actingbackfill.end();
610 ++i) {
611 if (*i == get_primary()) continue;
612 pg_shard_t peer = *i;
613 auto peer_missing_entry = peer_missing.find(peer);
614 if (peer_missing_entry != peer_missing.end() &&
615 peer_missing_entry->second.get_items().count(soid))
616 return true;
617
618 // Object is degraded if after last_backfill AND
619 // we are backfilling it
620 if (is_backfill_targets(peer) &&
621 peer_info[peer].last_backfill <= soid &&
622 last_backfill_started >= soid &&
623 backfills_in_flight.count(soid))
624 return true;
625 }
626 return false;
627 }
628
629 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
630 {
631 assert(is_degraded_or_backfilling_object(soid));
632
633 maybe_kick_recovery(soid);
634 waiting_for_degraded_object[soid].push_back(op);
635 op->mark_delayed("waiting for degraded object");
636 }
637
638 void PrimaryLogPG::block_write_on_full_cache(
639 const hobject_t& _oid, OpRequestRef op)
640 {
641 const hobject_t oid = _oid.get_head();
642 dout(20) << __func__ << ": blocking object " << oid
643 << " on full cache" << dendl;
644 objects_blocked_on_cache_full.insert(oid);
645 waiting_for_cache_not_full.push_back(op);
646 op->mark_delayed("waiting for cache not full");
647 }
648
649 void PrimaryLogPG::block_for_clean(
650 const hobject_t& oid, OpRequestRef op)
651 {
652 dout(20) << __func__ << ": blocking object " << oid
653 << " on primary repair" << dendl;
654 waiting_for_clean_to_primary_repair.push_back(op);
655 op->mark_delayed("waiting for clean to repair");
656 }
657
658 void PrimaryLogPG::block_write_on_snap_rollback(
659 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
660 {
661 dout(20) << __func__ << ": blocking object " << oid.get_head()
662 << " on snap promotion " << obc->obs.oi.soid << dendl;
663 // otherwise, we'd have blocked in do_op
664 assert(oid.is_head());
665 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
666 objects_blocked_on_snap_promotion[oid] = obc;
667 wait_for_blocked_object(obc->obs.oi.soid, op);
668 }
669
670 void PrimaryLogPG::block_write_on_degraded_snap(
671 const hobject_t& snap, OpRequestRef op)
672 {
673 dout(20) << __func__ << ": blocking object " << snap.get_head()
674 << " on degraded snap " << snap << dendl;
675 // otherwise, we'd have blocked in do_op
676 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
677 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
678 wait_for_degraded_object(snap, op);
679 }
680
681 bool PrimaryLogPG::maybe_await_blocked_snapset(
682 const hobject_t &hoid,
683 OpRequestRef op)
684 {
685 ObjectContextRef obc;
686 obc = object_contexts.lookup(hoid.get_head());
687 if (obc) {
688 if (obc->is_blocked()) {
689 wait_for_blocked_object(obc->obs.oi.soid, op);
690 return true;
691 } else {
692 return false;
693 }
694 }
695 obc = object_contexts.lookup(hoid.get_snapdir());
696 if (obc) {
697 if (obc->is_blocked()) {
698 wait_for_blocked_object(obc->obs.oi.soid, op);
699 return true;
700 } else {
701 return false;
702 }
703 }
704 return false;
705 }
706
707 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
708 {
709 dout(10) << __func__ << " " << soid << " " << op << dendl;
710 waiting_for_blocked_object[soid].push_back(op);
711 op->mark_delayed("waiting for blocked object");
712 }
713
714 void PrimaryLogPG::maybe_force_recovery()
715 {
716 // no force if not in degraded/recovery/backfill stats
717 if (!is_degraded() &&
718 !state_test(PG_STATE_RECOVERING |
719 PG_STATE_RECOVERY_WAIT |
720 PG_STATE_BACKFILLING |
721 PG_STATE_BACKFILL_WAIT |
722 PG_STATE_BACKFILL_TOOFULL))
723 return;
724
725 if (pg_log.get_log().approx_size() <
726 cct->_conf->osd_max_pg_log_entries *
727 cct->_conf->osd_force_recovery_pg_log_entries_factor)
728 return;
729
730 // find the oldest missing object
731 version_t min_version = 0;
732 hobject_t soid;
733 if (!pg_log.get_missing().get_items().empty()) {
734 min_version = pg_log.get_missing().get_rmissing().begin()->first;
735 soid = pg_log.get_missing().get_rmissing().begin()->second;
736 }
737 assert(!actingbackfill.empty());
738 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
739 it != actingbackfill.end();
740 ++it) {
741 if (*it == get_primary()) continue;
742 pg_shard_t peer = *it;
743 if (peer_missing.count(peer) &&
744 !peer_missing[peer].get_items().empty() &&
745 min_version > peer_missing[peer].get_rmissing().begin()->first) {
746 min_version = peer_missing[peer].get_rmissing().begin()->first;
747 soid = peer_missing[peer].get_rmissing().begin()->second;
748 }
749 }
750
751 // recover it
752 if (soid != hobject_t())
753 maybe_kick_recovery(soid);
754 }
755
756 class PGLSPlainFilter : public PGLSFilter {
757 string val;
758 public:
759 int init(bufferlist::iterator &params) override
760 {
761 try {
762 ::decode(xattr, params);
763 ::decode(val, params);
764 } catch (buffer::error &e) {
765 return -EINVAL;
766 }
767
768 return 0;
769 }
770 ~PGLSPlainFilter() override {}
771 bool filter(const hobject_t &obj, bufferlist& xattr_data,
772 bufferlist& outdata) override;
773 };
774
775 class PGLSParentFilter : public PGLSFilter {
776 inodeno_t parent_ino;
777 public:
778 CephContext* cct;
779 PGLSParentFilter(CephContext* cct) : cct(cct) {
780 xattr = "_parent";
781 }
782 int init(bufferlist::iterator &params) override
783 {
784 try {
785 ::decode(parent_ino, params);
786 } catch (buffer::error &e) {
787 return -EINVAL;
788 }
789 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
790
791 return 0;
792 }
793 ~PGLSParentFilter() override {}
794 bool filter(const hobject_t &obj, bufferlist& xattr_data,
795 bufferlist& outdata) override;
796 };
797
798 bool PGLSParentFilter::filter(const hobject_t &obj,
799 bufferlist& xattr_data, bufferlist& outdata)
800 {
801 bufferlist::iterator iter = xattr_data.begin();
802 inode_backtrace_t bt;
803
804 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
805
806 ::decode(bt, iter);
807
808 vector<inode_backpointer_t>::iterator vi;
809 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
810 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
811 if (vi->dirino == parent_ino) {
812 ::encode(*vi, outdata);
813 return true;
814 }
815 }
816
817 return false;
818 }
819
820 bool PGLSPlainFilter::filter(const hobject_t &obj,
821 bufferlist& xattr_data, bufferlist& outdata)
822 {
823 if (val.size() != xattr_data.length())
824 return false;
825
826 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
827 return false;
828
829 return true;
830 }
831
832 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
833 {
834 bufferlist bl;
835
836 // If filter has expressed an interest in an xattr, load it.
837 if (!filter->get_xattr().empty()) {
838 int ret = pgbackend->objects_get_attr(
839 sobj,
840 filter->get_xattr(),
841 &bl);
842 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
843 if (ret < 0) {
844 if (ret != -ENODATA || filter->reject_empty_xattr()) {
845 return false;
846 }
847 }
848 }
849
850 return filter->filter(sobj, bl, outdata);
851 }
852
853 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
854 {
855 string type;
856 PGLSFilter *filter;
857
858 try {
859 ::decode(type, iter);
860 }
861 catch (buffer::error& e) {
862 return -EINVAL;
863 }
864
865 if (type.compare("parent") == 0) {
866 filter = new PGLSParentFilter(cct);
867 } else if (type.compare("plain") == 0) {
868 filter = new PGLSPlainFilter();
869 } else {
870 std::size_t dot = type.find(".");
871 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
872 return -EINVAL;
873 }
874
875 const std::string class_name = type.substr(0, dot);
876 const std::string filter_name = type.substr(dot + 1);
877 ClassHandler::ClassData *cls = NULL;
878 int r = osd->class_handler->open_class(class_name, &cls);
879 if (r != 0) {
880 derr << "Error opening class '" << class_name << "': "
881 << cpp_strerror(r) << dendl;
882 if (r != -EPERM) // propogate permission error
883 r = -EINVAL;
884 return r;
885 } else {
886 assert(cls);
887 }
888
889 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
890 if (class_filter == NULL) {
891 derr << "Error finding filter '" << filter_name << "' in class "
892 << class_name << dendl;
893 return -EINVAL;
894 }
895 filter = class_filter->fn();
896 if (!filter) {
897 // Object classes are obliged to return us something, but let's
898 // give an error rather than asserting out.
899 derr << "Buggy class " << class_name << " failed to construct "
900 "filter " << filter_name << dendl;
901 return -EINVAL;
902 }
903 }
904
905 assert(filter);
906 int r = filter->init(iter);
907 if (r < 0) {
908 derr << "Error initializing filter " << type << ": "
909 << cpp_strerror(r) << dendl;
910 delete filter;
911 return -EINVAL;
912 } else {
913 // Successfully constructed and initialized, return it.
914 *pfilter = filter;
915 return 0;
916 }
917 }
918
919
920 // ==========================================================
921
922 int PrimaryLogPG::do_command(
923 cmdmap_t cmdmap,
924 ostream& ss,
925 bufferlist& idata,
926 bufferlist& odata,
927 ConnectionRef con,
928 ceph_tid_t tid)
929 {
930 const auto &missing = pg_log.get_missing();
931 string prefix;
932 string format;
933
934 cmd_getval(cct, cmdmap, "format", format);
935 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
936
937 string command;
938 cmd_getval(cct, cmdmap, "cmd", command);
939 if (command == "query") {
940 f->open_object_section("pg");
941 f->dump_string("state", pg_state_string(get_state()));
942 f->dump_stream("snap_trimq") << snap_trimq;
943 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
944 f->open_array_section("up");
945 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
946 f->dump_unsigned("osd", *p);
947 f->close_section();
948 f->open_array_section("acting");
949 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
950 f->dump_unsigned("osd", *p);
951 f->close_section();
952 if (!backfill_targets.empty()) {
953 f->open_array_section("backfill_targets");
954 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
955 p != backfill_targets.end();
956 ++p)
957 f->dump_stream("shard") << *p;
958 f->close_section();
959 }
960 if (!actingbackfill.empty()) {
961 f->open_array_section("actingbackfill");
962 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
963 p != actingbackfill.end();
964 ++p)
965 f->dump_stream("shard") << *p;
966 f->close_section();
967 }
968 f->open_object_section("info");
969 _update_calc_stats();
970 info.dump(f.get());
971 f->close_section();
972
973 f->open_array_section("peer_info");
974 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
975 p != peer_info.end();
976 ++p) {
977 f->open_object_section("info");
978 f->dump_stream("peer") << p->first;
979 p->second.dump(f.get());
980 f->close_section();
981 }
982 f->close_section();
983
984 f->open_array_section("recovery_state");
985 handle_query_state(f.get());
986 f->close_section();
987
988 f->open_object_section("agent_state");
989 if (agent_state)
990 agent_state->dump(f.get());
991 f->close_section();
992
993 f->close_section();
994 f->flush(odata);
995 return 0;
996 }
997 else if (command == "mark_unfound_lost") {
998 string mulcmd;
999 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1000 int mode = -1;
1001 if (mulcmd == "revert") {
1002 if (pool.info.ec_pool()) {
1003 ss << "mode must be 'delete' for ec pool";
1004 return -EINVAL;
1005 }
1006 mode = pg_log_entry_t::LOST_REVERT;
1007 } else if (mulcmd == "delete") {
1008 mode = pg_log_entry_t::LOST_DELETE;
1009 } else {
1010 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1011 return -EINVAL;
1012 }
1013 assert(mode == pg_log_entry_t::LOST_REVERT ||
1014 mode == pg_log_entry_t::LOST_DELETE);
1015
1016 if (!is_primary()) {
1017 ss << "not primary";
1018 return -EROFS;
1019 }
1020
1021 uint64_t unfound = missing_loc.num_unfound();
1022 if (!unfound) {
1023 ss << "pg has no unfound objects";
1024 return 0; // make command idempotent
1025 }
1026
1027 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1028 ss << "pg has " << unfound
1029 << " unfound objects but we haven't probed all sources, not marking lost";
1030 return -EINVAL;
1031 }
1032
1033 mark_all_unfound_lost(mode, con, tid);
1034 return -EAGAIN;
1035 }
1036 else if (command == "list_missing") {
1037 hobject_t offset;
1038 string offset_json;
1039 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1040 json_spirit::Value v;
1041 try {
1042 if (!json_spirit::read(offset_json, v))
1043 throw std::runtime_error("bad json");
1044 offset.decode(v);
1045 } catch (std::runtime_error& e) {
1046 ss << "error parsing offset: " << e.what();
1047 return -EINVAL;
1048 }
1049 }
1050 f->open_object_section("missing");
1051 {
1052 f->open_object_section("offset");
1053 offset.dump(f.get());
1054 f->close_section();
1055 }
1056 f->dump_int("num_missing", missing.num_missing());
1057 f->dump_int("num_unfound", get_num_unfound());
1058 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1059 missing_loc.get_needs_recovery();
1060 map<hobject_t, pg_missing_item>::const_iterator p =
1061 needs_recovery_map.upper_bound(offset);
1062 {
1063 f->open_array_section("objects");
1064 int32_t num = 0;
1065 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1066 if (missing_loc.is_unfound(p->first)) {
1067 f->open_object_section("object");
1068 {
1069 f->open_object_section("oid");
1070 p->first.dump(f.get());
1071 f->close_section();
1072 }
1073 p->second.dump(f.get()); // have, need keys
1074 {
1075 f->open_array_section("locations");
1076 for (set<pg_shard_t>::iterator r =
1077 missing_loc.get_locations(p->first).begin();
1078 r != missing_loc.get_locations(p->first).end();
1079 ++r)
1080 f->dump_stream("shard") << *r;
1081 f->close_section();
1082 }
1083 f->close_section();
1084 num++;
1085 }
1086 }
1087 f->close_section();
1088 }
1089 f->dump_bool("more", p != needs_recovery_map.end());
1090 f->close_section();
1091 f->flush(odata);
1092 return 0;
1093 }
1094
1095 ss << "unknown pg command " << prefix;
1096 return -EINVAL;
1097 }
1098
1099 // ==========================================================
1100
1101 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1102 {
1103 // NOTE: this is non-const because we modify the OSDOp.outdata in
1104 // place
1105 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1106 assert(m->get_type() == CEPH_MSG_OSD_OP);
1107 dout(10) << "do_pg_op " << *m << dendl;
1108
1109 op->mark_started();
1110
1111 int result = 0;
1112 string cname, mname;
1113 PGLSFilter *filter = NULL;
1114 bufferlist filter_out;
1115
1116 snapid_t snapid = m->get_snapid();
1117
1118 vector<OSDOp> ops = m->ops;
1119
1120 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1121 OSDOp& osd_op = *p;
1122 bufferlist::iterator bp = p->indata.begin();
1123 switch (p->op.op) {
1124 case CEPH_OSD_OP_PGNLS_FILTER:
1125 try {
1126 ::decode(cname, bp);
1127 ::decode(mname, bp);
1128 }
1129 catch (const buffer::error& e) {
1130 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1131 result = -EINVAL;
1132 break;
1133 }
1134 if (filter) {
1135 delete filter;
1136 filter = NULL;
1137 }
1138 result = get_pgls_filter(bp, &filter);
1139 if (result < 0)
1140 break;
1141
1142 assert(filter);
1143
1144 // fall through
1145
1146 case CEPH_OSD_OP_PGNLS:
1147 if (snapid != CEPH_NOSNAP) {
1148 result = -EINVAL;
1149 break;
1150 }
1151 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1152 dout(10) << " pgnls pg=" << m->get_pg()
1153 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1154 << " != " << info.pgid << dendl;
1155 result = 0; // hmm?
1156 } else {
1157 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1158
1159 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1160 // read into a buffer
1161 vector<hobject_t> sentries;
1162 pg_nls_response_t response;
1163 try {
1164 ::decode(response.handle, bp);
1165 }
1166 catch (const buffer::error& e) {
1167 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1168 result = -EINVAL;
1169 break;
1170 }
1171
1172 hobject_t next;
1173 hobject_t lower_bound = response.handle;
1174 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1175 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1176 dout(10) << " pgnls lower_bound " << lower_bound
1177 << " pg_end " << pg_end << dendl;
1178 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1179 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1180 // this should only happen with a buggy client.
1181 dout(10) << "outside of PG bounds " << pg_start << " .. "
1182 << pg_end << dendl;
1183 result = -EINVAL;
1184 break;
1185 }
1186
1187 hobject_t current = lower_bound;
1188 osr->flush();
1189 int r = pgbackend->objects_list_partial(
1190 current,
1191 list_size,
1192 list_size,
1193 &sentries,
1194 &next);
1195 if (r != 0) {
1196 result = -EINVAL;
1197 break;
1198 }
1199
1200 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1201 pg_log.get_missing().get_items().lower_bound(current);
1202 vector<hobject_t>::iterator ls_iter = sentries.begin();
1203 hobject_t _max = hobject_t::get_max();
1204 while (1) {
1205 const hobject_t &mcand =
1206 missing_iter == pg_log.get_missing().get_items().end() ?
1207 _max :
1208 missing_iter->first;
1209 const hobject_t &lcand =
1210 ls_iter == sentries.end() ?
1211 _max :
1212 *ls_iter;
1213
1214 hobject_t candidate;
1215 if (mcand == lcand) {
1216 candidate = mcand;
1217 if (!mcand.is_max()) {
1218 ++ls_iter;
1219 ++missing_iter;
1220 }
1221 } else if (mcand < lcand) {
1222 candidate = mcand;
1223 assert(!mcand.is_max());
1224 ++missing_iter;
1225 } else {
1226 candidate = lcand;
1227 assert(!lcand.is_max());
1228 ++ls_iter;
1229 }
1230
1231 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1232 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1233
1234 if (candidate >= next) {
1235 break;
1236 }
1237
1238 if (response.entries.size() == list_size) {
1239 next = candidate;
1240 break;
1241 }
1242
1243 // skip snapdir objects
1244 if (candidate.snap == CEPH_SNAPDIR)
1245 continue;
1246
1247 if (candidate.snap != CEPH_NOSNAP)
1248 continue;
1249
1250 // skip internal namespace
1251 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1252 continue;
1253
1254 if (missing_loc.is_deleted(candidate))
1255 continue;
1256
1257 // skip wrong namespace
1258 if (m->get_hobj().nspace != librados::all_nspaces &&
1259 candidate.get_namespace() != m->get_hobj().nspace)
1260 continue;
1261
1262 if (filter && !pgls_filter(filter, candidate, filter_out))
1263 continue;
1264
1265 dout(20) << "pgnls item 0x" << std::hex
1266 << candidate.get_hash()
1267 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1268 << std::dec << " "
1269 << candidate.oid.name << dendl;
1270
1271 librados::ListObjectImpl item;
1272 item.nspace = candidate.get_namespace();
1273 item.oid = candidate.oid.name;
1274 item.locator = candidate.get_key();
1275 response.entries.push_back(item);
1276 }
1277
1278 if (next.is_max() &&
1279 missing_iter == pg_log.get_missing().get_items().end() &&
1280 ls_iter == sentries.end()) {
1281 result = 1;
1282
1283 // Set response.handle to the start of the next PG according
1284 // to the object sort order.
1285 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1286 } else {
1287 response.handle = next;
1288 }
1289 dout(10) << "pgnls handle=" << response.handle << dendl;
1290 ::encode(response, osd_op.outdata);
1291 if (filter)
1292 ::encode(filter_out, osd_op.outdata);
1293 dout(10) << " pgnls result=" << result << " outdata.length()="
1294 << osd_op.outdata.length() << dendl;
1295 }
1296 break;
1297
1298 case CEPH_OSD_OP_PGLS_FILTER:
1299 try {
1300 ::decode(cname, bp);
1301 ::decode(mname, bp);
1302 }
1303 catch (const buffer::error& e) {
1304 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1305 result = -EINVAL;
1306 break;
1307 }
1308 if (filter) {
1309 delete filter;
1310 filter = NULL;
1311 }
1312 result = get_pgls_filter(bp, &filter);
1313 if (result < 0)
1314 break;
1315
1316 assert(filter);
1317
1318 // fall through
1319
1320 case CEPH_OSD_OP_PGLS:
1321 if (snapid != CEPH_NOSNAP) {
1322 result = -EINVAL;
1323 break;
1324 }
1325 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1326 dout(10) << " pgls pg=" << m->get_pg()
1327 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1328 << " != " << info.pgid << dendl;
1329 result = 0; // hmm?
1330 } else {
1331 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1332
1333 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1334 // read into a buffer
1335 vector<hobject_t> sentries;
1336 pg_ls_response_t response;
1337 try {
1338 ::decode(response.handle, bp);
1339 }
1340 catch (const buffer::error& e) {
1341 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1342 result = -EINVAL;
1343 break;
1344 }
1345
1346 hobject_t next;
1347 hobject_t current = response.handle;
1348 osr->flush();
1349 int r = pgbackend->objects_list_partial(
1350 current,
1351 list_size,
1352 list_size,
1353 &sentries,
1354 &next);
1355 if (r != 0) {
1356 result = -EINVAL;
1357 break;
1358 }
1359
1360 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1361
1362 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1363 pg_log.get_missing().get_items().lower_bound(current);
1364 vector<hobject_t>::iterator ls_iter = sentries.begin();
1365 hobject_t _max = hobject_t::get_max();
1366 while (1) {
1367 const hobject_t &mcand =
1368 missing_iter == pg_log.get_missing().get_items().end() ?
1369 _max :
1370 missing_iter->first;
1371 const hobject_t &lcand =
1372 ls_iter == sentries.end() ?
1373 _max :
1374 *ls_iter;
1375
1376 hobject_t candidate;
1377 if (mcand == lcand) {
1378 candidate = mcand;
1379 if (!mcand.is_max()) {
1380 ++ls_iter;
1381 ++missing_iter;
1382 }
1383 } else if (mcand < lcand) {
1384 candidate = mcand;
1385 assert(!mcand.is_max());
1386 ++missing_iter;
1387 } else {
1388 candidate = lcand;
1389 assert(!lcand.is_max());
1390 ++ls_iter;
1391 }
1392
1393 if (candidate >= next) {
1394 break;
1395 }
1396
1397 if (response.entries.size() == list_size) {
1398 next = candidate;
1399 break;
1400 }
1401
1402 // skip snapdir objects
1403 if (candidate.snap == CEPH_SNAPDIR)
1404 continue;
1405
1406 if (candidate.snap != CEPH_NOSNAP)
1407 continue;
1408
1409 // skip wrong namespace
1410 if (candidate.get_namespace() != m->get_hobj().nspace)
1411 continue;
1412
1413 if (missing_loc.is_deleted(candidate))
1414 continue;
1415
1416 if (filter && !pgls_filter(filter, candidate, filter_out))
1417 continue;
1418
1419 response.entries.push_back(make_pair(candidate.oid,
1420 candidate.get_key()));
1421 }
1422 if (next.is_max() &&
1423 missing_iter == pg_log.get_missing().get_items().end() &&
1424 ls_iter == sentries.end()) {
1425 result = 1;
1426 }
1427 response.handle = next;
1428 ::encode(response, osd_op.outdata);
1429 if (filter)
1430 ::encode(filter_out, osd_op.outdata);
1431 dout(10) << " pgls result=" << result << " outdata.length()="
1432 << osd_op.outdata.length() << dendl;
1433 }
1434 break;
1435
1436 case CEPH_OSD_OP_PG_HITSET_LS:
1437 {
1438 list< pair<utime_t,utime_t> > ls;
1439 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1440 p != info.hit_set.history.end();
1441 ++p)
1442 ls.push_back(make_pair(p->begin, p->end));
1443 if (hit_set)
1444 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1445 ::encode(ls, osd_op.outdata);
1446 }
1447 break;
1448
1449 case CEPH_OSD_OP_PG_HITSET_GET:
1450 {
1451 utime_t stamp(osd_op.op.hit_set_get.stamp);
1452 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1453 // read the current in-memory HitSet, not the version we've
1454 // checkpointed.
1455 if (!hit_set) {
1456 result= -ENOENT;
1457 break;
1458 }
1459 ::encode(*hit_set, osd_op.outdata);
1460 result = osd_op.outdata.length();
1461 } else {
1462 // read an archived HitSet.
1463 hobject_t oid;
1464 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1465 p != info.hit_set.history.end();
1466 ++p) {
1467 if (stamp >= p->begin && stamp <= p->end) {
1468 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1469 break;
1470 }
1471 }
1472 if (oid == hobject_t()) {
1473 result = -ENOENT;
1474 break;
1475 }
1476 if (!pool.info.is_replicated()) {
1477 // FIXME: EC not supported yet
1478 result = -EOPNOTSUPP;
1479 break;
1480 }
1481 if (is_unreadable_object(oid)) {
1482 wait_for_unreadable_object(oid, op);
1483 delete filter;
1484 return;
1485 }
1486 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1487 }
1488 }
1489 break;
1490
1491 case CEPH_OSD_OP_SCRUBLS:
1492 result = do_scrub_ls(m, &osd_op);
1493 break;
1494
1495 default:
1496 result = -EINVAL;
1497 break;
1498 }
1499
1500 if (result < 0)
1501 break;
1502 }
1503
1504 // reply
1505 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1506 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1507 false);
1508 reply->claim_op_out_data(ops);
1509 reply->set_result(result);
1510 reply->set_reply_versions(info.last_update, info.last_user_version);
1511 osd->send_message_osd_client(reply, m->get_connection());
1512 delete filter;
1513 }
1514
1515 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1516 {
1517 if (m->get_pg() != info.pgid.pgid) {
1518 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1519 return -EINVAL; // hmm?
1520 }
1521 auto bp = osd_op->indata.begin();
1522 scrub_ls_arg_t arg;
1523 try {
1524 arg.decode(bp);
1525 } catch (buffer::error&) {
1526 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1527 return -EINVAL;
1528 }
1529 int r = 0;
1530 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1531 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1532 r = -EAGAIN;
1533 } else if (!scrubber.store) {
1534 r = -ENOENT;
1535 } else if (arg.get_snapsets) {
1536 result.vals = scrubber.store->get_snap_errors(osd->store,
1537 get_pgid().pool(),
1538 arg.start_after,
1539 arg.max_return);
1540 } else {
1541 result.vals = scrubber.store->get_object_errors(osd->store,
1542 get_pgid().pool(),
1543 arg.start_after,
1544 arg.max_return);
1545 }
1546 ::encode(result, osd_op->outdata);
1547 return r;
1548 }
1549
1550 void PrimaryLogPG::calc_trim_to()
1551 {
1552 size_t target = cct->_conf->osd_min_pg_log_entries;
1553 if (is_degraded() ||
1554 state_test(PG_STATE_RECOVERING |
1555 PG_STATE_RECOVERY_WAIT |
1556 PG_STATE_BACKFILLING |
1557 PG_STATE_BACKFILL_WAIT |
1558 PG_STATE_BACKFILL_TOOFULL)) {
1559 target = cct->_conf->osd_max_pg_log_entries;
1560 }
1561
1562 eversion_t limit = MIN(
1563 min_last_complete_ondisk,
1564 pg_log.get_can_rollback_to());
1565 if (limit != eversion_t() &&
1566 limit != pg_trim_to &&
1567 pg_log.get_log().approx_size() > target) {
1568 size_t num_to_trim = pg_log.get_log().approx_size() - target;
1569 if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1570 return;
1571 }
1572 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1573 eversion_t new_trim_to;
1574 for (size_t i = 0; i < num_to_trim; ++i) {
1575 new_trim_to = it->version;
1576 ++it;
1577 if (new_trim_to > limit) {
1578 new_trim_to = limit;
1579 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1580 break;
1581 }
1582 }
1583 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1584 pg_trim_to = new_trim_to;
1585 assert(pg_trim_to <= pg_log.get_head());
1586 assert(pg_trim_to <= min_last_complete_ondisk);
1587 }
1588 }
1589
1590 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1591 const PGPool &_pool, spg_t p) :
1592 PG(o, curmap, _pool, p),
1593 pgbackend(
1594 PGBackend::build_pg_backend(
1595 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1596 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1597 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1598 new_backfill(false),
1599 temp_seq(0),
1600 snap_trimmer_machine(this)
1601 {
1602 missing_loc.set_backend_predicates(
1603 pgbackend->get_is_readable_predicate(),
1604 pgbackend->get_is_recoverable_predicate());
1605 snap_trimmer_machine.initiate();
1606 }
1607
1608 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1609 {
1610 src_oloc = oloc;
1611 if (oloc.key.empty())
1612 src_oloc.key = oid.name;
1613 }
1614
1615 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1616 {
1617 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1618 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1619 if (!session)
1620 return; // drop it.
1621 session->put(); // get_priv takes a ref, and so does the SessionRef
1622 hobject_t begin = info.pgid.pgid.get_hobj_start();
1623 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1624 if (begin < m->begin) {
1625 begin = m->begin;
1626 }
1627 if (end > m->end) {
1628 end = m->end;
1629 }
1630 dout(10) << __func__ << " backoff ack id " << m->id
1631 << " [" << begin << "," << end << ")" << dendl;
1632 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1633 }
1634
1635 void PrimaryLogPG::do_request(
1636 OpRequestRef& op,
1637 ThreadPool::TPHandle &handle)
1638 {
1639 if (op->osd_trace) {
1640 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1641 op->pg_trace.event("do request");
1642 }
1643 // make sure we have a new enough map
1644 auto p = waiting_for_map.find(op->get_source());
1645 if (p != waiting_for_map.end()) {
1646 // preserve ordering
1647 dout(20) << __func__ << " waiting_for_map "
1648 << p->first << " not empty, queueing" << dendl;
1649 p->second.push_back(op);
1650 op->mark_delayed("waiting_for_map not empty");
1651 return;
1652 }
1653 if (!have_same_or_newer_map(op->min_epoch)) {
1654 dout(20) << __func__ << " min " << op->min_epoch
1655 << ", queue on waiting_for_map " << op->get_source() << dendl;
1656 waiting_for_map[op->get_source()].push_back(op);
1657 op->mark_delayed("op must wait for map");
1658 osd->request_osdmap_update(op->min_epoch);
1659 return;
1660 }
1661
1662 if (can_discard_request(op)) {
1663 return;
1664 }
1665
1666 // pg-wide backoffs
1667 const Message *m = op->get_req();
1668 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1669 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1670 if (!session)
1671 return; // drop it.
1672 session->put(); // get_priv takes a ref, and so does the SessionRef
1673
1674 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1675 if (session->check_backoff(cct, info.pgid,
1676 info.pgid.pgid.get_hobj_start(), m)) {
1677 return;
1678 }
1679
1680 bool backoff =
1681 is_down() ||
1682 is_incomplete() ||
1683 (!is_active() && is_peered());
1684 if (g_conf->osd_backoff_on_peering && !backoff) {
1685 if (is_peering()) {
1686 backoff = true;
1687 }
1688 }
1689 if (backoff) {
1690 add_pg_backoff(session);
1691 return;
1692 }
1693 }
1694 // pg backoff acks at pg-level
1695 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1696 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1697 if (ba->begin != ba->end) {
1698 handle_backoff(op);
1699 return;
1700 }
1701 }
1702 }
1703
1704 if (flushes_in_progress > 0) {
1705 dout(20) << flushes_in_progress
1706 << " flushes_in_progress pending "
1707 << "waiting for active on " << op << dendl;
1708 waiting_for_peered.push_back(op);
1709 op->mark_delayed("waiting for peered");
1710 return;
1711 }
1712
1713 if (!is_peered()) {
1714 // Delay unless PGBackend says it's ok
1715 if (pgbackend->can_handle_while_inactive(op)) {
1716 bool handled = pgbackend->handle_message(op);
1717 assert(handled);
1718 return;
1719 } else {
1720 waiting_for_peered.push_back(op);
1721 op->mark_delayed("waiting for peered");
1722 return;
1723 }
1724 }
1725
1726 assert(is_peered() && flushes_in_progress == 0);
1727 if (pgbackend->handle_message(op))
1728 return;
1729
1730 switch (op->get_req()->get_type()) {
1731 case CEPH_MSG_OSD_OP:
1732 case CEPH_MSG_OSD_BACKOFF:
1733 if (!is_active()) {
1734 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1735 waiting_for_active.push_back(op);
1736 op->mark_delayed("waiting for active");
1737 return;
1738 }
1739 switch (op->get_req()->get_type()) {
1740 case CEPH_MSG_OSD_OP:
1741 // verify client features
1742 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1743 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1744 osd->reply_op_error(op, -EOPNOTSUPP);
1745 return;
1746 }
1747 do_op(op);
1748 break;
1749 case CEPH_MSG_OSD_BACKOFF:
1750 // object-level backoff acks handled in osdop context
1751 handle_backoff(op);
1752 break;
1753 }
1754 break;
1755
1756 case MSG_OSD_SUBOP:
1757 do_sub_op(op);
1758 break;
1759
1760 case MSG_OSD_SUBOPREPLY:
1761 do_sub_op_reply(op);
1762 break;
1763
1764 case MSG_OSD_PG_SCAN:
1765 do_scan(op, handle);
1766 break;
1767
1768 case MSG_OSD_PG_BACKFILL:
1769 do_backfill(op);
1770 break;
1771
1772 case MSG_OSD_PG_BACKFILL_REMOVE:
1773 do_backfill_remove(op);
1774 break;
1775
1776 case MSG_OSD_SCRUB_RESERVE:
1777 {
1778 const MOSDScrubReserve *m =
1779 static_cast<const MOSDScrubReserve*>(op->get_req());
1780 switch (m->type) {
1781 case MOSDScrubReserve::REQUEST:
1782 handle_scrub_reserve_request(op);
1783 break;
1784 case MOSDScrubReserve::GRANT:
1785 handle_scrub_reserve_grant(op, m->from);
1786 break;
1787 case MOSDScrubReserve::REJECT:
1788 handle_scrub_reserve_reject(op, m->from);
1789 break;
1790 case MOSDScrubReserve::RELEASE:
1791 handle_scrub_reserve_release(op);
1792 break;
1793 }
1794 }
1795 break;
1796
1797 case MSG_OSD_REP_SCRUB:
1798 replica_scrub(op, handle);
1799 break;
1800
1801 case MSG_OSD_REP_SCRUBMAP:
1802 do_replica_scrub_map(op);
1803 break;
1804
1805 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1806 do_update_log_missing(op);
1807 break;
1808
1809 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1810 do_update_log_missing_reply(op);
1811 break;
1812
1813 default:
1814 assert(0 == "bad message type in do_request");
1815 }
1816 }
1817
1818 hobject_t PrimaryLogPG::earliest_backfill() const
1819 {
1820 hobject_t e = hobject_t::get_max();
1821 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1822 i != backfill_targets.end();
1823 ++i) {
1824 pg_shard_t bt = *i;
1825 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1826 assert(iter != peer_info.end());
1827 if (iter->second.last_backfill < e)
1828 e = iter->second.last_backfill;
1829 }
1830 return e;
1831 }
1832
1833 /** do_op - do an op
1834 * pg lock will be held (if multithreaded)
1835 * osd_lock NOT held.
1836 */
1837 void PrimaryLogPG::do_op(OpRequestRef& op)
1838 {
1839 FUNCTRACE();
1840 // NOTE: take a non-const pointer here; we must be careful not to
1841 // change anything that will break other reads on m (operator<<).
1842 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1843 assert(m->get_type() == CEPH_MSG_OSD_OP);
1844 if (m->finish_decode()) {
1845 op->reset_desc(); // for TrackedOp
1846 m->clear_payload();
1847 }
1848
1849 dout(20) << __func__ << ": op " << *m << dendl;
1850
1851 hobject_t head = m->get_hobj();
1852 head.snap = CEPH_NOSNAP;
1853
1854 if (!info.pgid.pgid.contains(
1855 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1856 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1857 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1858 << std::hex << head.get_hash() << std::dec << dendl;
1859 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1860 << " op " << *m;
1861 assert(!cct->_conf->osd_debug_misdirected_ops);
1862 return;
1863 }
1864
1865 bool can_backoff =
1866 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1867 SessionRef session;
1868 if (can_backoff) {
1869 session = static_cast<Session*>(m->get_connection()->get_priv());
1870 if (!session.get()) {
1871 dout(10) << __func__ << " no session" << dendl;
1872 return;
1873 }
1874 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1875
1876 if (session->check_backoff(cct, info.pgid, head, m)) {
1877 return;
1878 }
1879 }
1880
1881 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1882 // not implemented.
1883 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1884 osd->reply_op_error(op, -EINVAL);
1885 return;
1886 }
1887
1888 if (op->rmw_flags == 0) {
1889 int r = osd->osd->init_op_flags(op);
1890 if (r) {
1891 osd->reply_op_error(op, r);
1892 return;
1893 }
1894 }
1895
1896 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1897 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1898 op->may_read() &&
1899 !(op->may_write() || op->may_cache())) {
1900 // balanced reads; any replica will do
1901 if (!(is_primary() || is_replica())) {
1902 osd->handle_misdirected_op(this, op);
1903 return;
1904 }
1905 } else {
1906 // normal case; must be primary
1907 if (!is_primary()) {
1908 osd->handle_misdirected_op(this, op);
1909 return;
1910 }
1911 }
1912
1913 if (!op_has_sufficient_caps(op)) {
1914 osd->reply_op_error(op, -EPERM);
1915 return;
1916 }
1917
1918 if (op->includes_pg_op()) {
1919 return do_pg_op(op);
1920 }
1921
1922 // object name too long?
1923 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1924 dout(4) << "do_op name is longer than "
1925 << cct->_conf->osd_max_object_name_len
1926 << " bytes" << dendl;
1927 osd->reply_op_error(op, -ENAMETOOLONG);
1928 return;
1929 }
1930 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1931 dout(4) << "do_op locator is longer than "
1932 << cct->_conf->osd_max_object_name_len
1933 << " bytes" << dendl;
1934 osd->reply_op_error(op, -ENAMETOOLONG);
1935 return;
1936 }
1937 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1938 dout(4) << "do_op namespace is longer than "
1939 << cct->_conf->osd_max_object_namespace_len
1940 << " bytes" << dendl;
1941 osd->reply_op_error(op, -ENAMETOOLONG);
1942 return;
1943 }
1944
1945 if (int r = osd->store->validate_hobject_key(head)) {
1946 dout(4) << "do_op object " << head << " invalid for backing store: "
1947 << r << dendl;
1948 osd->reply_op_error(op, r);
1949 return;
1950 }
1951
1952 // blacklisted?
1953 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1954 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1955 osd->reply_op_error(op, -EBLACKLISTED);
1956 return;
1957 }
1958
1959 // order this op as a write?
1960 bool write_ordered = op->rwordered();
1961
1962 // discard due to cluster full transition? (we discard any op that
1963 // originates before the cluster or pool is marked full; the client
1964 // will resend after the full flag is removed or if they expect the
1965 // op to succeed despite being full). The except is FULL_FORCE and
1966 // FULL_TRY ops, which there is no reason to discard because they
1967 // bypass all full checks anyway. If this op isn't write or
1968 // read-ordered, we skip.
1969 // FIXME: we exclude mds writes for now.
1970 if (write_ordered && !(m->get_source().is_mds() ||
1971 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1972 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1973 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1974 dout(10) << __func__ << " discarding op sent before full " << m << " "
1975 << *m << dendl;
1976 return;
1977 }
1978 // mds should have stopped writing before this point.
1979 // We can't allow OSD to become non-startable even if mds
1980 // could be writing as part of file removals.
1981 ostringstream ss;
1982 if (write_ordered && osd->check_failsafe_full(ss)) {
1983 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1984 << ss.str()
1985 << dendl;
1986 return;
1987 }
1988 int64_t poolid = get_pgid().pool();
1989 if (op->may_write()) {
1990
1991 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1992 if (!pi) {
1993 return;
1994 }
1995
1996 // invalid?
1997 if (m->get_snapid() != CEPH_NOSNAP) {
1998 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1999 osd->reply_op_error(op, -EINVAL);
2000 return;
2001 }
2002
2003 // too big?
2004 if (cct->_conf->osd_max_write_size &&
2005 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2006 // journal can't hold commit!
2007 derr << "do_op msg data len " << m->get_data_len()
2008 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2009 << " on " << *m << dendl;
2010 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2011 return;
2012 }
2013 }
2014
2015 dout(10) << "do_op " << *m
2016 << (op->may_write() ? " may_write" : "")
2017 << (op->may_read() ? " may_read" : "")
2018 << (op->may_cache() ? " may_cache" : "")
2019 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2020 << " flags " << ceph_osd_flag_string(m->get_flags())
2021 << dendl;
2022
2023 // missing object?
2024 if (is_unreadable_object(head)) {
2025 if (!is_primary()) {
2026 osd->reply_op_error(op, -EAGAIN);
2027 return;
2028 }
2029 if (can_backoff &&
2030 (g_conf->osd_backoff_on_degraded ||
2031 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2032 add_backoff(session, head, head);
2033 maybe_kick_recovery(head);
2034 } else {
2035 wait_for_unreadable_object(head, op);
2036 }
2037 return;
2038 }
2039
2040 // degraded object?
2041 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2042 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2043 add_backoff(session, head, head);
2044 maybe_kick_recovery(head);
2045 } else {
2046 wait_for_degraded_object(head, op);
2047 }
2048 return;
2049 }
2050
2051 if (write_ordered &&
2052 scrubber.write_blocked_by_scrub(head)) {
2053 dout(20) << __func__ << ": waiting for scrub" << dendl;
2054 waiting_for_scrub.push_back(op);
2055 op->mark_delayed("waiting for scrub");
2056 return;
2057 }
2058
2059 // blocked on snap?
2060 map<hobject_t, snapid_t>::iterator blocked_iter =
2061 objects_blocked_on_degraded_snap.find(head);
2062 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2063 hobject_t to_wait_on(head);
2064 to_wait_on.snap = blocked_iter->second;
2065 wait_for_degraded_object(to_wait_on, op);
2066 return;
2067 }
2068 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2069 objects_blocked_on_snap_promotion.find(head);
2070 if (write_ordered &&
2071 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2072 wait_for_blocked_object(
2073 blocked_snap_promote_iter->second->obs.oi.soid,
2074 op);
2075 return;
2076 }
2077 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2078 block_write_on_full_cache(head, op);
2079 return;
2080 }
2081
2082 // missing snapdir?
2083 hobject_t snapdir = head.get_snapdir();
2084
2085 if (is_unreadable_object(snapdir)) {
2086 wait_for_unreadable_object(snapdir, op);
2087 return;
2088 }
2089
2090 // degraded object?
2091 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2092 wait_for_degraded_object(snapdir, op);
2093 return;
2094 }
2095
2096 // dup/resent?
2097 if (op->may_write() || op->may_cache()) {
2098 // warning: we will get back *a* request for this reqid, but not
2099 // necessarily the most recent. this happens with flush and
2100 // promote ops, but we can't possible have both in our log where
2101 // the original request is still not stable on disk, so for our
2102 // purposes here it doesn't matter which one we get.
2103 eversion_t version;
2104 version_t user_version;
2105 int return_code = 0;
2106 bool got = check_in_progress_op(
2107 m->get_reqid(), &version, &user_version, &return_code);
2108 if (got) {
2109 dout(3) << __func__ << " dup " << m->get_reqid()
2110 << " version " << version << dendl;
2111 if (already_complete(version)) {
2112 osd->reply_op_error(op, return_code, version, user_version);
2113 } else {
2114 dout(10) << " waiting for " << version << " to commit" << dendl;
2115 // always queue ondisk waiters, so that we can requeue if needed
2116 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2117 op->mark_delayed("waiting for ondisk");
2118 }
2119 return;
2120 }
2121 }
2122
2123 ObjectContextRef obc;
2124 bool can_create = op->may_write() || op->may_cache();
2125 hobject_t missing_oid;
2126 const hobject_t& oid = m->get_hobj();
2127
2128 // io blocked on obc?
2129 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2130 maybe_await_blocked_snapset(oid, op)) {
2131 return;
2132 }
2133
2134 int r = find_object_context(
2135 oid, &obc, can_create,
2136 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2137 &missing_oid);
2138
2139 if (r == -EAGAIN) {
2140 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2141 // we have to wait for the object.
2142 if (is_primary()) {
2143 // missing the specific snap we need; requeue and wait.
2144 assert(!op->may_write()); // only happens on a read/cache
2145 wait_for_unreadable_object(missing_oid, op);
2146 return;
2147 }
2148 } else if (r == 0) {
2149 if (is_unreadable_object(obc->obs.oi.soid)) {
2150 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2151 << " is unreadable, waiting" << dendl;
2152 wait_for_unreadable_object(obc->obs.oi.soid, op);
2153 return;
2154 }
2155
2156 // degraded object? (the check above was for head; this could be a clone)
2157 if (write_ordered &&
2158 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2159 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2160 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2161 << " is degraded, waiting" << dendl;
2162 wait_for_degraded_object(obc->obs.oi.soid, op);
2163 return;
2164 }
2165 }
2166
2167 bool in_hit_set = false;
2168 if (hit_set) {
2169 if (obc.get()) {
2170 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2171 in_hit_set = true;
2172 } else {
2173 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2174 in_hit_set = true;
2175 }
2176 if (!op->hitset_inserted) {
2177 hit_set->insert(oid);
2178 op->hitset_inserted = true;
2179 if (hit_set->is_full() ||
2180 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2181 hit_set_persist();
2182 }
2183 }
2184 }
2185
2186 if (agent_state) {
2187 if (agent_choose_mode(false, op))
2188 return;
2189 }
2190
2191 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2192 if (maybe_handle_manifest(op,
2193 write_ordered,
2194 obc))
2195 return;
2196 }
2197
2198 if (maybe_handle_cache(op,
2199 write_ordered,
2200 obc,
2201 r,
2202 missing_oid,
2203 false,
2204 in_hit_set))
2205 return;
2206
2207 if (r && (r != -ENOENT || !obc)) {
2208 // copy the reqids for copy get on ENOENT
2209 if (r == -ENOENT &&
2210 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2211 fill_in_copy_get_noent(op, oid, m->ops[0]);
2212 return;
2213 }
2214 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2215 if (op->may_write() &&
2216 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2217 record_write_error(op, oid, nullptr, r);
2218 } else {
2219 osd->reply_op_error(op, r);
2220 }
2221 return;
2222 }
2223
2224 // make sure locator is consistent
2225 object_locator_t oloc(obc->obs.oi.soid);
2226 if (m->get_object_locator() != oloc) {
2227 dout(10) << " provided locator " << m->get_object_locator()
2228 << " != object's " << obc->obs.oi.soid << dendl;
2229 osd->clog->warn() << "bad locator " << m->get_object_locator()
2230 << " on object " << oloc
2231 << " op " << *m;
2232 }
2233
2234 // io blocked on obc?
2235 if (obc->is_blocked() &&
2236 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2237 wait_for_blocked_object(obc->obs.oi.soid, op);
2238 return;
2239 }
2240
2241 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2242
2243 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2244 OSDOp& osd_op = *p;
2245
2246 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2247 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2248 m->get_snapid() != CEPH_SNAPDIR) {
2249 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2250 osd->reply_op_error(op, -EINVAL);
2251 return;
2252 }
2253 }
2254
2255 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2256
2257 if (!obc->obs.exists)
2258 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2259
2260 /* Due to obc caching, we might have a cached non-existent snapset_obc
2261 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2262 * do_op pipeline make decisions based on whether snapset_obc is
2263 * populated.
2264 */
2265 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2266 ctx->snapset_obc = ObjectContextRef();
2267
2268 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2269 dout(20) << __func__ << ": skipping rw locks" << dendl;
2270 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2271 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2272
2273 // verify there is in fact a flush in progress
2274 // FIXME: we could make this a stronger test.
2275 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2276 if (p == flush_ops.end()) {
2277 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2278 reply_ctx(ctx, -EINVAL);
2279 return;
2280 }
2281 } else if (!get_rw_locks(write_ordered, ctx)) {
2282 dout(20) << __func__ << " waiting for rw locks " << dendl;
2283 op->mark_delayed("waiting for rw locks");
2284 close_op_ctx(ctx);
2285 return;
2286 }
2287 dout(20) << __func__ << " obc " << *obc << dendl;
2288
2289 if (r) {
2290 dout(20) << __func__ << " returned an error: " << r << dendl;
2291 close_op_ctx(ctx);
2292 if (op->may_write() &&
2293 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2294 record_write_error(op, oid, nullptr, r);
2295 } else {
2296 osd->reply_op_error(op, r);
2297 }
2298 return;
2299 }
2300
2301 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2302 ctx->ignore_cache = true;
2303 }
2304
2305 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2306 // This object is lost. Reading from it returns an error.
2307 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2308 << " is lost" << dendl;
2309 reply_ctx(ctx, -ENFILE);
2310 return;
2311 }
2312 if (!op->may_write() &&
2313 !op->may_cache() &&
2314 (!obc->obs.exists ||
2315 ((m->get_snapid() != CEPH_SNAPDIR) &&
2316 obc->obs.oi.is_whiteout()))) {
2317 // copy the reqids for copy get on ENOENT
2318 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2319 fill_in_copy_get_noent(op, oid, m->ops[0]);
2320 close_op_ctx(ctx);
2321 return;
2322 }
2323 reply_ctx(ctx, -ENOENT);
2324 return;
2325 }
2326
2327 op->mark_started();
2328
2329 execute_ctx(ctx);
2330 utime_t prepare_latency = ceph_clock_now();
2331 prepare_latency -= op->get_dequeued_time();
2332 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2333 if (op->may_read() && op->may_write()) {
2334 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2335 } else if (op->may_read()) {
2336 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2337 } else if (op->may_write() || op->may_cache()) {
2338 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2339 }
2340
2341 // force recovery of the oldest missing object if too many logs
2342 maybe_force_recovery();
2343 }
2344 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2345 OpRequestRef op,
2346 bool write_ordered,
2347 ObjectContextRef obc)
2348 {
2349 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2350 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2351 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2352 return cache_result_t::NOOP;
2353 }
2354
2355 if (obc)
2356 dout(10) << __func__ << " " << obc->obs.oi << " "
2357 << (obc->obs.exists ? "exists" : "DNE")
2358 << dendl;
2359
2360 // if it is write-ordered and blocked, stop now
2361 if (obc.get() && obc->is_blocked() && write_ordered) {
2362 // we're already doing something with this object
2363 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2364 return cache_result_t::NOOP;
2365 }
2366
2367 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2368 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2369 OSDOp& osd_op = *p;
2370 ceph_osd_op& op = osd_op.op;
2371 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2372 return cache_result_t::NOOP;
2373 }
2374 }
2375
2376 switch (obc->obs.oi.manifest.type) {
2377 case object_manifest_t::TYPE_REDIRECT:
2378 if (op->may_write() || write_ordered) {
2379 do_proxy_write(op, obc->obs.oi.soid, obc);
2380 } else {
2381 do_proxy_read(op, obc);
2382 }
2383 return cache_result_t::HANDLED_PROXY;
2384 case object_manifest_t::TYPE_CHUNKED:
2385 default:
2386 assert(0 == "unrecognized manifest type");
2387 }
2388
2389 return cache_result_t::NOOP;
2390 }
2391
2392 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2393 MOSDOpReply *orig_reply, int r)
2394 {
2395 dout(20) << __func__ << " r=" << r << dendl;
2396 assert(op->may_write());
2397 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2398 mempool::osd_pglog::list<pg_log_entry_t> entries;
2399 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2400 get_next_version(), eversion_t(), 0,
2401 reqid, utime_t(), r));
2402
2403 struct OnComplete {
2404 PrimaryLogPG *pg;
2405 OpRequestRef op;
2406 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2407 int r;
2408 OnComplete(
2409 PrimaryLogPG *pg,
2410 OpRequestRef op,
2411 MOSDOpReply *orig_reply,
2412 int r)
2413 : pg(pg), op(op),
2414 orig_reply(orig_reply, false /* take over ref */), r(r)
2415 {}
2416 void operator()() {
2417 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2418 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2419 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2420 MOSDOpReply *reply = orig_reply.detach();
2421 if (reply == nullptr) {
2422 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2423 flags, true);
2424 }
2425 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2426 pg->osd->send_message_osd_client(reply, m->get_connection());
2427 }
2428 };
2429
2430 ObcLockManager lock_manager;
2431 submit_log_entries(
2432 entries,
2433 std::move(lock_manager),
2434 boost::optional<std::function<void(void)> >(
2435 OnComplete(this, op, orig_reply, r)),
2436 op,
2437 r);
2438 }
2439
2440 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2441 OpRequestRef op,
2442 bool write_ordered,
2443 ObjectContextRef obc,
2444 int r, hobject_t missing_oid,
2445 bool must_promote,
2446 bool in_hit_set,
2447 ObjectContextRef *promote_obc)
2448 {
2449 if (op &&
2450 op->get_req() &&
2451 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2452 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2453 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2454 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2455 return cache_result_t::NOOP;
2456 }
2457 // return quickly if caching is not enabled
2458 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2459 return cache_result_t::NOOP;
2460
2461 must_promote = must_promote || op->need_promote();
2462
2463 if (obc)
2464 dout(25) << __func__ << " " << obc->obs.oi << " "
2465 << (obc->obs.exists ? "exists" : "DNE")
2466 << " missing_oid " << missing_oid
2467 << " must_promote " << (int)must_promote
2468 << " in_hit_set " << (int)in_hit_set
2469 << dendl;
2470 else
2471 dout(25) << __func__ << " (no obc)"
2472 << " missing_oid " << missing_oid
2473 << " must_promote " << (int)must_promote
2474 << " in_hit_set " << (int)in_hit_set
2475 << dendl;
2476
2477 // if it is write-ordered and blocked, stop now
2478 if (obc.get() && obc->is_blocked() && write_ordered) {
2479 // we're already doing something with this object
2480 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2481 return cache_result_t::NOOP;
2482 }
2483
2484 if (r == -ENOENT && missing_oid == hobject_t()) {
2485 // we know this object is logically absent (e.g., an undefined clone)
2486 return cache_result_t::NOOP;
2487 }
2488
2489 if (obc.get() && obc->obs.exists) {
2490 osd->logger->inc(l_osd_op_cache_hit);
2491 return cache_result_t::NOOP;
2492 }
2493
2494 if (missing_oid == hobject_t() && obc.get()) {
2495 missing_oid = obc->obs.oi.soid;
2496 }
2497
2498 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2499 const object_locator_t oloc = m->get_object_locator();
2500
2501 if (op->need_skip_handle_cache()) {
2502 return cache_result_t::NOOP;
2503 }
2504
2505 // older versions do not proxy the feature bits.
2506 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2507 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2508 OpRequestRef promote_op;
2509
2510 switch (pool.info.cache_mode) {
2511 case pg_pool_t::CACHEMODE_WRITEBACK:
2512 if (agent_state &&
2513 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2514 if (!op->may_write() && !op->may_cache() &&
2515 !write_ordered && !must_promote) {
2516 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2517 do_proxy_read(op);
2518 return cache_result_t::HANDLED_PROXY;
2519 }
2520 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2521 block_write_on_full_cache(missing_oid, op);
2522 return cache_result_t::BLOCKED_FULL;
2523 }
2524
2525 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2526 promote_object(obc, missing_oid, oloc, op, promote_obc);
2527 return cache_result_t::BLOCKED_PROMOTE;
2528 }
2529
2530 if (op->may_write() || op->may_cache()) {
2531 if (can_proxy_write) {
2532 do_proxy_write(op, missing_oid);
2533 } else {
2534 // promote if can't proxy the write
2535 promote_object(obc, missing_oid, oloc, op, promote_obc);
2536 return cache_result_t::BLOCKED_PROMOTE;
2537 }
2538
2539 // Promote too?
2540 if (!op->need_skip_promote() &&
2541 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2542 pool.info.min_write_recency_for_promote,
2543 OpRequestRef(),
2544 promote_obc)) {
2545 return cache_result_t::BLOCKED_PROMOTE;
2546 }
2547 return cache_result_t::HANDLED_PROXY;
2548 } else {
2549 do_proxy_read(op);
2550
2551 // Avoid duplicate promotion
2552 if (obc.get() && obc->is_blocked()) {
2553 if (promote_obc)
2554 *promote_obc = obc;
2555 return cache_result_t::BLOCKED_PROMOTE;
2556 }
2557
2558 // Promote too?
2559 if (!op->need_skip_promote()) {
2560 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2561 pool.info.min_read_recency_for_promote,
2562 promote_op, promote_obc);
2563 }
2564
2565 return cache_result_t::HANDLED_PROXY;
2566 }
2567 assert(0 == "unreachable");
2568 return cache_result_t::NOOP;
2569
2570 case pg_pool_t::CACHEMODE_FORWARD:
2571 // FIXME: this mode allows requests to be reordered.
2572 do_cache_redirect(op);
2573 return cache_result_t::HANDLED_REDIRECT;
2574
2575 case pg_pool_t::CACHEMODE_READONLY:
2576 // TODO: clean this case up
2577 if (!obc.get() && r == -ENOENT) {
2578 // we don't have the object and op's a read
2579 promote_object(obc, missing_oid, oloc, op, promote_obc);
2580 return cache_result_t::BLOCKED_PROMOTE;
2581 }
2582 if (!r) { // it must be a write
2583 do_cache_redirect(op);
2584 return cache_result_t::HANDLED_REDIRECT;
2585 }
2586 // crap, there was a failure of some kind
2587 return cache_result_t::NOOP;
2588
2589 case pg_pool_t::CACHEMODE_READFORWARD:
2590 // Do writeback to the cache tier for writes
2591 if (op->may_write() || write_ordered || must_promote) {
2592 if (agent_state &&
2593 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2594 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2595 block_write_on_full_cache(missing_oid, op);
2596 return cache_result_t::BLOCKED_FULL;
2597 }
2598 promote_object(obc, missing_oid, oloc, op, promote_obc);
2599 return cache_result_t::BLOCKED_PROMOTE;
2600 }
2601
2602 // If it is a read, we can read, we need to forward it
2603 do_cache_redirect(op);
2604 return cache_result_t::HANDLED_REDIRECT;
2605
2606 case pg_pool_t::CACHEMODE_PROXY:
2607 if (!must_promote) {
2608 if (op->may_write() || op->may_cache() || write_ordered) {
2609 if (can_proxy_write) {
2610 do_proxy_write(op, missing_oid);
2611 return cache_result_t::HANDLED_PROXY;
2612 }
2613 } else {
2614 do_proxy_read(op);
2615 return cache_result_t::HANDLED_PROXY;
2616 }
2617 }
2618 // ugh, we're forced to promote.
2619 if (agent_state &&
2620 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2621 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2622 block_write_on_full_cache(missing_oid, op);
2623 return cache_result_t::BLOCKED_FULL;
2624 }
2625 promote_object(obc, missing_oid, oloc, op, promote_obc);
2626 return cache_result_t::BLOCKED_PROMOTE;
2627
2628 case pg_pool_t::CACHEMODE_READPROXY:
2629 // Do writeback to the cache tier for writes
2630 if (op->may_write() || write_ordered || must_promote) {
2631 if (agent_state &&
2632 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2633 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2634 block_write_on_full_cache(missing_oid, op);
2635 return cache_result_t::BLOCKED_FULL;
2636 }
2637 promote_object(obc, missing_oid, oloc, op, promote_obc);
2638 return cache_result_t::BLOCKED_PROMOTE;
2639 }
2640
2641 // If it is a read, we can read, we need to proxy it
2642 do_proxy_read(op);
2643 return cache_result_t::HANDLED_PROXY;
2644
2645 default:
2646 assert(0 == "unrecognized cache_mode");
2647 }
2648 return cache_result_t::NOOP;
2649 }
2650
2651 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2652 const hobject_t& missing_oid,
2653 const object_locator_t& oloc,
2654 bool in_hit_set,
2655 uint32_t recency,
2656 OpRequestRef promote_op,
2657 ObjectContextRef *promote_obc)
2658 {
2659 dout(20) << __func__ << " missing_oid " << missing_oid
2660 << " in_hit_set " << in_hit_set << dendl;
2661
2662 switch (recency) {
2663 case 0:
2664 break;
2665 case 1:
2666 // Check if in the current hit set
2667 if (in_hit_set) {
2668 break;
2669 } else {
2670 // not promoting
2671 return false;
2672 }
2673 break;
2674 default:
2675 {
2676 unsigned count = (int)in_hit_set;
2677 if (count) {
2678 // Check if in other hit sets
2679 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2680 for (map<time_t,HitSetRef>::reverse_iterator itor =
2681 agent_state->hit_set_map.rbegin();
2682 itor != agent_state->hit_set_map.rend();
2683 ++itor) {
2684 if (!itor->second->contains(oid)) {
2685 break;
2686 }
2687 ++count;
2688 if (count >= recency) {
2689 break;
2690 }
2691 }
2692 }
2693 if (count >= recency) {
2694 break;
2695 }
2696 return false; // not promoting
2697 }
2698 break;
2699 }
2700
2701 if (osd->promote_throttle()) {
2702 dout(10) << __func__ << " promote throttled" << dendl;
2703 return false;
2704 }
2705 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2706 return true;
2707 }
2708
2709 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2710 {
2711 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2712 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2713 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2714 get_osdmap()->get_epoch(), flags, false);
2715 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2716 reply->set_redirect(redir);
2717 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2718 << op << dendl;
2719 m->get_connection()->send_message(reply);
2720 return;
2721 }
2722
2723 struct C_ProxyRead : public Context {
2724 PrimaryLogPGRef pg;
2725 hobject_t oid;
2726 epoch_t last_peering_reset;
2727 ceph_tid_t tid;
2728 PrimaryLogPG::ProxyReadOpRef prdop;
2729 utime_t start;
2730 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2731 const PrimaryLogPG::ProxyReadOpRef& prd)
2732 : pg(p), oid(o), last_peering_reset(lpr),
2733 tid(0), prdop(prd), start(ceph_clock_now())
2734 {}
2735 void finish(int r) override {
2736 if (prdop->canceled)
2737 return;
2738 pg->lock();
2739 if (prdop->canceled) {
2740 pg->unlock();
2741 return;
2742 }
2743 if (last_peering_reset == pg->get_last_peering_reset()) {
2744 pg->finish_proxy_read(oid, tid, r);
2745 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2746 }
2747 pg->unlock();
2748 }
2749 };
2750
2751 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2752 {
2753 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2754 // stash the result in the request's OSDOp vector
2755 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2756 object_locator_t oloc;
2757 hobject_t soid;
2758 /* extensible tier */
2759 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2760 switch (obc->obs.oi.manifest.type) {
2761 case object_manifest_t::TYPE_REDIRECT:
2762 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2763 soid = obc->obs.oi.manifest.redirect_target;
2764 break;
2765 case object_manifest_t::TYPE_CHUNKED:
2766 default:
2767 assert(0 == "unrecognized manifest type");
2768 }
2769 } else {
2770 /* proxy */
2771 soid = m->get_hobj();
2772 oloc = object_locator_t(m->get_object_locator());
2773 oloc.pool = pool.info.tier_of;
2774 }
2775 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2776
2777 // pass through some original flags that make sense.
2778 // - leave out redirection and balancing flags since we are
2779 // already proxying through the primary
2780 // - leave off read/write/exec flags that are derived from the op
2781 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2782 CEPH_OSD_FLAG_ORDERSNAP |
2783 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2784 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2785
2786 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2787
2788 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2789
2790 ObjectOperation obj_op;
2791 obj_op.dup(prdop->ops);
2792
2793 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2794 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2795 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2796 ceph_osd_op op = obj_op.ops[i].op;
2797 switch (op.op) {
2798 case CEPH_OSD_OP_READ:
2799 case CEPH_OSD_OP_SYNC_READ:
2800 case CEPH_OSD_OP_SPARSE_READ:
2801 case CEPH_OSD_OP_CHECKSUM:
2802 case CEPH_OSD_OP_CMPEXT:
2803 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2804 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2805 }
2806 }
2807 }
2808
2809 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2810 prdop);
2811 ceph_tid_t tid = osd->objecter->read(
2812 soid.oid, oloc, obj_op,
2813 m->get_snapid(), NULL,
2814 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2815 &prdop->user_version,
2816 &prdop->data_offset,
2817 m->get_features());
2818 fin->tid = tid;
2819 prdop->objecter_tid = tid;
2820 proxyread_ops[tid] = prdop;
2821 in_progress_proxy_ops[soid].push_back(op);
2822 }
2823
2824 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2825 {
2826 dout(10) << __func__ << " " << oid << " tid " << tid
2827 << " " << cpp_strerror(r) << dendl;
2828
2829 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2830 if (p == proxyread_ops.end()) {
2831 dout(10) << __func__ << " no proxyread_op found" << dendl;
2832 return;
2833 }
2834 ProxyReadOpRef prdop = p->second;
2835 if (tid != prdop->objecter_tid) {
2836 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2837 << " tid " << prdop->objecter_tid << dendl;
2838 return;
2839 }
2840 if (oid != prdop->soid) {
2841 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2842 << " soid " << prdop->soid << dendl;
2843 return;
2844 }
2845 proxyread_ops.erase(tid);
2846
2847 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2848 if (q == in_progress_proxy_ops.end()) {
2849 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2850 return;
2851 }
2852 assert(q->second.size());
2853 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2854 q->second.end(),
2855 prdop->op);
2856 assert(it != q->second.end());
2857 OpRequestRef op = *it;
2858 q->second.erase(it);
2859 if (q->second.size() == 0) {
2860 in_progress_proxy_ops.erase(oid);
2861 }
2862
2863 osd->logger->inc(l_osd_tier_proxy_read);
2864
2865 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2866 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2867 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2868 ctx->user_at_version = prdop->user_version;
2869 ctx->data_off = prdop->data_offset;
2870 ctx->ignore_log_op_stats = true;
2871 complete_read_ctx(r, ctx);
2872 }
2873
2874 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2875 {
2876 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2877 if (p == in_progress_proxy_ops.end())
2878 return;
2879
2880 list<OpRequestRef>& ls = p->second;
2881 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2882 requeue_ops(ls);
2883 in_progress_proxy_ops.erase(p);
2884 }
2885
2886 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2887 {
2888 dout(10) << __func__ << " " << prdop->soid << dendl;
2889 prdop->canceled = true;
2890
2891 // cancel objecter op, if we can
2892 if (prdop->objecter_tid) {
2893 osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2894 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2895 prdop->ops[i].outdata.clear();
2896 }
2897 proxyread_ops.erase(prdop->objecter_tid);
2898 prdop->objecter_tid = 0;
2899 }
2900 }
2901
2902 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2903 {
2904 dout(10) << __func__ << dendl;
2905
2906 // cancel proxy reads
2907 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2908 while (p != proxyread_ops.end()) {
2909 cancel_proxy_read((p++)->second);
2910 }
2911
2912 // cancel proxy writes
2913 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2914 while (q != proxywrite_ops.end()) {
2915 cancel_proxy_write((q++)->second);
2916 }
2917
2918 if (requeue) {
2919 map<hobject_t, list<OpRequestRef>>::iterator p =
2920 in_progress_proxy_ops.begin();
2921 while (p != in_progress_proxy_ops.end()) {
2922 list<OpRequestRef>& ls = p->second;
2923 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2924 << " requests" << dendl;
2925 requeue_ops(ls);
2926 in_progress_proxy_ops.erase(p++);
2927 }
2928 } else {
2929 in_progress_proxy_ops.clear();
2930 }
2931 }
2932
2933 struct C_ProxyWrite_Commit : public Context {
2934 PrimaryLogPGRef pg;
2935 hobject_t oid;
2936 epoch_t last_peering_reset;
2937 ceph_tid_t tid;
2938 PrimaryLogPG::ProxyWriteOpRef pwop;
2939 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2940 const PrimaryLogPG::ProxyWriteOpRef& pw)
2941 : pg(p), oid(o), last_peering_reset(lpr),
2942 tid(0), pwop(pw)
2943 {}
2944 void finish(int r) override {
2945 if (pwop->canceled)
2946 return;
2947 pg->lock();
2948 if (pwop->canceled) {
2949 pg->unlock();
2950 return;
2951 }
2952 if (last_peering_reset == pg->get_last_peering_reset()) {
2953 pg->finish_proxy_write(oid, tid, r);
2954 }
2955 pg->unlock();
2956 }
2957 };
2958
2959 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2960 {
2961 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2962 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2963 object_locator_t oloc;
2964 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2965 hobject_t soid;
2966 /* extensible tier */
2967 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2968 switch (obc->obs.oi.manifest.type) {
2969 case object_manifest_t::TYPE_REDIRECT:
2970 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2971 soid = obc->obs.oi.manifest.redirect_target;
2972 break;
2973 case object_manifest_t::TYPE_CHUNKED:
2974 default:
2975 assert(0 == "unrecognized manifest type");
2976 }
2977 } else {
2978 /* proxy */
2979 soid = m->get_hobj();
2980 oloc = object_locator_t(m->get_object_locator());
2981 oloc.pool = pool.info.tier_of;
2982 }
2983
2984 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2985 if (!(op->may_write() || op->may_cache())) {
2986 flags |= CEPH_OSD_FLAG_RWORDERED;
2987 }
2988 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2989
2990 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2991 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
2992 pwop->mtime = m->get_mtime();
2993
2994 ObjectOperation obj_op;
2995 obj_op.dup(pwop->ops);
2996
2997 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2998 this, soid, get_last_peering_reset(), pwop);
2999 ceph_tid_t tid = osd->objecter->mutate(
3000 soid.oid, oloc, obj_op, snapc,
3001 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3002 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3003 &pwop->user_version, pwop->reqid);
3004 fin->tid = tid;
3005 pwop->objecter_tid = tid;
3006 proxywrite_ops[tid] = pwop;
3007 in_progress_proxy_ops[soid].push_back(op);
3008 }
3009
3010 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3011 {
3012 dout(10) << __func__ << " " << oid << " tid " << tid
3013 << " " << cpp_strerror(r) << dendl;
3014
3015 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3016 if (p == proxywrite_ops.end()) {
3017 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3018 return;
3019 }
3020 ProxyWriteOpRef pwop = p->second;
3021 assert(tid == pwop->objecter_tid);
3022 assert(oid == pwop->soid);
3023
3024 proxywrite_ops.erase(tid);
3025
3026 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3027 if (q == in_progress_proxy_ops.end()) {
3028 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3029 delete pwop->ctx;
3030 pwop->ctx = NULL;
3031 return;
3032 }
3033 list<OpRequestRef>& in_progress_op = q->second;
3034 assert(in_progress_op.size());
3035 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3036 in_progress_op.end(),
3037 pwop->op);
3038 assert(it != in_progress_op.end());
3039 in_progress_op.erase(it);
3040 if (in_progress_op.size() == 0) {
3041 in_progress_proxy_ops.erase(oid);
3042 }
3043
3044 osd->logger->inc(l_osd_tier_proxy_write);
3045
3046 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3047 assert(m != NULL);
3048
3049 if (!pwop->sent_reply) {
3050 // send commit.
3051 MOSDOpReply *reply = pwop->ctx->reply;
3052 if (reply)
3053 pwop->ctx->reply = NULL;
3054 else {
3055 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3056 reply->set_reply_versions(eversion_t(), pwop->user_version);
3057 }
3058 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3059 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3060 osd->send_message_osd_client(reply, m->get_connection());
3061 pwop->sent_reply = true;
3062 pwop->ctx->op->mark_commit_sent();
3063 }
3064
3065 delete pwop->ctx;
3066 pwop->ctx = NULL;
3067 }
3068
3069 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3070 {
3071 dout(10) << __func__ << " " << pwop->soid << dendl;
3072 pwop->canceled = true;
3073
3074 // cancel objecter op, if we can
3075 if (pwop->objecter_tid) {
3076 osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3077 delete pwop->ctx;
3078 pwop->ctx = NULL;
3079 proxywrite_ops.erase(pwop->objecter_tid);
3080 pwop->objecter_tid = 0;
3081 }
3082 }
3083
3084 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3085 ObjectContextRef obc;
3086 PrimaryLogPG *pg;
3087 utime_t start;
3088 public:
3089 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3090 : obc(obc_),
3091 pg(pg_),
3092 start(ceph_clock_now()) {}
3093
3094 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3095 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3096 int r = results.get<0>();
3097 pg->finish_promote(r, results_data, obc);
3098 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3099 }
3100 };
3101
3102 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3103 const hobject_t& missing_oid,
3104 const object_locator_t& oloc,
3105 OpRequestRef op,
3106 ObjectContextRef *promote_obc)
3107 {
3108 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3109 assert(hoid != hobject_t());
3110 if (scrubber.write_blocked_by_scrub(hoid)) {
3111 dout(10) << __func__ << " " << hoid
3112 << " blocked by scrub" << dendl;
3113 if (op) {
3114 waiting_for_scrub.push_back(op);
3115 op->mark_delayed("waiting for scrub");
3116 dout(10) << __func__ << " " << hoid
3117 << " placing op in waiting_for_scrub" << dendl;
3118 } else {
3119 dout(10) << __func__ << " " << hoid
3120 << " no op, dropping on the floor" << dendl;
3121 }
3122 return;
3123 }
3124 if (!obc) { // we need to create an ObjectContext
3125 assert(missing_oid != hobject_t());
3126 obc = get_object_context(missing_oid, true);
3127 }
3128 if (promote_obc)
3129 *promote_obc = obc;
3130
3131 /*
3132 * Before promote complete, if there are proxy-reads for the object,
3133 * for this case we don't use DONTNEED.
3134 */
3135 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3136 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3137 if (q == in_progress_proxy_ops.end()) {
3138 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3139 }
3140
3141 PromoteCallback *cb = new PromoteCallback(obc, this);
3142 object_locator_t my_oloc = oloc;
3143 my_oloc.pool = pool.info.tier_of;
3144
3145 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3146 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3147 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3148 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3149 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3150 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3151 src_fadvise_flags, 0);
3152
3153 assert(obc->is_blocked());
3154
3155 if (op)
3156 wait_for_blocked_object(obc->obs.oi.soid, op);
3157 info.stats.stats.sum.num_promote++;
3158 }
3159
3160 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3161 {
3162 FUNCTRACE();
3163 dout(10) << __func__ << " " << ctx << dendl;
3164 ctx->reset_obs(ctx->obc);
3165 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3166 OpRequestRef op = ctx->op;
3167 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3168 ObjectContextRef obc = ctx->obc;
3169 const hobject_t& soid = obc->obs.oi.soid;
3170
3171 // this method must be idempotent since we may call it several times
3172 // before we finally apply the resulting transaction.
3173 ctx->op_t.reset(new PGTransaction);
3174
3175 if (op->may_write() || op->may_cache()) {
3176 // snap
3177 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3178 pool.info.is_pool_snaps_mode()) {
3179 // use pool's snapc
3180 ctx->snapc = pool.snapc;
3181 } else {
3182 // client specified snapc
3183 ctx->snapc.seq = m->get_snap_seq();
3184 ctx->snapc.snaps = m->get_snaps();
3185 filter_snapc(ctx->snapc.snaps);
3186 }
3187 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3188 ctx->snapc.seq < obc->ssc->snapset.seq) {
3189 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3190 << " < snapset seq " << obc->ssc->snapset.seq
3191 << " on " << obc->obs.oi.soid << dendl;
3192 reply_ctx(ctx, -EOLDSNAPC);
3193 return;
3194 }
3195
3196 // version
3197 ctx->at_version = get_next_version();
3198 ctx->mtime = m->get_mtime();
3199
3200 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3201 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3202 << " snapc " << ctx->snapc
3203 << " snapset " << obc->ssc->snapset
3204 << dendl;
3205 } else {
3206 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3207 << " ov " << obc->obs.oi.version
3208 << dendl;
3209 }
3210
3211 if (!ctx->user_at_version)
3212 ctx->user_at_version = obc->obs.oi.user_version;
3213 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3214
3215 if (op->may_read()) {
3216 dout(10) << " taking ondisk_read_lock" << dendl;
3217 obc->ondisk_read_lock();
3218 }
3219
3220 {
3221 #ifdef WITH_LTTNG
3222 osd_reqid_t reqid = ctx->op->get_reqid();
3223 #endif
3224 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3225 reqid.name._num, reqid.tid, reqid.inc);
3226 }
3227
3228 int result = prepare_transaction(ctx);
3229
3230 {
3231 #ifdef WITH_LTTNG
3232 osd_reqid_t reqid = ctx->op->get_reqid();
3233 #endif
3234 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3235 reqid.name._num, reqid.tid, reqid.inc);
3236 }
3237
3238 if (op->may_read()) {
3239 dout(10) << " dropping ondisk_read_lock" << dendl;
3240 obc->ondisk_read_unlock();
3241 }
3242
3243 bool pending_async_reads = !ctx->pending_async_reads.empty();
3244 if (result == -EINPROGRESS || pending_async_reads) {
3245 // come back later.
3246 if (pending_async_reads) {
3247 in_progress_async_reads.push_back(make_pair(op, ctx));
3248 ctx->start_async_reads(this);
3249 }
3250 return;
3251 }
3252
3253 if (result == -EAGAIN) {
3254 // clean up after the ctx
3255 close_op_ctx(ctx);
3256 return;
3257 }
3258
3259 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3260 // prepare the reply
3261 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3262 successful_write);
3263
3264 // Write operations aren't allowed to return a data payload because
3265 // we can't do so reliably. If the client has to resend the request
3266 // and it has already been applied, we will return 0 with no
3267 // payload. Non-deterministic behavior is no good. However, it is
3268 // possible to construct an operation that does a read, does a guard
3269 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3270 // with the write, or return a CMPXATTR and the read value.
3271 if (successful_write) {
3272 // write. normalize the result code.
3273 dout(20) << " zeroing write result code " << result << dendl;
3274 result = 0;
3275 }
3276 ctx->reply->set_result(result);
3277
3278 // read or error?
3279 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3280 // finish side-effects
3281 if (result >= 0)
3282 do_osd_op_effects(ctx, m->get_connection());
3283
3284 complete_read_ctx(result, ctx);
3285 return;
3286 }
3287
3288 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3289
3290 assert(op->may_write() || op->may_cache());
3291
3292 // trim log?
3293 calc_trim_to();
3294
3295 // verify that we are doing this in order?
3296 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3297 !pool.info.is_tier() && !pool.info.has_tiers()) {
3298 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3299 ceph_tid_t t = m->get_tid();
3300 client_t n = m->get_source().num();
3301 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3302 if (p == cm.end()) {
3303 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3304 cm[n] = t;
3305 } else {
3306 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3307 if (p->second > t) {
3308 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3309 assert(0 == "out of order op");
3310 }
3311 p->second = t;
3312 }
3313 }
3314
3315 if (ctx->update_log_only) {
3316 if (result >= 0)
3317 do_osd_op_effects(ctx, m->get_connection());
3318
3319 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3320 // save just what we need from ctx
3321 MOSDOpReply *reply = ctx->reply;
3322 ctx->reply = nullptr;
3323 reply->claim_op_out_data(*ctx->ops);
3324 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3325 close_op_ctx(ctx);
3326
3327 if (result == -ENOENT) {
3328 reply->set_enoent_reply_versions(info.last_update,
3329 info.last_user_version);
3330 }
3331 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3332 // append to pg log for dup detection - don't save buffers for now
3333 record_write_error(op, soid, reply, result);
3334 return;
3335 }
3336
3337 // no need to capture PG ref, repop cancel will handle that
3338 // Can capture the ctx by pointer, it's owned by the repop
3339 ctx->register_on_commit(
3340 [m, ctx, this](){
3341 if (ctx->op)
3342 log_op_stats(
3343 ctx);
3344
3345 if (m && !ctx->sent_reply) {
3346 MOSDOpReply *reply = ctx->reply;
3347 if (reply)
3348 ctx->reply = nullptr;
3349 else {
3350 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3351 reply->set_reply_versions(ctx->at_version,
3352 ctx->user_at_version);
3353 }
3354 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3355 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3356 osd->send_message_osd_client(reply, m->get_connection());
3357 ctx->sent_reply = true;
3358 ctx->op->mark_commit_sent();
3359 }
3360 });
3361 ctx->register_on_success(
3362 [ctx, this]() {
3363 do_osd_op_effects(
3364 ctx,
3365 ctx->op ? ctx->op->get_req()->get_connection() :
3366 ConnectionRef());
3367 });
3368 ctx->register_on_finish(
3369 [ctx, this]() {
3370 delete ctx;
3371 });
3372
3373 // issue replica writes
3374 ceph_tid_t rep_tid = osd->get_tid();
3375
3376 RepGather *repop = new_repop(ctx, obc, rep_tid);
3377
3378 issue_repop(repop, ctx);
3379 eval_repop(repop);
3380 repop->put();
3381 }
3382
3383 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3384 release_object_locks(ctx->lock_manager);
3385
3386 ctx->op_t.reset();
3387
3388 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3389 ctx->on_finish.erase(p++)) {
3390 (*p)();
3391 }
3392 delete ctx;
3393 }
3394
3395 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3396 {
3397 if (ctx->op)
3398 osd->reply_op_error(ctx->op, r);
3399 close_op_ctx(ctx);
3400 }
3401
3402 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3403 {
3404 if (ctx->op)
3405 osd->reply_op_error(ctx->op, r, v, uv);
3406 close_op_ctx(ctx);
3407 }
3408
3409 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3410 {
3411 OpRequestRef op = ctx->op;
3412 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3413
3414 utime_t now = ceph_clock_now();
3415 utime_t latency = now;
3416 latency -= ctx->op->get_req()->get_recv_stamp();
3417 utime_t process_latency = now;
3418 process_latency -= ctx->op->get_dequeued_time();
3419
3420 uint64_t inb = ctx->bytes_written;
3421 uint64_t outb = ctx->bytes_read;
3422
3423 osd->logger->inc(l_osd_op);
3424
3425 osd->logger->inc(l_osd_op_outb, outb);
3426 osd->logger->inc(l_osd_op_inb, inb);
3427 osd->logger->tinc(l_osd_op_lat, latency);
3428 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3429
3430 if (op->may_read() && op->may_write()) {
3431 osd->logger->inc(l_osd_op_rw);
3432 osd->logger->inc(l_osd_op_rw_inb, inb);
3433 osd->logger->inc(l_osd_op_rw_outb, outb);
3434 osd->logger->tinc(l_osd_op_rw_lat, latency);
3435 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3436 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3437 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3438 } else if (op->may_read()) {
3439 osd->logger->inc(l_osd_op_r);
3440 osd->logger->inc(l_osd_op_r_outb, outb);
3441 osd->logger->tinc(l_osd_op_r_lat, latency);
3442 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3443 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3444 } else if (op->may_write() || op->may_cache()) {
3445 osd->logger->inc(l_osd_op_w);
3446 osd->logger->inc(l_osd_op_w_inb, inb);
3447 osd->logger->tinc(l_osd_op_w_lat, latency);
3448 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3449 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3450 } else
3451 ceph_abort();
3452
3453 dout(15) << "log_op_stats " << *m
3454 << " inb " << inb
3455 << " outb " << outb
3456 << " lat " << latency << dendl;
3457 }
3458
3459 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3460 {
3461 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3462 assert(have_same_or_newer_map(m->map_epoch));
3463 assert(m->get_type() == MSG_OSD_SUBOP);
3464 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3465
3466 if (!is_peered()) {
3467 waiting_for_peered.push_back(op);
3468 op->mark_delayed("waiting for active");
3469 return;
3470 }
3471
3472 const OSDOp *first = NULL;
3473 if (m->ops.size() >= 1) {
3474 first = &m->ops[0];
3475 }
3476
3477 if (first) {
3478 switch (first->op.op) {
3479 case CEPH_OSD_OP_DELETE:
3480 sub_op_remove(op);
3481 return;
3482 case CEPH_OSD_OP_SCRUB_RESERVE:
3483 handle_scrub_reserve_request(op);
3484 return;
3485 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3486 handle_scrub_reserve_release(op);
3487 return;
3488 case CEPH_OSD_OP_SCRUB_MAP:
3489 sub_op_scrub_map(op);
3490 return;
3491 }
3492 }
3493 }
3494
3495 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3496 {
3497 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3498 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3499 if (r->ops.size() >= 1) {
3500 const OSDOp& first = r->ops[0];
3501 switch (first.op.op) {
3502 case CEPH_OSD_OP_SCRUB_RESERVE:
3503 {
3504 pg_shard_t from = r->from;
3505 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3506 bool reserved;
3507 ::decode(reserved, p);
3508 if (reserved) {
3509 handle_scrub_reserve_grant(op, from);
3510 } else {
3511 handle_scrub_reserve_reject(op, from);
3512 }
3513 }
3514 return;
3515 }
3516 }
3517 }
3518
3519 void PrimaryLogPG::do_scan(
3520 OpRequestRef op,
3521 ThreadPool::TPHandle &handle)
3522 {
3523 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3524 assert(m->get_type() == MSG_OSD_PG_SCAN);
3525 dout(10) << "do_scan " << *m << dendl;
3526
3527 op->mark_started();
3528
3529 switch (m->op) {
3530 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3531 {
3532 ostringstream ss;
3533 if (osd->check_backfill_full(ss)) {
3534 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3535 queue_peering_event(
3536 CephPeeringEvtRef(
3537 std::make_shared<CephPeeringEvt>(
3538 get_osdmap()->get_epoch(),
3539 get_osdmap()->get_epoch(),
3540 BackfillTooFull())));
3541 return;
3542 }
3543
3544 BackfillInterval bi;
3545 bi.begin = m->begin;
3546 // No need to flush, there won't be any in progress writes occuring
3547 // past m->begin
3548 scan_range(
3549 cct->_conf->osd_backfill_scan_min,
3550 cct->_conf->osd_backfill_scan_max,
3551 &bi,
3552 handle);
3553 MOSDPGScan *reply = new MOSDPGScan(
3554 MOSDPGScan::OP_SCAN_DIGEST,
3555 pg_whoami,
3556 get_osdmap()->get_epoch(), m->query_epoch,
3557 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3558 ::encode(bi.objects, reply->get_data());
3559 osd->send_message_osd_cluster(reply, m->get_connection());
3560 }
3561 break;
3562
3563 case MOSDPGScan::OP_SCAN_DIGEST:
3564 {
3565 pg_shard_t from = m->from;
3566
3567 // Check that from is in backfill_targets vector
3568 assert(is_backfill_targets(from));
3569
3570 BackfillInterval& bi = peer_backfill_info[from];
3571 bi.begin = m->begin;
3572 bi.end = m->end;
3573 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3574
3575 // take care to preserve ordering!
3576 bi.clear_objects();
3577 ::decode_noclear(bi.objects, p);
3578
3579 if (waiting_on_backfill.erase(from)) {
3580 if (waiting_on_backfill.empty()) {
3581 assert(peer_backfill_info.size() == backfill_targets.size());
3582 finish_recovery_op(hobject_t::get_max());
3583 }
3584 } else {
3585 // we canceled backfill for a while due to a too full, and this
3586 // is an extra response from a non-too-full peer
3587 }
3588 }
3589 break;
3590 }
3591 }
3592
3593 void PrimaryLogPG::do_backfill(OpRequestRef op)
3594 {
3595 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3596 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3597 dout(10) << "do_backfill " << *m << dendl;
3598
3599 op->mark_started();
3600
3601 switch (m->op) {
3602 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3603 {
3604 assert(cct->_conf->osd_kill_backfill_at != 1);
3605
3606 MOSDPGBackfill *reply = new MOSDPGBackfill(
3607 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3608 get_osdmap()->get_epoch(),
3609 m->query_epoch,
3610 spg_t(info.pgid.pgid, get_primary().shard));
3611 reply->set_priority(get_recovery_op_priority());
3612 osd->send_message_osd_cluster(reply, m->get_connection());
3613 queue_peering_event(
3614 CephPeeringEvtRef(
3615 std::make_shared<CephPeeringEvt>(
3616 get_osdmap()->get_epoch(),
3617 get_osdmap()->get_epoch(),
3618 RecoveryDone())));
3619 }
3620 // fall-thru
3621
3622 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3623 {
3624 assert(cct->_conf->osd_kill_backfill_at != 2);
3625
3626 info.set_last_backfill(m->last_backfill);
3627 info.stats = m->stats;
3628
3629 ObjectStore::Transaction t;
3630 dirty_info = true;
3631 write_if_dirty(t);
3632 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3633 assert(tr == 0);
3634 }
3635 break;
3636
3637 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3638 {
3639 assert(is_primary());
3640 assert(cct->_conf->osd_kill_backfill_at != 3);
3641 finish_recovery_op(hobject_t::get_max());
3642 }
3643 break;
3644 }
3645 }
3646
3647 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3648 {
3649 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3650 op->get_req());
3651 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3652 dout(7) << __func__ << " " << m->ls << dendl;
3653
3654 op->mark_started();
3655
3656 ObjectStore::Transaction t;
3657 for (auto& p : m->ls) {
3658 remove_snap_mapped_object(t, p.first);
3659 }
3660 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3661 assert(r == 0);
3662 }
3663
3664 int PrimaryLogPG::trim_object(
3665 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3666 {
3667 *ctxp = NULL;
3668 // load clone info
3669 bufferlist bl;
3670 ObjectContextRef obc = get_object_context(coid, false, NULL);
3671 if (!obc || !obc->ssc || !obc->ssc->exists) {
3672 osd->clog->error() << __func__ << ": Can not trim " << coid
3673 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3674 return -ENOENT;
3675 }
3676
3677 hobject_t snapoid(
3678 coid.oid, coid.get_key(),
3679 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3680 info.pgid.pool(), coid.get_namespace());
3681 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3682 if (!snapset_obc) {
3683 osd->clog->error() << __func__ << ": Can not trim " << coid
3684 << " repair needed, no snapset obc for " << snapoid;
3685 return -ENOENT;
3686 }
3687
3688 SnapSet& snapset = obc->ssc->snapset;
3689
3690 bool legacy = snapset.is_legacy() ||
3691 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3692
3693 object_info_t &coi = obc->obs.oi;
3694 set<snapid_t> old_snaps;
3695 if (legacy) {
3696 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3697 } else {
3698 auto p = snapset.clone_snaps.find(coid.snap);
3699 if (p == snapset.clone_snaps.end()) {
3700 osd->clog->error() << "No clone_snaps in snapset " << snapset
3701 << " for object " << coid << "\n";
3702 return -ENOENT;
3703 }
3704 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3705 snapset.clone_snaps[coid.snap].end());
3706 }
3707 if (old_snaps.empty()) {
3708 osd->clog->error() << "No object info snaps for object " << coid;
3709 return -ENOENT;
3710 }
3711
3712 dout(10) << coid << " old_snaps " << old_snaps
3713 << " old snapset " << snapset << dendl;
3714 if (snapset.seq == 0) {
3715 osd->clog->error() << "No snapset.seq for object " << coid;
3716 return -ENOENT;
3717 }
3718
3719 set<snapid_t> new_snaps;
3720 for (set<snapid_t>::iterator i = old_snaps.begin();
3721 i != old_snaps.end();
3722 ++i) {
3723 if (!pool.info.is_removed_snap(*i))
3724 new_snaps.insert(*i);
3725 }
3726
3727 vector<snapid_t>::iterator p = snapset.clones.end();
3728
3729 if (new_snaps.empty()) {
3730 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3731 if (p == snapset.clones.end()) {
3732 osd->clog->error() << "Snap " << coid.snap << " not in clones";
3733 return -ENOENT;
3734 }
3735 }
3736
3737 OpContextUPtr ctx = simple_opc_create(obc);
3738 ctx->snapset_obc = snapset_obc;
3739
3740 if (!ctx->lock_manager.get_snaptrimmer_write(
3741 coid,
3742 obc,
3743 first)) {
3744 close_op_ctx(ctx.release());
3745 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3746 return -ENOLCK;
3747 }
3748
3749 if (!ctx->lock_manager.get_snaptrimmer_write(
3750 snapoid,
3751 snapset_obc,
3752 first)) {
3753 close_op_ctx(ctx.release());
3754 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3755 return -ENOLCK;
3756 }
3757
3758 ctx->at_version = get_next_version();
3759
3760 PGTransaction *t = ctx->op_t.get();
3761
3762 if (new_snaps.empty()) {
3763 // remove clone
3764 dout(10) << coid << " snaps " << old_snaps << " -> "
3765 << new_snaps << " ... deleting" << dendl;
3766
3767 // ...from snapset
3768 assert(p != snapset.clones.end());
3769
3770 snapid_t last = coid.snap;
3771 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3772
3773 if (p != snapset.clones.begin()) {
3774 // not the oldest... merge overlap into next older clone
3775 vector<snapid_t>::iterator n = p - 1;
3776 hobject_t prev_coid = coid;
3777 prev_coid.snap = *n;
3778 bool adjust_prev_bytes = is_present_clone(prev_coid);
3779
3780 if (adjust_prev_bytes)
3781 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3782
3783 snapset.clone_overlap[*n].intersection_of(
3784 snapset.clone_overlap[*p]);
3785
3786 if (adjust_prev_bytes)
3787 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3788 }
3789 ctx->delta_stats.num_objects--;
3790 if (coi.is_dirty())
3791 ctx->delta_stats.num_objects_dirty--;
3792 if (coi.is_omap())
3793 ctx->delta_stats.num_objects_omap--;
3794 if (coi.is_whiteout()) {
3795 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3796 ctx->delta_stats.num_whiteouts--;
3797 }
3798 ctx->delta_stats.num_object_clones--;
3799 if (coi.is_cache_pinned())
3800 ctx->delta_stats.num_objects_pinned--;
3801 obc->obs.exists = false;
3802
3803 snapset.clones.erase(p);
3804 snapset.clone_overlap.erase(last);
3805 snapset.clone_size.erase(last);
3806 snapset.clone_snaps.erase(last);
3807
3808 ctx->log.push_back(
3809 pg_log_entry_t(
3810 pg_log_entry_t::DELETE,
3811 coid,
3812 ctx->at_version,
3813 ctx->obs->oi.version,
3814 0,
3815 osd_reqid_t(),
3816 ctx->mtime,
3817 0)
3818 );
3819 t->remove(coid);
3820 t->update_snaps(
3821 coid,
3822 old_snaps,
3823 new_snaps);
3824
3825 coi = object_info_t(coid);
3826
3827 ctx->at_version.version++;
3828 } else {
3829 // save adjusted snaps for this object
3830 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3831 if (legacy) {
3832 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3833 } else {
3834 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3835 new_snaps.rend());
3836 // we still do a 'modify' event on this object just to trigger a
3837 // snapmapper.update ... :(
3838 }
3839
3840 coi.prior_version = coi.version;
3841 coi.version = ctx->at_version;
3842 bl.clear();
3843 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3844 t->setattr(coid, OI_ATTR, bl);
3845
3846 ctx->log.push_back(
3847 pg_log_entry_t(
3848 pg_log_entry_t::MODIFY,
3849 coid,
3850 coi.version,
3851 coi.prior_version,
3852 0,
3853 osd_reqid_t(),
3854 ctx->mtime,
3855 0)
3856 );
3857 ctx->at_version.version++;
3858
3859 t->update_snaps(
3860 coid,
3861 old_snaps,
3862 new_snaps);
3863 }
3864
3865 // save head snapset
3866 dout(10) << coid << " new snapset " << snapset << " on "
3867 << snapset_obc->obs.oi << dendl;
3868 if (snapset.clones.empty() &&
3869 (!snapset.head_exists ||
3870 (snapset_obc->obs.oi.is_whiteout() &&
3871 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3872 !snapset_obc->obs.oi.is_cache_pinned()))) {
3873 // NOTE: this arguably constitutes minor interference with the
3874 // tiering agent if this is a cache tier since a snap trim event
3875 // is effectively evicting a whiteout we might otherwise want to
3876 // keep around.
3877 dout(10) << coid << " removing " << snapoid << dendl;
3878 ctx->log.push_back(
3879 pg_log_entry_t(
3880 pg_log_entry_t::DELETE,
3881 snapoid,
3882 ctx->at_version,
3883 ctx->snapset_obc->obs.oi.version,
3884 0,
3885 osd_reqid_t(),
3886 ctx->mtime,
3887 0)
3888 );
3889 if (snapoid.is_head()) {
3890 derr << "removing snap head" << dendl;
3891 object_info_t& oi = ctx->snapset_obc->obs.oi;
3892 ctx->delta_stats.num_objects--;
3893 if (oi.is_dirty()) {
3894 ctx->delta_stats.num_objects_dirty--;
3895 }
3896 if (oi.is_omap())
3897 ctx->delta_stats.num_objects_omap--;
3898 if (oi.is_whiteout()) {
3899 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3900 ctx->delta_stats.num_whiteouts--;
3901 }
3902 if (oi.is_cache_pinned()) {
3903 ctx->delta_stats.num_objects_pinned--;
3904 }
3905 }
3906 ctx->snapset_obc->obs.exists = false;
3907 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3908 t->remove(snapoid);
3909 } else {
3910 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3911 snapset.filter(pool.info);
3912 dout(10) << coid << " writing updated snapset on " << snapoid
3913 << ", snapset is " << snapset << dendl;
3914 ctx->log.push_back(
3915 pg_log_entry_t(
3916 pg_log_entry_t::MODIFY,
3917 snapoid,
3918 ctx->at_version,
3919 ctx->snapset_obc->obs.oi.version,
3920 0,
3921 osd_reqid_t(),
3922 ctx->mtime,
3923 0)
3924 );
3925
3926 ctx->snapset_obc->obs.oi.prior_version =
3927 ctx->snapset_obc->obs.oi.version;
3928 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3929
3930 map <string, bufferlist> attrs;
3931 bl.clear();
3932 ::encode(snapset, bl);
3933 attrs[SS_ATTR].claim(bl);
3934
3935 bl.clear();
3936 ::encode(ctx->snapset_obc->obs.oi, bl,
3937 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3938 attrs[OI_ATTR].claim(bl);
3939 t->setattrs(snapoid, attrs);
3940 }
3941
3942 *ctxp = std::move(ctx);
3943 return 0;
3944 }
3945
3946 void PrimaryLogPG::kick_snap_trim()
3947 {
3948 assert(is_active());
3949 assert(is_primary());
3950 if (is_clean() && !snap_trimq.empty()) {
3951 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3952 snap_trimmer_machine.process_event(KickTrim());
3953 }
3954 }
3955
3956 void PrimaryLogPG::snap_trimmer_scrub_complete()
3957 {
3958 if (is_primary() && is_active() && is_clean()) {
3959 assert(!snap_trimq.empty());
3960 snap_trimmer_machine.process_event(ScrubComplete());
3961 }
3962 }
3963
3964 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3965 {
3966 if (deleting || pg_has_reset_since(queued)) {
3967 return;
3968 }
3969
3970 assert(is_primary());
3971
3972 dout(10) << "snap_trimmer posting" << dendl;
3973 snap_trimmer_machine.process_event(DoSnapWork());
3974 dout(10) << "snap_trimmer complete" << dendl;
3975 return;
3976 }
3977
3978 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3979 {
3980 __u64 v2;
3981
3982 string v2s(xattr.c_str(), xattr.length());
3983 if (v2s.length())
3984 v2 = strtoull(v2s.c_str(), NULL, 10);
3985 else
3986 v2 = 0;
3987
3988 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3989
3990 switch (op) {
3991 case CEPH_OSD_CMPXATTR_OP_EQ:
3992 return (v1 == v2);
3993 case CEPH_OSD_CMPXATTR_OP_NE:
3994 return (v1 != v2);
3995 case CEPH_OSD_CMPXATTR_OP_GT:
3996 return (v1 > v2);
3997 case CEPH_OSD_CMPXATTR_OP_GTE:
3998 return (v1 >= v2);
3999 case CEPH_OSD_CMPXATTR_OP_LT:
4000 return (v1 < v2);
4001 case CEPH_OSD_CMPXATTR_OP_LTE:
4002 return (v1 <= v2);
4003 default:
4004 return -EINVAL;
4005 }
4006 }
4007
4008 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4009 {
4010 string v2s(xattr.c_str(), xattr.length());
4011
4012 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4013
4014 switch (op) {
4015 case CEPH_OSD_CMPXATTR_OP_EQ:
4016 return (v1s.compare(v2s) == 0);
4017 case CEPH_OSD_CMPXATTR_OP_NE:
4018 return (v1s.compare(v2s) != 0);
4019 case CEPH_OSD_CMPXATTR_OP_GT:
4020 return (v1s.compare(v2s) > 0);
4021 case CEPH_OSD_CMPXATTR_OP_GTE:
4022 return (v1s.compare(v2s) >= 0);
4023 case CEPH_OSD_CMPXATTR_OP_LT:
4024 return (v1s.compare(v2s) < 0);
4025 case CEPH_OSD_CMPXATTR_OP_LTE:
4026 return (v1s.compare(v2s) <= 0);
4027 default:
4028 return -EINVAL;
4029 }
4030 }
4031
4032 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4033 {
4034 ceph_osd_op& op = osd_op.op;
4035 vector<OSDOp> write_ops(1);
4036 OSDOp& write_op = write_ops[0];
4037 uint64_t write_length = op.writesame.length;
4038 int result = 0;
4039
4040 if (!write_length)
4041 return 0;
4042
4043 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4044 return -EINVAL;
4045
4046 if (op.writesame.data_length != osd_op.indata.length()) {
4047 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4048 return -EINVAL;
4049 }
4050
4051 while (write_length) {
4052 write_op.indata.append(osd_op.indata);
4053 write_length -= op.writesame.data_length;
4054 }
4055
4056 write_op.op.op = CEPH_OSD_OP_WRITE;
4057 write_op.op.extent.offset = op.writesame.offset;
4058 write_op.op.extent.length = op.writesame.length;
4059 result = do_osd_ops(ctx, write_ops);
4060 if (result < 0)
4061 derr << "do_writesame do_osd_ops failed " << result << dendl;
4062
4063 return result;
4064 }
4065
4066 // ========================================================================
4067 // low level osd ops
4068
4069 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4070 {
4071 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4072 bufferlist header, vals;
4073 int r = _get_tmap(ctx, &header, &vals);
4074 if (r < 0) {
4075 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4076 r = 0;
4077 return r;
4078 }
4079
4080 vector<OSDOp> ops(3);
4081
4082 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4083 ops[0].op.extent.offset = 0;
4084 ops[0].op.extent.length = 0;
4085
4086 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4087 ops[1].indata.claim(header);
4088
4089 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4090 ops[2].indata.claim(vals);
4091
4092 return do_osd_ops(ctx, ops);
4093 }
4094
4095 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4096 bufferlist& bl)
4097 {
4098 // decode
4099 bufferlist header;
4100 map<string, bufferlist> m;
4101 if (bl.length()) {
4102 bufferlist::iterator p = bl.begin();
4103 ::decode(header, p);
4104 ::decode(m, p);
4105 assert(p.end());
4106 }
4107
4108 // do the update(s)
4109 while (!bp.end()) {
4110 __u8 op;
4111 string key;
4112 ::decode(op, bp);
4113
4114 switch (op) {
4115 case CEPH_OSD_TMAP_SET: // insert key
4116 {
4117 ::decode(key, bp);
4118 bufferlist data;
4119 ::decode(data, bp);
4120 m[key] = data;
4121 }
4122 break;
4123 case CEPH_OSD_TMAP_RM: // remove key
4124 ::decode(key, bp);
4125 if (!m.count(key)) {
4126 return -ENOENT;
4127 }
4128 m.erase(key);
4129 break;
4130 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4131 ::decode(key, bp);
4132 m.erase(key);
4133 break;
4134 case CEPH_OSD_TMAP_HDR: // update header
4135 {
4136 ::decode(header, bp);
4137 }
4138 break;
4139 default:
4140 return -EINVAL;
4141 }
4142 }
4143
4144 // reencode
4145 bufferlist obl;
4146 ::encode(header, obl);
4147 ::encode(m, obl);
4148
4149 // write it out
4150 vector<OSDOp> nops(1);
4151 OSDOp& newop = nops[0];
4152 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4153 newop.op.extent.offset = 0;
4154 newop.op.extent.length = obl.length();
4155 newop.indata = obl;
4156 do_osd_ops(ctx, nops);
4157 osd_op.outdata.claim(newop.outdata);
4158 return 0;
4159 }
4160
4161 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4162 {
4163 bufferlist::iterator orig_bp = bp;
4164 int result = 0;
4165 if (bp.end()) {
4166 dout(10) << "tmapup is a no-op" << dendl;
4167 } else {
4168 // read the whole object
4169 vector<OSDOp> nops(1);
4170 OSDOp& newop = nops[0];
4171 newop.op.op = CEPH_OSD_OP_READ;
4172 newop.op.extent.offset = 0;
4173 newop.op.extent.length = 0;
4174 result = do_osd_ops(ctx, nops);
4175
4176 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4177
4178 dout(30) << " starting is \n";
4179 newop.outdata.hexdump(*_dout);
4180 *_dout << dendl;
4181
4182 bufferlist::iterator ip = newop.outdata.begin();
4183 bufferlist obl;
4184
4185 dout(30) << "the update command is: \n";
4186 osd_op.indata.hexdump(*_dout);
4187 *_dout << dendl;
4188
4189 // header
4190 bufferlist header;
4191 __u32 nkeys = 0;
4192 if (newop.outdata.length()) {
4193 ::decode(header, ip);
4194 ::decode(nkeys, ip);
4195 }
4196 dout(10) << "tmapup header " << header.length() << dendl;
4197
4198 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4199 ++bp;
4200 ::decode(header, bp);
4201 dout(10) << "tmapup new header " << header.length() << dendl;
4202 }
4203
4204 ::encode(header, obl);
4205
4206 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4207
4208 // update keys
4209 bufferlist newkeydata;
4210 string nextkey, last_in_key;
4211 bufferlist nextval;
4212 bool have_next = false;
4213 if (!ip.end()) {
4214 have_next = true;
4215 ::decode(nextkey, ip);
4216 ::decode(nextval, ip);
4217 }
4218 while (!bp.end() && !result) {
4219 __u8 op;
4220 string key;
4221 try {
4222 ::decode(op, bp);
4223 ::decode(key, bp);
4224 }
4225 catch (buffer::error& e) {
4226 return -EINVAL;
4227 }
4228 if (key < last_in_key) {
4229 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4230 << "', falling back to an inefficient (unsorted) update" << dendl;
4231 bp = orig_bp;
4232 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4233 }
4234 last_in_key = key;
4235
4236 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4237
4238 // skip existing intervening keys
4239 bool key_exists = false;
4240 while (have_next && !key_exists) {
4241 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4242 if (nextkey > key)
4243 break;
4244 if (nextkey < key) {
4245 // copy untouched.
4246 ::encode(nextkey, newkeydata);
4247 ::encode(nextval, newkeydata);
4248 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4249 } else {
4250 // don't copy; discard old value. and stop.
4251 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4252 key_exists = true;
4253 nkeys--;
4254 }
4255 if (!ip.end()) {
4256 ::decode(nextkey, ip);
4257 ::decode(nextval, ip);
4258 } else {
4259 have_next = false;
4260 }
4261 }
4262
4263 if (op == CEPH_OSD_TMAP_SET) {
4264 bufferlist val;
4265 try {
4266 ::decode(val, bp);
4267 }
4268 catch (buffer::error& e) {
4269 return -EINVAL;
4270 }
4271 ::encode(key, newkeydata);
4272 ::encode(val, newkeydata);
4273 dout(20) << " set " << key << " " << val.length() << dendl;
4274 nkeys++;
4275 } else if (op == CEPH_OSD_TMAP_CREATE) {
4276 if (key_exists) {
4277 return -EEXIST;
4278 }
4279 bufferlist val;
4280 try {
4281 ::decode(val, bp);
4282 }
4283 catch (buffer::error& e) {
4284 return -EINVAL;
4285 }
4286 ::encode(key, newkeydata);
4287 ::encode(val, newkeydata);
4288 dout(20) << " create " << key << " " << val.length() << dendl;
4289 nkeys++;
4290 } else if (op == CEPH_OSD_TMAP_RM) {
4291 // do nothing.
4292 if (!key_exists) {
4293 return -ENOENT;
4294 }
4295 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4296 // do nothing
4297 } else {
4298 dout(10) << " invalid tmap op " << (int)op << dendl;
4299 return -EINVAL;
4300 }
4301 }
4302
4303 // copy remaining
4304 if (have_next) {
4305 ::encode(nextkey, newkeydata);
4306 ::encode(nextval, newkeydata);
4307 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4308 }
4309 if (!ip.end()) {
4310 bufferlist rest;
4311 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4312 dout(20) << " keep trailing " << rest.length()
4313 << " at " << newkeydata.length() << dendl;
4314 newkeydata.claim_append(rest);
4315 }
4316
4317 // encode final key count + key data
4318 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4319 ::encode(nkeys, obl);
4320 obl.claim_append(newkeydata);
4321
4322 if (0) {
4323 dout(30) << " final is \n";
4324 obl.hexdump(*_dout);
4325 *_dout << dendl;
4326
4327 // sanity check
4328 bufferlist::iterator tp = obl.begin();
4329 bufferlist h;
4330 ::decode(h, tp);
4331 map<string,bufferlist> d;
4332 ::decode(d, tp);
4333 assert(tp.end());
4334 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4335 }
4336
4337 // write it out
4338 if (!result) {
4339 dout(20) << "tmapput write " << obl.length() << dendl;
4340 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4341 newop.op.extent.offset = 0;
4342 newop.op.extent.length = obl.length();
4343 newop.indata = obl;
4344 do_osd_ops(ctx, nops);
4345 osd_op.outdata.claim(newop.outdata);
4346 }
4347 }
4348 return result;
4349 }
4350
4351 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4352 {
4353 if (offset >= max ||
4354 length > max ||
4355 offset + length > max)
4356 return -EFBIG;
4357
4358 return 0;
4359 }
4360
4361 struct FillInVerifyExtent : public Context {
4362 ceph_le64 *r;
4363 int32_t *rval;
4364 bufferlist *outdatap;
4365 boost::optional<uint32_t> maybe_crc;
4366 uint64_t size;
4367 OSDService *osd;
4368 hobject_t soid;
4369 __le32 flags;
4370 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4371 boost::optional<uint32_t> mc, uint64_t size,
4372 OSDService *osd, hobject_t soid, __le32 flags) :
4373 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4374 size(size), osd(osd), soid(soid), flags(flags) {}
4375 void finish(int len) override {
4376 *r = len;
4377 if (len < 0) {
4378 *rval = len;
4379 return;
4380 }
4381 *rval = 0;
4382
4383 // whole object? can we verify the checksum?
4384 if (maybe_crc && *r == size) {
4385 uint32_t crc = outdatap->crc32c(-1);
4386 if (maybe_crc != crc) {
4387 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4388 << " != expected 0x" << *maybe_crc
4389 << std::dec << " on " << soid;
4390 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4391 *rval = -EIO;
4392 *r = 0;
4393 }
4394 }
4395 }
4396 }
4397 };
4398
4399 struct ToSparseReadResult : public Context {
4400 int* result;
4401 bufferlist* data_bl;
4402 uint64_t data_offset;
4403 ceph_le64* len;
4404 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4405 ceph_le64* len)
4406 : result(result), data_bl(bl), data_offset(offset),len(len) {}
4407 void finish(int r) override {
4408 if (r < 0) {
4409 *result = r;
4410 return;
4411 }
4412 *result = 0;
4413 *len = r;
4414 bufferlist outdata;
4415 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4416 ::encode(extents, outdata);
4417 ::encode_destructively(*data_bl, outdata);
4418 data_bl->swap(outdata);
4419 }
4420 };
4421
4422 template<typename V>
4423 static string list_keys(const map<string, V>& m) {
4424 string s;
4425 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4426 if (!s.empty()) {
4427 s.push_back(',');
4428 }
4429 s.append(itr->first);
4430 }
4431 return s;
4432 }
4433
4434 template<typename T>
4435 static string list_entries(const T& m) {
4436 string s;
4437 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4438 if (!s.empty()) {
4439 s.push_back(',');
4440 }
4441 s.append(*itr);
4442 }
4443 return s;
4444 }
4445
4446 void PrimaryLogPG::maybe_create_new_object(
4447 OpContext *ctx,
4448 bool ignore_transaction)
4449 {
4450 ObjectState& obs = ctx->new_obs;
4451 if (!obs.exists) {
4452 ctx->delta_stats.num_objects++;
4453 obs.exists = true;
4454 assert(!obs.oi.is_whiteout());
4455 obs.oi.new_object();
4456 if (!ignore_transaction)
4457 ctx->op_t->create(obs.oi.soid);
4458 } else if (obs.oi.is_whiteout()) {
4459 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4460 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4461 --ctx->delta_stats.num_whiteouts;
4462 }
4463 }
4464
4465 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4466 OSDOp& osd_op;
4467
4468 ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4469 }
4470
4471 int execute() override {
4472 return osd_op.rval;
4473 }
4474 };
4475
4476 struct C_ChecksumRead : public Context {
4477 PrimaryLogPG *primary_log_pg;
4478 OSDOp &osd_op;
4479 Checksummer::CSumType csum_type;
4480 bufferlist init_value_bl;
4481 ceph_le64 read_length;
4482 bufferlist read_bl;
4483 Context *fill_extent_ctx;
4484
4485 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4486 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4487 boost::optional<uint32_t> maybe_crc, uint64_t size,
4488 OSDService *osd, hobject_t soid, __le32 flags)
4489 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4490 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4491 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4492 &read_bl, maybe_crc, size,
4493 osd, soid, flags)) {
4494 }
4495 ~C_ChecksumRead() override {
4496 delete fill_extent_ctx;
4497 }
4498
4499 void finish(int r) override {
4500 fill_extent_ctx->complete(r);
4501 fill_extent_ctx = nullptr;
4502
4503 if (osd_op.rval >= 0) {
4504 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4505 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4506 &init_value_bl_it, read_bl);
4507 }
4508 }
4509 };
4510
4511 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4512 bufferlist::iterator *bl_it)
4513 {
4514 dout(20) << __func__ << dendl;
4515
4516 auto& op = osd_op.op;
4517 if (op.checksum.chunk_size > 0) {
4518 if (op.checksum.length == 0) {
4519 dout(10) << __func__ << ": length required when chunk size provided"
4520 << dendl;
4521 return -EINVAL;
4522 }
4523 if (op.checksum.length % op.checksum.chunk_size != 0) {
4524 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4525 return -EINVAL;
4526 }
4527 }
4528
4529 auto& oi = ctx->new_obs.oi;
4530 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4531 // zeroed offset+length implies checksum whole object
4532 op.checksum.length = oi.size;
4533 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4534 return -EOVERFLOW;
4535 }
4536
4537 Checksummer::CSumType csum_type;
4538 switch (op.checksum.type) {
4539 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4540 csum_type = Checksummer::CSUM_XXHASH32;
4541 break;
4542 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4543 csum_type = Checksummer::CSUM_XXHASH64;
4544 break;
4545 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4546 csum_type = Checksummer::CSUM_CRC32C;
4547 break;
4548 default:
4549 dout(10) << __func__ << ": unknown crc type ("
4550 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4551 return -EINVAL;
4552 }
4553
4554 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4555 if (bl_it->get_remaining() < csum_init_value_size) {
4556 dout(10) << __func__ << ": init value not provided" << dendl;
4557 return -EINVAL;
4558 }
4559
4560 bufferlist init_value_bl;
4561 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4562 csum_init_value_size);
4563 bl_it->advance(csum_init_value_size);
4564
4565 if (pool.info.require_rollback() && op.checksum.length > 0) {
4566 // If there is a data digest and it is possible we are reading
4567 // entire object, pass the digest.
4568 boost::optional<uint32_t> maybe_crc;
4569 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4570 op.checksum.length >= oi.size) {
4571 maybe_crc = oi.data_digest;
4572 }
4573
4574 // async read
4575 auto& soid = oi.soid;
4576 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4577 std::move(init_value_bl), maybe_crc,
4578 oi.size, osd, soid, op.flags);
4579
4580 ctx->pending_async_reads.push_back({
4581 {op.checksum.offset, op.checksum.length, op.flags},
4582 {&checksum_ctx->read_bl, checksum_ctx}});
4583
4584 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4585 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4586 new ReadFinisher(osd_op));
4587 return -EINPROGRESS;
4588 }
4589
4590 // sync read
4591 std::vector<OSDOp> read_ops(1);
4592 auto& read_op = read_ops[0];
4593 if (op.checksum.length > 0) {
4594 read_op.op.op = CEPH_OSD_OP_READ;
4595 read_op.op.flags = op.flags;
4596 read_op.op.extent.offset = op.checksum.offset;
4597 read_op.op.extent.length = op.checksum.length;
4598 read_op.op.extent.truncate_size = 0;
4599 read_op.op.extent.truncate_seq = 0;
4600
4601 int r = do_osd_ops(ctx, read_ops);
4602 if (r < 0) {
4603 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4604 return r;
4605 }
4606 }
4607
4608 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4609 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4610 read_op.outdata);
4611 }
4612
4613 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4614 Checksummer::CSumType csum_type,
4615 bufferlist::iterator *init_value_bl_it,
4616 const bufferlist &read_bl) {
4617 dout(20) << __func__ << dendl;
4618
4619 auto& op = osd_op.op;
4620
4621 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4622 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4623 << op.checksum.length << dendl;
4624 return -EINVAL;
4625 }
4626
4627 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4628 op.checksum.chunk_size : read_bl.length());
4629 uint32_t csum_count = (csum_chunk_size > 0 ?
4630 read_bl.length() / csum_chunk_size : 0);
4631
4632 bufferlist csum;
4633 bufferptr csum_data;
4634 if (csum_count > 0) {
4635 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4636 csum_data = buffer::create(csum_value_size * csum_count);
4637 csum_data.zero();
4638 csum.append(csum_data);
4639
4640 switch (csum_type) {
4641 case Checksummer::CSUM_XXHASH32:
4642 {
4643 Checksummer::xxhash32::init_value_t init_value;
4644 ::decode(init_value, *init_value_bl_it);
4645 Checksummer::calculate<Checksummer::xxhash32>(
4646 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4647 &csum_data);
4648 }
4649 break;
4650 case Checksummer::CSUM_XXHASH64:
4651 {
4652 Checksummer::xxhash64::init_value_t init_value;
4653 ::decode(init_value, *init_value_bl_it);
4654 Checksummer::calculate<Checksummer::xxhash64>(
4655 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4656 &csum_data);
4657 }
4658 break;
4659 case Checksummer::CSUM_CRC32C:
4660 {
4661 Checksummer::crc32c::init_value_t init_value;
4662 ::decode(init_value, *init_value_bl_it);
4663 Checksummer::calculate<Checksummer::crc32c>(
4664 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4665 &csum_data);
4666 }
4667 break;
4668 default:
4669 break;
4670 }
4671 }
4672
4673 ::encode(csum_count, osd_op.outdata);
4674 osd_op.outdata.claim_append(csum);
4675 return 0;
4676 }
4677
4678 struct C_ExtentCmpRead : public Context {
4679 PrimaryLogPG *primary_log_pg;
4680 OSDOp &osd_op;
4681 ceph_le64 read_length;
4682 bufferlist read_bl;
4683 Context *fill_extent_ctx;
4684
4685 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4686 boost::optional<uint32_t> maybe_crc, uint64_t size,
4687 OSDService *osd, hobject_t soid, __le32 flags)
4688 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4689 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4690 &read_bl, maybe_crc, size,
4691 osd, soid, flags)) {
4692 }
4693 ~C_ExtentCmpRead() override {
4694 delete fill_extent_ctx;
4695 }
4696
4697 void finish(int r) override {
4698 if (r == -ENOENT) {
4699 osd_op.rval = 0;
4700 read_bl.clear();
4701 delete fill_extent_ctx;
4702 } else {
4703 fill_extent_ctx->complete(r);
4704 }
4705 fill_extent_ctx = nullptr;
4706
4707 if (osd_op.rval >= 0) {
4708 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4709 }
4710 }
4711 };
4712
4713 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4714 {
4715 dout(20) << __func__ << dendl;
4716 ceph_osd_op& op = osd_op.op;
4717
4718 auto& oi = ctx->new_obs.oi;
4719 uint64_t size = oi.size;
4720 if ((oi.truncate_seq < op.extent.truncate_seq) &&
4721 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4722 size = op.extent.truncate_size;
4723 }
4724
4725 if (op.extent.offset >= size) {
4726 op.extent.length = 0;
4727 } else if (op.extent.offset + op.extent.length > size) {
4728 op.extent.length = size - op.extent.offset;
4729 }
4730
4731 if (op.extent.length == 0) {
4732 dout(20) << __func__ << " zero length extent" << dendl;
4733 return finish_extent_cmp(osd_op, bufferlist{});
4734 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4735 dout(20) << __func__ << " object DNE" << dendl;
4736 return finish_extent_cmp(osd_op, {});
4737 } else if (pool.info.require_rollback()) {
4738 // If there is a data digest and it is possible we are reading
4739 // entire object, pass the digest.
4740 boost::optional<uint32_t> maybe_crc;
4741 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4742 op.checksum.length >= oi.size) {
4743 maybe_crc = oi.data_digest;
4744 }
4745
4746 // async read
4747 auto& soid = oi.soid;
4748 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4749 osd, soid, op.flags);
4750 ctx->pending_async_reads.push_back({
4751 {op.extent.offset, op.extent.length, op.flags},
4752 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4753
4754 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4755
4756 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4757 new ReadFinisher(osd_op));
4758 return -EINPROGRESS;
4759 }
4760
4761 // sync read
4762 vector<OSDOp> read_ops(1);
4763 OSDOp& read_op = read_ops[0];
4764
4765 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4766 read_op.op.extent.offset = op.extent.offset;
4767 read_op.op.extent.length = op.extent.length;
4768 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4769 read_op.op.extent.truncate_size = op.extent.truncate_size;
4770
4771 int result = do_osd_ops(ctx, read_ops);
4772 if (result < 0) {
4773 derr << __func__ << " failed " << result << dendl;
4774 return result;
4775 }
4776 return finish_extent_cmp(osd_op, read_op.outdata);
4777 }
4778
4779 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4780 {
4781 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4782 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4783 if (osd_op.indata[idx] != read_byte) {
4784 return (-MAX_ERRNO - idx);
4785 }
4786 }
4787
4788 return 0;
4789 }
4790
4791 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4792 dout(20) << __func__ << dendl;
4793 auto& op = osd_op.op;
4794 auto& oi = ctx->new_obs.oi;
4795 auto& soid = oi.soid;
4796 __u32 seq = oi.truncate_seq;
4797 uint64_t size = oi.size;
4798 bool trimmed_read = false;
4799
4800 // are we beyond truncate_size?
4801 if ( (seq < op.extent.truncate_seq) &&
4802 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4803 size = op.extent.truncate_size;
4804
4805 if (op.extent.length == 0) //length is zero mean read the whole object
4806 op.extent.length = size;
4807
4808 if (op.extent.offset >= size) {
4809 op.extent.length = 0;
4810 trimmed_read = true;
4811 } else if (op.extent.offset + op.extent.length > size) {
4812 op.extent.length = size - op.extent.offset;
4813 trimmed_read = true;
4814 }
4815
4816 // read into a buffer
4817 int result = 0;
4818 if (trimmed_read && op.extent.length == 0) {
4819 // read size was trimmed to zero and it is expected to do nothing
4820 // a read operation of 0 bytes does *not* do nothing, this is why
4821 // the trimmed_read boolean is needed
4822 } else if (pool.info.require_rollback()) {
4823 boost::optional<uint32_t> maybe_crc;
4824 // If there is a data digest and it is possible we are reading
4825 // entire object, pass the digest. FillInVerifyExtent will
4826 // will check the oi.size again.
4827 if (oi.is_data_digest() && op.extent.offset == 0 &&
4828 op.extent.length >= oi.size)
4829 maybe_crc = oi.data_digest;
4830 ctx->pending_async_reads.push_back(
4831 make_pair(
4832 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4833 make_pair(&osd_op.outdata,
4834 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4835 &osd_op.outdata, maybe_crc, oi.size,
4836 osd, soid, op.flags))));
4837 dout(10) << " async_read noted for " << soid << dendl;
4838
4839 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4840 new ReadFinisher(osd_op));
4841 } else {
4842 int r = pgbackend->objects_read_sync(
4843 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4844 if (r == -EIO) {
4845 r = rep_repair_primary_object(soid, ctx->op);
4846 }
4847 if (r >= 0)
4848 op.extent.length = r;
4849 else {
4850 result = r;
4851 op.extent.length = 0;
4852 }
4853 dout(10) << " read got " << r << " / " << op.extent.length
4854 << " bytes from obj " << soid << dendl;
4855
4856 // whole object? can we verify the checksum?
4857 if (op.extent.length == oi.size && oi.is_data_digest()) {
4858 uint32_t crc = osd_op.outdata.crc32c(-1);
4859 if (oi.data_digest != crc) {
4860 osd->clog->error() << info.pgid << std::hex
4861 << " full-object read crc 0x" << crc
4862 << " != expected 0x" << oi.data_digest
4863 << std::dec << " on " << soid;
4864 // FIXME fall back to replica or something?
4865 result = -EIO;
4866 }
4867 }
4868 }
4869
4870 // XXX the op.extent.length is the requested length for async read
4871 // On error this length is changed to 0 after the error comes back.
4872 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4873 ctx->delta_stats.num_rd++;
4874 return result;
4875 }
4876
4877 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4878 dout(20) << __func__ << dendl;
4879 auto& op = osd_op.op;
4880 auto& oi = ctx->new_obs.oi;
4881 auto& soid = oi.soid;
4882
4883 if (op.extent.truncate_seq) {
4884 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4885 return -EINVAL;
4886 }
4887
4888 ++ctx->num_read;
4889 if (pool.info.ec_pool()) {
4890 // translate sparse read to a normal one if not supported
4891 uint64_t offset = op.extent.offset;
4892 uint64_t length = op.extent.length;
4893 if (offset > oi.size) {
4894 length = 0;
4895 } else if (offset + length > oi.size) {
4896 length = oi.size - offset;
4897 }
4898
4899 if (length > 0) {
4900 ctx->pending_async_reads.push_back(
4901 make_pair(
4902 boost::make_tuple(offset, length, op.flags),
4903 make_pair(
4904 &osd_op.outdata,
4905 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4906 &op.extent.length))));
4907 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4908
4909 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4910 new ReadFinisher(osd_op));
4911 } else {
4912 dout(10) << " sparse read ended up empty for " << soid << dendl;
4913 map<uint64_t, uint64_t> extents;
4914 ::encode(extents, osd_op.outdata);
4915 }
4916 } else {
4917 // read into a buffer
4918 map<uint64_t, uint64_t> m;
4919 uint32_t total_read = 0;
4920 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4921 info.pgid.shard),
4922 op.extent.offset, op.extent.length, m);
4923 if (r < 0) {
4924 return r;
4925 }
4926
4927 map<uint64_t, uint64_t>::iterator miter;
4928 bufferlist data_bl;
4929 uint64_t last = op.extent.offset;
4930 for (miter = m.begin(); miter != m.end(); ++miter) {
4931 // verify hole?
4932 if (cct->_conf->osd_verify_sparse_read_holes &&
4933 last < miter->first) {
4934 bufferlist t;
4935 uint64_t len = miter->first - last;
4936 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4937 if (r < 0) {
4938 osd->clog->error() << coll << " " << soid
4939 << " sparse-read failed to read: "
4940 << r;
4941 } else if (!t.is_zero()) {
4942 osd->clog->error() << coll << " " << soid
4943 << " sparse-read found data in hole "
4944 << last << "~" << len;
4945 }
4946 }
4947
4948 bufferlist tmpbl;
4949 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4950 op.flags, &tmpbl);
4951 if (r == -EIO) {
4952 r = rep_repair_primary_object(soid, ctx->op);
4953 }
4954 if (r < 0) {
4955 return r;
4956 }
4957
4958 // this is usually happen when we get extent that exceeds the actual file
4959 // size
4960 if (r < (int)miter->second)
4961 miter->second = r;
4962 total_read += r;
4963 dout(10) << "sparse-read " << miter->first << "@" << miter->second
4964 << dendl;
4965 data_bl.claim_append(tmpbl);
4966 last = miter->first + r;
4967 }
4968
4969 if (r < 0) {
4970 return r;
4971 }
4972
4973 // verify trailing hole?
4974 if (cct->_conf->osd_verify_sparse_read_holes) {
4975 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4976 if (last < end) {
4977 bufferlist t;
4978 uint64_t len = end - last;
4979 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4980 if (r < 0) {
4981 osd->clog->error() << coll << " " << soid
4982 << " sparse-read failed to read: " << r;
4983 } else if (!t.is_zero()) {
4984 osd->clog->error() << coll << " " << soid
4985 << " sparse-read found data in hole "
4986 << last << "~" << len;
4987 }
4988 }
4989 }
4990
4991 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4992 // Maybe at first, there is no much whole objects. With continued use, more
4993 // and more whole object exist. So from this point, for spare-read add
4994 // checksum make sense.
4995 if (total_read == oi.size && oi.is_data_digest()) {
4996 uint32_t crc = data_bl.crc32c(-1);
4997 if (oi.data_digest != crc) {
4998 osd->clog->error() << info.pgid << std::hex
4999 << " full-object read crc 0x" << crc
5000 << " != expected 0x" << oi.data_digest
5001 << std::dec << " on " << soid;
5002 // FIXME fall back to replica or something?
5003 return -EIO;
5004 }
5005 }
5006
5007 op.extent.length = total_read;
5008
5009 ::encode(m, osd_op.outdata); // re-encode since it might be modified
5010 ::encode_destructively(data_bl, osd_op.outdata);
5011
5012 dout(10) << " sparse_read got " << total_read << " bytes from object "
5013 << soid << dendl;
5014 }
5015
5016 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5017 ctx->delta_stats.num_rd++;
5018 return 0;
5019 }
5020
5021 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5022 {
5023 int result = 0;
5024 SnapSetContext *ssc = ctx->obc->ssc;
5025 ObjectState& obs = ctx->new_obs;
5026 object_info_t& oi = obs.oi;
5027 const hobject_t& soid = oi.soid;
5028
5029 PGTransaction* t = ctx->op_t.get();
5030
5031 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5032
5033 ctx->current_osd_subop_num = 0;
5034 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
5035 OSDOp& osd_op = *p;
5036 ceph_osd_op& op = osd_op.op;
5037
5038 OpFinisher* op_finisher = nullptr;
5039 {
5040 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5041 if (op_finisher_it != ctx->op_finishers.end()) {
5042 op_finisher = op_finisher_it->second.get();
5043 }
5044 }
5045
5046 // TODO: check endianness (__le32 vs uint32_t, etc.)
5047 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5048 // but the code in this function seems to treat them as native-endian. What should the
5049 // tracepoints do?
5050 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5051
5052 dout(10) << "do_osd_op " << osd_op << dendl;
5053
5054 bufferlist::iterator bp = osd_op.indata.begin();
5055
5056 // user-visible modifcation?
5057 switch (op.op) {
5058 // non user-visible modifications
5059 case CEPH_OSD_OP_WATCH:
5060 case CEPH_OSD_OP_CACHE_EVICT:
5061 case CEPH_OSD_OP_CACHE_FLUSH:
5062 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5063 case CEPH_OSD_OP_UNDIRTY:
5064 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5065 case CEPH_OSD_OP_CACHE_PIN:
5066 case CEPH_OSD_OP_CACHE_UNPIN:
5067 case CEPH_OSD_OP_SET_REDIRECT:
5068 break;
5069 default:
5070 if (op.op & CEPH_OSD_OP_MODE_WR)
5071 ctx->user_modify = true;
5072 }
5073
5074 // munge -1 truncate to 0 truncate
5075 if (ceph_osd_op_uses_extent(op.op) &&
5076 op.extent.truncate_seq == 1 &&
5077 op.extent.truncate_size == (-1ULL)) {
5078 op.extent.truncate_size = 0;
5079 op.extent.truncate_seq = 0;
5080 }
5081
5082 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5083 if (op.op == CEPH_OSD_OP_ZERO &&
5084 obs.exists &&
5085 op.extent.offset < cct->_conf->osd_max_object_size &&
5086 op.extent.length >= 1 &&
5087 op.extent.length <= cct->_conf->osd_max_object_size &&
5088 op.extent.offset + op.extent.length >= oi.size) {
5089 if (op.extent.offset >= oi.size) {
5090 // no-op
5091 goto fail;
5092 }
5093 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5094 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5095 op.op = CEPH_OSD_OP_TRUNCATE;
5096 }
5097
5098 switch (op.op) {
5099
5100 // --- READS ---
5101
5102 case CEPH_OSD_OP_CMPEXT:
5103 ++ctx->num_read;
5104 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5105 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5106 op.extent.length, op.extent.truncate_size,
5107 op.extent.truncate_seq);
5108
5109 if (op_finisher == nullptr) {
5110 result = do_extent_cmp(ctx, osd_op);
5111 } else {
5112 result = op_finisher->execute();
5113 }
5114 break;
5115
5116 case CEPH_OSD_OP_SYNC_READ:
5117 if (pool.info.require_rollback()) {
5118 result = -EOPNOTSUPP;
5119 break;
5120 }
5121 // fall through
5122 case CEPH_OSD_OP_READ:
5123 ++ctx->num_read;
5124 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5125 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5126 op.extent.length, op.extent.truncate_size,
5127 op.extent.truncate_seq);
5128 if (op_finisher == nullptr) {
5129 if (!ctx->data_off) {
5130 ctx->data_off = op.extent.offset;
5131 }
5132 result = do_read(ctx, osd_op);
5133 } else {
5134 result = op_finisher->execute();
5135 }
5136 break;
5137
5138 case CEPH_OSD_OP_CHECKSUM:
5139 ++ctx->num_read;
5140 {
5141 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5142 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5143 op.checksum.offset, op.checksum.length,
5144 op.checksum.chunk_size);
5145
5146 if (op_finisher == nullptr) {
5147 result = do_checksum(ctx, osd_op, &bp);
5148 } else {
5149 result = op_finisher->execute();
5150 }
5151 }
5152 break;
5153
5154 /* map extents */
5155 case CEPH_OSD_OP_MAPEXT:
5156 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5157 if (pool.info.require_rollback()) {
5158 result = -EOPNOTSUPP;
5159 break;
5160 }
5161 ++ctx->num_read;
5162 {
5163 // read into a buffer
5164 bufferlist bl;
5165 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5166 info.pgid.shard),
5167 op.extent.offset, op.extent.length, bl);
5168 osd_op.outdata.claim(bl);
5169 if (r < 0)
5170 result = r;
5171 else
5172 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5173 ctx->delta_stats.num_rd++;
5174 dout(10) << " map_extents done on object " << soid << dendl;
5175 }
5176 break;
5177
5178 /* map extents */
5179 case CEPH_OSD_OP_SPARSE_READ:
5180 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5181 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5182 op.extent.length, op.extent.truncate_size,
5183 op.extent.truncate_seq);
5184 if (op_finisher == nullptr) {
5185 result = do_sparse_read(ctx, osd_op);
5186 } else {
5187 result = op_finisher->execute();
5188 }
5189 break;
5190
5191 case CEPH_OSD_OP_CALL:
5192 {
5193 string cname, mname;
5194 bufferlist indata;
5195 try {
5196 bp.copy(op.cls.class_len, cname);
5197 bp.copy(op.cls.method_len, mname);
5198 bp.copy(op.cls.indata_len, indata);
5199 } catch (buffer::error& e) {
5200 dout(10) << "call unable to decode class + method + indata" << dendl;
5201 dout(30) << "in dump: ";
5202 osd_op.indata.hexdump(*_dout);
5203 *_dout << dendl;
5204 result = -EINVAL;
5205 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5206 break;
5207 }
5208 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5209
5210 ClassHandler::ClassData *cls;
5211 result = osd->class_handler->open_class(cname, &cls);
5212 assert(result == 0); // init_op_flags() already verified this works.
5213
5214 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5215 if (!method) {
5216 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5217 result = -EOPNOTSUPP;
5218 break;
5219 }
5220
5221 int flags = method->get_flags();
5222 if (flags & CLS_METHOD_WR)
5223 ctx->user_modify = true;
5224
5225 bufferlist outdata;
5226 dout(10) << "call method " << cname << "." << mname << dendl;
5227 int prev_rd = ctx->num_read;
5228 int prev_wr = ctx->num_write;
5229 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5230
5231 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5232 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5233 result = -EIO;
5234 break;
5235 }
5236 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5237 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5238 result = -EIO;
5239 break;
5240 }
5241
5242 dout(10) << "method called response length=" << outdata.length() << dendl;
5243 op.extent.length = outdata.length();
5244 osd_op.outdata.claim_append(outdata);
5245 dout(30) << "out dump: ";
5246 osd_op.outdata.hexdump(*_dout);
5247 *_dout << dendl;
5248 }
5249 break;
5250
5251 case CEPH_OSD_OP_STAT:
5252 // note: stat does not require RD
5253 {
5254 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5255
5256 if (obs.exists && !oi.is_whiteout()) {
5257 ::encode(oi.size, osd_op.outdata);
5258 ::encode(oi.mtime, osd_op.outdata);
5259 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5260 } else {
5261 result = -ENOENT;
5262 dout(10) << "stat oi object does not exist" << dendl;
5263 }
5264
5265 ctx->delta_stats.num_rd++;
5266 }
5267 break;
5268
5269 case CEPH_OSD_OP_ISDIRTY:
5270 ++ctx->num_read;
5271 {
5272 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5273 bool is_dirty = obs.oi.is_dirty();
5274 ::encode(is_dirty, osd_op.outdata);
5275 ctx->delta_stats.num_rd++;
5276 result = 0;
5277 }
5278 break;
5279
5280 case CEPH_OSD_OP_UNDIRTY:
5281 ++ctx->num_write;
5282 {
5283 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5284 if (oi.is_dirty()) {
5285 ctx->undirty = true; // see make_writeable()
5286 ctx->modify = true;
5287 ctx->delta_stats.num_wr++;
5288 }
5289 result = 0;
5290 }
5291 break;
5292
5293 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5294 ++ctx->num_write;
5295 {
5296 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5297 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5298 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5299 result = -EINVAL;
5300 break;
5301 }
5302 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5303 result = -EINVAL;
5304 break;
5305 }
5306 if (!obs.exists) {
5307 result = 0;
5308 break;
5309 }
5310 if (oi.is_cache_pinned()) {
5311 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5312 result = -EPERM;
5313 break;
5314 }
5315 if (oi.is_dirty()) {
5316 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5317 if (result == -EINPROGRESS)
5318 result = -EAGAIN;
5319 } else {
5320 result = 0;
5321 }
5322 }
5323 break;
5324
5325 case CEPH_OSD_OP_CACHE_FLUSH:
5326 ++ctx->num_write;
5327 {
5328 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5329 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5330 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5331 result = -EINVAL;
5332 break;
5333 }
5334 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5335 result = -EINVAL;
5336 break;
5337 }
5338 if (!obs.exists) {
5339 result = 0;
5340 break;
5341 }
5342 if (oi.is_cache_pinned()) {
5343 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5344 result = -EPERM;
5345 break;
5346 }
5347 hobject_t missing;
5348 if (oi.is_dirty()) {
5349 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5350 if (result == -EINPROGRESS)
5351 result = -EAGAIN;
5352 } else {
5353 result = 0;
5354 }
5355 // Check special return value which has set missing_return
5356 if (result == -ENOENT) {
5357 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5358 assert(!missing.is_min());
5359 wait_for_unreadable_object(missing, ctx->op);
5360 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5361 result = -EAGAIN;
5362 }
5363 }
5364 break;
5365
5366 case CEPH_OSD_OP_CACHE_EVICT:
5367 ++ctx->num_write;
5368 {
5369 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5370 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5371 result = -EINVAL;
5372 break;
5373 }
5374 if (!obs.exists) {
5375 result = 0;
5376 break;
5377 }
5378 if (oi.is_cache_pinned()) {
5379 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5380 result = -EPERM;
5381 break;
5382 }
5383 if (oi.is_dirty()) {
5384 result = -EBUSY;
5385 break;
5386 }
5387 if (!oi.watchers.empty()) {
5388 result = -EBUSY;
5389 break;
5390 }
5391 if (soid.snap == CEPH_NOSNAP) {
5392 result = _verify_no_head_clones(soid, ssc->snapset);
5393 if (result < 0)
5394 break;
5395 }
5396 result = _delete_oid(ctx, true, false);
5397 if (result >= 0) {
5398 // mark that this is a cache eviction to avoid triggering normal
5399 // make_writeable() clone or snapdir object creation in finish_ctx()
5400 ctx->cache_evict = true;
5401 }
5402 osd->logger->inc(l_osd_tier_evict);
5403 }
5404 break;
5405
5406 case CEPH_OSD_OP_GETXATTR:
5407 ++ctx->num_read;
5408 {
5409 string aname;
5410 bp.copy(op.xattr.name_len, aname);
5411 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5412 string name = "_" + aname;
5413 int r = getattr_maybe_cache(
5414 ctx->obc,
5415 name,
5416 &(osd_op.outdata));
5417 if (r >= 0) {
5418 op.xattr.value_len = osd_op.outdata.length();
5419 result = 0;
5420 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5421 } else
5422 result = r;
5423
5424 ctx->delta_stats.num_rd++;
5425 }
5426 break;
5427
5428 case CEPH_OSD_OP_GETXATTRS:
5429 ++ctx->num_read;
5430 {
5431 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5432 map<string, bufferlist> out;
5433 result = getattrs_maybe_cache(
5434 ctx->obc,
5435 &out,
5436 true);
5437
5438 bufferlist bl;
5439 ::encode(out, bl);
5440 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5441 ctx->delta_stats.num_rd++;
5442 osd_op.outdata.claim_append(bl);
5443 }
5444 break;
5445
5446 case CEPH_OSD_OP_CMPXATTR:
5447 ++ctx->num_read;
5448 {
5449 string aname;
5450 bp.copy(op.xattr.name_len, aname);
5451 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5452 string name = "_" + aname;
5453 name[op.xattr.name_len + 1] = 0;
5454
5455 bufferlist xattr;
5456 result = getattr_maybe_cache(
5457 ctx->obc,
5458 name,
5459 &xattr);
5460 if (result < 0 && result != -EEXIST && result != -ENODATA)
5461 break;
5462
5463 ctx->delta_stats.num_rd++;
5464 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5465
5466 switch (op.xattr.cmp_mode) {
5467 case CEPH_OSD_CMPXATTR_MODE_STRING:
5468 {
5469 string val;
5470 bp.copy(op.xattr.value_len, val);
5471 val[op.xattr.value_len] = 0;
5472 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5473 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5474 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5475 }
5476 break;
5477
5478 case CEPH_OSD_CMPXATTR_MODE_U64:
5479 {
5480 uint64_t u64val;
5481 try {
5482 ::decode(u64val, bp);
5483 }
5484 catch (buffer::error& e) {
5485 result = -EINVAL;
5486 goto fail;
5487 }
5488 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5489 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5490 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5491 }
5492 break;
5493
5494 default:
5495 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5496 result = -EINVAL;
5497 }
5498
5499 if (!result) {
5500 dout(10) << "comparison returned false" << dendl;
5501 result = -ECANCELED;
5502 break;
5503 }
5504 if (result < 0) {
5505 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5506 break;
5507 }
5508
5509 dout(10) << "comparison returned true" << dendl;
5510 }
5511 break;
5512
5513 case CEPH_OSD_OP_ASSERT_VER:
5514 ++ctx->num_read;
5515 {
5516 uint64_t ver = op.assert_ver.ver;
5517 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5518 if (!ver)
5519 result = -EINVAL;
5520 else if (ver < oi.user_version)
5521 result = -ERANGE;
5522 else if (ver > oi.user_version)
5523 result = -EOVERFLOW;
5524 }
5525 break;
5526
5527 case CEPH_OSD_OP_LIST_WATCHERS:
5528 ++ctx->num_read;
5529 {
5530 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5531 obj_list_watch_response_t resp;
5532
5533 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5534 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5535 ++oi_iter) {
5536 dout(20) << "key cookie=" << oi_iter->first.first
5537 << " entity=" << oi_iter->first.second << " "
5538 << oi_iter->second << dendl;
5539 assert(oi_iter->first.first == oi_iter->second.cookie);
5540 assert(oi_iter->first.second.is_client());
5541
5542 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5543 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5544 resp.entries.push_back(wi);
5545 }
5546
5547 resp.encode(osd_op.outdata, ctx->get_features());
5548 result = 0;
5549
5550 ctx->delta_stats.num_rd++;
5551 break;
5552 }
5553
5554 case CEPH_OSD_OP_LIST_SNAPS:
5555 ++ctx->num_read;
5556 {
5557 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5558 obj_list_snap_response_t resp;
5559
5560 if (!ssc) {
5561 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5562 }
5563 assert(ssc);
5564
5565 int clonecount = ssc->snapset.clones.size();
5566 if (ssc->snapset.head_exists)
5567 clonecount++;
5568 resp.clones.reserve(clonecount);
5569 for (auto clone_iter = ssc->snapset.clones.begin();
5570 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5571 clone_info ci;
5572 ci.cloneid = *clone_iter;
5573
5574 hobject_t clone_oid = soid;
5575 clone_oid.snap = *clone_iter;
5576
5577 if (!ssc->snapset.is_legacy()) {
5578 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5579 if (p == ssc->snapset.clone_snaps.end()) {
5580 osd->clog->error() << "osd." << osd->whoami
5581 << ": inconsistent clone_snaps found for oid "
5582 << soid << " clone " << *clone_iter
5583 << " snapset " << ssc->snapset;
5584 result = -EINVAL;
5585 break;
5586 }
5587 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5588 ci.snaps.push_back(*q);
5589 }
5590 } else {
5591 /* No need to take a lock here. We are only inspecting state cached on
5592 * in the ObjectContext, so we aren't performing an actual read unless
5593 * the clone obc is not already loaded (in which case, it cannot have
5594 * an in progress write). We also do not risk exposing uncommitted
5595 * state since we do have a read lock on the head object or snapdir,
5596 * which we would have to write lock in order to make user visible
5597 * modifications to the snapshot state (snap trim related mutations
5598 * are not user visible).
5599 */
5600 if (is_missing_object(clone_oid)) {
5601 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5602 wait_for_unreadable_object(clone_oid, ctx->op);
5603 result = -EAGAIN;
5604 break;
5605 }
5606
5607 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5608 if (!clone_obc) {
5609 if (maybe_handle_cache(
5610 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5611 // promoting the clone
5612 result = -EAGAIN;
5613 } else {
5614 osd->clog->error() << "osd." << osd->whoami
5615 << ": missing clone " << clone_oid
5616 << " for oid "
5617 << soid;
5618 // should not happen
5619 result = -ENOENT;
5620 }
5621 break;
5622 }
5623 for (vector<snapid_t>::reverse_iterator p =
5624 clone_obc->obs.oi.legacy_snaps.rbegin();
5625 p != clone_obc->obs.oi.legacy_snaps.rend();
5626 ++p) {
5627 ci.snaps.push_back(*p);
5628 }
5629 }
5630
5631 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5632
5633 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5634 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5635 if (coi == ssc->snapset.clone_overlap.end()) {
5636 osd->clog->error() << "osd." << osd->whoami
5637 << ": inconsistent clone_overlap found for oid "
5638 << soid << " clone " << *clone_iter;
5639 result = -EINVAL;
5640 break;
5641 }
5642 const interval_set<uint64_t> &o = coi->second;
5643 ci.overlap.reserve(o.num_intervals());
5644 for (interval_set<uint64_t>::const_iterator r = o.begin();
5645 r != o.end(); ++r) {
5646 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5647 r.get_len()));
5648 }
5649
5650 map<snapid_t, uint64_t>::const_iterator si;
5651 si = ssc->snapset.clone_size.find(ci.cloneid);
5652 if (si == ssc->snapset.clone_size.end()) {
5653 osd->clog->error() << "osd." << osd->whoami
5654 << ": inconsistent clone_size found for oid "
5655 << soid << " clone " << *clone_iter;
5656 result = -EINVAL;
5657 break;
5658 }
5659 ci.size = si->second;
5660
5661 resp.clones.push_back(ci);
5662 }
5663 if (result < 0) {
5664 break;
5665 }
5666 if (ssc->snapset.head_exists &&
5667 !ctx->obc->obs.oi.is_whiteout()) {
5668 assert(obs.exists);
5669 clone_info ci;
5670 ci.cloneid = CEPH_NOSNAP;
5671
5672 //Size for HEAD is oi.size
5673 ci.size = oi.size;
5674
5675 resp.clones.push_back(ci);
5676 }
5677 resp.seq = ssc->snapset.seq;
5678
5679 resp.encode(osd_op.outdata);
5680 result = 0;
5681
5682 ctx->delta_stats.num_rd++;
5683 break;
5684 }
5685
5686 case CEPH_OSD_OP_NOTIFY:
5687 ++ctx->num_read;
5688 {
5689 uint32_t timeout;
5690 bufferlist bl;
5691
5692 try {
5693 uint32_t ver; // obsolete
5694 ::decode(ver, bp);
5695 ::decode(timeout, bp);
5696 ::decode(bl, bp);
5697 } catch (const buffer::error &e) {
5698 timeout = 0;
5699 }
5700 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5701 if (!timeout)
5702 timeout = cct->_conf->osd_default_notify_timeout;
5703
5704 notify_info_t n;
5705 n.timeout = timeout;
5706 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5707 n.cookie = op.watch.cookie;
5708 n.bl = bl;
5709 ctx->notifies.push_back(n);
5710
5711 // return our unique notify id to the client
5712 ::encode(n.notify_id, osd_op.outdata);
5713 }
5714 break;
5715
5716 case CEPH_OSD_OP_NOTIFY_ACK:
5717 ++ctx->num_read;
5718 {
5719 try {
5720 uint64_t notify_id = 0;
5721 uint64_t watch_cookie = 0;
5722 ::decode(notify_id, bp);
5723 ::decode(watch_cookie, bp);
5724 bufferlist reply_bl;
5725 if (!bp.end()) {
5726 ::decode(reply_bl, bp);
5727 }
5728 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5729 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5730 ctx->notify_acks.push_back(ack);
5731 } catch (const buffer::error &e) {
5732 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5733 OpContext::NotifyAck ack(
5734 // op.watch.cookie is actually the notify_id for historical reasons
5735 op.watch.cookie
5736 );
5737 ctx->notify_acks.push_back(ack);
5738 }
5739 }
5740 break;
5741
5742 case CEPH_OSD_OP_SETALLOCHINT:
5743 ++ctx->num_write;
5744 {
5745 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5746 maybe_create_new_object(ctx);
5747 oi.expected_object_size = op.alloc_hint.expected_object_size;
5748 oi.expected_write_size = op.alloc_hint.expected_write_size;
5749 oi.alloc_hint_flags = op.alloc_hint.flags;
5750 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5751 op.alloc_hint.expected_write_size,
5752 op.alloc_hint.flags);
5753 ctx->delta_stats.num_wr++;
5754 result = 0;
5755 }
5756 break;
5757
5758
5759 // --- WRITES ---
5760
5761 // -- object data --
5762
5763 case CEPH_OSD_OP_WRITE:
5764 ++ctx->num_write;
5765 { // write
5766 __u32 seq = oi.truncate_seq;
5767 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5768 if (op.extent.length != osd_op.indata.length()) {
5769 result = -EINVAL;
5770 break;
5771 }
5772
5773 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5774 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5775
5776 if (pool.info.requires_aligned_append() &&
5777 (op.extent.offset % pool.info.required_alignment() != 0)) {
5778 result = -EOPNOTSUPP;
5779 break;
5780 }
5781
5782 if (!obs.exists) {
5783 if (pool.info.requires_aligned_append() && op.extent.offset) {
5784 result = -EOPNOTSUPP;
5785 break;
5786 }
5787 } else if (op.extent.offset != oi.size &&
5788 pool.info.requires_aligned_append()) {
5789 result = -EOPNOTSUPP;
5790 break;
5791 }
5792
5793 if (seq && (seq > op.extent.truncate_seq) &&
5794 (op.extent.offset + op.extent.length > oi.size)) {
5795 // old write, arrived after trimtrunc
5796 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5797 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5798 << ", adjusting write length to " << op.extent.length << dendl;
5799 bufferlist t;
5800 t.substr_of(osd_op.indata, 0, op.extent.length);
5801 osd_op.indata.swap(t);
5802 }
5803 if (op.extent.truncate_seq > seq) {
5804 // write arrives before trimtrunc
5805 if (obs.exists && !oi.is_whiteout()) {
5806 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5807 << ", truncating to " << op.extent.truncate_size << dendl;
5808 t->truncate(soid, op.extent.truncate_size);
5809 oi.truncate_seq = op.extent.truncate_seq;
5810 oi.truncate_size = op.extent.truncate_size;
5811 if (op.extent.truncate_size != oi.size) {
5812 ctx->delta_stats.num_bytes -= oi.size;
5813 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5814 oi.size = op.extent.truncate_size;
5815 }
5816 } else {
5817 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5818 << ", but object is new" << dendl;
5819 oi.truncate_seq = op.extent.truncate_seq;
5820 oi.truncate_size = op.extent.truncate_size;
5821 }
5822 }
5823 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5824 if (result < 0)
5825 break;
5826
5827 maybe_create_new_object(ctx);
5828
5829 if (op.extent.length == 0) {
5830 if (op.extent.offset > oi.size) {
5831 t->truncate(
5832 soid, op.extent.offset);
5833 } else {
5834 t->nop(soid);
5835 }
5836 } else {
5837 t->write(
5838 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5839 }
5840
5841 if (op.extent.offset == 0 && op.extent.length >= oi.size)
5842 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5843 else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5844 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5845 else
5846 obs.oi.clear_data_digest();
5847 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5848 op.extent.offset, op.extent.length);
5849
5850 }
5851 break;
5852
5853 case CEPH_OSD_OP_WRITEFULL:
5854 ++ctx->num_write;
5855 { // write full object
5856 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5857
5858 if (op.extent.length != osd_op.indata.length()) {
5859 result = -EINVAL;
5860 break;
5861 }
5862 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5863 if (result < 0)
5864 break;
5865
5866 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5867 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5868
5869 maybe_create_new_object(ctx);
5870 if (pool.info.require_rollback()) {
5871 t->truncate(soid, 0);
5872 } else if (obs.exists && op.extent.length < oi.size) {
5873 t->truncate(soid, op.extent.length);
5874 }
5875 if (op.extent.length) {
5876 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5877 }
5878 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5879
5880 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5881 0, op.extent.length, true);
5882 }
5883 break;
5884
5885 case CEPH_OSD_OP_WRITESAME:
5886 ++ctx->num_write;
5887 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5888 result = do_writesame(ctx, osd_op);
5889 break;
5890
5891 case CEPH_OSD_OP_ROLLBACK :
5892 ++ctx->num_write;
5893 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5894 result = _rollback_to(ctx, op);
5895 break;
5896
5897 case CEPH_OSD_OP_ZERO:
5898 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5899 if (pool.info.requires_aligned_append()) {
5900 result = -EOPNOTSUPP;
5901 break;
5902 }
5903 ++ctx->num_write;
5904 { // zero
5905 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5906 if (result < 0)
5907 break;
5908 assert(op.extent.length);
5909 if (obs.exists && !oi.is_whiteout()) {
5910 t->zero(soid, op.extent.offset, op.extent.length);
5911 interval_set<uint64_t> ch;
5912 ch.insert(op.extent.offset, op.extent.length);
5913 ctx->modified_ranges.union_of(ch);
5914 ctx->delta_stats.num_wr++;
5915 oi.clear_data_digest();
5916 } else {
5917 // no-op
5918 }
5919 }
5920 break;
5921 case CEPH_OSD_OP_CREATE:
5922 ++ctx->num_write;
5923 {
5924 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5925 int flags = le32_to_cpu(op.flags);
5926 if (obs.exists && !oi.is_whiteout() &&
5927 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5928 result = -EEXIST; /* this is an exclusive create */
5929 } else {
5930 if (osd_op.indata.length()) {
5931 bufferlist::iterator p = osd_op.indata.begin();
5932 string category;
5933 try {
5934 ::decode(category, p);
5935 }
5936 catch (buffer::error& e) {
5937 result = -EINVAL;
5938 goto fail;
5939 }
5940 // category is no longer implemented.
5941 }
5942 if (result >= 0) {
5943 maybe_create_new_object(ctx);
5944 t->nop(soid);
5945 }
5946 }
5947 }
5948 break;
5949
5950 case CEPH_OSD_OP_TRIMTRUNC:
5951 op.extent.offset = op.extent.truncate_size;
5952 // falling through
5953
5954 case CEPH_OSD_OP_TRUNCATE:
5955 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5956 if (pool.info.requires_aligned_append()) {
5957 result = -EOPNOTSUPP;
5958 break;
5959 }
5960 ++ctx->num_write;
5961 {
5962 // truncate
5963 if (!obs.exists || oi.is_whiteout()) {
5964 dout(10) << " object dne, truncate is a no-op" << dendl;
5965 break;
5966 }
5967
5968 if (op.extent.offset > cct->_conf->osd_max_object_size) {
5969 result = -EFBIG;
5970 break;
5971 }
5972
5973 if (op.extent.truncate_seq) {
5974 assert(op.extent.offset == op.extent.truncate_size);
5975 if (op.extent.truncate_seq <= oi.truncate_seq) {
5976 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5977 << ", no-op" << dendl;
5978 break; // old
5979 }
5980 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5981 << ", truncating" << dendl;
5982 oi.truncate_seq = op.extent.truncate_seq;
5983 oi.truncate_size = op.extent.truncate_size;
5984 }
5985
5986 maybe_create_new_object(ctx);
5987 t->truncate(soid, op.extent.offset);
5988 if (oi.size > op.extent.offset) {
5989 interval_set<uint64_t> trim;
5990 trim.insert(op.extent.offset, oi.size-op.extent.offset);
5991 ctx->modified_ranges.union_of(trim);
5992 }
5993 if (op.extent.offset != oi.size) {
5994 ctx->delta_stats.num_bytes -= oi.size;
5995 ctx->delta_stats.num_bytes += op.extent.offset;
5996 oi.size = op.extent.offset;
5997 }
5998 ctx->delta_stats.num_wr++;
5999 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6000
6001 oi.clear_data_digest();
6002 }
6003 break;
6004
6005 case CEPH_OSD_OP_DELETE:
6006 ++ctx->num_write;
6007 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6008 {
6009 result = _delete_oid(ctx, false, ctx->ignore_cache);
6010 }
6011 break;
6012
6013 case CEPH_OSD_OP_WATCH:
6014 ++ctx->num_write;
6015 {
6016 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6017 op.watch.cookie, op.watch.op);
6018 if (!obs.exists) {
6019 result = -ENOENT;
6020 break;
6021 }
6022 uint64_t cookie = op.watch.cookie;
6023 entity_name_t entity = ctx->reqid.name;
6024 ObjectContextRef obc = ctx->obc;
6025
6026 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6027 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6028 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6029 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6030 dout(10) << "watch: peer_addr="
6031 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6032
6033 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6034 if (op.watch.timeout != 0) {
6035 timeout = op.watch.timeout;
6036 }
6037
6038 watch_info_t w(cookie, timeout,
6039 ctx->op->get_req()->get_connection()->get_peer_addr());
6040 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6041 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6042 if (oi.watchers.count(make_pair(cookie, entity))) {
6043 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6044 } else {
6045 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6046 oi.watchers[make_pair(cookie, entity)] = w;
6047 t->nop(soid); // make sure update the object_info on disk!
6048 }
6049 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6050 ctx->watch_connects.push_back(make_pair(w, will_ping));
6051 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6052 if (!oi.watchers.count(make_pair(cookie, entity))) {
6053 result = -ENOTCONN;
6054 break;
6055 }
6056 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6057 ctx->watch_connects.push_back(make_pair(w, true));
6058 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6059 /* Note: WATCH with PING doesn't cause may_write() to return true,
6060 * so if there is nothing else in the transaction, this is going
6061 * to run do_osd_op_effects, but not write out a log entry */
6062 if (!oi.watchers.count(make_pair(cookie, entity))) {
6063 result = -ENOTCONN;
6064 break;
6065 }
6066 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6067 obc->watchers.find(make_pair(cookie, entity));
6068 if (p == obc->watchers.end() ||
6069 !p->second->is_connected()) {
6070 // client needs to reconnect
6071 result = -ETIMEDOUT;
6072 break;
6073 }
6074 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6075 p->second->got_ping(ceph_clock_now());
6076 result = 0;
6077 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6078 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6079 oi.watchers.find(make_pair(cookie, entity));
6080 if (oi_iter != oi.watchers.end()) {
6081 dout(10) << " removed watch " << oi_iter->second << " by "
6082 << entity << dendl;
6083 oi.watchers.erase(oi_iter);
6084 t->nop(soid); // update oi on disk
6085 ctx->watch_disconnects.push_back(
6086 watch_disconnect_t(cookie, entity, false));
6087 } else {
6088 dout(10) << " can't remove: no watch by " << entity << dendl;
6089 }
6090 }
6091 }
6092 break;
6093
6094 case CEPH_OSD_OP_CACHE_PIN:
6095 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6096 if ((!pool.info.is_tier() ||
6097 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6098 result = -EINVAL;
6099 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6100 break;
6101 }
6102 ++ctx->num_write;
6103 {
6104 if (!obs.exists || oi.is_whiteout()) {
6105 result = -ENOENT;
6106 break;
6107 }
6108
6109 if (!oi.is_cache_pinned()) {
6110 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6111 ctx->modify = true;
6112 ctx->delta_stats.num_objects_pinned++;
6113 ctx->delta_stats.num_wr++;
6114 }
6115 result = 0;
6116 }
6117 break;
6118
6119 case CEPH_OSD_OP_CACHE_UNPIN:
6120 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6121 if ((!pool.info.is_tier() ||
6122 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6123 result = -EINVAL;
6124 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6125 break;
6126 }
6127 ++ctx->num_write;
6128 {
6129 if (!obs.exists || oi.is_whiteout()) {
6130 result = -ENOENT;
6131 break;
6132 }
6133
6134 if (oi.is_cache_pinned()) {
6135 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6136 ctx->modify = true;
6137 ctx->delta_stats.num_objects_pinned--;
6138 ctx->delta_stats.num_wr++;
6139 }
6140 result = 0;
6141 }
6142 break;
6143
6144 case CEPH_OSD_OP_SET_REDIRECT:
6145 ++ctx->num_write;
6146 {
6147 if (pool.info.is_tier()) {
6148 result = -EINVAL;
6149 break;
6150 }
6151 if (!obs.exists) {
6152 result = -ENOENT;
6153 break;
6154 }
6155 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6156 result = -EOPNOTSUPP;
6157 break;
6158 }
6159
6160 object_t target_name;
6161 object_locator_t target_oloc;
6162 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6163 version_t target_version = op.copy_from.src_version;
6164 try {
6165 ::decode(target_name, bp);
6166 ::decode(target_oloc, bp);
6167 }
6168 catch (buffer::error& e) {
6169 result = -EINVAL;
6170 goto fail;
6171 }
6172 pg_t raw_pg;
6173 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6174 hobject_t target(target_name, target_oloc.key, target_snapid,
6175 raw_pg.ps(), raw_pg.pool(),
6176 target_oloc.nspace);
6177 if (target == soid) {
6178 dout(20) << " set-redirect self is invalid" << dendl;
6179 result = -EINVAL;
6180 break;
6181 }
6182 oi.set_flag(object_info_t::FLAG_MANIFEST);
6183 oi.manifest.redirect_target = target;
6184 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6185 t->truncate(soid, 0);
6186 if (oi.is_omap() && pool.info.supports_omap()) {
6187 t->omap_clear(soid);
6188 obs.oi.clear_omap_digest();
6189 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6190 }
6191 ctx->delta_stats.num_bytes -= oi.size;
6192 oi.size = 0;
6193 oi.new_object();
6194 oi.user_version = target_version;
6195 ctx->user_at_version = target_version;
6196 /* rm_attrs */
6197 map<string,bufferlist> rmattrs;
6198 result = getattrs_maybe_cache(ctx->obc,
6199 &rmattrs,
6200 true);
6201 if (result < 0) {
6202 return result;
6203 }
6204 map<string, bufferlist>::iterator iter;
6205 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6206 const string& name = iter->first;
6207 t->rmattr(soid, name);
6208 }
6209 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6210 }
6211
6212 break;
6213
6214 // -- object attrs --
6215
6216 case CEPH_OSD_OP_SETXATTR:
6217 ++ctx->num_write;
6218 {
6219 if (cct->_conf->osd_max_attr_size > 0 &&
6220 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6221 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6222 result = -EFBIG;
6223 break;
6224 }
6225 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6226 cct->_conf->osd_max_attr_name_len);
6227 if (op.xattr.name_len > max_name_len) {
6228 result = -ENAMETOOLONG;
6229 break;
6230 }
6231 maybe_create_new_object(ctx);
6232 string aname;
6233 bp.copy(op.xattr.name_len, aname);
6234 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6235 string name = "_" + aname;
6236 bufferlist bl;
6237 bp.copy(op.xattr.value_len, bl);
6238 t->setattr(soid, name, bl);
6239 ctx->delta_stats.num_wr++;
6240 }
6241 break;
6242
6243 case CEPH_OSD_OP_RMXATTR:
6244 ++ctx->num_write;
6245 {
6246 string aname;
6247 bp.copy(op.xattr.name_len, aname);
6248 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6249 if (!obs.exists || oi.is_whiteout()) {
6250 result = -ENOENT;
6251 break;
6252 }
6253 string name = "_" + aname;
6254 t->rmattr(soid, name);
6255 ctx->delta_stats.num_wr++;
6256 }
6257 break;
6258
6259
6260 // -- fancy writers --
6261 case CEPH_OSD_OP_APPEND:
6262 {
6263 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6264 // just do it inline; this works because we are happy to execute
6265 // fancy op on replicas as well.
6266 vector<OSDOp> nops(1);
6267 OSDOp& newop = nops[0];
6268 newop.op.op = CEPH_OSD_OP_WRITE;
6269 newop.op.extent.offset = oi.size;
6270 newop.op.extent.length = op.extent.length;
6271 newop.op.extent.truncate_seq = oi.truncate_seq;
6272 newop.indata = osd_op.indata;
6273 result = do_osd_ops(ctx, nops);
6274 osd_op.outdata.claim(newop.outdata);
6275 }
6276 break;
6277
6278 case CEPH_OSD_OP_STARTSYNC:
6279 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6280 t->nop(soid);
6281 break;
6282
6283
6284 // -- trivial map --
6285 case CEPH_OSD_OP_TMAPGET:
6286 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6287 if (pool.info.require_rollback()) {
6288 result = -EOPNOTSUPP;
6289 break;
6290 }
6291 {
6292 vector<OSDOp> nops(1);
6293 OSDOp& newop = nops[0];
6294 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6295 newop.op.extent.offset = 0;
6296 newop.op.extent.length = 0;
6297 do_osd_ops(ctx, nops);
6298 osd_op.outdata.claim(newop.outdata);
6299 }
6300 break;
6301
6302 case CEPH_OSD_OP_TMAPPUT:
6303 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6304 if (pool.info.require_rollback()) {
6305 result = -EOPNOTSUPP;
6306 break;
6307 }
6308 {
6309 //_dout_lock.Lock();
6310 //osd_op.data.hexdump(*_dout);
6311 //_dout_lock.Unlock();
6312
6313 // verify sort order
6314 bool unsorted = false;
6315 if (true) {
6316 bufferlist header;
6317 ::decode(header, bp);
6318 uint32_t n;
6319 ::decode(n, bp);
6320 string last_key;
6321 while (n--) {
6322 string key;
6323 ::decode(key, bp);
6324 dout(10) << "tmapput key " << key << dendl;
6325 bufferlist val;
6326 ::decode(val, bp);
6327 if (key < last_key) {
6328 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6329 unsorted = true;
6330 break;
6331 }
6332 last_key = key;
6333 }
6334 }
6335
6336 // write it
6337 vector<OSDOp> nops(1);
6338 OSDOp& newop = nops[0];
6339 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6340 newop.op.extent.offset = 0;
6341 newop.op.extent.length = osd_op.indata.length();
6342 newop.indata = osd_op.indata;
6343
6344 if (unsorted) {
6345 bp = osd_op.indata.begin();
6346 bufferlist header;
6347 map<string, bufferlist> m;
6348 ::decode(header, bp);
6349 ::decode(m, bp);
6350 assert(bp.end());
6351 bufferlist newbl;
6352 ::encode(header, newbl);
6353 ::encode(m, newbl);
6354 newop.indata = newbl;
6355 }
6356 result = do_osd_ops(ctx, nops);
6357 assert(result == 0);
6358 }
6359 break;
6360
6361 case CEPH_OSD_OP_TMAPUP:
6362 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6363 if (pool.info.require_rollback()) {
6364 result = -EOPNOTSUPP;
6365 break;
6366 }
6367 ++ctx->num_write;
6368 result = do_tmapup(ctx, bp, osd_op);
6369 break;
6370
6371 case CEPH_OSD_OP_TMAP2OMAP:
6372 ++ctx->num_write;
6373 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6374 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6375 break;
6376
6377 // OMAP Read ops
6378 case CEPH_OSD_OP_OMAPGETKEYS:
6379 ++ctx->num_read;
6380 {
6381 string start_after;
6382 uint64_t max_return;
6383 try {
6384 ::decode(start_after, bp);
6385 ::decode(max_return, bp);
6386 }
6387 catch (buffer::error& e) {
6388 result = -EINVAL;
6389 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6390 goto fail;
6391 }
6392 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6393 max_return = cct->_conf->osd_max_omap_entries_per_request;
6394 }
6395 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6396
6397 bufferlist bl;
6398 uint32_t num = 0;
6399 bool truncated = false;
6400 if (oi.is_omap()) {
6401 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6402 coll, ghobject_t(soid)
6403 );
6404 assert(iter);
6405 iter->upper_bound(start_after);
6406 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6407 if (num >= max_return ||
6408 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6409 truncated = true;
6410 break;
6411 }
6412 ::encode(iter->key(), bl);
6413 }
6414 } // else return empty out_set
6415 ::encode(num, osd_op.outdata);
6416 osd_op.outdata.claim_append(bl);
6417 ::encode(truncated, osd_op.outdata);
6418 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6419 ctx->delta_stats.num_rd++;
6420 }
6421 break;
6422
6423 case CEPH_OSD_OP_OMAPGETVALS:
6424 ++ctx->num_read;
6425 {
6426 string start_after;
6427 uint64_t max_return;
6428 string filter_prefix;
6429 try {
6430 ::decode(start_after, bp);
6431 ::decode(max_return, bp);
6432 ::decode(filter_prefix, bp);
6433 }
6434 catch (buffer::error& e) {
6435 result = -EINVAL;
6436 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6437 goto fail;
6438 }
6439 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6440 max_return = cct->_conf->osd_max_omap_entries_per_request;
6441 }
6442 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6443
6444 uint32_t num = 0;
6445 bool truncated = false;
6446 bufferlist bl;
6447 if (oi.is_omap()) {
6448 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6449 coll, ghobject_t(soid)
6450 );
6451 if (!iter) {
6452 result = -ENOENT;
6453 goto fail;
6454 }
6455 iter->upper_bound(start_after);
6456 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6457 for (num = 0;
6458 iter->valid() &&
6459 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6460 ++num, iter->next(false)) {
6461 dout(20) << "Found key " << iter->key() << dendl;
6462 if (num >= max_return ||
6463 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6464 truncated = true;
6465 break;
6466 }
6467 ::encode(iter->key(), bl);
6468 ::encode(iter->value(), bl);
6469 }
6470 } // else return empty out_set
6471 ::encode(num, osd_op.outdata);
6472 osd_op.outdata.claim_append(bl);
6473 ::encode(truncated, osd_op.outdata);
6474 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6475 ctx->delta_stats.num_rd++;
6476 }
6477 break;
6478
6479 case CEPH_OSD_OP_OMAPGETHEADER:
6480 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6481 if (!oi.is_omap()) {
6482 // return empty header
6483 break;
6484 }
6485 ++ctx->num_read;
6486 {
6487 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6488 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6489 ctx->delta_stats.num_rd++;
6490 }
6491 break;
6492
6493 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6494 ++ctx->num_read;
6495 {
6496 set<string> keys_to_get;
6497 try {
6498 ::decode(keys_to_get, bp);
6499 }
6500 catch (buffer::error& e) {
6501 result = -EINVAL;
6502 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6503 goto fail;
6504 }
6505 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6506 map<string, bufferlist> out;
6507 if (oi.is_omap()) {
6508 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6509 } // else return empty omap entries
6510 ::encode(out, osd_op.outdata);
6511 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6512 ctx->delta_stats.num_rd++;
6513 }
6514 break;
6515
6516 case CEPH_OSD_OP_OMAP_CMP:
6517 ++ctx->num_read;
6518 {
6519 if (!obs.exists || oi.is_whiteout()) {
6520 result = -ENOENT;
6521 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6522 break;
6523 }
6524 map<string, pair<bufferlist, int> > assertions;
6525 try {
6526 ::decode(assertions, bp);
6527 }
6528 catch (buffer::error& e) {
6529 result = -EINVAL;
6530 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6531 goto fail;
6532 }
6533 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6534
6535 map<string, bufferlist> out;
6536
6537 if (oi.is_omap()) {
6538 set<string> to_get;
6539 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6540 i != assertions.end();
6541 ++i)
6542 to_get.insert(i->first);
6543 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6544 to_get, &out);
6545 if (r < 0) {
6546 result = r;
6547 break;
6548 }
6549 } // else leave out empty
6550
6551 //Should set num_rd_kb based on encode length of map
6552 ctx->delta_stats.num_rd++;
6553
6554 int r = 0;
6555 bufferlist empty;
6556 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6557 i != assertions.end();
6558 ++i) {
6559 auto out_entry = out.find(i->first);
6560 bufferlist &bl = (out_entry != out.end()) ?
6561 out_entry->second : empty;
6562 switch (i->second.second) {
6563 case CEPH_OSD_CMPXATTR_OP_EQ:
6564 if (!(bl == i->second.first)) {
6565 r = -ECANCELED;
6566 }
6567 break;
6568 case CEPH_OSD_CMPXATTR_OP_LT:
6569 if (!(bl < i->second.first)) {
6570 r = -ECANCELED;
6571 }
6572 break;
6573 case CEPH_OSD_CMPXATTR_OP_GT:
6574 if (!(bl > i->second.first)) {
6575 r = -ECANCELED;
6576 }
6577 break;
6578 default:
6579 r = -EINVAL;
6580 break;
6581 }
6582 if (r < 0)
6583 break;
6584 }
6585 if (r < 0) {
6586 result = r;
6587 }
6588 }
6589 break;
6590
6591 // OMAP Write ops
6592 case CEPH_OSD_OP_OMAPSETVALS:
6593 if (!pool.info.supports_omap()) {
6594 result = -EOPNOTSUPP;
6595 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6596 break;
6597 }
6598 ++ctx->num_write;
6599 {
6600 maybe_create_new_object(ctx);
6601 bufferlist to_set_bl;
6602 try {
6603 decode_str_str_map_to_bl(bp, &to_set_bl);
6604 }
6605 catch (buffer::error& e) {
6606 result = -EINVAL;
6607 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6608 goto fail;
6609 }
6610 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6611 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6612 dout(20) << "setting vals: " << dendl;
6613 map<string,bufferlist> to_set;
6614 bufferlist::iterator pt = to_set_bl.begin();
6615 ::decode(to_set, pt);
6616 for (map<string, bufferlist>::iterator i = to_set.begin();
6617 i != to_set.end();
6618 ++i) {
6619 dout(20) << "\t" << i->first << dendl;
6620 }
6621 }
6622 t->omap_setkeys(soid, to_set_bl);
6623 ctx->delta_stats.num_wr++;
6624 }
6625 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6626 obs.oi.clear_omap_digest();
6627 break;
6628
6629 case CEPH_OSD_OP_OMAPSETHEADER:
6630 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6631 if (!pool.info.supports_omap()) {
6632 result = -EOPNOTSUPP;
6633 break;
6634 }
6635 ++ctx->num_write;
6636 {
6637 maybe_create_new_object(ctx);
6638 t->omap_setheader(soid, osd_op.indata);
6639 ctx->delta_stats.num_wr++;
6640 }
6641 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6642 obs.oi.clear_omap_digest();
6643 break;
6644
6645 case CEPH_OSD_OP_OMAPCLEAR:
6646 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6647 if (!pool.info.supports_omap()) {
6648 result = -EOPNOTSUPP;
6649 break;
6650 }
6651 ++ctx->num_write;
6652 {
6653 if (!obs.exists || oi.is_whiteout()) {
6654 result = -ENOENT;
6655 break;
6656 }
6657 if (oi.is_omap()) {
6658 t->omap_clear(soid);
6659 ctx->delta_stats.num_wr++;
6660 obs.oi.clear_omap_digest();
6661 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6662 }
6663 }
6664 break;
6665
6666 case CEPH_OSD_OP_OMAPRMKEYS:
6667 if (!pool.info.supports_omap()) {
6668 result = -EOPNOTSUPP;
6669 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6670 break;
6671 }
6672 ++ctx->num_write;
6673 {
6674 if (!obs.exists || oi.is_whiteout()) {
6675 result = -ENOENT;
6676 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6677 break;
6678 }
6679 bufferlist to_rm_bl;
6680 try {
6681 decode_str_set_to_bl(bp, &to_rm_bl);
6682 }
6683 catch (buffer::error& e) {
6684 result = -EINVAL;
6685 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6686 goto fail;
6687 }
6688 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6689 t->omap_rmkeys(soid, to_rm_bl);
6690 ctx->delta_stats.num_wr++;
6691 }
6692 obs.oi.clear_omap_digest();
6693 break;
6694
6695 case CEPH_OSD_OP_COPY_GET:
6696 ++ctx->num_read;
6697 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6698 soid.snap.val);
6699 if (op_finisher == nullptr) {
6700 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6701 } else {
6702 result = op_finisher->execute();
6703 }
6704 break;
6705
6706 case CEPH_OSD_OP_COPY_FROM:
6707 ++ctx->num_write;
6708 {
6709 object_t src_name;
6710 object_locator_t src_oloc;
6711 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6712 version_t src_version = op.copy_from.src_version;
6713 try {
6714 ::decode(src_name, bp);
6715 ::decode(src_oloc, bp);
6716 }
6717 catch (buffer::error& e) {
6718 result = -EINVAL;
6719 tracepoint(osd,
6720 do_osd_op_pre_copy_from,
6721 soid.oid.name.c_str(),
6722 soid.snap.val,
6723 "???",
6724 0,
6725 "???",
6726 "???",
6727 0,
6728 src_snapid,
6729 src_version);
6730 goto fail;
6731 }
6732 tracepoint(osd,
6733 do_osd_op_pre_copy_from,
6734 soid.oid.name.c_str(),
6735 soid.snap.val,
6736 src_name.name.c_str(),
6737 src_oloc.pool,
6738 src_oloc.key.c_str(),
6739 src_oloc.nspace.c_str(),
6740 src_oloc.hash,
6741 src_snapid,
6742 src_version);
6743 if (op_finisher == nullptr) {
6744 // start
6745 pg_t raw_pg;
6746 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6747 hobject_t src(src_name, src_oloc.key, src_snapid,
6748 raw_pg.ps(), raw_pg.pool(),
6749 src_oloc.nspace);
6750 if (src == soid) {
6751 dout(20) << " copy from self is invalid" << dendl;
6752 result = -EINVAL;
6753 break;
6754 }
6755 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6756 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6757 new CopyFromFinisher(cb));
6758 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6759 op.copy_from.flags,
6760 false,
6761 op.copy_from.src_fadvise_flags,
6762 op.flags);
6763 result = -EINPROGRESS;
6764 } else {
6765 // finish
6766 result = op_finisher->execute();
6767 assert(result == 0);
6768
6769 // COPY_FROM cannot be executed multiple times -- it must restart
6770 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6771 }
6772 }
6773 break;
6774
6775 default:
6776 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6777 dout(1) << "unrecognized osd op " << op.op
6778 << " " << ceph_osd_op_name(op.op)
6779 << dendl;
6780 result = -EOPNOTSUPP;
6781 }
6782
6783 fail:
6784 osd_op.rval = result;
6785 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6786 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6787 result = 0;
6788
6789 if (result < 0)
6790 break;
6791 }
6792 return result;
6793 }
6794
6795 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6796 {
6797 if (ctx->new_obs.oi.size == 0) {
6798 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6799 return -ENODATA;
6800 }
6801 vector<OSDOp> nops(1);
6802 OSDOp &newop = nops[0];
6803 newop.op.op = CEPH_OSD_OP_TMAPGET;
6804 do_osd_ops(ctx, nops);
6805 try {
6806 bufferlist::iterator i = newop.outdata.begin();
6807 ::decode(*header, i);
6808 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6809 } catch (...) {
6810 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6811 << dendl;
6812 return -EINVAL;
6813 }
6814 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6815 << dendl;
6816 return 0;
6817 }
6818
6819 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6820 const SnapSet& ss)
6821 {
6822 // verify that all clones have been evicted
6823 dout(20) << __func__ << " verifying clones are absent "
6824 << ss << dendl;
6825 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6826 p != ss.clones.end();
6827 ++p) {
6828 hobject_t clone_oid = soid;
6829 clone_oid.snap = *p;
6830 if (is_missing_object(clone_oid))
6831 return -EBUSY;
6832 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6833 if (clone_obc && clone_obc->obs.exists) {
6834 dout(10) << __func__ << " cannot evict head before clone "
6835 << clone_oid << dendl;
6836 return -EBUSY;
6837 }
6838 if (copy_ops.count(clone_oid)) {
6839 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6840 << clone_oid << dendl;
6841 return -EBUSY;
6842 }
6843 }
6844 return 0;
6845 }
6846
6847 inline int PrimaryLogPG::_delete_oid(
6848 OpContext *ctx,
6849 bool no_whiteout, // no whiteouts, no matter what.
6850 bool try_no_whiteout) // try not to whiteout
6851 {
6852 SnapSet& snapset = ctx->new_snapset;
6853 ObjectState& obs = ctx->new_obs;
6854 object_info_t& oi = obs.oi;
6855 const hobject_t& soid = oi.soid;
6856 PGTransaction* t = ctx->op_t.get();
6857
6858 // cache: cache: set whiteout on delete?
6859 bool whiteout = false;
6860 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6861 && !no_whiteout
6862 && !try_no_whiteout) {
6863 whiteout = true;
6864 }
6865 bool legacy;
6866 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6867 legacy = false;
6868 // in luminous or later, we can't delete the head if there are
6869 // clones. we trust the caller passing no_whiteout has already
6870 // verified they don't exist.
6871 if (!snapset.clones.empty() ||
6872 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6873 if (no_whiteout) {
6874 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6875 << dendl;
6876 } else {
6877 dout(20) << __func__ << " has or will have clones; will whiteout"
6878 << dendl;
6879 whiteout = true;
6880 }
6881 }
6882 } else {
6883 legacy = true;
6884 }
6885 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6886 << " no_whiteout=" << (int)no_whiteout
6887 << " try_no_whiteout=" << (int)try_no_whiteout
6888 << dendl;
6889 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6890 return -ENOENT;
6891
6892 t->remove(soid);
6893
6894 if (oi.size > 0) {
6895 interval_set<uint64_t> ch;
6896 ch.insert(0, oi.size);
6897 ctx->modified_ranges.union_of(ch);
6898 }
6899
6900 ctx->delta_stats.num_wr++;
6901 if (soid.is_snap()) {
6902 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6903 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6904 } else {
6905 ctx->delta_stats.num_bytes -= oi.size;
6906 }
6907 oi.size = 0;
6908 oi.new_object();
6909
6910 // disconnect all watchers
6911 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6912 oi.watchers.begin();
6913 p != oi.watchers.end();
6914 ++p) {
6915 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6916 ctx->watch_disconnects.push_back(
6917 watch_disconnect_t(p->first.first, p->first.second, true));
6918 }
6919 oi.watchers.clear();
6920
6921 if (whiteout) {
6922 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6923 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6924 ctx->delta_stats.num_whiteouts++;
6925 t->create(soid);
6926 osd->logger->inc(l_osd_tier_whiteout);
6927 return 0;
6928 }
6929
6930 // delete the head
6931 ctx->delta_stats.num_objects--;
6932 if (soid.is_snap())
6933 ctx->delta_stats.num_object_clones--;
6934 if (oi.is_whiteout()) {
6935 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6936 ctx->delta_stats.num_whiteouts--;
6937 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6938 }
6939 if (oi.is_cache_pinned()) {
6940 ctx->delta_stats.num_objects_pinned--;
6941 }
6942 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6943 snapset.head_exists = false;
6944 }
6945 obs.exists = false;
6946 return 0;
6947 }
6948
6949 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6950 {
6951 SnapSet& snapset = ctx->new_snapset;
6952 ObjectState& obs = ctx->new_obs;
6953 object_info_t& oi = obs.oi;
6954 const hobject_t& soid = oi.soid;
6955 PGTransaction* t = ctx->op_t.get();
6956 snapid_t snapid = (uint64_t)op.snap.snapid;
6957 hobject_t missing_oid;
6958
6959 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6960
6961 ObjectContextRef rollback_to;
6962 int ret = find_object_context(
6963 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6964 soid.get_namespace()),
6965 &rollback_to, false, false, &missing_oid);
6966 if (ret == -EAGAIN) {
6967 /* clone must be missing */
6968 assert(is_degraded_or_backfilling_object(missing_oid));
6969 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
6970 << missing_oid << " (requested snapid: ) " << snapid << dendl;
6971 block_write_on_degraded_snap(missing_oid, ctx->op);
6972 return ret;
6973 }
6974 {
6975 ObjectContextRef promote_obc;
6976 cache_result_t tier_mode_result;
6977 if (obs.exists && obs.oi.has_manifest()) {
6978 tier_mode_result =
6979 maybe_handle_manifest_detail(
6980 ctx->op,
6981 true,
6982 rollback_to);
6983 } else {
6984 tier_mode_result =
6985 maybe_handle_cache_detail(
6986 ctx->op,
6987 true,
6988 rollback_to,
6989 ret,
6990 missing_oid,
6991 true,
6992 false,
6993 &promote_obc);
6994 }
6995 switch (tier_mode_result) {
6996 case cache_result_t::NOOP:
6997 break;
6998 case cache_result_t::BLOCKED_PROMOTE:
6999 assert(promote_obc);
7000 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7001 return -EAGAIN;
7002 case cache_result_t::BLOCKED_FULL:
7003 block_write_on_full_cache(soid, ctx->op);
7004 return -EAGAIN;
7005 default:
7006 assert(0 == "must promote was set, other values are not valid");
7007 return -EAGAIN;
7008 }
7009 }
7010
7011 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7012 // there's no snapshot here, or there's no object.
7013 // if there's no snapshot, we delete the object; otherwise, do nothing.
7014 dout(20) << "_rollback_to deleting head on " << soid.oid
7015 << " because got ENOENT|whiteout on find_object_context" << dendl;
7016 if (ctx->obc->obs.oi.watchers.size()) {
7017 // Cannot delete an object with watchers
7018 ret = -EBUSY;
7019 } else {
7020 _delete_oid(ctx, false, false);
7021 ret = 0;
7022 }
7023 } else if (ret) {
7024 // ummm....huh? It *can't* return anything else at time of writing.
7025 assert(0 == "unexpected error code in _rollback_to");
7026 } else { //we got our context, let's use it to do the rollback!
7027 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7028 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7029 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7030 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7031 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7032 ret = -EAGAIN;
7033 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7034 // rolling back to the head; we just need to clone it.
7035 ctx->modify = true;
7036 } else {
7037 /* 1) Delete current head
7038 * 2) Clone correct snapshot into head
7039 * 3) Calculate clone_overlaps by following overlaps
7040 * forward from rollback snapshot */
7041 dout(10) << "_rollback_to deleting " << soid.oid
7042 << " and rolling back to old snap" << dendl;
7043
7044 if (obs.exists) {
7045 t->remove(soid);
7046 }
7047 t->clone(soid, rollback_to_sobject);
7048 snapset.head_exists = true;
7049 t->add_obc(rollback_to);
7050
7051 map<snapid_t, interval_set<uint64_t> >::iterator iter =
7052 snapset.clone_overlap.lower_bound(snapid);
7053 interval_set<uint64_t> overlaps = iter->second;
7054 assert(iter != snapset.clone_overlap.end());
7055 for ( ;
7056 iter != snapset.clone_overlap.end();
7057 ++iter)
7058 overlaps.intersection_of(iter->second);
7059
7060 if (obs.oi.size > 0) {
7061 interval_set<uint64_t> modified;
7062 modified.insert(0, obs.oi.size);
7063 overlaps.intersection_of(modified);
7064 modified.subtract(overlaps);
7065 ctx->modified_ranges.union_of(modified);
7066 }
7067
7068 // Adjust the cached objectcontext
7069 maybe_create_new_object(ctx, true);
7070 ctx->delta_stats.num_bytes -= obs.oi.size;
7071 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7072 obs.oi.size = rollback_to->obs.oi.size;
7073 if (rollback_to->obs.oi.is_data_digest())
7074 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7075 else
7076 obs.oi.clear_data_digest();
7077 if (rollback_to->obs.oi.is_omap_digest())
7078 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7079 else
7080 obs.oi.clear_omap_digest();
7081
7082 if (rollback_to->obs.oi.is_omap()) {
7083 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7084 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7085 } else {
7086 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7087 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7088 }
7089
7090 snapset.head_exists = true;
7091 }
7092 }
7093 return ret;
7094 }
7095
7096 void PrimaryLogPG::_make_clone(
7097 OpContext *ctx,
7098 PGTransaction* t,
7099 ObjectContextRef obc,
7100 const hobject_t& head, const hobject_t& coid,
7101 object_info_t *poi)
7102 {
7103 bufferlist bv;
7104 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7105
7106 t->clone(coid, head);
7107 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7108 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7109 }
7110
7111 void PrimaryLogPG::make_writeable(OpContext *ctx)
7112 {
7113 const hobject_t& soid = ctx->obs->oi.soid;
7114 SnapContext& snapc = ctx->snapc;
7115
7116 // clone?
7117 assert(soid.snap == CEPH_NOSNAP);
7118 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7119 << " snapc=" << snapc << dendl;
7120
7121 bool was_dirty = ctx->obc->obs.oi.is_dirty();
7122 if (ctx->new_obs.exists) {
7123 // we will mark the object dirty
7124 if (ctx->undirty && was_dirty) {
7125 dout(20) << " clearing DIRTY flag" << dendl;
7126 assert(ctx->new_obs.oi.is_dirty());
7127 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7128 --ctx->delta_stats.num_objects_dirty;
7129 osd->logger->inc(l_osd_tier_clean);
7130 } else if (!was_dirty && !ctx->undirty) {
7131 dout(20) << " setting DIRTY flag" << dendl;
7132 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7133 ++ctx->delta_stats.num_objects_dirty;
7134 osd->logger->inc(l_osd_tier_dirty);
7135 }
7136 } else {
7137 if (was_dirty) {
7138 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7139 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7140 --ctx->delta_stats.num_objects_dirty;
7141 }
7142 }
7143
7144 if ((ctx->new_obs.exists &&
7145 ctx->new_obs.oi.is_omap()) &&
7146 (!ctx->obc->obs.exists ||
7147 !ctx->obc->obs.oi.is_omap())) {
7148 ++ctx->delta_stats.num_objects_omap;
7149 }
7150 if ((!ctx->new_obs.exists ||
7151 !ctx->new_obs.oi.is_omap()) &&
7152 (ctx->obc->obs.exists &&
7153 ctx->obc->obs.oi.is_omap())) {
7154 --ctx->delta_stats.num_objects_omap;
7155 }
7156
7157 // use newer snapc?
7158 if (ctx->new_snapset.seq > snapc.seq) {
7159 snapc.seq = ctx->new_snapset.seq;
7160 snapc.snaps = ctx->new_snapset.snaps;
7161 filter_snapc(snapc.snaps);
7162 dout(10) << " using newer snapc " << snapc << dendl;
7163 }
7164
7165 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7166 snapc.snaps.size() && // there are snaps
7167 !ctx->cache_evict &&
7168 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
7169 // clone
7170 hobject_t coid = soid;
7171 coid.snap = snapc.seq;
7172
7173 unsigned l;
7174 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7175
7176 vector<snapid_t> snaps(l);
7177 for (unsigned i=0; i<l; i++)
7178 snaps[i] = snapc.snaps[i];
7179
7180 // prepare clone
7181 object_info_t static_snap_oi(coid);
7182 object_info_t *snap_oi;
7183 if (is_primary()) {
7184 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7185 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7186 ctx->clone_obc->obs.oi = static_snap_oi;
7187 ctx->clone_obc->obs.exists = true;
7188 ctx->clone_obc->ssc = ctx->obc->ssc;
7189 ctx->clone_obc->ssc->ref++;
7190 if (pool.info.require_rollback())
7191 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7192 snap_oi = &ctx->clone_obc->obs.oi;
7193 bool got = ctx->lock_manager.get_write_greedy(
7194 coid,
7195 ctx->clone_obc,
7196 ctx->op);
7197 assert(got);
7198 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7199 } else {
7200 snap_oi = &static_snap_oi;
7201 }
7202 snap_oi->version = ctx->at_version;
7203 snap_oi->prior_version = ctx->obs->oi.version;
7204 snap_oi->copy_user_bits(ctx->obs->oi);
7205
7206 bool legacy = ctx->new_snapset.is_legacy() ||
7207 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7208 if (legacy) {
7209 snap_oi->legacy_snaps = snaps;
7210 }
7211
7212 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7213
7214 ctx->delta_stats.num_objects++;
7215 if (snap_oi->is_dirty()) {
7216 ctx->delta_stats.num_objects_dirty++;
7217 osd->logger->inc(l_osd_tier_dirty);
7218 }
7219 if (snap_oi->is_omap())
7220 ctx->delta_stats.num_objects_omap++;
7221 if (snap_oi->is_cache_pinned())
7222 ctx->delta_stats.num_objects_pinned++;
7223 ctx->delta_stats.num_object_clones++;
7224 ctx->new_snapset.clones.push_back(coid.snap);
7225 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7226 if (!legacy) {
7227 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7228 }
7229
7230 // clone_overlap should contain an entry for each clone
7231 // (an empty interval_set if there is no overlap)
7232 ctx->new_snapset.clone_overlap[coid.snap];
7233 if (ctx->obs->oi.size)
7234 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7235
7236 // log clone
7237 dout(10) << " cloning v " << ctx->obs->oi.version
7238 << " to " << coid << " v " << ctx->at_version
7239 << " snaps=" << snaps
7240 << " snapset=" << ctx->new_snapset << dendl;
7241 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7242 ctx->obs->oi.version,
7243 ctx->obs->oi.user_version,
7244 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7245 ::encode(snaps, ctx->log.back().snaps);
7246
7247 ctx->at_version.version++;
7248 }
7249
7250 // update most recent clone_overlap and usage stats
7251 if (ctx->new_snapset.clones.size() > 0) {
7252 /* we need to check whether the most recent clone exists, if it's been evicted,
7253 * it's not included in the stats */
7254 hobject_t last_clone_oid = soid;
7255 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7256 if (is_present_clone(last_clone_oid)) {
7257 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7258 ctx->modified_ranges.intersection_of(newest_overlap);
7259 // modified_ranges is still in use by the clone
7260 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7261 newest_overlap.subtract(ctx->modified_ranges);
7262 }
7263 }
7264
7265 // update snapset with latest snap context
7266 ctx->new_snapset.seq = snapc.seq;
7267 ctx->new_snapset.snaps = snapc.snaps;
7268 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7269 // pessimistic assumption that this is a net-new legacy SnapSet
7270 ctx->delta_stats.num_legacy_snapsets++;
7271 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7272 } else if (ctx->new_snapset.is_legacy()) {
7273 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7274 }
7275 dout(20) << "make_writeable " << soid
7276 << " done, snapset=" << ctx->new_snapset << dendl;
7277 }
7278
7279
7280 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7281 interval_set<uint64_t>& modified, uint64_t offset,
7282 uint64_t length, bool write_full)
7283 {
7284 interval_set<uint64_t> ch;
7285 if (write_full) {
7286 if (oi.size)
7287 ch.insert(0, oi.size);
7288 } else if (length)
7289 ch.insert(offset, length);
7290 modified.union_of(ch);
7291 if (write_full || offset + length > oi.size) {
7292 uint64_t new_size = offset + length;
7293 delta_stats.num_bytes -= oi.size;
7294 delta_stats.num_bytes += new_size;
7295 oi.size = new_size;
7296 }
7297 delta_stats.num_wr++;
7298 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7299 }
7300
7301 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7302 {
7303 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7304 delta_stats.num_bytes += p.get_len();
7305 }
7306 }
7307
7308 void PrimaryLogPG::complete_disconnect_watches(
7309 ObjectContextRef obc,
7310 const list<watch_disconnect_t> &to_disconnect)
7311 {
7312 for (list<watch_disconnect_t>::const_iterator i =
7313 to_disconnect.begin();
7314 i != to_disconnect.end();
7315 ++i) {
7316 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7317 auto watchers_entry = obc->watchers.find(watcher);
7318 if (watchers_entry != obc->watchers.end()) {
7319 WatchRef watch = watchers_entry->second;
7320 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7321 obc->watchers.erase(watcher);
7322 watch->remove(i->send_disconnect);
7323 } else {
7324 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7325 << watcher << dendl;
7326 }
7327 }
7328 }
7329
7330 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7331 {
7332 entity_name_t entity = ctx->reqid.name;
7333 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7334
7335 // disconnects first
7336 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7337
7338 assert(conn);
7339
7340 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7341 if (!session.get())
7342 return;
7343 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7344
7345 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7346 i != ctx->watch_connects.end();
7347 ++i) {
7348 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7349 dout(15) << "do_osd_op_effects applying watch connect on session "
7350 << session.get() << " watcher " << watcher << dendl;
7351 WatchRef watch;
7352 if (ctx->obc->watchers.count(watcher)) {
7353 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7354 << dendl;
7355 watch = ctx->obc->watchers[watcher];
7356 } else {
7357 dout(15) << "do_osd_op_effects new watcher " << watcher
7358 << dendl;
7359 watch = Watch::makeWatchRef(
7360 this, osd, ctx->obc, i->first.timeout_seconds,
7361 i->first.cookie, entity, conn->get_peer_addr());
7362 ctx->obc->watchers.insert(
7363 make_pair(
7364 watcher,
7365 watch));
7366 }
7367 watch->connect(conn, i->second);
7368 }
7369
7370 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7371 p != ctx->notifies.end();
7372 ++p) {
7373 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7374 ConnectionRef conn(ctx->op->get_req()->get_connection());
7375 NotifyRef notif(
7376 Notify::makeNotifyRef(
7377 conn,
7378 ctx->reqid.name.num(),
7379 p->bl,
7380 p->timeout,
7381 p->cookie,
7382 p->notify_id,
7383 ctx->obc->obs.oi.user_version,
7384 osd));
7385 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7386 ctx->obc->watchers.begin();
7387 i != ctx->obc->watchers.end();
7388 ++i) {
7389 dout(10) << "starting notify on watch " << i->first << dendl;
7390 i->second->start_notify(notif);
7391 }
7392 notif->init();
7393 }
7394
7395 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7396 p != ctx->notify_acks.end();
7397 ++p) {
7398 if (p->watch_cookie)
7399 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7400 else
7401 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7402 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7403 ctx->obc->watchers.begin();
7404 i != ctx->obc->watchers.end();
7405 ++i) {
7406 if (i->first.second != entity) continue;
7407 if (p->watch_cookie &&
7408 p->watch_cookie.get() != i->first.first) continue;
7409 dout(10) << "acking notify on watch " << i->first << dendl;
7410 i->second->notify_ack(p->notify_id, p->reply_bl);
7411 }
7412 }
7413 }
7414
7415 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7416 {
7417 ostringstream ss;
7418 ss << "temp_" << info.pgid << "_" << get_role()
7419 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7420 hobject_t hoid = target.make_temp_hobject(ss.str());
7421 dout(20) << __func__ << " " << hoid << dendl;
7422 return hoid;
7423 }
7424
7425 hobject_t PrimaryLogPG::get_temp_recovery_object(
7426 const hobject_t& target,
7427 eversion_t version)
7428 {
7429 ostringstream ss;
7430 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7431 << "_" << version
7432 << "_" << info.history.same_interval_since
7433 << "_" << target.snap;
7434 // pgid + version + interval + snapid is unique, and short
7435 hobject_t hoid = target.make_temp_hobject(ss.str());
7436 dout(20) << __func__ << " " << hoid << dendl;
7437 return hoid;
7438 }
7439
7440 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7441 {
7442 assert(!ctx->ops->empty());
7443
7444 const hobject_t& soid = ctx->obs->oi.soid;
7445
7446 // valid snap context?
7447 if (!ctx->snapc.is_valid()) {
7448 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7449 return -EINVAL;
7450 }
7451
7452 // prepare the actual mutation
7453 int result = do_osd_ops(ctx, *ctx->ops);
7454 if (result < 0) {
7455 if (ctx->op->may_write() &&
7456 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7457 // need to save the error code in the pg log, to detect dup ops,
7458 // but do nothing else
7459 ctx->update_log_only = true;
7460 }
7461 return result;
7462 }
7463
7464 // read-op? write-op noop? done?
7465 if (ctx->op_t->empty() && !ctx->modify) {
7466 unstable_stats.add(ctx->delta_stats);
7467 if (ctx->op->may_write() &&
7468 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7469 ctx->update_log_only = true;
7470 }
7471 return result;
7472 }
7473
7474 // check for full
7475 if ((ctx->delta_stats.num_bytes > 0 ||
7476 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7477 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7478 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7479 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7480 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7481 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7482 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7483 << dendl;
7484 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7485 // they tried, they failed.
7486 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7487 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7488 } else {
7489 // drop request
7490 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7491 return -EAGAIN;
7492 }
7493 }
7494
7495 // clone, if necessary
7496 if (soid.snap == CEPH_NOSNAP)
7497 make_writeable(ctx);
7498
7499 finish_ctx(ctx,
7500 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7501 pg_log_entry_t::DELETE);
7502
7503 return result;
7504 }
7505
7506 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7507 {
7508 const hobject_t& soid = ctx->obs->oi.soid;
7509 dout(20) << __func__ << " " << soid << " " << ctx
7510 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7511 << dendl;
7512 utime_t now = ceph_clock_now();
7513
7514 // snapset
7515 bufferlist bss;
7516
7517 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7518 ::encode(ctx->new_snapset, bss);
7519 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7520 !ctx->new_snapset.is_legacy());
7521
7522 if (ctx->new_obs.exists) {
7523 if (!ctx->obs->exists) {
7524 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7525 hobject_t snapoid = soid.get_snapdir();
7526 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7527 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7528 ctx->at_version,
7529 ctx->snapset_obc->obs.oi.version,
7530 0, osd_reqid_t(), ctx->mtime, 0));
7531 ctx->op_t->remove(snapoid);
7532
7533 ctx->at_version.version++;
7534
7535 ctx->snapset_obc->obs.exists = false;
7536 }
7537 }
7538 } else if (!ctx->new_snapset.clones.empty() &&
7539 !ctx->cache_evict &&
7540 !ctx->new_snapset.head_exists &&
7541 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7542 // save snapset on _snap
7543 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7544 info.pgid.pool(), soid.get_namespace());
7545 dout(10) << " final snapset " << ctx->new_snapset
7546 << " in " << snapoid << dendl;
7547 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7548 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7549 ctx->at_version,
7550 eversion_t(),
7551 0, osd_reqid_t(), ctx->mtime, 0));
7552
7553 if (!ctx->snapset_obc)
7554 ctx->snapset_obc = get_object_context(snapoid, true);
7555 bool got = false;
7556 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7557 got = ctx->lock_manager.get_write_greedy(
7558 snapoid,
7559 ctx->snapset_obc,
7560 ctx->op);
7561 } else {
7562 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7563 got = ctx->lock_manager.get_lock_type(
7564 ObjectContext::RWState::RWEXCL,
7565 snapoid,
7566 ctx->snapset_obc,
7567 ctx->op);
7568 }
7569 assert(got);
7570 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7571 ctx->snapset_obc->obs.exists = true;
7572 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7573 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7574 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7575 ctx->snapset_obc->obs.oi.local_mtime = now;
7576
7577 map<string, bufferlist> attrs;
7578 bufferlist bv(sizeof(ctx->new_obs.oi));
7579 ::encode(ctx->snapset_obc->obs.oi, bv,
7580 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7581 ctx->op_t->create(snapoid);
7582 attrs[OI_ATTR].claim(bv);
7583 attrs[SS_ATTR].claim(bss);
7584 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7585 ctx->at_version.version++;
7586 }
7587 }
7588
7589 // finish and log the op.
7590 if (ctx->user_modify) {
7591 // update the user_version for any modify ops, except for the watch op
7592 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7593 /* In order for new clients and old clients to interoperate properly
7594 * when exchanging versions, we need to lower bound the user_version
7595 * (which our new clients pay proper attention to)
7596 * by the at_version (which is all the old clients can ever see). */
7597 if (ctx->at_version.version > ctx->user_at_version)
7598 ctx->user_at_version = ctx->at_version.version;
7599 ctx->new_obs.oi.user_version = ctx->user_at_version;
7600 }
7601 ctx->bytes_written = ctx->op_t->get_bytes_written();
7602
7603 if (ctx->new_obs.exists) {
7604 // on the head object
7605 ctx->new_obs.oi.version = ctx->at_version;
7606 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7607 ctx->new_obs.oi.last_reqid = ctx->reqid;
7608 if (ctx->mtime != utime_t()) {
7609 ctx->new_obs.oi.mtime = ctx->mtime;
7610 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7611 ctx->new_obs.oi.local_mtime = now;
7612 } else {
7613 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7614 }
7615
7616 map <string, bufferlist> attrs;
7617 bufferlist bv(sizeof(ctx->new_obs.oi));
7618 ::encode(ctx->new_obs.oi, bv,
7619 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7620 attrs[OI_ATTR].claim(bv);
7621
7622 if (soid.snap == CEPH_NOSNAP) {
7623 dout(10) << " final snapset " << ctx->new_snapset
7624 << " in " << soid << dendl;
7625 attrs[SS_ATTR].claim(bss);
7626 } else {
7627 dout(10) << " no snapset (this is a clone)" << dendl;
7628 }
7629 ctx->op_t->setattrs(soid, attrs);
7630 } else {
7631 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7632 }
7633
7634 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7635 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7636
7637 // append to log
7638 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7639 ctx->obs->oi.version,
7640 ctx->user_at_version, ctx->reqid,
7641 ctx->mtime, 0));
7642 if (soid.snap < CEPH_NOSNAP) {
7643 switch (log_op_type) {
7644 case pg_log_entry_t::MODIFY:
7645 case pg_log_entry_t::PROMOTE:
7646 case pg_log_entry_t::CLEAN:
7647 if (legacy_snapset) {
7648 dout(20) << __func__ << " encoding legacy_snaps "
7649 << ctx->new_obs.oi.legacy_snaps
7650 << dendl;
7651 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7652 } else {
7653 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7654 << dendl;
7655 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7656 }
7657 break;
7658 default:
7659 break;
7660 }
7661 }
7662
7663 if (!ctx->extra_reqids.empty()) {
7664 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7665 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7666 }
7667
7668 // apply new object state.
7669 ctx->obc->obs = ctx->new_obs;
7670
7671 if (soid.is_head() && !ctx->obc->obs.exists &&
7672 (!maintain_ssc || ctx->cache_evict)) {
7673 ctx->obc->ssc->exists = false;
7674 ctx->obc->ssc->snapset = SnapSet();
7675 } else {
7676 ctx->obc->ssc->exists = true;
7677 ctx->obc->ssc->snapset = ctx->new_snapset;
7678 }
7679 }
7680
7681 void PrimaryLogPG::apply_stats(
7682 const hobject_t &soid,
7683 const object_stat_sum_t &delta_stats) {
7684
7685 info.stats.stats.add(delta_stats);
7686
7687 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7688 i != backfill_targets.end();
7689 ++i) {
7690 pg_shard_t bt = *i;
7691 pg_info_t& pinfo = peer_info[bt];
7692 if (soid <= pinfo.last_backfill)
7693 pinfo.stats.stats.add(delta_stats);
7694 else if (soid <= last_backfill_started)
7695 pending_backfill_updates[soid].stats.add(delta_stats);
7696 }
7697
7698 if (is_primary() && scrubber.active) {
7699 if (soid < scrubber.start) {
7700 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7701 << "," << scrubber.end << ")" << dendl;
7702 scrub_cstat.add(delta_stats);
7703 } else {
7704 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7705 << "," << scrubber.end << ")" << dendl;
7706 }
7707 }
7708 }
7709
7710 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7711 {
7712 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7713 assert(ctx->async_reads_complete());
7714
7715 for (vector<OSDOp>::iterator p = ctx->ops->begin();
7716 p != ctx->ops->end() && result >= 0; ++p) {
7717 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7718 result = p->rval;
7719 break;
7720 }
7721 ctx->bytes_read += p->outdata.length();
7722 }
7723 ctx->reply->claim_op_out_data(*ctx->ops);
7724 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7725
7726 MOSDOpReply *reply = ctx->reply;
7727 ctx->reply = nullptr;
7728
7729 if (result >= 0) {
7730 if (!ctx->ignore_log_op_stats) {
7731 log_op_stats(ctx);
7732 publish_stats_to_osd();
7733 }
7734
7735 // on read, return the current object version
7736 if (ctx->obs) {
7737 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7738 } else {
7739 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7740 }
7741 } else if (result == -ENOENT) {
7742 // on ENOENT, set a floor for what the next user version will be.
7743 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7744 }
7745
7746 reply->set_result(result);
7747 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7748 osd->send_message_osd_client(reply, m->get_connection());
7749 close_op_ctx(ctx);
7750 }
7751
7752 // ========================================================================
7753 // copyfrom
7754
7755 struct C_Copyfrom : public Context {
7756 PrimaryLogPGRef pg;
7757 hobject_t oid;
7758 epoch_t last_peering_reset;
7759 ceph_tid_t tid;
7760 PrimaryLogPG::CopyOpRef cop;
7761 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7762 const PrimaryLogPG::CopyOpRef& c)
7763 : pg(p), oid(o), last_peering_reset(lpr),
7764 tid(0), cop(c)
7765 {}
7766 void finish(int r) override {
7767 if (r == -ECANCELED)
7768 return;
7769 pg->lock();
7770 if (last_peering_reset == pg->get_last_peering_reset()) {
7771 pg->process_copy_chunk(oid, tid, r);
7772 }
7773 pg->unlock();
7774 }
7775 };
7776
7777 struct C_CopyFrom_AsyncReadCb : public Context {
7778 OSDOp *osd_op;
7779 object_copy_data_t reply_obj;
7780 uint64_t features;
7781 size_t len;
7782 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7783 osd_op(osd_op), features(features), len(0) {}
7784 void finish(int r) override {
7785 osd_op->rval = r;
7786 if (r < 0) {
7787 return;
7788 }
7789
7790 assert(len > 0);
7791 assert(len <= reply_obj.data.length());
7792 bufferlist bl;
7793 bl.substr_of(reply_obj.data, 0, len);
7794 reply_obj.data.swap(bl);
7795 ::encode(reply_obj, osd_op->outdata, features);
7796 }
7797 };
7798
7799 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7800 OSDOp& osd_op, ObjectContextRef &obc)
7801 {
7802 object_info_t& oi = obc->obs.oi;
7803 hobject_t& soid = oi.soid;
7804 int result = 0;
7805 object_copy_cursor_t cursor;
7806 uint64_t out_max;
7807 try {
7808 ::decode(cursor, bp);
7809 ::decode(out_max, bp);
7810 }
7811 catch (buffer::error& e) {
7812 result = -EINVAL;
7813 return result;
7814 }
7815
7816 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7817 uint64_t features = op->get_features();
7818
7819 bool async_read_started = false;
7820 object_copy_data_t _reply_obj;
7821 C_CopyFrom_AsyncReadCb *cb = NULL;
7822 if (pool.info.require_rollback()) {
7823 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7824 }
7825 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7826 // size, mtime
7827 reply_obj.size = oi.size;
7828 reply_obj.mtime = oi.mtime;
7829 assert(obc->ssc);
7830 if (soid.snap < CEPH_NOSNAP) {
7831 if (obc->ssc->snapset.is_legacy()) {
7832 reply_obj.snaps = oi.legacy_snaps;
7833 } else {
7834 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7835 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7836 reply_obj.snaps = p->second;
7837 }
7838 } else {
7839 reply_obj.snap_seq = obc->ssc->snapset.seq;
7840 }
7841 if (oi.is_data_digest()) {
7842 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7843 reply_obj.data_digest = oi.data_digest;
7844 }
7845 if (oi.is_omap_digest()) {
7846 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7847 reply_obj.omap_digest = oi.omap_digest;
7848 }
7849 reply_obj.truncate_seq = oi.truncate_seq;
7850 reply_obj.truncate_size = oi.truncate_size;
7851
7852 // attrs
7853 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7854 if (!cursor.attr_complete) {
7855 result = getattrs_maybe_cache(
7856 ctx->obc,
7857 &out_attrs,
7858 true);
7859 if (result < 0) {
7860 if (cb) {
7861 delete cb;
7862 }
7863 return result;
7864 }
7865 cursor.attr_complete = true;
7866 dout(20) << " got attrs" << dendl;
7867 }
7868
7869 int64_t left = out_max - osd_op.outdata.length();
7870
7871 // data
7872 bufferlist& bl = reply_obj.data;
7873 if (left > 0 && !cursor.data_complete) {
7874 if (cursor.data_offset < oi.size) {
7875 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7876 if (cb) {
7877 async_read_started = true;
7878 ctx->pending_async_reads.push_back(
7879 make_pair(
7880 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7881 make_pair(&bl, cb)));
7882 cb->len = max_read;
7883
7884 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7885 new ReadFinisher(osd_op));
7886 result = -EINPROGRESS;
7887
7888 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7889 } else {
7890 result = pgbackend->objects_read_sync(
7891 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7892 if (result < 0)
7893 return result;
7894 }
7895 left -= max_read;
7896 cursor.data_offset += max_read;
7897 }
7898 if (cursor.data_offset == oi.size) {
7899 cursor.data_complete = true;
7900 dout(20) << " got data" << dendl;
7901 }
7902 assert(cursor.data_offset <= oi.size);
7903 }
7904
7905 // omap
7906 uint32_t omap_keys = 0;
7907 if (!pool.info.supports_omap() || !oi.is_omap()) {
7908 cursor.omap_complete = true;
7909 } else {
7910 if (left > 0 && !cursor.omap_complete) {
7911 assert(cursor.data_complete);
7912 if (cursor.omap_offset.empty()) {
7913 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7914 &reply_obj.omap_header);
7915 }
7916 bufferlist omap_data;
7917 ObjectMap::ObjectMapIterator iter =
7918 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7919 assert(iter);
7920 iter->upper_bound(cursor.omap_offset);
7921 for (; iter->valid(); iter->next(false)) {
7922 ++omap_keys;
7923 ::encode(iter->key(), omap_data);
7924 ::encode(iter->value(), omap_data);
7925 left -= iter->key().length() + 4 + iter->value().length() + 4;
7926 if (left <= 0)
7927 break;
7928 }
7929 if (omap_keys) {
7930 ::encode(omap_keys, reply_obj.omap_data);
7931 reply_obj.omap_data.claim_append(omap_data);
7932 }
7933 if (iter->valid()) {
7934 cursor.omap_offset = iter->key();
7935 } else {
7936 cursor.omap_complete = true;
7937 dout(20) << " got omap" << dendl;
7938 }
7939 }
7940 }
7941
7942 if (cursor.is_complete()) {
7943 // include reqids only in the final step. this is a bit fragile
7944 // but it works...
7945 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7946 dout(20) << " got reqids" << dendl;
7947 }
7948
7949 dout(20) << " cursor.is_complete=" << cursor.is_complete()
7950 << " " << out_attrs.size() << " attrs"
7951 << " " << bl.length() << " bytes"
7952 << " " << reply_obj.omap_header.length() << " omap header bytes"
7953 << " " << reply_obj.omap_data.length() << " omap data bytes in "
7954 << omap_keys << " keys"
7955 << " " << reply_obj.reqids.size() << " reqids"
7956 << dendl;
7957 reply_obj.cursor = cursor;
7958 if (!async_read_started) {
7959 ::encode(reply_obj, osd_op.outdata, features);
7960 }
7961 if (cb && !async_read_started) {
7962 delete cb;
7963 }
7964
7965 if (result > 0) {
7966 result = 0;
7967 }
7968 return result;
7969 }
7970
7971 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7972 OSDOp& osd_op)
7973 {
7974 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7975 // be careful not to modify anything else that will upset a racing
7976 // operator<<
7977 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7978 uint64_t features = m->get_features();
7979 object_copy_data_t reply_obj;
7980
7981 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7982 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7983 ::encode(reply_obj, osd_op.outdata, features);
7984 osd_op.rval = -ENOENT;
7985 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7986 reply->claim_op_out_data(m->ops);
7987 reply->set_result(-ENOENT);
7988 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7989 osd->send_message_osd_client(reply, m->get_connection());
7990 }
7991
7992 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7993 hobject_t src, object_locator_t oloc,
7994 version_t version, unsigned flags,
7995 bool mirror_snapset,
7996 unsigned src_obj_fadvise_flags,
7997 unsigned dest_obj_fadvise_flags)
7998 {
7999 const hobject_t& dest = obc->obs.oi.soid;
8000 dout(10) << __func__ << " " << dest
8001 << " from " << src << " " << oloc << " v" << version
8002 << " flags " << flags
8003 << (mirror_snapset ? " mirror_snapset" : "")
8004 << dendl;
8005
8006 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8007 src.snap == CEPH_SNAPDIR));
8008
8009 // cancel a previous in-progress copy?
8010 if (copy_ops.count(dest)) {
8011 // FIXME: if the src etc match, we could avoid restarting from the
8012 // beginning.
8013 CopyOpRef cop = copy_ops[dest];
8014 cancel_copy(cop, false);
8015 }
8016
8017 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8018 mirror_snapset, src_obj_fadvise_flags,
8019 dest_obj_fadvise_flags));
8020 copy_ops[dest] = cop;
8021 obc->start_block();
8022
8023 _copy_some(obc, cop);
8024 }
8025
8026 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8027 {
8028 dout(10) << __func__ << " " << obc << " " << cop << dendl;
8029
8030 unsigned flags = 0;
8031 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8032 flags |= CEPH_OSD_FLAG_FLUSH;
8033 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8034 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8035 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8036 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8037 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8038 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8039 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8040 flags |= CEPH_OSD_FLAG_RWORDERED;
8041
8042 C_GatherBuilder gather(cct);
8043
8044 if (cop->cursor.is_initial() && cop->mirror_snapset) {
8045 // list snaps too.
8046 assert(cop->src.snap == CEPH_NOSNAP);
8047 ObjectOperation op;
8048 op.list_snaps(&cop->results.snapset, NULL);
8049 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8050 CEPH_SNAPDIR, NULL,
8051 flags, gather.new_sub(), NULL);
8052 cop->objecter_tid2 = tid;
8053 }
8054
8055 ObjectOperation op;
8056 if (cop->results.user_version) {
8057 op.assert_version(cop->results.user_version);
8058 } else {
8059 // we should learn the version after the first chunk, if we didn't know
8060 // it already!
8061 assert(cop->cursor.is_initial());
8062 }
8063 op.copy_get(&cop->cursor, get_copy_chunk_size(),
8064 &cop->results.object_size, &cop->results.mtime,
8065 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8066 &cop->results.snaps, &cop->results.snap_seq,
8067 &cop->results.flags,
8068 &cop->results.source_data_digest,
8069 &cop->results.source_omap_digest,
8070 &cop->results.reqids,
8071 &cop->results.truncate_seq,
8072 &cop->results.truncate_size,
8073 &cop->rval);
8074 op.set_last_op_flags(cop->src_obj_fadvise_flags);
8075
8076 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8077 get_last_peering_reset(), cop);
8078 gather.set_finisher(new C_OnFinisher(fin,
8079 &osd->objecter_finisher));
8080
8081 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8082 cop->src.snap, NULL,
8083 flags,
8084 gather.new_sub(),
8085 // discover the object version if we don't know it yet
8086 cop->results.user_version ? NULL : &cop->results.user_version);
8087 fin->tid = tid;
8088 cop->objecter_tid = tid;
8089 gather.activate();
8090 }
8091
8092 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8093 {
8094 dout(10) << __func__ << " " << oid << " tid " << tid
8095 << " " << cpp_strerror(r) << dendl;
8096 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8097 if (p == copy_ops.end()) {
8098 dout(10) << __func__ << " no copy_op found" << dendl;
8099 return;
8100 }
8101 CopyOpRef cop = p->second;
8102 if (tid != cop->objecter_tid) {
8103 dout(10) << __func__ << " tid " << tid << " != cop " << cop
8104 << " tid " << cop->objecter_tid << dendl;
8105 return;
8106 }
8107
8108 if (cop->omap_data.length() || cop->omap_header.length())
8109 cop->results.has_omap = true;
8110
8111 if (r >= 0 && !pool.info.supports_omap() &&
8112 (cop->omap_data.length() || cop->omap_header.length())) {
8113 r = -EOPNOTSUPP;
8114 }
8115 cop->objecter_tid = 0;
8116 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
8117 ObjectContextRef& cobc = cop->obc;
8118
8119 if (r < 0)
8120 goto out;
8121
8122 assert(cop->rval >= 0);
8123
8124 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8125 // verify snap hasn't been deleted
8126 vector<snapid_t>::iterator p = cop->results.snaps.begin();
8127 while (p != cop->results.snaps.end()) {
8128 if (pool.info.is_removed_snap(*p)) {
8129 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8130 << dendl;
8131 for (vector<snapid_t>::iterator q = p + 1;
8132 q != cop->results.snaps.end();
8133 ++q)
8134 *(q - 1) = *q;
8135 cop->results.snaps.resize(cop->results.snaps.size() - 1);
8136 } else {
8137 ++p;
8138 }
8139 }
8140 if (cop->results.snaps.empty()) {
8141 dout(10) << __func__ << " no more snaps for " << oid << dendl;
8142 r = -ENOENT;
8143 goto out;
8144 }
8145 }
8146
8147 assert(cop->rval >= 0);
8148
8149 if (!cop->temp_cursor.data_complete) {
8150 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8151 }
8152 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8153 if (cop->omap_header.length()) {
8154 cop->results.omap_digest =
8155 cop->omap_header.crc32c(cop->results.omap_digest);
8156 }
8157 if (cop->omap_data.length()) {
8158 bufferlist keys;
8159 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8160 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8161 }
8162 }
8163
8164 if (!cop->temp_cursor.attr_complete) {
8165 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8166 p != cop->attrs.end();
8167 ++p) {
8168 cop->results.attrs[string("_") + p->first] = p->second;
8169 }
8170 cop->attrs.clear();
8171 }
8172
8173 if (!cop->cursor.is_complete()) {
8174 // write out what we have so far
8175 if (cop->temp_cursor.is_initial()) {
8176 assert(!cop->results.started_temp_obj);
8177 cop->results.started_temp_obj = true;
8178 cop->results.temp_oid = generate_temp_object(oid);
8179 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8180 }
8181 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8182 OpContextUPtr ctx = simple_opc_create(tempobc);
8183 if (cop->temp_cursor.is_initial()) {
8184 ctx->new_temp_oid = cop->results.temp_oid;
8185 }
8186 _write_copy_chunk(cop, ctx->op_t.get());
8187 simple_opc_submit(std::move(ctx));
8188 dout(10) << __func__ << " fetching more" << dendl;
8189 _copy_some(cobc, cop);
8190 return;
8191 }
8192
8193 // verify digests?
8194 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8195 dout(20) << __func__ << std::hex
8196 << " got digest: rx data 0x" << cop->results.data_digest
8197 << " omap 0x" << cop->results.omap_digest
8198 << ", source: data 0x" << cop->results.source_data_digest
8199 << " omap 0x" << cop->results.source_omap_digest
8200 << std::dec
8201 << " flags " << cop->results.flags
8202 << dendl;
8203 }
8204 if (cop->results.is_data_digest() &&
8205 cop->results.data_digest != cop->results.source_data_digest) {
8206 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8207 << " != source 0x" << cop->results.source_data_digest << std::dec
8208 << dendl;
8209 osd->clog->error() << info.pgid << " copy from " << cop->src
8210 << " to " << cop->obc->obs.oi.soid << std::hex
8211 << " data digest 0x" << cop->results.data_digest
8212 << " != source 0x" << cop->results.source_data_digest
8213 << std::dec;
8214 r = -EIO;
8215 goto out;
8216 }
8217 if (cop->results.is_omap_digest() &&
8218 cop->results.omap_digest != cop->results.source_omap_digest) {
8219 derr << __func__ << std::hex
8220 << " omap digest 0x" << cop->results.omap_digest
8221 << " != source 0x" << cop->results.source_omap_digest
8222 << std::dec << dendl;
8223 osd->clog->error() << info.pgid << " copy from " << cop->src
8224 << " to " << cop->obc->obs.oi.soid << std::hex
8225 << " omap digest 0x" << cop->results.omap_digest
8226 << " != source 0x" << cop->results.source_omap_digest
8227 << std::dec;
8228 r = -EIO;
8229 goto out;
8230 }
8231 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8232 derr << __func__ << " injecting copyfrom failure" << dendl;
8233 r = -EIO;
8234 goto out;
8235 }
8236
8237 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8238 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8239 ObjectState& obs = cop->obc->obs;
8240 if (cop->temp_cursor.is_initial()) {
8241 dout(20) << "fill_in_final_tx: writing "
8242 << "directly to final object" << dendl;
8243 // write directly to final object
8244 cop->results.temp_oid = obs.oi.soid;
8245 _write_copy_chunk(cop, t);
8246 } else {
8247 // finish writing to temp object, then move into place
8248 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8249 _write_copy_chunk(cop, t);
8250 t->rename(obs.oi.soid, cop->results.temp_oid);
8251 }
8252 t->setattrs(obs.oi.soid, cop->results.attrs);
8253 });
8254
8255 dout(20) << __func__ << " success; committing" << dendl;
8256
8257 out:
8258 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8259 CopyCallbackResults results(r, &cop->results);
8260 cop->cb->complete(results);
8261
8262 copy_ops.erase(cobc->obs.oi.soid);
8263 cobc->stop_block();
8264
8265 if (r < 0 && cop->results.started_temp_obj) {
8266 dout(10) << __func__ << " deleting partial temp object "
8267 << cop->results.temp_oid << dendl;
8268 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8269 OpContextUPtr ctx = simple_opc_create(tempobc);
8270 ctx->op_t->remove(cop->results.temp_oid);
8271 ctx->discard_temp_oid = cop->results.temp_oid;
8272 simple_opc_submit(std::move(ctx));
8273 }
8274
8275 // cancel and requeue proxy ops on this object
8276 if (!r) {
8277 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8278 it != proxyread_ops.end();) {
8279 if (it->second->soid == cobc->obs.oi.soid) {
8280 cancel_proxy_read((it++)->second);
8281 } else {
8282 ++it;
8283 }
8284 }
8285 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8286 it != proxywrite_ops.end();) {
8287 if (it->second->soid == cobc->obs.oi.soid) {
8288 cancel_proxy_write((it++)->second);
8289 } else {
8290 ++it;
8291 }
8292 }
8293 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8294 }
8295
8296 kick_object_context_blocked(cobc);
8297 }
8298
8299 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8300 {
8301 dout(20) << __func__ << " " << cop
8302 << " " << cop->attrs.size() << " attrs"
8303 << " " << cop->data.length() << " bytes"
8304 << " " << cop->omap_header.length() << " omap header bytes"
8305 << " " << cop->omap_data.length() << " omap data bytes"
8306 << dendl;
8307 if (!cop->temp_cursor.attr_complete) {
8308 t->create(cop->results.temp_oid);
8309 }
8310 if (!cop->temp_cursor.data_complete) {
8311 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8312 cop->cursor.data_offset);
8313 if (pool.info.requires_aligned_append() &&
8314 !cop->cursor.data_complete) {
8315 /**
8316 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8317 * to pick it up on the next pass.
8318 */
8319 assert(cop->temp_cursor.data_offset %
8320 pool.info.required_alignment() == 0);
8321 if (cop->data.length() % pool.info.required_alignment() != 0) {
8322 uint64_t to_trim =
8323 cop->data.length() % pool.info.required_alignment();
8324 bufferlist bl;
8325 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8326 cop->data.swap(bl);
8327 cop->cursor.data_offset -= to_trim;
8328 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8329 cop->cursor.data_offset);
8330 }
8331 }
8332 if (cop->data.length()) {
8333 t->write(
8334 cop->results.temp_oid,
8335 cop->temp_cursor.data_offset,
8336 cop->data.length(),
8337 cop->data,
8338 cop->dest_obj_fadvise_flags);
8339 }
8340 cop->data.clear();
8341 }
8342 if (pool.info.supports_omap()) {
8343 if (!cop->temp_cursor.omap_complete) {
8344 if (cop->omap_header.length()) {
8345 t->omap_setheader(
8346 cop->results.temp_oid,
8347 cop->omap_header);
8348 cop->omap_header.clear();
8349 }
8350 if (cop->omap_data.length()) {
8351 map<string,bufferlist> omap;
8352 bufferlist::iterator p = cop->omap_data.begin();
8353 ::decode(omap, p);
8354 t->omap_setkeys(cop->results.temp_oid, omap);
8355 cop->omap_data.clear();
8356 }
8357 }
8358 } else {
8359 assert(cop->omap_header.length() == 0);
8360 assert(cop->omap_data.length() == 0);
8361 }
8362 cop->temp_cursor = cop->cursor;
8363 }
8364
8365 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8366 {
8367 OpContext *ctx = cb->ctx;
8368 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8369
8370 ObjectState& obs = ctx->new_obs;
8371 if (obs.exists) {
8372 dout(20) << __func__ << ": exists, removing" << dendl;
8373 ctx->op_t->remove(obs.oi.soid);
8374 } else {
8375 ctx->delta_stats.num_objects++;
8376 obs.exists = true;
8377 }
8378 if (cb->is_temp_obj_used()) {
8379 ctx->discard_temp_oid = cb->results->temp_oid;
8380 }
8381 cb->results->fill_in_final_tx(ctx->op_t.get());
8382
8383 // CopyFromCallback fills this in for us
8384 obs.oi.user_version = ctx->user_at_version;
8385
8386 obs.oi.set_data_digest(cb->results->data_digest);
8387 obs.oi.set_omap_digest(cb->results->omap_digest);
8388
8389 obs.oi.truncate_seq = cb->results->truncate_seq;
8390 obs.oi.truncate_size = cb->results->truncate_size;
8391
8392 ctx->extra_reqids = cb->results->reqids;
8393
8394 // cache: clear whiteout?
8395 if (obs.oi.is_whiteout()) {
8396 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8397 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8398 --ctx->delta_stats.num_whiteouts;
8399 }
8400
8401 if (cb->results->has_omap) {
8402 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8403 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8404 } else {
8405 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8406 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8407 }
8408
8409 interval_set<uint64_t> ch;
8410 if (obs.oi.size > 0)
8411 ch.insert(0, obs.oi.size);
8412 ctx->modified_ranges.union_of(ch);
8413
8414 if (cb->get_data_size() != obs.oi.size) {
8415 ctx->delta_stats.num_bytes -= obs.oi.size;
8416 obs.oi.size = cb->get_data_size();
8417 ctx->delta_stats.num_bytes += obs.oi.size;
8418 }
8419 ctx->delta_stats.num_wr++;
8420 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8421
8422 osd->logger->inc(l_osd_copyfrom);
8423 }
8424
8425 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8426 ObjectContextRef obc)
8427 {
8428 const hobject_t& soid = obc->obs.oi.soid;
8429 dout(10) << __func__ << " " << soid << " r=" << r
8430 << " uv" << results->user_version << dendl;
8431
8432 if (r == -ECANCELED) {
8433 return;
8434 }
8435
8436 if (r != -ENOENT && soid.is_snap()) {
8437 if (results->snaps.empty()) {
8438 // we must have read "snap" content from the head object in
8439 // the base pool. use snap_seq to construct what snaps should
8440 // be for this clone (what is was before we evicted the clean
8441 // clone from this pool, and what it will be when we flush and
8442 // the clone eventually happens in the base pool).
8443 SnapSet& snapset = obc->ssc->snapset;
8444 vector<snapid_t>::iterator p = snapset.snaps.begin();
8445 while (p != snapset.snaps.end() && *p > soid.snap)
8446 ++p;
8447 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8448 results->snaps.push_back(*p);
8449 ++p;
8450 }
8451 }
8452
8453 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8454 filter_snapc(results->snaps);
8455
8456 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8457 if (results->snaps.empty()) {
8458 dout(20) << __func__
8459 << " snaps are empty, clone is invalid,"
8460 << " setting r to ENOENT" << dendl;
8461 r = -ENOENT;
8462 }
8463 }
8464
8465 if (r < 0 && results->started_temp_obj) {
8466 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8467 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8468 assert(tempobc);
8469 OpContextUPtr ctx = simple_opc_create(tempobc);
8470 ctx->op_t->remove(results->temp_oid);
8471 simple_opc_submit(std::move(ctx));
8472 results->started_temp_obj = false;
8473 }
8474
8475 if (r == -ENOENT && soid.is_snap()) {
8476 dout(10) << __func__
8477 << ": enoent while trying to promote clone, " << soid
8478 << " must have been trimmed, removing from snapset"
8479 << dendl;
8480 hobject_t head(soid.get_head());
8481 ObjectContextRef obc = get_object_context(head, false);
8482 assert(obc);
8483
8484 OpContextUPtr tctx = simple_opc_create(obc);
8485 tctx->at_version = get_next_version();
8486 filter_snapc(tctx->new_snapset.snaps);
8487 vector<snapid_t> new_clones;
8488 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8489 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8490 i != tctx->new_snapset.clones.end();
8491 ++i) {
8492 if (*i != soid.snap) {
8493 new_clones.push_back(*i);
8494 auto p = tctx->new_snapset.clone_snaps.find(*i);
8495 if (p != tctx->new_snapset.clone_snaps.end()) {
8496 new_clone_snaps[*i] = p->second;
8497 }
8498 }
8499 }
8500 tctx->new_snapset.clones.swap(new_clones);
8501 tctx->new_snapset.clone_overlap.erase(soid.snap);
8502 tctx->new_snapset.clone_size.erase(soid.snap);
8503 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8504
8505 // take RWWRITE lock for duration of our local write. ignore starvation.
8506 if (!tctx->lock_manager.take_write_lock(
8507 head,
8508 obc)) {
8509 assert(0 == "problem!");
8510 }
8511 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8512
8513 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8514
8515 simple_opc_submit(std::move(tctx));
8516 return;
8517 }
8518
8519 bool whiteout = false;
8520 if (r == -ENOENT) {
8521 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8522 dout(10) << __func__ << " whiteout " << soid << dendl;
8523 whiteout = true;
8524 }
8525
8526 if (r < 0 && !whiteout) {
8527 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8528 // pass error to everyone blocked on this object
8529 // FIXME: this is pretty sloppy, but at this point we got
8530 // something unexpected and don't have many other options.
8531 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8532 waiting_for_blocked_object.find(soid);
8533 if (blocked_iter != waiting_for_blocked_object.end()) {
8534 while (!blocked_iter->second.empty()) {
8535 osd->reply_op_error(blocked_iter->second.front(), r);
8536 blocked_iter->second.pop_front();
8537 }
8538 waiting_for_blocked_object.erase(blocked_iter);
8539 }
8540 return;
8541 }
8542
8543 osd->promote_finish(results->object_size);
8544
8545 OpContextUPtr tctx = simple_opc_create(obc);
8546 tctx->at_version = get_next_version();
8547
8548 ++tctx->delta_stats.num_objects;
8549 if (soid.snap < CEPH_NOSNAP)
8550 ++tctx->delta_stats.num_object_clones;
8551 tctx->new_obs.exists = true;
8552
8553 tctx->extra_reqids = results->reqids;
8554
8555 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8556 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8557
8558 if (whiteout) {
8559 // create a whiteout
8560 tctx->op_t->create(soid);
8561 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8562 ++tctx->delta_stats.num_whiteouts;
8563 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8564 osd->logger->inc(l_osd_tier_whiteout);
8565 } else {
8566 if (results->has_omap) {
8567 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8568 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8569 ++tctx->delta_stats.num_objects_omap;
8570 }
8571
8572 results->fill_in_final_tx(tctx->op_t.get());
8573 if (results->started_temp_obj) {
8574 tctx->discard_temp_oid = results->temp_oid;
8575 }
8576 tctx->new_obs.oi.size = results->object_size;
8577 tctx->new_obs.oi.user_version = results->user_version;
8578 // Don't care src object whether have data or omap digest
8579 if (results->object_size)
8580 tctx->new_obs.oi.set_data_digest(results->data_digest);
8581 if (results->has_omap)
8582 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8583 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8584 tctx->new_obs.oi.truncate_size = results->truncate_size;
8585
8586 if (soid.snap != CEPH_NOSNAP) {
8587 if (legacy_snapset) {
8588 tctx->new_obs.oi.legacy_snaps = results->snaps;
8589 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8590 } else {
8591 // it's already in the snapset
8592 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8593 }
8594 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8595 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8596 results->object_size);
8597 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8598
8599 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8600 } else {
8601 tctx->delta_stats.num_bytes += results->object_size;
8602 }
8603 }
8604
8605 if (results->mirror_snapset) {
8606 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8607 tctx->new_snapset.from_snap_set(
8608 results->snapset,
8609 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8610 }
8611 tctx->new_snapset.head_exists = true;
8612 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8613
8614 // take RWWRITE lock for duration of our local write. ignore starvation.
8615 if (!tctx->lock_manager.take_write_lock(
8616 obc->obs.oi.soid,
8617 obc)) {
8618 assert(0 == "problem!");
8619 }
8620 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8621
8622 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8623
8624 simple_opc_submit(std::move(tctx));
8625
8626 osd->logger->inc(l_osd_tier_promote);
8627
8628 if (agent_state &&
8629 agent_state->is_idle())
8630 agent_choose_mode();
8631 }
8632
8633 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8634 {
8635 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8636 << " from " << cop->src << " " << cop->oloc
8637 << " v" << cop->results.user_version << dendl;
8638
8639 // cancel objecter op, if we can
8640 if (cop->objecter_tid) {
8641 osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8642 cop->objecter_tid = 0;
8643 if (cop->objecter_tid2) {
8644 osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8645 cop->objecter_tid2 = 0;
8646 }
8647 }
8648
8649 copy_ops.erase(cop->obc->obs.oi.soid);
8650 cop->obc->stop_block();
8651
8652 kick_object_context_blocked(cop->obc);
8653 cop->results.should_requeue = requeue;
8654 CopyCallbackResults result(-ECANCELED, &cop->results);
8655 cop->cb->complete(result);
8656
8657 // There may still be an objecter callback referencing this copy op.
8658 // That callback will not need the obc since it's been canceled, and
8659 // we need the obc reference to go away prior to flush.
8660 cop->obc = ObjectContextRef();
8661 }
8662
8663 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8664 {
8665 dout(10) << __func__ << dendl;
8666 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8667 while (p != copy_ops.end()) {
8668 // requeue this op? can I queue up all of them?
8669 cancel_copy((p++)->second, requeue);
8670 }
8671 }
8672
8673
8674 // ========================================================================
8675 // flush
8676 //
8677 // Flush a dirty object in the cache tier by writing it back to the
8678 // base tier. The sequence looks like:
8679 //
8680 // * send a copy-from operation to the base tier to copy the current
8681 // version of the object
8682 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8683 // * on completion, we check if the object has been modified. if so,
8684 // just reply with -EAGAIN.
8685 // * try to take a write lock so we can clear the dirty flag. if this
8686 // fails, wait and retry
8687 // * start a repop that clears the bit.
8688 //
8689 // If we have to wait, we will retry by coming back through the
8690 // start_flush method. We check if a flush is already in progress
8691 // and, if so, try to finish it by rechecking the version and trying
8692 // to clear the dirty bit.
8693 //
8694 // In order for the cache-flush (a write op) to not block the copy-get
8695 // from reading the object, the client *must* set the SKIPRWLOCKS
8696 // flag.
8697 //
8698 // NOTE: normally writes are strictly ordered for the client, but
8699 // flushes are special in that they can be reordered with respect to
8700 // other writes. In particular, we can't have a flush request block
8701 // an update to the cache pool object!
8702
8703 struct C_Flush : public Context {
8704 PrimaryLogPGRef pg;
8705 hobject_t oid;
8706 epoch_t last_peering_reset;
8707 ceph_tid_t tid;
8708 utime_t start;
8709 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8710 : pg(p), oid(o), last_peering_reset(lpr),
8711 tid(0), start(ceph_clock_now())
8712 {}
8713 void finish(int r) override {
8714 if (r == -ECANCELED)
8715 return;
8716 pg->lock();
8717 if (last_peering_reset == pg->get_last_peering_reset()) {
8718 pg->finish_flush(oid, tid, r);
8719 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8720 }
8721 pg->unlock();
8722 }
8723 };
8724
8725 int PrimaryLogPG::start_flush(
8726 OpRequestRef op, ObjectContextRef obc,
8727 bool blocking, hobject_t *pmissing,
8728 boost::optional<std::function<void()>> &&on_flush)
8729 {
8730 const object_info_t& oi = obc->obs.oi;
8731 const hobject_t& soid = oi.soid;
8732 dout(10) << __func__ << " " << soid
8733 << " v" << oi.version
8734 << " uv" << oi.user_version
8735 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8736 << dendl;
8737
8738 // get a filtered snapset, need to remove removed snaps
8739 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8740
8741 // verify there are no (older) check for dirty clones
8742 {
8743 dout(20) << " snapset " << snapset << dendl;
8744 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8745 while (p != snapset.clones.rend() && *p >= soid.snap)
8746 ++p;
8747 if (p != snapset.clones.rend()) {
8748 hobject_t next = soid;
8749 next.snap = *p;
8750 assert(next.snap < soid.snap);
8751 if (pg_log.get_missing().is_missing(next)) {
8752 dout(10) << __func__ << " missing clone is " << next << dendl;
8753 if (pmissing)
8754 *pmissing = next;
8755 return -ENOENT;
8756 }
8757 ObjectContextRef older_obc = get_object_context(next, false);
8758 if (older_obc) {
8759 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8760 << dendl;
8761 if (older_obc->obs.oi.is_dirty()) {
8762 dout(10) << __func__ << " next oldest clone is dirty: "
8763 << older_obc->obs.oi << dendl;
8764 return -EBUSY;
8765 }
8766 } else {
8767 dout(20) << __func__ << " next oldest clone " << next
8768 << " is not present; implicitly clean" << dendl;
8769 }
8770 } else {
8771 dout(20) << __func__ << " no older clones" << dendl;
8772 }
8773 }
8774
8775 if (blocking)
8776 obc->start_block();
8777
8778 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8779 if (p != flush_ops.end()) {
8780 FlushOpRef fop = p->second;
8781 if (fop->op == op) {
8782 // we couldn't take the write lock on a cache-try-flush before;
8783 // now we are trying again for the lock.
8784 return try_flush_mark_clean(fop);
8785 }
8786 if (fop->flushed_version == obc->obs.oi.user_version &&
8787 (fop->blocking || !blocking)) {
8788 // nonblocking can join anything
8789 // blocking can only join a blocking flush
8790 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8791 if (op)
8792 fop->dup_ops.push_back(op);
8793 return -EAGAIN; // clean up this ctx; op will retry later
8794 }
8795
8796 // cancel current flush since it will fail anyway, or because we
8797 // are blocking and the existing flush is nonblocking.
8798 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8799 if (fop->op)
8800 osd->reply_op_error(fop->op, -EBUSY);
8801 while (!fop->dup_ops.empty()) {
8802 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8803 fop->dup_ops.pop_front();
8804 }
8805 cancel_flush(fop, false);
8806 }
8807
8808 /**
8809 * In general, we need to send a delete and a copyfrom.
8810 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8811 * where 4 is marked as clean. To flush 10, we have to:
8812 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8813 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8814 *
8815 * There is a complicating case. Supposed there had been a clone 7
8816 * for snaps [7, 6] which has been trimmed since they no longer exist.
8817 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8818 * the delete, the snap will be promoted to 5, and the head will become
8819 * a snapdir. When the copy-from goes through, we'll end up with
8820 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8821 *
8822 * Another complication is the case where there is an interval change
8823 * after doing the delete and the flush but before marking the object
8824 * clean. We'll happily delete head and then recreate it at the same
8825 * sequence number, which works out ok.
8826 */
8827
8828 SnapContext snapc, dsnapc;
8829 if (snapset.seq != 0) {
8830 if (soid.snap == CEPH_NOSNAP) {
8831 snapc.seq = snapset.seq;
8832 snapc.snaps = snapset.snaps;
8833 } else {
8834 snapid_t min_included_snap;
8835 if (snapset.is_legacy()) {
8836 min_included_snap = oi.legacy_snaps.back();
8837 } else {
8838 auto p = snapset.clone_snaps.find(soid.snap);
8839 assert(p != snapset.clone_snaps.end());
8840 min_included_snap = p->second.back();
8841 }
8842 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8843 }
8844
8845 snapid_t prev_snapc = 0;
8846 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8847 citer != snapset.clones.rend();
8848 ++citer) {
8849 if (*citer < soid.snap) {
8850 prev_snapc = *citer;
8851 break;
8852 }
8853 }
8854
8855 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8856 }
8857
8858 object_locator_t base_oloc(soid);
8859 base_oloc.pool = pool.info.tier_of;
8860
8861 if (dsnapc.seq < snapc.seq) {
8862 ObjectOperation o;
8863 o.remove();
8864 osd->objecter->mutate(
8865 soid.oid,
8866 base_oloc,
8867 o,
8868 dsnapc,
8869 ceph::real_clock::from_ceph_timespec(oi.mtime),
8870 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8871 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8872 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8873 }
8874
8875 FlushOpRef fop(std::make_shared<FlushOp>());
8876 fop->obc = obc;
8877 fop->flushed_version = oi.user_version;
8878 fop->blocking = blocking;
8879 fop->on_flush = std::move(on_flush);
8880 fop->op = op;
8881
8882 ObjectOperation o;
8883 if (oi.is_whiteout()) {
8884 fop->removal = true;
8885 o.remove();
8886 } else {
8887 object_locator_t oloc(soid);
8888 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8889 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8890 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8891 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8892 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8893 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8894
8895 //mean the base tier don't cache data after this
8896 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8897 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8898 }
8899 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8900
8901 ceph_tid_t tid = osd->objecter->mutate(
8902 soid.oid, base_oloc, o, snapc,
8903 ceph::real_clock::from_ceph_timespec(oi.mtime),
8904 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8905 new C_OnFinisher(fin,
8906 &osd->objecter_finisher));
8907 /* we're under the pg lock and fin->finish() is grabbing that */
8908 fin->tid = tid;
8909 fop->objecter_tid = tid;
8910
8911 flush_ops[soid] = fop;
8912 info.stats.stats.sum.num_flush++;
8913 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8914 return -EINPROGRESS;
8915 }
8916
8917 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8918 {
8919 dout(10) << __func__ << " " << oid << " tid " << tid
8920 << " " << cpp_strerror(r) << dendl;
8921 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8922 if (p == flush_ops.end()) {
8923 dout(10) << __func__ << " no flush_op found" << dendl;
8924 return;
8925 }
8926 FlushOpRef fop = p->second;
8927 if (tid != fop->objecter_tid) {
8928 dout(10) << __func__ << " tid " << tid << " != fop " << fop
8929 << " tid " << fop->objecter_tid << dendl;
8930 return;
8931 }
8932 ObjectContextRef obc = fop->obc;
8933 fop->objecter_tid = 0;
8934
8935 if (r < 0 && !(r == -ENOENT && fop->removal)) {
8936 if (fop->op)
8937 osd->reply_op_error(fop->op, -EBUSY);
8938 if (fop->blocking) {
8939 obc->stop_block();
8940 kick_object_context_blocked(obc);
8941 }
8942
8943 if (!fop->dup_ops.empty()) {
8944 dout(20) << __func__ << " requeueing dups" << dendl;
8945 requeue_ops(fop->dup_ops);
8946 }
8947 if (fop->on_flush) {
8948 (*(fop->on_flush))();
8949 fop->on_flush = boost::none;
8950 }
8951 flush_ops.erase(oid);
8952 return;
8953 }
8954
8955 r = try_flush_mark_clean(fop);
8956 if (r == -EBUSY && fop->op) {
8957 osd->reply_op_error(fop->op, r);
8958 }
8959 }
8960
8961 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8962 {
8963 ObjectContextRef obc = fop->obc;
8964 const hobject_t& oid = obc->obs.oi.soid;
8965
8966 if (fop->blocking) {
8967 obc->stop_block();
8968 kick_object_context_blocked(obc);
8969 }
8970
8971 if (fop->flushed_version != obc->obs.oi.user_version ||
8972 !obc->obs.exists) {
8973 if (obc->obs.exists)
8974 dout(10) << __func__ << " flushed_version " << fop->flushed_version
8975 << " != current " << obc->obs.oi.user_version
8976 << dendl;
8977 else
8978 dout(10) << __func__ << " object no longer exists" << dendl;
8979
8980 if (!fop->dup_ops.empty()) {
8981 dout(20) << __func__ << " requeueing dups" << dendl;
8982 requeue_ops(fop->dup_ops);
8983 }
8984 if (fop->on_flush) {
8985 (*(fop->on_flush))();
8986 fop->on_flush = boost::none;
8987 }
8988 flush_ops.erase(oid);
8989 if (fop->blocking)
8990 osd->logger->inc(l_osd_tier_flush_fail);
8991 else
8992 osd->logger->inc(l_osd_tier_try_flush_fail);
8993 return -EBUSY;
8994 }
8995
8996 if (!fop->blocking &&
8997 scrubber.write_blocked_by_scrub(oid)) {
8998 if (fop->op) {
8999 dout(10) << __func__ << " blocked by scrub" << dendl;
9000 requeue_op(fop->op);
9001 requeue_ops(fop->dup_ops);
9002 return -EAGAIN; // will retry
9003 } else {
9004 osd->logger->inc(l_osd_tier_try_flush_fail);
9005 cancel_flush(fop, false);
9006 return -ECANCELED;
9007 }
9008 }
9009
9010 // successfully flushed, can we evict this object?
9011 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9012 agent_maybe_evict(obc, true)) {
9013 osd->logger->inc(l_osd_tier_clean);
9014 if (fop->on_flush) {
9015 (*(fop->on_flush))();
9016 fop->on_flush = boost::none;
9017 }
9018 flush_ops.erase(oid);
9019 return 0;
9020 }
9021
9022 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9023 OpContextUPtr ctx = simple_opc_create(fop->obc);
9024
9025 // successfully flushed; can we clear the dirty bit?
9026 // try to take the lock manually, since we don't
9027 // have a ctx yet.
9028 if (ctx->lock_manager.get_lock_type(
9029 ObjectContext::RWState::RWWRITE,
9030 oid,
9031 obc,
9032 fop->op)) {
9033 dout(20) << __func__ << " took write lock" << dendl;
9034 } else if (fop->op) {
9035 dout(10) << __func__ << " waiting on write lock" << dendl;
9036 close_op_ctx(ctx.release());
9037 requeue_op(fop->op);
9038 requeue_ops(fop->dup_ops);
9039 return -EAGAIN; // will retry
9040 } else {
9041 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9042 close_op_ctx(ctx.release());
9043 osd->logger->inc(l_osd_tier_try_flush_fail);
9044 cancel_flush(fop, false);
9045 return -ECANCELED;
9046 }
9047
9048 if (fop->on_flush) {
9049 ctx->register_on_finish(*(fop->on_flush));
9050 fop->on_flush = boost::none;
9051 }
9052
9053 ctx->at_version = get_next_version();
9054
9055 ctx->new_obs = obc->obs;
9056 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9057 --ctx->delta_stats.num_objects_dirty;
9058
9059 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9060
9061 osd->logger->inc(l_osd_tier_clean);
9062
9063 if (!fop->dup_ops.empty() || fop->op) {
9064 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9065 list<OpRequestRef> ls;
9066 if (fop->op)
9067 ls.push_back(fop->op);
9068 ls.splice(ls.end(), fop->dup_ops);
9069 requeue_ops(ls);
9070 }
9071
9072 simple_opc_submit(std::move(ctx));
9073
9074 flush_ops.erase(oid);
9075
9076 if (fop->blocking)
9077 osd->logger->inc(l_osd_tier_flush);
9078 else
9079 osd->logger->inc(l_osd_tier_try_flush);
9080
9081 return -EINPROGRESS;
9082 }
9083
9084 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
9085 {
9086 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9087 << fop->objecter_tid << dendl;
9088 if (fop->objecter_tid) {
9089 osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
9090 fop->objecter_tid = 0;
9091 }
9092 if (fop->blocking) {
9093 fop->obc->stop_block();
9094 kick_object_context_blocked(fop->obc);
9095 }
9096 if (requeue) {
9097 if (fop->op)
9098 requeue_op(fop->op);
9099 requeue_ops(fop->dup_ops);
9100 }
9101 if (fop->on_flush) {
9102 (*(fop->on_flush))();
9103 fop->on_flush = boost::none;
9104 }
9105 flush_ops.erase(fop->obc->obs.oi.soid);
9106 }
9107
9108 void PrimaryLogPG::cancel_flush_ops(bool requeue)
9109 {
9110 dout(10) << __func__ << dendl;
9111 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9112 while (p != flush_ops.end()) {
9113 cancel_flush((p++)->second, requeue);
9114 }
9115 }
9116
9117 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9118 {
9119 if (!pool.info.allow_incomplete_clones())
9120 return true;
9121 if (is_missing_object(coid))
9122 return true;
9123 ObjectContextRef obc = get_object_context(coid, false);
9124 return obc && obc->obs.exists;
9125 }
9126
9127 // ========================================================================
9128 // rep op gather
9129
9130 class C_OSD_RepopApplied : public Context {
9131 PrimaryLogPGRef pg;
9132 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9133 public:
9134 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9135 : pg(pg), repop(repop) {}
9136 void finish(int) override {
9137 pg->repop_all_applied(repop.get());
9138 }
9139 };
9140
9141
9142 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9143 {
9144 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9145 << dendl;
9146 assert(!repop->applies_with_commit);
9147 repop->all_applied = true;
9148 if (!repop->rep_aborted) {
9149 eval_repop(repop);
9150 }
9151 }
9152
9153 class C_OSD_RepopCommit : public Context {
9154 PrimaryLogPGRef pg;
9155 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9156 public:
9157 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9158 : pg(pg), repop(repop) {}
9159 void finish(int) override {
9160 pg->repop_all_committed(repop.get());
9161 }
9162 };
9163
9164 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9165 {
9166 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9167 << dendl;
9168 repop->all_committed = true;
9169 if (repop->applies_with_commit) {
9170 assert(!repop->all_applied);
9171 repop->all_applied = true;
9172 }
9173
9174 if (!repop->rep_aborted) {
9175 if (repop->v != eversion_t()) {
9176 last_update_ondisk = repop->v;
9177 last_complete_ondisk = repop->pg_local_last_complete;
9178 }
9179 eval_repop(repop);
9180 }
9181 }
9182
9183 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9184 {
9185 dout(10) << "op_applied version " << applied_version << dendl;
9186 if (applied_version == eversion_t())
9187 return;
9188 assert(applied_version > last_update_applied);
9189 assert(applied_version <= info.last_update);
9190 last_update_applied = applied_version;
9191 if (is_primary()) {
9192 if (scrubber.active) {
9193 if (last_update_applied >= scrubber.subset_last_update) {
9194 if (ops_blocked_by_scrub()) {
9195 requeue_scrub(true);
9196 } else {
9197 requeue_scrub(false);
9198 }
9199
9200 }
9201 } else {
9202 assert(scrubber.start == scrubber.end);
9203 }
9204 } else {
9205 if (scrubber.active_rep_scrub) {
9206 if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9207 scrubber.active_rep_scrub->get_req())->scrub_to) {
9208 osd->enqueue_back(
9209 info.pgid,
9210 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9211 scrubber.active_rep_scrub = OpRequestRef();
9212 }
9213 }
9214 }
9215 }
9216
9217 void PrimaryLogPG::eval_repop(RepGather *repop)
9218 {
9219 const MOSDOp *m = NULL;
9220 if (repop->op)
9221 m = static_cast<const MOSDOp *>(repop->op->get_req());
9222
9223 if (m)
9224 dout(10) << "eval_repop " << *repop
9225 << (repop->rep_done ? " DONE" : "")
9226 << dendl;
9227 else
9228 dout(10) << "eval_repop " << *repop << " (no op)"
9229 << (repop->rep_done ? " DONE" : "")
9230 << dendl;
9231
9232 if (repop->rep_done)
9233 return;
9234
9235 // ondisk?
9236 if (repop->all_committed) {
9237 dout(10) << " commit: " << *repop << dendl;
9238 for (auto p = repop->on_committed.begin();
9239 p != repop->on_committed.end();
9240 repop->on_committed.erase(p++)) {
9241 (*p)();
9242 }
9243 // send dup commits, in order
9244 if (waiting_for_ondisk.count(repop->v)) {
9245 assert(waiting_for_ondisk.begin()->first == repop->v);
9246 for (list<pair<OpRequestRef, version_t> >::iterator i =
9247 waiting_for_ondisk[repop->v].begin();
9248 i != waiting_for_ondisk[repop->v].end();
9249 ++i) {
9250 osd->reply_op_error(i->first, repop->r, repop->v,
9251 i->second);
9252 }
9253 waiting_for_ondisk.erase(repop->v);
9254 }
9255 }
9256
9257 // applied?
9258 if (repop->all_applied) {
9259 if (repop->applies_with_commit) {
9260 assert(repop->on_applied.empty());
9261 }
9262 dout(10) << " applied: " << *repop << " " << dendl;
9263 for (auto p = repop->on_applied.begin();
9264 p != repop->on_applied.end();
9265 repop->on_applied.erase(p++)) {
9266 (*p)();
9267 }
9268 }
9269
9270 // done.
9271 if (repop->all_applied && repop->all_committed) {
9272 repop->rep_done = true;
9273
9274 publish_stats_to_osd();
9275 calc_min_last_complete_ondisk();
9276
9277 dout(10) << " removing " << *repop << dendl;
9278 assert(!repop_queue.empty());
9279 dout(20) << " q front is " << *repop_queue.front() << dendl;
9280 if (repop_queue.front() != repop) {
9281 if (!repop->applies_with_commit) {
9282 dout(0) << " removing " << *repop << dendl;
9283 dout(0) << " q front is " << *repop_queue.front() << dendl;
9284 assert(repop_queue.front() == repop);
9285 }
9286 } else {
9287 RepGather *to_remove = nullptr;
9288 while (!repop_queue.empty() &&
9289 (to_remove = repop_queue.front())->rep_done) {
9290 repop_queue.pop_front();
9291 for (auto p = to_remove->on_success.begin();
9292 p != to_remove->on_success.end();
9293 to_remove->on_success.erase(p++)) {
9294 (*p)();
9295 }
9296 remove_repop(to_remove);
9297 }
9298 }
9299 }
9300 }
9301
9302 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9303 {
9304 FUNCTRACE();
9305 const hobject_t& soid = ctx->obs->oi.soid;
9306 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9307 << " o " << soid
9308 << dendl;
9309
9310 repop->v = ctx->at_version;
9311 if (ctx->at_version > eversion_t()) {
9312 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9313 i != actingbackfill.end();
9314 ++i) {
9315 if (*i == get_primary()) continue;
9316 pg_info_t &pinfo = peer_info[*i];
9317 // keep peer_info up to date
9318 if (pinfo.last_complete == pinfo.last_update)
9319 pinfo.last_complete = ctx->at_version;
9320 pinfo.last_update = ctx->at_version;
9321 }
9322 }
9323
9324 ctx->obc->ondisk_write_lock();
9325
9326 bool unlock_snapset_obc = false;
9327 ctx->op_t->add_obc(ctx->obc);
9328 if (ctx->clone_obc) {
9329 ctx->clone_obc->ondisk_write_lock();
9330 ctx->op_t->add_obc(ctx->clone_obc);
9331 }
9332 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9333 ctx->obc->obs.oi.soid) {
9334 ctx->snapset_obc->ondisk_write_lock();
9335 unlock_snapset_obc = true;
9336 ctx->op_t->add_obc(ctx->snapset_obc);
9337 }
9338
9339 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9340 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9341 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9342 ctx->obc,
9343 ctx->clone_obc,
9344 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9345 if (!(ctx->log.empty())) {
9346 assert(ctx->at_version >= projected_last_update);
9347 projected_last_update = ctx->at_version;
9348 }
9349 for (auto &&entry: ctx->log) {
9350 projected_log.add(entry);
9351 }
9352 pgbackend->submit_transaction(
9353 soid,
9354 ctx->delta_stats,
9355 ctx->at_version,
9356 std::move(ctx->op_t),
9357 pg_trim_to,
9358 min_last_complete_ondisk,
9359 ctx->log,
9360 ctx->updated_hset_history,
9361 onapplied_sync,
9362 on_all_applied,
9363 on_all_commit,
9364 repop->rep_tid,
9365 ctx->reqid,
9366 ctx->op);
9367 }
9368
9369 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9370 OpContext *ctx, ObjectContextRef obc,
9371 ceph_tid_t rep_tid)
9372 {
9373 if (ctx->op)
9374 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9375 else
9376 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9377
9378 RepGather *repop = new RepGather(
9379 ctx, rep_tid, info.last_complete, false);
9380
9381 repop->start = ceph_clock_now();
9382
9383 repop_queue.push_back(&repop->queue_item);
9384 repop->get();
9385
9386 osd->logger->inc(l_osd_op_wip);
9387
9388 dout(10) << __func__ << ": " << *repop << dendl;
9389 return repop;
9390 }
9391
9392 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9393 eversion_t version,
9394 int r,
9395 ObcLockManager &&manager,
9396 OpRequestRef &&op,
9397 boost::optional<std::function<void(void)> > &&on_complete)
9398 {
9399 RepGather *repop = new RepGather(
9400 std::move(manager),
9401 std::move(op),
9402 std::move(on_complete),
9403 osd->get_tid(),
9404 info.last_complete,
9405 true,
9406 r);
9407 repop->v = version;
9408
9409 repop->start = ceph_clock_now();
9410
9411 repop_queue.push_back(&repop->queue_item);
9412
9413 osd->logger->inc(l_osd_op_wip);
9414
9415 dout(10) << __func__ << ": " << *repop << dendl;
9416 return boost::intrusive_ptr<RepGather>(repop);
9417 }
9418
9419 void PrimaryLogPG::remove_repop(RepGather *repop)
9420 {
9421 dout(20) << __func__ << " " << *repop << dendl;
9422
9423 for (auto p = repop->on_finish.begin();
9424 p != repop->on_finish.end();
9425 repop->on_finish.erase(p++)) {
9426 (*p)();
9427 }
9428
9429 release_object_locks(
9430 repop->lock_manager);
9431 repop->put();
9432
9433 osd->logger->dec(l_osd_op_wip);
9434 }
9435
9436 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9437 {
9438 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9439 ceph_tid_t rep_tid = osd->get_tid();
9440 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9441 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9442 ctx->op_t.reset(new PGTransaction());
9443 ctx->mtime = ceph_clock_now();
9444 return ctx;
9445 }
9446
9447 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9448 {
9449 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9450 dout(20) << __func__ << " " << repop << dendl;
9451 issue_repop(repop, ctx.get());
9452 eval_repop(repop);
9453 calc_trim_to();
9454 repop->put();
9455 }
9456
9457
9458 void PrimaryLogPG::submit_log_entries(
9459 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9460 ObcLockManager &&manager,
9461 boost::optional<std::function<void(void)> > &&_on_complete,
9462 OpRequestRef op,
9463 int r)
9464 {
9465 dout(10) << __func__ << " " << entries << dendl;
9466 assert(is_primary());
9467
9468 eversion_t version;
9469 if (!entries.empty()) {
9470 assert(entries.rbegin()->version >= projected_last_update);
9471 version = projected_last_update = entries.rbegin()->version;
9472 }
9473
9474 boost::intrusive_ptr<RepGather> repop;
9475 boost::optional<std::function<void(void)> > on_complete;
9476 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9477 repop = new_repop(
9478 version,
9479 r,
9480 std::move(manager),
9481 std::move(op),
9482 std::move(_on_complete));
9483 } else {
9484 on_complete = std::move(_on_complete);
9485 }
9486
9487 pgbackend->call_write_ordered(
9488 [this, entries, repop, on_complete]() {
9489 ObjectStore::Transaction t;
9490 eversion_t old_last_update = info.last_update;
9491 merge_new_log_entries(entries, t);
9492
9493
9494 set<pg_shard_t> waiting_on;
9495 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9496 i != actingbackfill.end();
9497 ++i) {
9498 pg_shard_t peer(*i);
9499 if (peer == pg_whoami) continue;
9500 assert(peer_missing.count(peer));
9501 assert(peer_info.count(peer));
9502 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9503 assert(repop);
9504 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9505 entries,
9506 spg_t(info.pgid.pgid, i->shard),
9507 pg_whoami.shard,
9508 get_osdmap()->get_epoch(),
9509 last_peering_reset,
9510 repop->rep_tid);
9511 osd->send_message_osd_cluster(
9512 peer.osd, m, get_osdmap()->get_epoch());
9513 waiting_on.insert(peer);
9514 } else {
9515 MOSDPGLog *m = new MOSDPGLog(
9516 peer.shard, pg_whoami.shard,
9517 info.last_update.epoch,
9518 info);
9519 m->log.log = entries;
9520 m->log.tail = old_last_update;
9521 m->log.head = info.last_update;
9522 osd->send_message_osd_cluster(
9523 peer.osd, m, get_osdmap()->get_epoch());
9524 }
9525 }
9526 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9527 ceph_tid_t rep_tid = repop->rep_tid;
9528 waiting_on.insert(pg_whoami);
9529 log_entry_update_waiting_on.insert(
9530 make_pair(
9531 rep_tid,
9532 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9533 ));
9534 struct OnComplete : public Context {
9535 PrimaryLogPGRef pg;
9536 ceph_tid_t rep_tid;
9537 epoch_t epoch;
9538 OnComplete(
9539 PrimaryLogPGRef pg,
9540 ceph_tid_t rep_tid,
9541 epoch_t epoch)
9542 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9543 void finish(int) override {
9544 pg->lock();
9545 if (!pg->pg_has_reset_since(epoch)) {
9546 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9547 assert(it != pg->log_entry_update_waiting_on.end());
9548 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9549 assert(it2 != it->second.waiting_on.end());
9550 it->second.waiting_on.erase(it2);
9551 if (it->second.waiting_on.empty()) {
9552 pg->repop_all_committed(it->second.repop.get());
9553 pg->log_entry_update_waiting_on.erase(it);
9554 }
9555 }
9556 pg->unlock();
9557 }
9558 };
9559 t.register_on_commit(
9560 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9561 } else {
9562 if (on_complete) {
9563 struct OnComplete : public Context {
9564 PrimaryLogPGRef pg;
9565 std::function<void(void)> on_complete;
9566 epoch_t epoch;
9567 OnComplete(
9568 PrimaryLogPGRef pg,
9569 const std::function<void(void)> &on_complete,
9570 epoch_t epoch)
9571 : pg(pg),
9572 on_complete(std::move(on_complete)),
9573 epoch(epoch) {}
9574 void finish(int) override {
9575 pg->lock();
9576 if (!pg->pg_has_reset_since(epoch))
9577 on_complete();
9578 pg->unlock();
9579 }
9580 };
9581 t.register_on_complete(
9582 new OnComplete{
9583 this, *on_complete, get_osdmap()->get_epoch()
9584 });
9585 }
9586 }
9587 t.register_on_applied(
9588 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9589 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9590 assert(r == 0);
9591 });
9592 }
9593
9594 void PrimaryLogPG::cancel_log_updates()
9595 {
9596 // get rid of all the LogUpdateCtx so their references to repops are
9597 // dropped
9598 log_entry_update_waiting_on.clear();
9599 }
9600
9601 // -------------------------------------------------------
9602
9603 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9604 {
9605 pair<hobject_t, ObjectContextRef> i;
9606 while (object_contexts.get_next(i.first, &i)) {
9607 ObjectContextRef obc(i.second);
9608 get_obc_watchers(obc, pg_watchers);
9609 }
9610 }
9611
9612 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9613 {
9614 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9615 obc->watchers.begin();
9616 j != obc->watchers.end();
9617 ++j) {
9618 obj_watch_item_t owi;
9619
9620 owi.obj = obc->obs.oi.soid;
9621 owi.wi.addr = j->second->get_peer_addr();
9622 owi.wi.name = j->second->get_entity();
9623 owi.wi.cookie = j->second->get_cookie();
9624 owi.wi.timeout_seconds = j->second->get_timeout();
9625
9626 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9627 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9628
9629 pg_watchers.push_back(owi);
9630 }
9631 }
9632
9633 void PrimaryLogPG::check_blacklisted_watchers()
9634 {
9635 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9636 pair<hobject_t, ObjectContextRef> i;
9637 while (object_contexts.get_next(i.first, &i))
9638 check_blacklisted_obc_watchers(i.second);
9639 }
9640
9641 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9642 {
9643 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9644 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9645 obc->watchers.begin();
9646 k != obc->watchers.end();
9647 ) {
9648 //Advance iterator now so handle_watch_timeout() can erase element
9649 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9650 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9651 entity_addr_t ea = j->second->get_peer_addr();
9652 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9653 if (get_osdmap()->is_blacklisted(ea)) {
9654 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9655 assert(j->second->get_pg() == this);
9656 j->second->unregister_cb();
9657 handle_watch_timeout(j->second);
9658 }
9659 }
9660 }
9661
9662 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9663 {
9664 assert(is_active());
9665 assert((recovering.count(obc->obs.oi.soid) ||
9666 !is_missing_object(obc->obs.oi.soid)) ||
9667 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9668 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9669 pg_log_entry_t::LOST_REVERT &&
9670 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9671 obc->obs.oi.version));
9672
9673 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9674 assert(obc->watchers.empty());
9675 // populate unconnected_watchers
9676 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9677 obc->obs.oi.watchers.begin();
9678 p != obc->obs.oi.watchers.end();
9679 ++p) {
9680 utime_t expire = info.stats.last_became_active;
9681 expire += p->second.timeout_seconds;
9682 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9683 WatchRef watch(
9684 Watch::makeWatchRef(
9685 this, osd, obc, p->second.timeout_seconds, p->first.first,
9686 p->first.second, p->second.addr));
9687 watch->disconnect();
9688 obc->watchers.insert(
9689 make_pair(
9690 make_pair(p->first.first, p->first.second),
9691 watch));
9692 }
9693 // Look for watchers from blacklisted clients and drop
9694 check_blacklisted_obc_watchers(obc);
9695 }
9696
9697 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9698 {
9699 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9700 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9701
9702 if (!is_active()) {
9703 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9704 return;
9705 }
9706 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9707 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9708 watch->get_delayed_cb()
9709 );
9710 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9711 << obc->obs.oi.soid
9712 << dendl;
9713 return;
9714 }
9715
9716 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9717 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9718 << obc->obs.oi.soid
9719 << dendl;
9720 scrubber.add_callback(
9721 watch->get_delayed_cb() // This callback!
9722 );
9723 return;
9724 }
9725
9726 OpContextUPtr ctx = simple_opc_create(obc);
9727 ctx->at_version = get_next_version();
9728
9729 object_info_t& oi = ctx->new_obs.oi;
9730 oi.watchers.erase(make_pair(watch->get_cookie(),
9731 watch->get_entity()));
9732
9733 list<watch_disconnect_t> watch_disconnects = {
9734 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9735 };
9736 ctx->register_on_success(
9737 [this, obc, watch_disconnects]() {
9738 complete_disconnect_watches(obc, watch_disconnects);
9739 });
9740
9741
9742 PGTransaction *t = ctx->op_t.get();
9743 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9744 ctx->at_version,
9745 oi.version,
9746 0,
9747 osd_reqid_t(), ctx->mtime, 0));
9748
9749 oi.prior_version = obc->obs.oi.version;
9750 oi.version = ctx->at_version;
9751 bufferlist bl;
9752 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9753 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9754
9755 // apply new object state.
9756 ctx->obc->obs = ctx->new_obs;
9757
9758 // no ctx->delta_stats
9759 simple_opc_submit(std::move(ctx));
9760 }
9761
9762 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9763 SnapSetContext *ssc)
9764 {
9765 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9766 assert(obc->destructor_callback == NULL);
9767 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9768 obc->obs.oi = oi;
9769 obc->obs.exists = false;
9770 obc->ssc = ssc;
9771 if (ssc)
9772 register_snapset_context(ssc);
9773 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9774 if (is_active())
9775 populate_obc_watchers(obc);
9776 return obc;
9777 }
9778
9779 ObjectContextRef PrimaryLogPG::get_object_context(
9780 const hobject_t& soid,
9781 bool can_create,
9782 const map<string, bufferlist> *attrs)
9783 {
9784 assert(
9785 attrs || !pg_log.get_missing().is_missing(soid) ||
9786 // or this is a revert... see recover_primary()
9787 (pg_log.get_log().objects.count(soid) &&
9788 pg_log.get_log().objects.find(soid)->second->op ==
9789 pg_log_entry_t::LOST_REVERT));
9790 ObjectContextRef obc = object_contexts.lookup(soid);
9791 osd->logger->inc(l_osd_object_ctx_cache_total);
9792 if (obc) {
9793 osd->logger->inc(l_osd_object_ctx_cache_hit);
9794 dout(10) << __func__ << ": found obc in cache: " << obc
9795 << dendl;
9796 } else {
9797 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9798 // check disk
9799 bufferlist bv;
9800 if (attrs) {
9801 assert(attrs->count(OI_ATTR));
9802 bv = attrs->find(OI_ATTR)->second;
9803 } else {
9804 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9805 if (r < 0) {
9806 if (!can_create) {
9807 dout(10) << __func__ << ": no obc for soid "
9808 << soid << " and !can_create"
9809 << dendl;
9810 return ObjectContextRef(); // -ENOENT!
9811 }
9812
9813 dout(10) << __func__ << ": no obc for soid "
9814 << soid << " but can_create"
9815 << dendl;
9816 // new object.
9817 object_info_t oi(soid);
9818 SnapSetContext *ssc = get_snapset_context(
9819 soid, true, 0, false);
9820 assert(ssc);
9821 obc = create_object_context(oi, ssc);
9822 dout(10) << __func__ << ": " << obc << " " << soid
9823 << " " << obc->rwstate
9824 << " oi: " << obc->obs.oi
9825 << " ssc: " << obc->ssc
9826 << " snapset: " << obc->ssc->snapset << dendl;
9827 return obc;
9828 }
9829 }
9830
9831 object_info_t oi;
9832 try {
9833 bufferlist::iterator bliter = bv.begin();
9834 ::decode(oi, bliter);
9835 } catch (...) {
9836 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9837 return ObjectContextRef(); // -ENOENT!
9838 }
9839
9840 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9841
9842 obc = object_contexts.lookup_or_create(oi.soid);
9843 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9844 obc->obs.oi = oi;
9845 obc->obs.exists = true;
9846
9847 obc->ssc = get_snapset_context(
9848 soid, true,
9849 soid.has_snapset() ? attrs : 0);
9850
9851 if (is_active())
9852 populate_obc_watchers(obc);
9853
9854 if (pool.info.require_rollback()) {
9855 if (attrs) {
9856 obc->attr_cache = *attrs;
9857 } else {
9858 int r = pgbackend->objects_get_attrs(
9859 soid,
9860 &obc->attr_cache);
9861 assert(r == 0);
9862 }
9863 }
9864
9865 dout(10) << __func__ << ": creating obc from disk: " << obc
9866 << dendl;
9867 }
9868
9869 // XXX: Caller doesn't expect this
9870 if (obc->ssc == NULL) {
9871 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9872 return ObjectContextRef(); // -ENOENT!
9873 }
9874
9875 dout(10) << __func__ << ": " << obc << " " << soid
9876 << " " << obc->rwstate
9877 << " oi: " << obc->obs.oi
9878 << " exists: " << (int)obc->obs.exists
9879 << " ssc: " << obc->ssc
9880 << " snapset: " << obc->ssc->snapset << dendl;
9881 return obc;
9882 }
9883
9884 void PrimaryLogPG::context_registry_on_change()
9885 {
9886 pair<hobject_t, ObjectContextRef> i;
9887 while (object_contexts.get_next(i.first, &i)) {
9888 ObjectContextRef obc(i.second);
9889 if (obc) {
9890 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9891 obc->watchers.begin();
9892 j != obc->watchers.end();
9893 obc->watchers.erase(j++)) {
9894 j->second->discard();
9895 }
9896 }
9897 }
9898 }
9899
9900
9901 /*
9902 * If we return an error, and set *pmissing, then promoting that
9903 * object may help.
9904 *
9905 * If we return -EAGAIN, we will always set *pmissing to the missing
9906 * object to wait for.
9907 *
9908 * If we return an error but do not set *pmissing, then we know the
9909 * object does not exist.
9910 */
9911 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9912 ObjectContextRef *pobc,
9913 bool can_create,
9914 bool map_snapid_to_clone,
9915 hobject_t *pmissing)
9916 {
9917 FUNCTRACE();
9918 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9919 // want the head?
9920 if (oid.snap == CEPH_NOSNAP) {
9921 ObjectContextRef obc = get_object_context(oid, can_create);
9922 if (!obc) {
9923 if (pmissing)
9924 *pmissing = oid;
9925 return -ENOENT;
9926 }
9927 dout(10) << "find_object_context " << oid
9928 << " @" << oid.snap
9929 << " oi=" << obc->obs.oi
9930 << dendl;
9931 *pobc = obc;
9932
9933 return 0;
9934 }
9935
9936 hobject_t head = oid.get_head();
9937
9938 // want the snapdir?
9939 if (oid.snap == CEPH_SNAPDIR) {
9940 // return head or snapdir, whichever exists.
9941 ObjectContextRef headobc = get_object_context(head, can_create);
9942 ObjectContextRef obc = headobc;
9943 if (!obc || !obc->obs.exists)
9944 obc = get_object_context(oid, can_create);
9945 if (!obc || !obc->obs.exists) {
9946 // if we have neither, we would want to promote the head.
9947 if (pmissing)
9948 *pmissing = head;
9949 if (pobc)
9950 *pobc = headobc; // may be null
9951 return -ENOENT;
9952 }
9953 dout(10) << "find_object_context " << oid
9954 << " @" << oid.snap
9955 << " oi=" << obc->obs.oi
9956 << dendl;
9957 *pobc = obc;
9958
9959 // always populate ssc for SNAPDIR...
9960 if (!obc->ssc)
9961 obc->ssc = get_snapset_context(
9962 oid, true);
9963 return 0;
9964 }
9965
9966 // we want a snap
9967 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9968 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9969 return -ENOENT;
9970 }
9971
9972 SnapSetContext *ssc = get_snapset_context(oid, can_create);
9973 if (!ssc || !(ssc->exists || can_create)) {
9974 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9975 if (pmissing)
9976 *pmissing = head; // start by getting the head
9977 if (ssc)
9978 put_snapset_context(ssc);
9979 return -ENOENT;
9980 }
9981
9982 if (map_snapid_to_clone) {
9983 dout(10) << "find_object_context " << oid << " @" << oid.snap
9984 << " snapset " << ssc->snapset
9985 << " map_snapid_to_clone=true" << dendl;
9986 if (oid.snap > ssc->snapset.seq) {
9987 // already must be readable
9988 ObjectContextRef obc = get_object_context(head, false);
9989 dout(10) << "find_object_context " << oid << " @" << oid.snap
9990 << " snapset " << ssc->snapset
9991 << " maps to head" << dendl;
9992 *pobc = obc;
9993 put_snapset_context(ssc);
9994 return (obc && obc->obs.exists) ? 0 : -ENOENT;
9995 } else {
9996 vector<snapid_t>::const_iterator citer = std::find(
9997 ssc->snapset.clones.begin(),
9998 ssc->snapset.clones.end(),
9999 oid.snap);
10000 if (citer == ssc->snapset.clones.end()) {
10001 dout(10) << "find_object_context " << oid << " @" << oid.snap
10002 << " snapset " << ssc->snapset
10003 << " maps to nothing" << dendl;
10004 put_snapset_context(ssc);
10005 return -ENOENT;
10006 }
10007
10008 dout(10) << "find_object_context " << oid << " @" << oid.snap
10009 << " snapset " << ssc->snapset
10010 << " maps to " << oid << dendl;
10011
10012 if (pg_log.get_missing().is_missing(oid)) {
10013 dout(10) << "find_object_context " << oid << " @" << oid.snap
10014 << " snapset " << ssc->snapset
10015 << " " << oid << " is missing" << dendl;
10016 if (pmissing)
10017 *pmissing = oid;
10018 put_snapset_context(ssc);
10019 return -EAGAIN;
10020 }
10021
10022 ObjectContextRef obc = get_object_context(oid, false);
10023 if (!obc || !obc->obs.exists) {
10024 dout(10) << "find_object_context " << oid << " @" << oid.snap
10025 << " snapset " << ssc->snapset
10026 << " " << oid << " is not present" << dendl;
10027 if (pmissing)
10028 *pmissing = oid;
10029 put_snapset_context(ssc);
10030 return -ENOENT;
10031 }
10032 dout(10) << "find_object_context " << oid << " @" << oid.snap
10033 << " snapset " << ssc->snapset
10034 << " " << oid << " HIT" << dendl;
10035 *pobc = obc;
10036 put_snapset_context(ssc);
10037 return 0;
10038 }
10039 ceph_abort(); //unreachable
10040 }
10041
10042 dout(10) << "find_object_context " << oid << " @" << oid.snap
10043 << " snapset " << ssc->snapset << dendl;
10044
10045 // head?
10046 if (oid.snap > ssc->snapset.seq) {
10047 if (ssc->snapset.head_exists) {
10048 ObjectContextRef obc = get_object_context(head, false);
10049 dout(10) << "find_object_context " << head
10050 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10051 << " -- HIT " << obc->obs
10052 << dendl;
10053 if (!obc->ssc)
10054 obc->ssc = ssc;
10055 else {
10056 assert(ssc == obc->ssc);
10057 put_snapset_context(ssc);
10058 }
10059 *pobc = obc;
10060 return 0;
10061 }
10062 dout(10) << "find_object_context " << head
10063 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10064 << " but head dne -- DNE"
10065 << dendl;
10066 put_snapset_context(ssc);
10067 return -ENOENT;
10068 }
10069
10070 // which clone would it be?
10071 unsigned k = 0;
10072 while (k < ssc->snapset.clones.size() &&
10073 ssc->snapset.clones[k] < oid.snap)
10074 k++;
10075 if (k == ssc->snapset.clones.size()) {
10076 dout(10) << "find_object_context no clones with last >= oid.snap "
10077 << oid.snap << " -- DNE" << dendl;
10078 put_snapset_context(ssc);
10079 return -ENOENT;
10080 }
10081 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10082 info.pgid.pool(), oid.get_namespace());
10083
10084 if (pg_log.get_missing().is_missing(soid)) {
10085 dout(20) << "find_object_context " << soid << " missing, try again later"
10086 << dendl;
10087 if (pmissing)
10088 *pmissing = soid;
10089 put_snapset_context(ssc);
10090 return -EAGAIN;
10091 }
10092
10093 ObjectContextRef obc = get_object_context(soid, false);
10094 if (!obc || !obc->obs.exists) {
10095 if (pmissing)
10096 *pmissing = soid;
10097 put_snapset_context(ssc);
10098 if (is_degraded_or_backfilling_object(soid)) {
10099 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10100 return -EAGAIN;
10101 } else {
10102 dout(20) << __func__ << " missing clone " << soid << dendl;
10103 return -ENOENT;
10104 }
10105 }
10106
10107 if (!obc->ssc) {
10108 obc->ssc = ssc;
10109 } else {
10110 assert(obc->ssc == ssc);
10111 put_snapset_context(ssc);
10112 }
10113 ssc = 0;
10114
10115 // clone
10116 dout(20) << "find_object_context " << soid
10117 << " snapset " << obc->ssc->snapset
10118 << " legacy_snaps " << obc->obs.oi.legacy_snaps
10119 << dendl;
10120 snapid_t first, last;
10121 if (obc->ssc->snapset.is_legacy()) {
10122 first = obc->obs.oi.legacy_snaps.back();
10123 last = obc->obs.oi.legacy_snaps.front();
10124 } else {
10125 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10126 assert(p != obc->ssc->snapset.clone_snaps.end());
10127 first = p->second.back();
10128 last = p->second.front();
10129 }
10130 if (first <= oid.snap) {
10131 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10132 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10133 *pobc = obc;
10134 return 0;
10135 } else {
10136 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10137 << "] does not contain " << oid.snap << " -- DNE" << dendl;
10138 return -ENOENT;
10139 }
10140 }
10141
10142 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10143 {
10144 if (obc->ssc)
10145 put_snapset_context(obc->ssc);
10146 }
10147
10148 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10149 {
10150 object_info_t& oi = obc->obs.oi;
10151
10152 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10153 object_stat_sum_t stat;
10154
10155 stat.num_bytes += oi.size;
10156
10157 if (oi.soid.snap != CEPH_SNAPDIR)
10158 stat.num_objects++;
10159 if (oi.is_dirty())
10160 stat.num_objects_dirty++;
10161 if (oi.is_whiteout())
10162 stat.num_whiteouts++;
10163 if (oi.is_omap())
10164 stat.num_objects_omap++;
10165 if (oi.is_cache_pinned())
10166 stat.num_objects_pinned++;
10167
10168 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10169 stat.num_object_clones++;
10170
10171 if (!obc->ssc)
10172 obc->ssc = get_snapset_context(oi.soid, false);
10173 assert(obc->ssc);
10174
10175 // subtract off clone overlap
10176 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10177 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10178 for (interval_set<uint64_t>::const_iterator r = o.begin();
10179 r != o.end();
10180 ++r) {
10181 stat.num_bytes -= r.get_len();
10182 }
10183 }
10184 }
10185
10186 // add it in
10187 pgstat->stats.sum.add(stat);
10188 }
10189
10190 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10191 {
10192 const hobject_t& soid = obc->obs.oi.soid;
10193 if (obc->is_blocked()) {
10194 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10195 return;
10196 }
10197
10198 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10199 if (p != waiting_for_blocked_object.end()) {
10200 list<OpRequestRef>& ls = p->second;
10201 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10202 requeue_ops(ls);
10203 waiting_for_blocked_object.erase(p);
10204 }
10205
10206 map<hobject_t, ObjectContextRef>::iterator i =
10207 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10208 if (i != objects_blocked_on_snap_promotion.end()) {
10209 assert(i->second == obc);
10210 objects_blocked_on_snap_promotion.erase(i);
10211 }
10212
10213 if (obc->requeue_scrub_on_unblock) {
10214 obc->requeue_scrub_on_unblock = false;
10215 requeue_scrub();
10216 }
10217 }
10218
10219 SnapSetContext *PrimaryLogPG::get_snapset_context(
10220 const hobject_t& oid,
10221 bool can_create,
10222 const map<string, bufferlist> *attrs,
10223 bool oid_existed)
10224 {
10225 Mutex::Locker l(snapset_contexts_lock);
10226 SnapSetContext *ssc;
10227 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10228 oid.get_snapdir());
10229 if (p != snapset_contexts.end()) {
10230 if (can_create || p->second->exists) {
10231 ssc = p->second;
10232 } else {
10233 return NULL;
10234 }
10235 } else {
10236 bufferlist bv;
10237 if (!attrs) {
10238 int r = -ENOENT;
10239 if (!(oid.is_head() && !oid_existed))
10240 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10241 if (r < 0) {
10242 // try _snapset
10243 if (!(oid.is_snapdir() && !oid_existed))
10244 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10245 if (r < 0 && !can_create)
10246 return NULL;
10247 }
10248 } else {
10249 assert(attrs->count(SS_ATTR));
10250 bv = attrs->find(SS_ATTR)->second;
10251 }
10252 ssc = new SnapSetContext(oid.get_snapdir());
10253 _register_snapset_context(ssc);
10254 if (bv.length()) {
10255 bufferlist::iterator bvp = bv.begin();
10256 try {
10257 ssc->snapset.decode(bvp);
10258 } catch (buffer::error& e) {
10259 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10260 return NULL;
10261 }
10262 ssc->exists = true;
10263 } else {
10264 ssc->exists = false;
10265 }
10266 }
10267 assert(ssc);
10268 ssc->ref++;
10269 return ssc;
10270 }
10271
10272 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10273 {
10274 Mutex::Locker l(snapset_contexts_lock);
10275 --ssc->ref;
10276 if (ssc->ref == 0) {
10277 if (ssc->registered)
10278 snapset_contexts.erase(ssc->oid);
10279 delete ssc;
10280 }
10281 }
10282
10283 /** pull - request object from a peer
10284 */
10285
10286 /*
10287 * Return values:
10288 * NONE - didn't pull anything
10289 * YES - pulled what the caller wanted
10290 * OTHER - needed to pull something else first (_head or _snapdir)
10291 */
10292 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10293
10294 int PrimaryLogPG::recover_missing(
10295 const hobject_t &soid, eversion_t v,
10296 int priority,
10297 PGBackend::RecoveryHandle *h)
10298 {
10299 if (missing_loc.is_unfound(soid)) {
10300 dout(7) << "pull " << soid
10301 << " v " << v
10302 << " but it is unfound" << dendl;
10303 return PULL_NONE;
10304 }
10305
10306 if (missing_loc.is_deleted(soid)) {
10307 start_recovery_op(soid);
10308 assert(!recovering.count(soid));
10309 recovering.insert(make_pair(soid, ObjectContextRef()));
10310 epoch_t cur_epoch = get_osdmap()->get_epoch();
10311 remove_missing_object(soid, v, new FunctionContext(
10312 [=](int) {
10313 lock();
10314 if (!pg_has_reset_since(cur_epoch)) {
10315 bool object_missing = false;
10316 for (const auto& shard : actingbackfill) {
10317 if (shard == pg_whoami)
10318 continue;
10319 if (peer_missing[shard].is_missing(soid)) {
10320 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10321 object_missing = true;
10322 break;
10323 }
10324 }
10325 if (!object_missing) {
10326 object_stat_sum_t stat_diff;
10327 stat_diff.num_objects_recovered = 1;
10328 on_global_recover(soid, stat_diff, true);
10329 } else {
10330 auto recovery_handle = pgbackend->open_recovery_op();
10331 pgbackend->recover_delete_object(soid, v, recovery_handle);
10332 pgbackend->run_recovery_op(recovery_handle, priority);
10333 }
10334 }
10335 unlock();
10336 }));
10337 return PULL_YES;
10338 }
10339
10340 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10341 ObjectContextRef obc;
10342 ObjectContextRef head_obc;
10343 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10344 // do we have the head and/or snapdir?
10345 hobject_t head = soid.get_head();
10346 if (pg_log.get_missing().is_missing(head)) {
10347 if (recovering.count(head)) {
10348 dout(10) << " missing but already recovering head " << head << dendl;
10349 return PULL_NONE;
10350 } else {
10351 int r = recover_missing(
10352 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10353 h);
10354 if (r != PULL_NONE)
10355 return PULL_OTHER;
10356 return PULL_NONE;
10357 }
10358 }
10359 head = soid.get_snapdir();
10360 if (pg_log.get_missing().is_missing(head)) {
10361 if (recovering.count(head)) {
10362 dout(10) << " missing but already recovering snapdir " << head << dendl;
10363 return PULL_NONE;
10364 } else {
10365 int r = recover_missing(
10366 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10367 h);
10368 if (r != PULL_NONE)
10369 return PULL_OTHER;
10370 return PULL_NONE;
10371 }
10372 }
10373
10374 // we must have one or the other
10375 head_obc = get_object_context(
10376 soid.get_head(),
10377 false,
10378 0);
10379 if (!head_obc)
10380 head_obc = get_object_context(
10381 soid.get_snapdir(),
10382 false,
10383 0);
10384 assert(head_obc);
10385 }
10386 start_recovery_op(soid);
10387 assert(!recovering.count(soid));
10388 recovering.insert(make_pair(soid, obc));
10389 int r = pgbackend->recover_object(
10390 soid,
10391 v,
10392 head_obc,
10393 obc,
10394 h);
10395 // This is only a pull which shouldn't return an error
10396 assert(r >= 0);
10397 return PULL_YES;
10398 }
10399
10400 void PrimaryLogPG::send_remove_op(
10401 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10402 {
10403 ceph_tid_t tid = osd->get_tid();
10404 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10405
10406 dout(10) << "send_remove_op " << oid << " from osd." << peer
10407 << " tid " << tid << dendl;
10408
10409 MOSDSubOp *subop = new MOSDSubOp(
10410 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10411 oid, CEPH_OSD_FLAG_ACK,
10412 get_osdmap()->get_epoch(), tid, v);
10413 subop->ops = vector<OSDOp>(1);
10414 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10415
10416 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10417 }
10418
10419 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10420 eversion_t v, Context *on_complete)
10421 {
10422 dout(20) << __func__ << " " << soid << " " << v << dendl;
10423 assert(on_complete != nullptr);
10424 // delete locally
10425 ObjectStore::Transaction t;
10426 remove_snap_mapped_object(t, soid);
10427
10428 ObjectRecoveryInfo recovery_info;
10429 recovery_info.soid = soid;
10430 recovery_info.version = v;
10431
10432 epoch_t cur_epoch = get_osdmap()->get_epoch();
10433 t.register_on_complete(new FunctionContext(
10434 [=](int) {
10435 lock();
10436 if (!pg_has_reset_since(cur_epoch)) {
10437 ObjectStore::Transaction t2;
10438 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10439 t2.register_on_complete(on_complete);
10440 int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10441 assert(r == 0);
10442 unlock();
10443 } else {
10444 unlock();
10445 on_complete->complete(-EAGAIN);
10446 }
10447 }));
10448 int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10449 assert(r == 0);
10450 }
10451
10452 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10453 {
10454 dout(10) << "finish_degraded_object " << oid << dendl;
10455 if (callbacks_for_degraded_object.count(oid)) {
10456 list<Context*> contexts;
10457 contexts.swap(callbacks_for_degraded_object[oid]);
10458 callbacks_for_degraded_object.erase(oid);
10459 for (list<Context*>::iterator i = contexts.begin();
10460 i != contexts.end();
10461 ++i) {
10462 (*i)->complete(0);
10463 }
10464 }
10465 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10466 oid.get_head());
10467 if (i != objects_blocked_on_degraded_snap.end() &&
10468 i->second == oid.snap)
10469 objects_blocked_on_degraded_snap.erase(i);
10470 }
10471
10472 void PrimaryLogPG::_committed_pushed_object(
10473 epoch_t epoch, eversion_t last_complete)
10474 {
10475 lock();
10476 if (!pg_has_reset_since(epoch)) {
10477 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10478 last_complete_ondisk = last_complete;
10479
10480 if (last_complete_ondisk == info.last_update) {
10481 if (!is_primary()) {
10482 // Either we are a replica or backfill target.
10483 // we are fully up to date. tell the primary!
10484 osd->send_message_osd_cluster(
10485 get_primary().osd,
10486 new MOSDPGTrim(
10487 get_osdmap()->get_epoch(),
10488 spg_t(info.pgid.pgid, get_primary().shard),
10489 last_complete_ondisk),
10490 get_osdmap()->get_epoch());
10491 } else {
10492 calc_min_last_complete_ondisk();
10493 }
10494 }
10495
10496 } else {
10497 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10498 }
10499
10500 unlock();
10501 }
10502
10503 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10504 {
10505 lock();
10506 dout(20) << __func__ << dendl;
10507 if (obc) {
10508 dout(20) << "obc = " << *obc << dendl;
10509 }
10510 assert(active_pushes >= 1);
10511 --active_pushes;
10512
10513 // requeue an active chunky scrub waiting on recovery ops
10514 if (!deleting && active_pushes == 0
10515 && scrubber.is_chunky_scrub_active()) {
10516 if (ops_blocked_by_scrub()) {
10517 requeue_scrub(true);
10518 } else {
10519 requeue_scrub(false);
10520 }
10521 }
10522 unlock();
10523 }
10524
10525 void PrimaryLogPG::_applied_recovered_object_replica()
10526 {
10527 lock();
10528 dout(20) << __func__ << dendl;
10529 assert(active_pushes >= 1);
10530 --active_pushes;
10531
10532 // requeue an active chunky scrub waiting on recovery ops
10533 if (!deleting && active_pushes == 0 &&
10534 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10535 scrubber.active_rep_scrub->get_req())->chunky) {
10536 osd->enqueue_back(
10537 info.pgid,
10538 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10539 scrubber.active_rep_scrub = OpRequestRef();
10540 }
10541 unlock();
10542 }
10543
10544 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10545 {
10546 dout(10) << "got missing " << oid << " v " << v << dendl;
10547 pg_log.recover_got(oid, v, info);
10548 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10549 dout(10) << "last_complete now " << info.last_complete
10550 << " log.complete_to " << pg_log.get_log().complete_to->version
10551 << dendl;
10552 } else {
10553 dout(10) << "last_complete now " << info.last_complete
10554 << " log.complete_to at end" << dendl;
10555 //below is not true in the repair case.
10556 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10557 assert(info.last_complete == info.last_update);
10558 }
10559 }
10560
10561 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10562 {
10563 list<pg_shard_t> fl = { pg_whoami };
10564 failed_push(fl, soid);
10565 }
10566
10567 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10568 {
10569 dout(20) << __func__ << ": " << soid << dendl;
10570 assert(recovering.count(soid));
10571 auto obc = recovering[soid];
10572 if (obc) {
10573 list<OpRequestRef> blocked_ops;
10574 obc->drop_recovery_read(&blocked_ops);
10575 requeue_ops(blocked_ops);
10576 }
10577 recovering.erase(soid);
10578 for (auto&& i : from)
10579 missing_loc.remove_location(soid, i);
10580 dout(0) << __func__ << " " << soid << " from shard " << from
10581 << ", reps on " << missing_loc.get_locations(soid)
10582 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10583 finish_recovery_op(soid); // close out this attempt,
10584 }
10585
10586 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10587 {
10588 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10589 assert(m->get_type() == MSG_OSD_SUBOP);
10590 dout(7) << "sub_op_remove " << m->poid << dendl;
10591
10592 op->mark_started();
10593
10594 ObjectStore::Transaction t;
10595 remove_snap_mapped_object(t, m->poid);
10596 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10597 assert(r == 0);
10598 }
10599
10600 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10601 {
10602 eversion_t v;
10603 pg_missing_item pmi;
10604 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10605 assert(is_missing);
10606 v = pmi.have;
10607 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10608
10609 assert(!actingbackfill.empty());
10610 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10611 i != actingbackfill.end();
10612 ++i) {
10613 if (*i == get_primary()) continue;
10614 pg_shard_t peer = *i;
10615 if (!peer_missing[peer].is_missing(oid)) {
10616 continue;
10617 }
10618 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10619 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10620 if (h > v)
10621 v = h;
10622 }
10623
10624 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10625 return v;
10626 }
10627
10628 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10629 {
10630 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10631 op->get_req());
10632 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10633 ObjectStore::Transaction t;
10634 append_log_entries_update_missing(m->entries, t);
10635
10636 Context *complete = new FunctionContext(
10637 [=](int) {
10638 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10639 op->get_req());
10640 lock();
10641 if (!pg_has_reset_since(msg->get_epoch())) {
10642 MOSDPGUpdateLogMissingReply *reply =
10643 new MOSDPGUpdateLogMissingReply(
10644 spg_t(info.pgid.pgid, primary_shard().shard),
10645 pg_whoami.shard,
10646 msg->get_epoch(),
10647 msg->min_epoch,
10648 msg->get_tid());
10649 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10650 msg->get_connection()->send_message(reply);
10651 }
10652 unlock();
10653 });
10654
10655 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10656 t.register_on_commit(complete);
10657 } else {
10658 /* Hack to work around the fact that ReplicatedBackend sends
10659 * ack+commit if commit happens first
10660 *
10661 * This behavior is no longer necessary, but we preserve it so old
10662 * primaries can keep their repops in order */
10663 if (pool.info.ec_pool()) {
10664 t.register_on_complete(complete);
10665 } else {
10666 t.register_on_commit(complete);
10667 }
10668 }
10669 t.register_on_applied(
10670 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10671 int tr = osd->store->queue_transaction(
10672 osr.get(),
10673 std::move(t),
10674 nullptr);
10675 assert(tr == 0);
10676 }
10677
10678 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10679 {
10680 const MOSDPGUpdateLogMissingReply *m =
10681 static_cast<const MOSDPGUpdateLogMissingReply*>(
10682 op->get_req());
10683 dout(20) << __func__ << " got reply from "
10684 << m->get_from() << dendl;
10685
10686 auto it = log_entry_update_waiting_on.find(m->get_tid());
10687 if (it != log_entry_update_waiting_on.end()) {
10688 if (it->second.waiting_on.count(m->get_from())) {
10689 it->second.waiting_on.erase(m->get_from());
10690 } else {
10691 osd->clog->error()
10692 << info.pgid << " got reply "
10693 << *m << " from shard we are not waiting for "
10694 << m->get_from();
10695 }
10696
10697 if (it->second.waiting_on.empty()) {
10698 repop_all_committed(it->second.repop.get());
10699 log_entry_update_waiting_on.erase(it);
10700 }
10701 } else {
10702 osd->clog->error()
10703 << info.pgid << " got reply "
10704 << *m << " on unknown tid " << m->get_tid();
10705 }
10706 }
10707
10708 /* Mark all unfound objects as lost.
10709 */
10710 void PrimaryLogPG::mark_all_unfound_lost(
10711 int what,
10712 ConnectionRef con,
10713 ceph_tid_t tid)
10714 {
10715 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10716 list<hobject_t> oids;
10717
10718 dout(30) << __func__ << ": log before:\n";
10719 pg_log.get_log().print(*_dout);
10720 *_dout << dendl;
10721
10722 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10723
10724 utime_t mtime = ceph_clock_now();
10725 map<hobject_t, pg_missing_item>::const_iterator m =
10726 missing_loc.get_needs_recovery().begin();
10727 map<hobject_t, pg_missing_item>::const_iterator mend =
10728 missing_loc.get_needs_recovery().end();
10729
10730 ObcLockManager manager;
10731 eversion_t v = get_next_version();
10732 v.epoch = get_osdmap()->get_epoch();
10733 uint64_t num_unfound = missing_loc.num_unfound();
10734 while (m != mend) {
10735 const hobject_t &oid(m->first);
10736 if (!missing_loc.is_unfound(oid)) {
10737 // We only care about unfound objects
10738 ++m;
10739 continue;
10740 }
10741
10742 ObjectContextRef obc;
10743 eversion_t prev;
10744
10745 switch (what) {
10746 case pg_log_entry_t::LOST_MARK:
10747 assert(0 == "actually, not implemented yet!");
10748 break;
10749
10750 case pg_log_entry_t::LOST_REVERT:
10751 prev = pick_newest_available(oid);
10752 if (prev > eversion_t()) {
10753 // log it
10754 pg_log_entry_t e(
10755 pg_log_entry_t::LOST_REVERT, oid, v,
10756 m->second.need, 0, osd_reqid_t(), mtime, 0);
10757 e.reverting_to = prev;
10758 e.mark_unrollbackable();
10759 log_entries.push_back(e);
10760 dout(10) << e << dendl;
10761
10762 // we are now missing the new version; recovery code will sort it out.
10763 ++v.version;
10764 ++m;
10765 break;
10766 }
10767
10768 case pg_log_entry_t::LOST_DELETE:
10769 {
10770 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10771 0, osd_reqid_t(), mtime, 0);
10772 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10773 if (pool.info.require_rollback()) {
10774 e.mod_desc.try_rmobject(v.version);
10775 } else {
10776 e.mark_unrollbackable();
10777 }
10778 } // otherwise, just do what we used to do
10779 dout(10) << e << dendl;
10780 log_entries.push_back(e);
10781 oids.push_back(oid);
10782
10783 ++v.version;
10784 ++m;
10785 }
10786 break;
10787
10788 default:
10789 ceph_abort();
10790 }
10791 }
10792
10793 info.stats.stats_invalid = true;
10794
10795 submit_log_entries(
10796 log_entries,
10797 std::move(manager),
10798 boost::optional<std::function<void(void)> >(
10799 [this, oids, con, num_unfound, tid]() {
10800 if (perform_deletes_during_peering()) {
10801 for (auto oid : oids) {
10802 // clear old locations - merge_new_log_entries will have
10803 // handled rebuilding missing_loc for each of these
10804 // objects if we have the RECOVERY_DELETES flag
10805 missing_loc.recovered(oid);
10806 }
10807 }
10808
10809 for (auto& p : waiting_for_unreadable_object) {
10810 release_backoffs(p.first);
10811 }
10812 requeue_object_waiters(waiting_for_unreadable_object);
10813 queue_recovery();
10814
10815 stringstream ss;
10816 ss << "pg has " << num_unfound
10817 << " objects unfound and apparently lost marking";
10818 string rs = ss.str();
10819 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10820 osd->clog->info() << rs;
10821 if (con) {
10822 MCommandReply *reply = new MCommandReply(0, rs);
10823 reply->set_tid(tid);
10824 con->send_message(reply);
10825 }
10826 }),
10827 OpRequestRef());
10828 }
10829
10830 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10831 {
10832 assert(repop_queue.empty());
10833 }
10834
10835 /*
10836 * pg status change notification
10837 */
10838
10839 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10840 {
10841 list<OpRequestRef> rq;
10842
10843 // apply all repops
10844 while (!repop_queue.empty()) {
10845 RepGather *repop = repop_queue.front();
10846 repop_queue.pop_front();
10847 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10848 repop->rep_aborted = true;
10849 repop->on_applied.clear();
10850 repop->on_committed.clear();
10851 repop->on_success.clear();
10852
10853 if (requeue) {
10854 if (repop->op) {
10855 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10856 rq.push_back(repop->op);
10857 repop->op = OpRequestRef();
10858 }
10859
10860 // also requeue any dups, interleaved into position
10861 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10862 waiting_for_ondisk.find(repop->v);
10863 if (p != waiting_for_ondisk.end()) {
10864 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10865 for (list<pair<OpRequestRef, version_t> >::iterator i =
10866 p->second.begin();
10867 i != p->second.end();
10868 ++i) {
10869 rq.push_back(i->first);
10870 }
10871 waiting_for_ondisk.erase(p);
10872 }
10873 }
10874
10875 remove_repop(repop);
10876 }
10877
10878 assert(repop_queue.empty());
10879
10880 if (requeue) {
10881 requeue_ops(rq);
10882 if (!waiting_for_ondisk.empty()) {
10883 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10884 waiting_for_ondisk.begin();
10885 i != waiting_for_ondisk.end();
10886 ++i) {
10887 for (list<pair<OpRequestRef, version_t> >::iterator j =
10888 i->second.begin();
10889 j != i->second.end();
10890 ++j) {
10891 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10892 << i->first << dendl;
10893 }
10894 }
10895 assert(waiting_for_ondisk.empty());
10896 }
10897 }
10898
10899 waiting_for_ondisk.clear();
10900 }
10901
10902 void PrimaryLogPG::on_flushed()
10903 {
10904 assert(flushes_in_progress > 0);
10905 flushes_in_progress--;
10906 if (flushes_in_progress == 0) {
10907 requeue_ops(waiting_for_peered);
10908 }
10909 if (!is_peered() || !is_primary()) {
10910 pair<hobject_t, ObjectContextRef> i;
10911 while (object_contexts.get_next(i.first, &i)) {
10912 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10913 }
10914 assert(object_contexts.empty());
10915 }
10916 pgbackend->on_flushed();
10917 }
10918
10919 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10920 {
10921 dout(10) << "on_removal" << dendl;
10922
10923 // adjust info to backfill
10924 info.set_last_backfill(hobject_t());
10925 pg_log.reset_backfill();
10926 dirty_info = true;
10927
10928
10929 // clear log
10930 PGLogEntryHandler rollbacker{this, t};
10931 pg_log.roll_forward(&rollbacker);
10932
10933 write_if_dirty(*t);
10934
10935 if (!deleting)
10936 on_shutdown();
10937 }
10938
10939 void PrimaryLogPG::clear_async_reads()
10940 {
10941 dout(10) << __func__ << dendl;
10942 for(auto& i : in_progress_async_reads) {
10943 dout(10) << "clear ctx: "
10944 << "OpRequestRef " << i.first
10945 << " OpContext " << i.second
10946 << dendl;
10947 close_op_ctx(i.second);
10948 }
10949 }
10950
10951 void PrimaryLogPG::on_shutdown()
10952 {
10953 dout(10) << "on_shutdown" << dendl;
10954
10955 // remove from queues
10956 osd->pg_stat_queue_dequeue(this);
10957 osd->peering_wq.dequeue(this);
10958
10959 // handles queue races
10960 deleting = true;
10961
10962 if (recovery_queued) {
10963 recovery_queued = false;
10964 osd->clear_queued_recovery(this);
10965 }
10966
10967 clear_scrub_reserved();
10968 scrub_clear_state();
10969
10970 unreg_next_scrub();
10971 cancel_copy_ops(false);
10972 cancel_flush_ops(false);
10973 cancel_proxy_ops(false);
10974 apply_and_flush_repops(false);
10975 cancel_log_updates();
10976 // we must remove PGRefs, so do this this prior to release_backoffs() callers
10977 clear_backoffs();
10978 // clean up snap trim references
10979 snap_trimmer_machine.process_event(Reset());
10980
10981 pgbackend->on_change();
10982
10983 context_registry_on_change();
10984 object_contexts.clear();
10985
10986 clear_async_reads();
10987
10988 osd->remote_reserver.cancel_reservation(info.pgid);
10989 osd->local_reserver.cancel_reservation(info.pgid);
10990
10991 clear_primary_state();
10992 cancel_recovery();
10993 }
10994
10995 void PrimaryLogPG::on_activate()
10996 {
10997 // all clean?
10998 if (needs_recovery()) {
10999 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11000 queue_peering_event(
11001 CephPeeringEvtRef(
11002 std::make_shared<CephPeeringEvt>(
11003 get_osdmap()->get_epoch(),
11004 get_osdmap()->get_epoch(),
11005 DoRecovery())));
11006 } else if (needs_backfill()) {
11007 dout(10) << "activate queueing backfill" << dendl;
11008 queue_peering_event(
11009 CephPeeringEvtRef(
11010 std::make_shared<CephPeeringEvt>(
11011 get_osdmap()->get_epoch(),
11012 get_osdmap()->get_epoch(),
11013 RequestBackfill())));
11014 } else {
11015 dout(10) << "activate all replicas clean, no recovery" << dendl;
11016 eio_errors_to_process = false;
11017 queue_peering_event(
11018 CephPeeringEvtRef(
11019 std::make_shared<CephPeeringEvt>(
11020 get_osdmap()->get_epoch(),
11021 get_osdmap()->get_epoch(),
11022 AllReplicasRecovered())));
11023 }
11024
11025 publish_stats_to_osd();
11026
11027 if (!backfill_targets.empty()) {
11028 last_backfill_started = earliest_backfill();
11029 new_backfill = true;
11030 assert(!last_backfill_started.is_max());
11031 dout(5) << "on activate: bft=" << backfill_targets
11032 << " from " << last_backfill_started << dendl;
11033 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11034 i != backfill_targets.end();
11035 ++i) {
11036 dout(5) << "target shard " << *i
11037 << " from " << peer_info[*i].last_backfill
11038 << dendl;
11039 }
11040 }
11041
11042 hit_set_setup();
11043 agent_setup();
11044 }
11045
11046 void PrimaryLogPG::_on_new_interval()
11047 {
11048 dout(20) << __func__ << "checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11049 if (!pg_log.get_missing().may_include_deletes &&
11050 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11051 pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11052 }
11053 assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11054 }
11055
11056 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11057 {
11058 dout(10) << "on_change" << dendl;
11059
11060 if (hit_set && hit_set->insert_count() == 0) {
11061 dout(20) << " discarding empty hit_set" << dendl;
11062 hit_set_clear();
11063 }
11064
11065 if (recovery_queued) {
11066 recovery_queued = false;
11067 osd->clear_queued_recovery(this);
11068 }
11069
11070 // requeue everything in the reverse order they should be
11071 // reexamined.
11072 requeue_ops(waiting_for_peered);
11073 requeue_ops(waiting_for_active);
11074
11075 clear_scrub_reserved();
11076
11077 cancel_copy_ops(is_primary());
11078 cancel_flush_ops(is_primary());
11079 cancel_proxy_ops(is_primary());
11080
11081 // requeue object waiters
11082 for (auto& p : waiting_for_unreadable_object) {
11083 release_backoffs(p.first);
11084 }
11085 if (is_primary()) {
11086 requeue_object_waiters(waiting_for_unreadable_object);
11087 } else {
11088 waiting_for_unreadable_object.clear();
11089 }
11090 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11091 p != waiting_for_degraded_object.end();
11092 waiting_for_degraded_object.erase(p++)) {
11093 release_backoffs(p->first);
11094 if (is_primary())
11095 requeue_ops(p->second);
11096 else
11097 p->second.clear();
11098 finish_degraded_object(p->first);
11099 }
11100
11101 // requeues waiting_for_scrub
11102 scrub_clear_state();
11103
11104 for (auto p = waiting_for_blocked_object.begin();
11105 p != waiting_for_blocked_object.end();
11106 waiting_for_blocked_object.erase(p++)) {
11107 if (is_primary())
11108 requeue_ops(p->second);
11109 else
11110 p->second.clear();
11111 }
11112 for (auto i = callbacks_for_degraded_object.begin();
11113 i != callbacks_for_degraded_object.end();
11114 ) {
11115 finish_degraded_object((i++)->first);
11116 }
11117 assert(callbacks_for_degraded_object.empty());
11118
11119 if (is_primary()) {
11120 requeue_ops(waiting_for_cache_not_full);
11121 } else {
11122 waiting_for_cache_not_full.clear();
11123 }
11124 objects_blocked_on_cache_full.clear();
11125
11126 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11127 in_progress_async_reads.begin();
11128 i != in_progress_async_reads.end();
11129 in_progress_async_reads.erase(i++)) {
11130 close_op_ctx(i->second);
11131 if (is_primary())
11132 requeue_op(i->first);
11133 }
11134
11135 // this will requeue ops we were working on but didn't finish, and
11136 // any dups
11137 apply_and_flush_repops(is_primary());
11138 cancel_log_updates();
11139
11140 // do this *after* apply_and_flush_repops so that we catch any newly
11141 // registered watches.
11142 context_registry_on_change();
11143
11144 pgbackend->on_change_cleanup(t);
11145 scrubber.cleanup_store(t);
11146 pgbackend->on_change();
11147
11148 // clear snap_trimmer state
11149 snap_trimmer_machine.process_event(Reset());
11150
11151 debug_op_order.clear();
11152 unstable_stats.clear();
11153
11154 // we don't want to cache object_contexts through the interval change
11155 // NOTE: we actually assert that all currently live references are dead
11156 // by the time the flush for the next interval completes.
11157 object_contexts.clear();
11158
11159 // should have been cleared above by finishing all of the degraded objects
11160 assert(objects_blocked_on_degraded_snap.empty());
11161 }
11162
11163 void PrimaryLogPG::on_role_change()
11164 {
11165 dout(10) << "on_role_change" << dendl;
11166 if (get_role() != 0 && hit_set) {
11167 dout(10) << " clearing hit set" << dendl;
11168 hit_set_clear();
11169 }
11170 }
11171
11172 void PrimaryLogPG::on_pool_change()
11173 {
11174 dout(10) << __func__ << dendl;
11175 // requeue cache full waiters just in case the cache_mode is
11176 // changing away from writeback mode. note that if we are not
11177 // active the normal requeuing machinery is sufficient (and properly
11178 // ordered).
11179 if (is_active() &&
11180 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11181 !waiting_for_cache_not_full.empty()) {
11182 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11183 << dendl;
11184 requeue_ops(waiting_for_cache_not_full);
11185 objects_blocked_on_cache_full.clear();
11186 }
11187 hit_set_setup();
11188 agent_setup();
11189 }
11190
11191 // clear state. called on recovery completion AND cancellation.
11192 void PrimaryLogPG::_clear_recovery_state()
11193 {
11194 missing_loc.clear();
11195 #ifdef DEBUG_RECOVERY_OIDS
11196 recovering_oids.clear();
11197 #endif
11198 last_backfill_started = hobject_t();
11199 set<hobject_t>::iterator i = backfills_in_flight.begin();
11200 while (i != backfills_in_flight.end()) {
11201 assert(recovering.count(*i));
11202 backfills_in_flight.erase(i++);
11203 }
11204
11205 list<OpRequestRef> blocked_ops;
11206 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11207 i != recovering.end();
11208 recovering.erase(i++)) {
11209 if (i->second) {
11210 i->second->drop_recovery_read(&blocked_ops);
11211 requeue_ops(blocked_ops);
11212 }
11213 }
11214 assert(backfills_in_flight.empty());
11215 pending_backfill_updates.clear();
11216 assert(recovering.empty());
11217 pgbackend->clear_recovery_state();
11218 }
11219
11220 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11221 {
11222 dout(20) << __func__ << ": " << soid << dendl;
11223 assert(recovering.count(soid));
11224 ObjectContextRef obc = recovering[soid];
11225 if (obc) {
11226 list<OpRequestRef> blocked_ops;
11227 obc->drop_recovery_read(&blocked_ops);
11228 requeue_ops(blocked_ops);
11229 }
11230 recovering.erase(soid);
11231 finish_recovery_op(soid);
11232 release_backoffs(soid);
11233 if (waiting_for_degraded_object.count(soid)) {
11234 dout(20) << " kicking degraded waiters on " << soid << dendl;
11235 requeue_ops(waiting_for_degraded_object[soid]);
11236 waiting_for_degraded_object.erase(soid);
11237 }
11238 if (waiting_for_unreadable_object.count(soid)) {
11239 dout(20) << " kicking unreadable waiters on " << soid << dendl;
11240 requeue_ops(waiting_for_unreadable_object[soid]);
11241 waiting_for_unreadable_object.erase(soid);
11242 }
11243 if (is_missing_object(soid))
11244 pg_log.set_last_requested(0); // get recover_primary to start over
11245 finish_degraded_object(soid);
11246 }
11247
11248 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11249 {
11250 /*
11251 * check that any peers we are planning to (or currently) pulling
11252 * objects from are dealt with.
11253 */
11254 missing_loc.check_recovery_sources(osdmap);
11255 pgbackend->check_recovery_sources(osdmap);
11256
11257 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11258 i != peer_log_requested.end();
11259 ) {
11260 if (!osdmap->is_up(i->osd)) {
11261 dout(10) << "peer_log_requested removing " << *i << dendl;
11262 peer_log_requested.erase(i++);
11263 } else {
11264 ++i;
11265 }
11266 }
11267
11268 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11269 i != peer_missing_requested.end();
11270 ) {
11271 if (!osdmap->is_up(i->osd)) {
11272 dout(10) << "peer_missing_requested removing " << *i << dendl;
11273 peer_missing_requested.erase(i++);
11274 } else {
11275 ++i;
11276 }
11277 }
11278 }
11279
11280 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11281 {
11282 set<pg_shard_t> now_down;
11283 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11284 p != missing_loc_sources.end();
11285 ) {
11286 if (osdmap->is_up(p->osd)) {
11287 ++p;
11288 continue;
11289 }
11290 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11291 now_down.insert(*p);
11292 missing_loc_sources.erase(p++);
11293 }
11294
11295 if (now_down.empty()) {
11296 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11297 } else {
11298 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11299 << missing_loc_sources << dendl;
11300
11301 // filter missing_loc
11302 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11303 while (p != missing_loc.end()) {
11304 set<pg_shard_t>::iterator q = p->second.begin();
11305 while (q != p->second.end())
11306 if (now_down.count(*q)) {
11307 p->second.erase(q++);
11308 } else {
11309 ++q;
11310 }
11311 if (p->second.empty())
11312 missing_loc.erase(p++);
11313 else
11314 ++p;
11315 }
11316 }
11317 }
11318
11319
11320 bool PrimaryLogPG::start_recovery_ops(
11321 uint64_t max,
11322 ThreadPool::TPHandle &handle,
11323 uint64_t *ops_started)
11324 {
11325 uint64_t& started = *ops_started;
11326 started = 0;
11327 bool work_in_progress = false;
11328 assert(is_primary());
11329
11330 if (!state_test(PG_STATE_RECOVERING) &&
11331 !state_test(PG_STATE_BACKFILLING)) {
11332 /* TODO: I think this case is broken and will make do_recovery()
11333 * unhappy since we're returning false */
11334 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11335 return false;
11336 }
11337
11338 const auto &missing = pg_log.get_missing();
11339
11340 unsigned int num_missing = missing.num_missing();
11341 uint64_t num_unfound = get_num_unfound();
11342
11343 if (num_missing == 0) {
11344 info.last_complete = info.last_update;
11345 }
11346
11347 if (num_missing == num_unfound) {
11348 // All of the missing objects we have are unfound.
11349 // Recover the replicas.
11350 started = recover_replicas(max, handle);
11351 }
11352 if (!started) {
11353 // We still have missing objects that we should grab from replicas.
11354 started += recover_primary(max, handle);
11355 }
11356 if (!started && num_unfound != get_num_unfound()) {
11357 // second chance to recovery replicas
11358 started = recover_replicas(max, handle);
11359 }
11360
11361 if (started)
11362 work_in_progress = true;
11363
11364 bool deferred_backfill = false;
11365 if (recovering.empty() &&
11366 state_test(PG_STATE_BACKFILLING) &&
11367 !backfill_targets.empty() && started < max &&
11368 missing.num_missing() == 0 &&
11369 waiting_on_backfill.empty()) {
11370 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11371 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11372 deferred_backfill = true;
11373 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11374 !is_degraded()) {
11375 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11376 deferred_backfill = true;
11377 } else if (!backfill_reserved) {
11378 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11379 if (!backfill_reserving) {
11380 dout(10) << "queueing RequestBackfill" << dendl;
11381 backfill_reserving = true;
11382 queue_peering_event(
11383 CephPeeringEvtRef(
11384 std::make_shared<CephPeeringEvt>(
11385 get_osdmap()->get_epoch(),
11386 get_osdmap()->get_epoch(),
11387 RequestBackfill())));
11388 }
11389 deferred_backfill = true;
11390 } else {
11391 started += recover_backfill(max - started, handle, &work_in_progress);
11392 }
11393 }
11394
11395 dout(10) << " started " << started << dendl;
11396 osd->logger->inc(l_osd_rop, started);
11397
11398 if (!recovering.empty() ||
11399 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11400 return work_in_progress;
11401
11402 assert(recovering.empty());
11403 assert(recovery_ops_active == 0);
11404
11405 dout(10) << __func__ << " needs_recovery: "
11406 << missing_loc.get_needs_recovery()
11407 << dendl;
11408 dout(10) << __func__ << " missing_loc: "
11409 << missing_loc.get_missing_locs()
11410 << dendl;
11411 int unfound = get_num_unfound();
11412 if (unfound) {
11413 dout(10) << " still have " << unfound << " unfound" << dendl;
11414 return work_in_progress;
11415 }
11416
11417 if (missing.num_missing() > 0) {
11418 // this shouldn't happen!
11419 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11420 << missing.num_missing() << ": " << missing.get_items();
11421 return work_in_progress;
11422 }
11423
11424 if (needs_recovery()) {
11425 // this shouldn't happen!
11426 // We already checked num_missing() so we must have missing replicas
11427 osd->clog->error() << info.pgid
11428 << " Unexpected Error: recovery ending with missing replicas";
11429 return work_in_progress;
11430 }
11431
11432 if (state_test(PG_STATE_RECOVERING)) {
11433 state_clear(PG_STATE_RECOVERING);
11434 state_clear(PG_STATE_FORCED_RECOVERY);
11435 if (get_osdmap()->get_pg_size(info.pgid.pgid) <= acting.size()) {
11436 state_clear(PG_STATE_DEGRADED);
11437 }
11438 if (needs_backfill()) {
11439 dout(10) << "recovery done, queuing backfill" << dendl;
11440 queue_peering_event(
11441 CephPeeringEvtRef(
11442 std::make_shared<CephPeeringEvt>(
11443 get_osdmap()->get_epoch(),
11444 get_osdmap()->get_epoch(),
11445 RequestBackfill())));
11446 } else {
11447 dout(10) << "recovery done, no backfill" << dendl;
11448 eio_errors_to_process = false;
11449 state_clear(PG_STATE_FORCED_BACKFILL);
11450 queue_peering_event(
11451 CephPeeringEvtRef(
11452 std::make_shared<CephPeeringEvt>(
11453 get_osdmap()->get_epoch(),
11454 get_osdmap()->get_epoch(),
11455 AllReplicasRecovered())));
11456 }
11457 } else { // backfilling
11458 state_clear(PG_STATE_BACKFILLING);
11459 state_clear(PG_STATE_FORCED_BACKFILL);
11460 state_clear(PG_STATE_FORCED_RECOVERY);
11461 dout(10) << "recovery done, backfill done" << dendl;
11462 eio_errors_to_process = false;
11463 queue_peering_event(
11464 CephPeeringEvtRef(
11465 std::make_shared<CephPeeringEvt>(
11466 get_osdmap()->get_epoch(),
11467 get_osdmap()->get_epoch(),
11468 Backfilled())));
11469 }
11470
11471 return false;
11472 }
11473
11474 /**
11475 * do one recovery op.
11476 * return true if done, false if nothing left to do.
11477 */
11478 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11479 {
11480 assert(is_primary());
11481
11482 const auto &missing = pg_log.get_missing();
11483
11484 dout(10) << "recover_primary recovering " << recovering.size()
11485 << " in pg" << dendl;
11486 dout(10) << "recover_primary " << missing << dendl;
11487 dout(25) << "recover_primary " << missing.get_items() << dendl;
11488
11489 // look at log!
11490 pg_log_entry_t *latest = 0;
11491 unsigned started = 0;
11492 int skipped = 0;
11493
11494 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11495 map<version_t, hobject_t>::const_iterator p =
11496 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11497 while (p != missing.get_rmissing().end()) {
11498 handle.reset_tp_timeout();
11499 hobject_t soid;
11500 version_t v = p->first;
11501
11502 if (pg_log.get_log().objects.count(p->second)) {
11503 latest = pg_log.get_log().objects.find(p->second)->second;
11504 assert(latest->is_update() || latest->is_delete());
11505 soid = latest->soid;
11506 } else {
11507 latest = 0;
11508 soid = p->second;
11509 }
11510 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11511 ++p;
11512
11513 hobject_t head = soid.get_head();
11514
11515 eversion_t need = item.need;
11516
11517 dout(10) << "recover_primary "
11518 << soid << " " << item.need
11519 << (missing.is_missing(soid) ? " (missing)":"")
11520 << (missing.is_missing(head) ? " (missing head)":"")
11521 << (recovering.count(soid) ? " (recovering)":"")
11522 << (recovering.count(head) ? " (recovering head)":"")
11523 << dendl;
11524
11525 if (latest) {
11526 switch (latest->op) {
11527 case pg_log_entry_t::CLONE:
11528 /*
11529 * Handling for this special case removed for now, until we
11530 * can correctly construct an accurate SnapSet from the old
11531 * one.
11532 */
11533 break;
11534
11535 case pg_log_entry_t::LOST_REVERT:
11536 {
11537 if (item.have == latest->reverting_to) {
11538 ObjectContextRef obc = get_object_context(soid, true);
11539
11540 if (obc->obs.oi.version == latest->version) {
11541 // I'm already reverting
11542 dout(10) << " already reverting " << soid << dendl;
11543 } else {
11544 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11545 obc->ondisk_write_lock();
11546 obc->obs.oi.version = latest->version;
11547
11548 ObjectStore::Transaction t;
11549 bufferlist b2;
11550 obc->obs.oi.encode(
11551 b2,
11552 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11553 assert(!pool.info.require_rollback());
11554 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11555
11556 recover_got(soid, latest->version);
11557 missing_loc.add_location(soid, pg_whoami);
11558
11559 ++active_pushes;
11560
11561 osd->store->queue_transaction(osr.get(), std::move(t),
11562 new C_OSD_AppliedRecoveredObject(this, obc),
11563 new C_OSD_CommittedPushedObject(
11564 this,
11565 get_osdmap()->get_epoch(),
11566 info.last_complete),
11567 new C_OSD_OndiskWriteUnlock(obc));
11568 continue;
11569 }
11570 } else {
11571 /*
11572 * Pull the old version of the object. Update missing_loc here to have the location
11573 * of the version we want.
11574 *
11575 * This doesn't use the usual missing_loc paths, but that's okay:
11576 * - if we have it locally, we hit the case above, and go from there.
11577 * - if we don't, we always pass through this case during recovery and set up the location
11578 * properly.
11579 * - this way we don't need to mangle the missing code to be general about needing an old
11580 * version...
11581 */
11582 eversion_t alternate_need = latest->reverting_to;
11583 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11584
11585 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11586 p != peer_missing.end();
11587 ++p)
11588 if (p->second.is_missing(soid, need) &&
11589 p->second.get_items().at(soid).have == alternate_need) {
11590 missing_loc.add_location(soid, p->first);
11591 }
11592 dout(10) << " will pull " << alternate_need << " or " << need
11593 << " from one of " << missing_loc.get_locations(soid)
11594 << dendl;
11595 }
11596 }
11597 break;
11598 }
11599 }
11600
11601 if (!recovering.count(soid)) {
11602 if (recovering.count(head)) {
11603 ++skipped;
11604 } else {
11605 int r = recover_missing(
11606 soid, need, get_recovery_op_priority(), h);
11607 switch (r) {
11608 case PULL_YES:
11609 ++started;
11610 break;
11611 case PULL_OTHER:
11612 ++started;
11613 case PULL_NONE:
11614 ++skipped;
11615 break;
11616 default:
11617 ceph_abort();
11618 }
11619 if (started >= max)
11620 break;
11621 }
11622 }
11623
11624 // only advance last_requested if we haven't skipped anything
11625 if (!skipped)
11626 pg_log.set_last_requested(v);
11627 }
11628
11629 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11630 return started;
11631 }
11632
11633 bool PrimaryLogPG::primary_error(
11634 const hobject_t& soid, eversion_t v)
11635 {
11636 pg_log.missing_add(soid, v, eversion_t());
11637 pg_log.set_last_requested(0);
11638 missing_loc.remove_location(soid, pg_whoami);
11639 bool uhoh = true;
11640 assert(!actingbackfill.empty());
11641 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11642 i != actingbackfill.end();
11643 ++i) {
11644 if (*i == get_primary()) continue;
11645 pg_shard_t peer = *i;
11646 if (!peer_missing[peer].is_missing(soid, v)) {
11647 missing_loc.add_location(soid, peer);
11648 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11649 << ", there should be a copy on shard " << peer << dendl;
11650 uhoh = false;
11651 }
11652 }
11653 if (uhoh)
11654 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11655 else
11656 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11657 << ", will try copies on " << missing_loc.get_locations(soid);
11658 return uhoh;
11659 }
11660
11661 int PrimaryLogPG::prep_object_replica_deletes(
11662 const hobject_t& soid, eversion_t v,
11663 PGBackend::RecoveryHandle *h)
11664 {
11665 assert(is_primary());
11666 dout(10) << __func__ << ": on " << soid << dendl;
11667
11668 start_recovery_op(soid);
11669 assert(!recovering.count(soid));
11670 recovering.insert(make_pair(soid, ObjectContextRef()));
11671
11672 pgbackend->recover_delete_object(soid, v, h);
11673 return 1;
11674 }
11675
11676 int PrimaryLogPG::prep_object_replica_pushes(
11677 const hobject_t& soid, eversion_t v,
11678 PGBackend::RecoveryHandle *h)
11679 {
11680 assert(is_primary());
11681 dout(10) << __func__ << ": on " << soid << dendl;
11682
11683 // NOTE: we know we will get a valid oloc off of disk here.
11684 ObjectContextRef obc = get_object_context(soid, false);
11685 if (!obc) {
11686 primary_error(soid, v);
11687 return 0;
11688 }
11689
11690 if (!obc->get_recovery_read()) {
11691 dout(20) << "recovery delayed on " << soid
11692 << "; could not get rw_manager lock" << dendl;
11693 return 0;
11694 } else {
11695 dout(20) << "recovery got recovery read lock on " << soid
11696 << dendl;
11697 }
11698
11699 start_recovery_op(soid);
11700 assert(!recovering.count(soid));
11701 recovering.insert(make_pair(soid, obc));
11702
11703 /* We need this in case there is an in progress write on the object. In fact,
11704 * the only possible write is an update to the xattr due to a lost_revert --
11705 * a client write would be blocked since the object is degraded.
11706 * In almost all cases, therefore, this lock should be uncontended.
11707 */
11708 obc->ondisk_read_lock();
11709 int r = pgbackend->recover_object(
11710 soid,
11711 v,
11712 ObjectContextRef(),
11713 obc, // has snapset context
11714 h);
11715 obc->ondisk_read_unlock();
11716 if (r < 0) {
11717 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11718 primary_failed(soid);
11719 primary_error(soid, v);
11720 return 0;
11721 }
11722 return 1;
11723 }
11724
11725 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11726 {
11727 dout(10) << __func__ << "(" << max << ")" << dendl;
11728 uint64_t started = 0;
11729
11730 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11731
11732 // this is FAR from an optimal recovery order. pretty lame, really.
11733 assert(!actingbackfill.empty());
11734 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11735 i != actingbackfill.end();
11736 ++i) {
11737 if (*i == get_primary()) continue;
11738 pg_shard_t peer = *i;
11739 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11740 assert(pm != peer_missing.end());
11741 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11742 assert(pi != peer_info.end());
11743 size_t m_sz = pm->second.num_missing();
11744
11745 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11746 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11747
11748 // oldest first!
11749 const pg_missing_t &m(pm->second);
11750 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11751 p != m.get_rmissing().end() && started < max;
11752 ++p) {
11753 handle.reset_tp_timeout();
11754 const hobject_t soid(p->second);
11755
11756 if (missing_loc.is_unfound(soid)) {
11757 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11758 continue;
11759 }
11760
11761 if (soid > pi->second.last_backfill) {
11762 if (!recovering.count(soid)) {
11763 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11764 derr << __func__ << ": object added to missing set for backfill, but "
11765 << "is not in recovering, error!" << dendl;
11766 ceph_abort();
11767 }
11768 continue;
11769 }
11770
11771 if (recovering.count(soid)) {
11772 dout(10) << __func__ << ": already recovering " << soid << dendl;
11773 continue;
11774 }
11775
11776 if (missing_loc.is_deleted(soid)) {
11777 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11778 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11779 started += prep_object_replica_deletes(soid, r->second.need, h);
11780 continue;
11781 }
11782
11783 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11784 dout(10) << __func__ << ": " << soid.get_head()
11785 << " still missing on primary" << dendl;
11786 continue;
11787 }
11788
11789 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11790 dout(10) << __func__ << ": " << soid.get_snapdir()
11791 << " still missing on primary" << dendl;
11792 continue;
11793 }
11794
11795 if (pg_log.get_missing().is_missing(soid)) {
11796 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11797 continue;
11798 }
11799
11800 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11801 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11802 started += prep_object_replica_pushes(soid, r->second.need,
11803 h);
11804 }
11805 }
11806
11807 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11808 return started;
11809 }
11810
11811 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11812 {
11813 hobject_t e = hobject_t::get_max();
11814 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11815 i != backfill_targets.end();
11816 ++i) {
11817 pg_shard_t peer = *i;
11818 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11819 peer_backfill_info.find(peer);
11820 assert(iter != peer_backfill_info.end());
11821 if (iter->second.begin < e)
11822 e = iter->second.begin;
11823 }
11824 return e;
11825 }
11826
11827 bool PrimaryLogPG::all_peer_done() const
11828 {
11829 // Primary hasn't got any more objects
11830 assert(backfill_info.empty());
11831
11832 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11833 i != backfill_targets.end();
11834 ++i) {
11835 pg_shard_t bt = *i;
11836 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11837 peer_backfill_info.find(bt);
11838 assert(piter != peer_backfill_info.end());
11839 const BackfillInterval& pbi = piter->second;
11840 // See if peer has more to process
11841 if (!pbi.extends_to_end() || !pbi.empty())
11842 return false;
11843 }
11844 return true;
11845 }
11846
11847 /**
11848 * recover_backfill
11849 *
11850 * Invariants:
11851 *
11852 * backfilled: fully pushed to replica or present in replica's missing set (both
11853 * our copy and theirs).
11854 *
11855 * All objects on a backfill_target in
11856 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11857 * objects have been actually deleted and all logically-valid objects are replicated.
11858 * There may be PG objects in this interval yet to be backfilled.
11859 *
11860 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11861 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11862 *
11863 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11864 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11865 * interval remain on the backfill target.
11866 *
11867 * For a backfill target, all objects <= peer_info[target].last_backfill
11868 * have been backfilled to target
11869 *
11870 * There *MAY* be missing/outdated objects between last_backfill_started and
11871 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11872 * io created objects since the last scan. For this reason, we call
11873 * update_range() again before continuing backfill.
11874 */
11875 uint64_t PrimaryLogPG::recover_backfill(
11876 uint64_t max,
11877 ThreadPool::TPHandle &handle, bool *work_started)
11878 {
11879 dout(10) << "recover_backfill (" << max << ")"
11880 << " bft=" << backfill_targets
11881 << " last_backfill_started " << last_backfill_started
11882 << (new_backfill ? " new_backfill":"")
11883 << dendl;
11884 assert(!backfill_targets.empty());
11885
11886 // Initialize from prior backfill state
11887 if (new_backfill) {
11888 // on_activate() was called prior to getting here
11889 assert(last_backfill_started == earliest_backfill());
11890 new_backfill = false;
11891
11892 // initialize BackfillIntervals
11893 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11894 i != backfill_targets.end();
11895 ++i) {
11896 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11897 }
11898 backfill_info.reset(last_backfill_started);
11899
11900 backfills_in_flight.clear();
11901 pending_backfill_updates.clear();
11902 }
11903
11904 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11905 i != backfill_targets.end();
11906 ++i) {
11907 dout(10) << "peer osd." << *i
11908 << " info " << peer_info[*i]
11909 << " interval " << peer_backfill_info[*i].begin
11910 << "-" << peer_backfill_info[*i].end
11911 << " " << peer_backfill_info[*i].objects.size() << " objects"
11912 << dendl;
11913 }
11914
11915 // update our local interval to cope with recent changes
11916 backfill_info.begin = last_backfill_started;
11917 update_range(&backfill_info, handle);
11918
11919 unsigned ops = 0;
11920 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11921 set<hobject_t> add_to_stat;
11922
11923 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11924 i != backfill_targets.end();
11925 ++i) {
11926 peer_backfill_info[*i].trim_to(
11927 std::max(peer_info[*i].last_backfill, last_backfill_started));
11928 }
11929 backfill_info.trim_to(last_backfill_started);
11930
11931 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11932 while (ops < max) {
11933 if (backfill_info.begin <= earliest_peer_backfill() &&
11934 !backfill_info.extends_to_end() && backfill_info.empty()) {
11935 hobject_t next = backfill_info.end;
11936 backfill_info.reset(next);
11937 backfill_info.end = hobject_t::get_max();
11938 update_range(&backfill_info, handle);
11939 backfill_info.trim();
11940 }
11941
11942 dout(20) << " my backfill interval " << backfill_info << dendl;
11943
11944 bool sent_scan = false;
11945 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11946 i != backfill_targets.end();
11947 ++i) {
11948 pg_shard_t bt = *i;
11949 BackfillInterval& pbi = peer_backfill_info[bt];
11950
11951 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11952 if (pbi.begin <= backfill_info.begin &&
11953 !pbi.extends_to_end() && pbi.empty()) {
11954 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11955 epoch_t e = get_osdmap()->get_epoch();
11956 MOSDPGScan *m = new MOSDPGScan(
11957 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11958 spg_t(info.pgid.pgid, bt.shard),
11959 pbi.end, hobject_t());
11960 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11961 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11962 waiting_on_backfill.insert(bt);
11963 sent_scan = true;
11964 }
11965 }
11966
11967 // Count simultaneous scans as a single op and let those complete
11968 if (sent_scan) {
11969 ops++;
11970 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11971 break;
11972 }
11973
11974 if (backfill_info.empty() && all_peer_done()) {
11975 dout(10) << " reached end for both local and all peers" << dendl;
11976 break;
11977 }
11978
11979 // Get object within set of peers to operate on and
11980 // the set of targets for which that object applies.
11981 hobject_t check = earliest_peer_backfill();
11982
11983 if (check < backfill_info.begin) {
11984
11985 set<pg_shard_t> check_targets;
11986 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11987 i != backfill_targets.end();
11988 ++i) {
11989 pg_shard_t bt = *i;
11990 BackfillInterval& pbi = peer_backfill_info[bt];
11991 if (pbi.begin == check)
11992 check_targets.insert(bt);
11993 }
11994 assert(!check_targets.empty());
11995
11996 dout(20) << " BACKFILL removing " << check
11997 << " from peers " << check_targets << dendl;
11998 for (set<pg_shard_t>::iterator i = check_targets.begin();
11999 i != check_targets.end();
12000 ++i) {
12001 pg_shard_t bt = *i;
12002 BackfillInterval& pbi = peer_backfill_info[bt];
12003 assert(pbi.begin == check);
12004
12005 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12006 pbi.pop_front();
12007 }
12008
12009 /* This requires a bit of explanation. We compare head against
12010 * last_backfill to determine whether to send an operation
12011 * to the replica. A single write operation can touch up to three
12012 * objects: head, the snapdir, and a new clone which sorts closer to
12013 * head than any existing clone. If last_backfill points at a clone,
12014 * the transaction won't be sent and all 3 must lie on the right side
12015 * of the line (i.e., we'll backfill them later). If last_backfill
12016 * points at snapdir, it sorts greater than head, so we send the
12017 * transaction which is correct because all three must lie to the left
12018 * of the line.
12019 *
12020 * If it points at head, we have a bit of an issue. If head actually
12021 * exists, no problem, because any transaction which touches snapdir
12022 * must end up creating it (and deleting head), so sending the
12023 * operation won't pose a problem -- we'll end up having to scan it,
12024 * but it'll end up being the right version so we won't bother to
12025 * rebackfill it. However, if head doesn't exist, any write on head
12026 * will remove snapdir. For a replicated pool, this isn't a problem,
12027 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12028 * It only poses a problem for EC pools, because we never just delete
12029 * an object, we rename it into a rollback object. That operation
12030 * will end up crashing the osd with ENOENT. Tolerating the failure
12031 * wouldn't work either, even if snapdir exists, we'd be creating a
12032 * rollback object past the last_backfill line which wouldn't get
12033 * cleaned up (no rollback objects past the last_backfill line is an
12034 * existing important invariant). Thus, let's avoid the whole issue
12035 * by just not updating last_backfill_started here if head doesn't
12036 * exist and snapdir does. We aren't using up a recovery count here,
12037 * so we're going to recover snapdir immediately anyway. We'll only
12038 * fail "backward" if we fail to get the rw lock and that just means
12039 * we'll re-process this section of the hash space again.
12040 *
12041 * I'm choosing this hack here because the really "correct" answer is
12042 * going to be to unify snapdir and head into a single object (a
12043 * snapdir is really just a confusing way to talk about head existing
12044 * as a whiteout), but doing that is going to be a somewhat larger
12045 * undertaking.
12046 *
12047 * @see http://tracker.ceph.com/issues/17668
12048 */
12049 if (!(check.is_head() &&
12050 backfill_info.begin.is_snapdir() &&
12051 check == backfill_info.begin.get_head()))
12052 last_backfill_started = check;
12053
12054 // Don't increment ops here because deletions
12055 // are cheap and not replied to unlike real recovery_ops,
12056 // and we can't increment ops without requeueing ourself
12057 // for recovery.
12058 } else {
12059 eversion_t& obj_v = backfill_info.objects.begin()->second;
12060
12061 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12062 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12063 i != backfill_targets.end();
12064 ++i) {
12065 pg_shard_t bt = *i;
12066 BackfillInterval& pbi = peer_backfill_info[bt];
12067 // Find all check peers that have the wrong version
12068 if (check == backfill_info.begin && check == pbi.begin) {
12069 if (pbi.objects.begin()->second != obj_v) {
12070 need_ver_targs.push_back(bt);
12071 } else {
12072 keep_ver_targs.push_back(bt);
12073 }
12074 } else {
12075 pg_info_t& pinfo = peer_info[bt];
12076
12077 // Only include peers that we've caught up to their backfill line
12078 // otherwise, they only appear to be missing this object
12079 // because their pbi.begin > backfill_info.begin.
12080 if (backfill_info.begin > pinfo.last_backfill)
12081 missing_targs.push_back(bt);
12082 else
12083 skip_targs.push_back(bt);
12084 }
12085 }
12086
12087 if (!keep_ver_targs.empty()) {
12088 // These peers have version obj_v
12089 dout(20) << " BACKFILL keeping " << check
12090 << " with ver " << obj_v
12091 << " on peers " << keep_ver_targs << dendl;
12092 //assert(!waiting_for_degraded_object.count(check));
12093 }
12094 if (!need_ver_targs.empty() || !missing_targs.empty()) {
12095 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12096 assert(obc);
12097 if (obc->get_recovery_read()) {
12098 if (!need_ver_targs.empty()) {
12099 dout(20) << " BACKFILL replacing " << check
12100 << " with ver " << obj_v
12101 << " to peers " << need_ver_targs << dendl;
12102 }
12103 if (!missing_targs.empty()) {
12104 dout(20) << " BACKFILL pushing " << backfill_info.begin
12105 << " with ver " << obj_v
12106 << " to peers " << missing_targs << dendl;
12107 }
12108 vector<pg_shard_t> all_push = need_ver_targs;
12109 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12110
12111 handle.reset_tp_timeout();
12112 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12113 if (r < 0) {
12114 *work_started = true;
12115 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12116 break;
12117 }
12118 ops++;
12119 } else {
12120 *work_started = true;
12121 dout(20) << "backfill blocking on " << backfill_info.begin
12122 << "; could not get rw_manager lock" << dendl;
12123 break;
12124 }
12125 }
12126 dout(20) << "need_ver_targs=" << need_ver_targs
12127 << " keep_ver_targs=" << keep_ver_targs << dendl;
12128 dout(20) << "backfill_targets=" << backfill_targets
12129 << " missing_targs=" << missing_targs
12130 << " skip_targs=" << skip_targs << dendl;
12131
12132 last_backfill_started = backfill_info.begin;
12133 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12134 backfill_info.pop_front();
12135 vector<pg_shard_t> check_targets = need_ver_targs;
12136 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12137 for (vector<pg_shard_t>::iterator i = check_targets.begin();
12138 i != check_targets.end();
12139 ++i) {
12140 pg_shard_t bt = *i;
12141 BackfillInterval& pbi = peer_backfill_info[bt];
12142 pbi.pop_front();
12143 }
12144 }
12145 }
12146
12147 hobject_t backfill_pos =
12148 std::min(backfill_info.begin, earliest_peer_backfill());
12149
12150 for (set<hobject_t>::iterator i = add_to_stat.begin();
12151 i != add_to_stat.end();
12152 ++i) {
12153 ObjectContextRef obc = get_object_context(*i, false);
12154 assert(obc);
12155 pg_stat_t stat;
12156 add_object_context_to_pg_stat(obc, &stat);
12157 pending_backfill_updates[*i] = stat;
12158 }
12159 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12160 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12161 for (unsigned i = 0; i < to_remove.size(); ++i) {
12162 handle.reset_tp_timeout();
12163 const hobject_t& oid = to_remove[i].get<0>();
12164 eversion_t v = to_remove[i].get<1>();
12165 pg_shard_t peer = to_remove[i].get<2>();
12166 MOSDPGBackfillRemove *m;
12167 auto it = reqs.find(peer);
12168 if (it != reqs.end()) {
12169 m = it->second;
12170 } else {
12171 m = reqs[peer] = new MOSDPGBackfillRemove(
12172 spg_t(info.pgid.pgid, peer.shard),
12173 get_osdmap()->get_epoch());
12174 }
12175 m->ls.push_back(make_pair(oid, v));
12176
12177 if (oid <= last_backfill_started)
12178 pending_backfill_updates[oid]; // add empty stat!
12179 }
12180 for (auto p : reqs) {
12181 osd->send_message_osd_cluster(p.first.osd, p.second,
12182 get_osdmap()->get_epoch());
12183 }
12184 } else {
12185 // for jewel targets
12186 for (unsigned i = 0; i < to_remove.size(); ++i) {
12187 handle.reset_tp_timeout();
12188
12189 // ordered before any subsequent updates
12190 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12191 to_remove[i].get<2>());
12192
12193 if (to_remove[i].get<0>() <= last_backfill_started)
12194 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12195 }
12196 }
12197
12198 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12199
12200 dout(5) << "backfill_pos is " << backfill_pos << dendl;
12201 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12202 i != backfills_in_flight.end();
12203 ++i) {
12204 dout(20) << *i << " is still in flight" << dendl;
12205 }
12206
12207 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12208 backfill_pos : *(backfills_in_flight.begin());
12209 hobject_t new_last_backfill = earliest_backfill();
12210 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12211 for (map<hobject_t, pg_stat_t>::iterator i =
12212 pending_backfill_updates.begin();
12213 i != pending_backfill_updates.end() &&
12214 i->first < next_backfill_to_complete;
12215 pending_backfill_updates.erase(i++)) {
12216 dout(20) << " pending_backfill_update " << i->first << dendl;
12217 assert(i->first > new_last_backfill);
12218 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12219 j != backfill_targets.end();
12220 ++j) {
12221 pg_shard_t bt = *j;
12222 pg_info_t& pinfo = peer_info[bt];
12223 //Add stats to all peers that were missing object
12224 if (i->first > pinfo.last_backfill)
12225 pinfo.stats.add(i->second);
12226 }
12227 new_last_backfill = i->first;
12228 }
12229 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12230
12231 assert(!pending_backfill_updates.empty() ||
12232 new_last_backfill == last_backfill_started);
12233 if (pending_backfill_updates.empty() &&
12234 backfill_pos.is_max()) {
12235 assert(backfills_in_flight.empty());
12236 new_last_backfill = backfill_pos;
12237 last_backfill_started = backfill_pos;
12238 }
12239 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12240
12241 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12242 // all the backfill targets. Otherwise, we will move last_backfill up on
12243 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12244 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12245 i != backfill_targets.end();
12246 ++i) {
12247 pg_shard_t bt = *i;
12248 pg_info_t& pinfo = peer_info[bt];
12249
12250 if (new_last_backfill > pinfo.last_backfill) {
12251 pinfo.set_last_backfill(new_last_backfill);
12252 epoch_t e = get_osdmap()->get_epoch();
12253 MOSDPGBackfill *m = NULL;
12254 if (pinfo.last_backfill.is_max()) {
12255 m = new MOSDPGBackfill(
12256 MOSDPGBackfill::OP_BACKFILL_FINISH,
12257 e,
12258 last_peering_reset,
12259 spg_t(info.pgid.pgid, bt.shard));
12260 // Use default priority here, must match sub_op priority
12261 /* pinfo.stats might be wrong if we did log-based recovery on the
12262 * backfilled portion in addition to continuing backfill.
12263 */
12264 pinfo.stats = info.stats;
12265 start_recovery_op(hobject_t::get_max());
12266 } else {
12267 m = new MOSDPGBackfill(
12268 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12269 e,
12270 last_peering_reset,
12271 spg_t(info.pgid.pgid, bt.shard));
12272 // Use default priority here, must match sub_op priority
12273 }
12274 m->last_backfill = pinfo.last_backfill;
12275 m->stats = pinfo.stats;
12276 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12277 dout(10) << " peer " << bt
12278 << " num_objects now " << pinfo.stats.stats.sum.num_objects
12279 << " / " << info.stats.stats.sum.num_objects << dendl;
12280 }
12281 }
12282
12283 if (ops)
12284 *work_started = true;
12285 return ops;
12286 }
12287
12288 int PrimaryLogPG::prep_backfill_object_push(
12289 hobject_t oid, eversion_t v,
12290 ObjectContextRef obc,
12291 vector<pg_shard_t> peers,
12292 PGBackend::RecoveryHandle *h)
12293 {
12294 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12295 assert(!peers.empty());
12296
12297 backfills_in_flight.insert(oid);
12298 for (unsigned int i = 0 ; i < peers.size(); ++i) {
12299 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12300 assert(bpm != peer_missing.end());
12301 bpm->second.add(oid, eversion_t(), eversion_t(), false);
12302 }
12303
12304 assert(!recovering.count(oid));
12305
12306 start_recovery_op(oid);
12307 recovering.insert(make_pair(oid, obc));
12308
12309 // We need to take the read_lock here in order to flush in-progress writes
12310 obc->ondisk_read_lock();
12311 int r = pgbackend->recover_object(
12312 oid,
12313 v,
12314 ObjectContextRef(),
12315 obc,
12316 h);
12317 obc->ondisk_read_unlock();
12318 if (r < 0) {
12319 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12320 primary_failed(oid);
12321 primary_error(oid, v);
12322 backfills_in_flight.erase(oid);
12323 missing_loc.add_missing(oid, v, eversion_t());
12324 }
12325 return r;
12326 }
12327
12328 void PrimaryLogPG::update_range(
12329 BackfillInterval *bi,
12330 ThreadPool::TPHandle &handle)
12331 {
12332 int local_min = cct->_conf->osd_backfill_scan_min;
12333 int local_max = cct->_conf->osd_backfill_scan_max;
12334
12335 if (bi->version < info.log_tail) {
12336 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12337 << dendl;
12338 if (last_update_applied >= info.log_tail) {
12339 bi->version = last_update_applied;
12340 } else {
12341 osr->flush();
12342 bi->version = info.last_update;
12343 }
12344 scan_range(local_min, local_max, bi, handle);
12345 }
12346
12347 if (bi->version >= projected_last_update) {
12348 dout(10) << __func__<< ": bi is current " << dendl;
12349 assert(bi->version == projected_last_update);
12350 } else if (bi->version >= info.log_tail) {
12351 if (pg_log.get_log().empty() && projected_log.empty()) {
12352 /* Because we don't move log_tail on split, the log might be
12353 * empty even if log_tail != last_update. However, the only
12354 * way to get here with an empty log is if log_tail is actually
12355 * eversion_t(), because otherwise the entry which changed
12356 * last_update since the last scan would have to be present.
12357 */
12358 assert(bi->version == eversion_t());
12359 return;
12360 }
12361
12362 dout(10) << __func__<< ": bi is old, (" << bi->version
12363 << ") can be updated with log to projected_last_update "
12364 << projected_last_update << dendl;
12365
12366 auto func = [&](const pg_log_entry_t &e) {
12367 dout(10) << __func__ << ": updating from version " << e.version
12368 << dendl;
12369 const hobject_t &soid = e.soid;
12370 if (soid >= bi->begin &&
12371 soid < bi->end) {
12372 if (e.is_update()) {
12373 dout(10) << __func__ << ": " << e.soid << " updated to version "
12374 << e.version << dendl;
12375 bi->objects.erase(e.soid);
12376 bi->objects.insert(
12377 make_pair(
12378 e.soid,
12379 e.version));
12380 } else if (e.is_delete()) {
12381 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12382 bi->objects.erase(e.soid);
12383 }
12384 }
12385 };
12386 dout(10) << "scanning pg log first" << dendl;
12387 pg_log.get_log().scan_log_after(bi->version, func);
12388 dout(10) << "scanning projected log" << dendl;
12389 projected_log.scan_log_after(bi->version, func);
12390 bi->version = projected_last_update;
12391 } else {
12392 assert(0 == "scan_range should have raised bi->version past log_tail");
12393 }
12394 }
12395
12396 void PrimaryLogPG::scan_range(
12397 int min, int max, BackfillInterval *bi,
12398 ThreadPool::TPHandle &handle)
12399 {
12400 assert(is_locked());
12401 dout(10) << "scan_range from " << bi->begin << dendl;
12402 bi->clear_objects();
12403
12404 vector<hobject_t> ls;
12405 ls.reserve(max);
12406 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12407 assert(r >= 0);
12408 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12409 dout(20) << ls << dendl;
12410
12411 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12412 handle.reset_tp_timeout();
12413 ObjectContextRef obc;
12414 if (is_primary())
12415 obc = object_contexts.lookup(*p);
12416 if (obc) {
12417 bi->objects[*p] = obc->obs.oi.version;
12418 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12419 } else {
12420 bufferlist bl;
12421 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12422
12423 /* If the object does not exist here, it must have been removed
12424 * between the collection_list_partial and here. This can happen
12425 * for the first item in the range, which is usually last_backfill.
12426 */
12427 if (r == -ENOENT)
12428 continue;
12429
12430 assert(r >= 0);
12431 object_info_t oi(bl);
12432 bi->objects[*p] = oi.version;
12433 dout(20) << " " << *p << " " << oi.version << dendl;
12434 }
12435 }
12436 }
12437
12438
12439 /** check_local
12440 *
12441 * verifies that stray objects have been deleted
12442 */
12443 void PrimaryLogPG::check_local()
12444 {
12445 dout(10) << __func__ << dendl;
12446
12447 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12448
12449 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12450 return;
12451
12452 // just scan the log.
12453 set<hobject_t> did;
12454 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12455 p != pg_log.get_log().log.rend();
12456 ++p) {
12457 if (did.count(p->soid))
12458 continue;
12459 did.insert(p->soid);
12460
12461 if (p->is_delete() && !is_missing_object(p->soid)) {
12462 dout(10) << " checking " << p->soid
12463 << " at " << p->version << dendl;
12464 struct stat st;
12465 int r = osd->store->stat(
12466 ch,
12467 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12468 &st);
12469 if (r != -ENOENT) {
12470 derr << __func__ << " " << p->soid << " exists, but should have been "
12471 << "deleted" << dendl;
12472 assert(0 == "erroneously present object");
12473 }
12474 } else {
12475 // ignore old(+missing) objects
12476 }
12477 }
12478 }
12479
12480
12481
12482 // ===========================
12483 // hit sets
12484
12485 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12486 {
12487 ostringstream ss;
12488 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12489 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12490 info.pgid.ps(), info.pgid.pool(),
12491 cct->_conf->osd_hit_set_namespace);
12492 dout(20) << __func__ << " " << hoid << dendl;
12493 return hoid;
12494 }
12495
12496 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12497 utime_t end,
12498 bool using_gmt)
12499 {
12500 ostringstream ss;
12501 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12502 if (using_gmt) {
12503 start.gmtime(ss) << "_";
12504 end.gmtime(ss);
12505 } else {
12506 start.localtime(ss) << "_";
12507 end.localtime(ss);
12508 }
12509 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12510 info.pgid.ps(), info.pgid.pool(),
12511 cct->_conf->osd_hit_set_namespace);
12512 dout(20) << __func__ << " " << hoid << dendl;
12513 return hoid;
12514 }
12515
12516 void PrimaryLogPG::hit_set_clear()
12517 {
12518 dout(20) << __func__ << dendl;
12519 hit_set.reset();
12520 hit_set_start_stamp = utime_t();
12521 }
12522
12523 void PrimaryLogPG::hit_set_setup()
12524 {
12525 if (!is_active() ||
12526 !is_primary()) {
12527 hit_set_clear();
12528 return;
12529 }
12530
12531 if (is_active() && is_primary() &&
12532 (!pool.info.hit_set_count ||
12533 !pool.info.hit_set_period ||
12534 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12535 hit_set_clear();
12536
12537 // only primary is allowed to remove all the hit set objects
12538 hit_set_remove_all();
12539 return;
12540 }
12541
12542 // FIXME: discard any previous data for now
12543 hit_set_create();
12544
12545 // include any writes we know about from the pg log. this doesn't
12546 // capture reads, but it is better than nothing!
12547 hit_set_apply_log();
12548 }
12549
12550 void PrimaryLogPG::hit_set_remove_all()
12551 {
12552 // If any archives are degraded we skip this
12553 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12554 p != info.hit_set.history.end();
12555 ++p) {
12556 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12557
12558 // Once we hit a degraded object just skip
12559 if (is_degraded_or_backfilling_object(aoid))
12560 return;
12561 if (scrubber.write_blocked_by_scrub(aoid))
12562 return;
12563 }
12564
12565 if (!info.hit_set.history.empty()) {
12566 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12567 assert(p != info.hit_set.history.rend());
12568 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12569 assert(!is_degraded_or_backfilling_object(oid));
12570 ObjectContextRef obc = get_object_context(oid, false);
12571 assert(obc);
12572
12573 OpContextUPtr ctx = simple_opc_create(obc);
12574 ctx->at_version = get_next_version();
12575 ctx->updated_hset_history = info.hit_set;
12576 utime_t now = ceph_clock_now();
12577 ctx->mtime = now;
12578 hit_set_trim(ctx, 0);
12579 simple_opc_submit(std::move(ctx));
12580 }
12581
12582 info.hit_set = pg_hit_set_history_t();
12583 if (agent_state) {
12584 agent_state->discard_hit_sets();
12585 }
12586 }
12587
12588 void PrimaryLogPG::hit_set_create()
12589 {
12590 utime_t now = ceph_clock_now();
12591 // make a copy of the params to modify
12592 HitSet::Params params(pool.info.hit_set_params);
12593
12594 dout(20) << __func__ << " " << params << dendl;
12595 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12596 BloomHitSet::Params *p =
12597 static_cast<BloomHitSet::Params*>(params.impl.get());
12598
12599 // convert false positive rate so it holds up across the full period
12600 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12601 if (p->get_fpp() <= 0.0)
12602 p->set_fpp(.01); // fpp cannot be zero!
12603
12604 // if we don't have specified size, estimate target size based on the
12605 // previous bin!
12606 if (p->target_size == 0 && hit_set) {
12607 utime_t dur = now - hit_set_start_stamp;
12608 unsigned unique = hit_set->approx_unique_insert_count();
12609 dout(20) << __func__ << " previous set had approx " << unique
12610 << " unique items over " << dur << " seconds" << dendl;
12611 p->target_size = (double)unique * (double)pool.info.hit_set_period
12612 / (double)dur;
12613 }
12614 if (p->target_size <
12615 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12616 p->target_size = cct->_conf->osd_hit_set_min_size;
12617
12618 if (p->target_size
12619 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12620 p->target_size = cct->_conf->osd_hit_set_max_size;
12621
12622 p->seed = now.sec();
12623
12624 dout(10) << __func__ << " target_size " << p->target_size
12625 << " fpp " << p->get_fpp() << dendl;
12626 }
12627 hit_set.reset(new HitSet(params));
12628 hit_set_start_stamp = now;
12629 }
12630
12631 /**
12632 * apply log entries to set
12633 *
12634 * this would only happen after peering, to at least capture writes
12635 * during an interval that was potentially lost.
12636 */
12637 bool PrimaryLogPG::hit_set_apply_log()
12638 {
12639 if (!hit_set)
12640 return false;
12641
12642 eversion_t to = info.last_update;
12643 eversion_t from = info.hit_set.current_last_update;
12644 if (to <= from) {
12645 dout(20) << __func__ << " no update" << dendl;
12646 return false;
12647 }
12648
12649 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12650 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12651 while (p != pg_log.get_log().log.rend() && p->version > to)
12652 ++p;
12653 while (p != pg_log.get_log().log.rend() && p->version > from) {
12654 hit_set->insert(p->soid);
12655 ++p;
12656 }
12657
12658 return true;
12659 }
12660
12661 void PrimaryLogPG::hit_set_persist()
12662 {
12663 dout(10) << __func__ << dendl;
12664 bufferlist bl;
12665 unsigned max = pool.info.hit_set_count;
12666
12667 utime_t now = ceph_clock_now();
12668 hobject_t oid;
12669
12670 // If any archives are degraded we skip this persist request
12671 // account for the additional entry being added below
12672 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12673 p != info.hit_set.history.end();
12674 ++p) {
12675 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12676
12677 // Once we hit a degraded object just skip further trim
12678 if (is_degraded_or_backfilling_object(aoid))
12679 return;
12680 if (scrubber.write_blocked_by_scrub(aoid))
12681 return;
12682 }
12683
12684 // If backfill is in progress and we could possibly overlap with the
12685 // hit_set_* objects, back off. Since these all have
12686 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12687 // look just at that. This is necessary because our transactions
12688 // may include a modify of the new hit_set *and* a delete of the
12689 // old one, and this may span the backfill boundary.
12690 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12691 p != backfill_targets.end();
12692 ++p) {
12693 assert(peer_info.count(*p));
12694 const pg_info_t& pi = peer_info[*p];
12695 if (pi.last_backfill == hobject_t() ||
12696 pi.last_backfill.get_hash() == info.pgid.ps()) {
12697 dout(10) << __func__ << " backfill target osd." << *p
12698 << " last_backfill has not progressed past pgid ps"
12699 << dendl;
12700 return;
12701 }
12702 }
12703
12704
12705 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12706 new_hset.begin = hit_set_start_stamp;
12707 new_hset.end = now;
12708 oid = get_hit_set_archive_object(
12709 new_hset.begin,
12710 new_hset.end,
12711 new_hset.using_gmt);
12712
12713 // If the current object is degraded we skip this persist request
12714 if (scrubber.write_blocked_by_scrub(oid))
12715 return;
12716
12717 hit_set->seal();
12718 ::encode(*hit_set, bl);
12719 dout(20) << __func__ << " archive " << oid << dendl;
12720
12721 if (agent_state) {
12722 agent_state->add_hit_set(new_hset.begin, hit_set);
12723 uint32_t size = agent_state->hit_set_map.size();
12724 if (size >= pool.info.hit_set_count) {
12725 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12726 }
12727 hit_set_in_memory_trim(size);
12728 }
12729
12730 ObjectContextRef obc = get_object_context(oid, true);
12731 OpContextUPtr ctx = simple_opc_create(obc);
12732
12733 ctx->at_version = get_next_version();
12734 ctx->updated_hset_history = info.hit_set;
12735 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12736
12737 updated_hit_set_hist.current_last_update = info.last_update;
12738 new_hset.version = ctx->at_version;
12739
12740 updated_hit_set_hist.history.push_back(new_hset);
12741 hit_set_create();
12742
12743 // fabricate an object_info_t and SnapSet
12744 obc->obs.oi.version = ctx->at_version;
12745 obc->obs.oi.mtime = now;
12746 obc->obs.oi.size = bl.length();
12747 obc->obs.exists = true;
12748 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12749
12750 ctx->new_obs = obc->obs;
12751
12752 obc->ssc->snapset.head_exists = true;
12753 ctx->new_snapset = obc->ssc->snapset;
12754
12755 ctx->delta_stats.num_objects++;
12756 ctx->delta_stats.num_objects_hit_set_archive++;
12757 ctx->delta_stats.num_bytes += bl.length();
12758 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12759
12760 bufferlist bss;
12761 ::encode(ctx->new_snapset, bss);
12762 bufferlist boi(sizeof(ctx->new_obs.oi));
12763 ::encode(ctx->new_obs.oi, boi,
12764 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12765
12766 ctx->op_t->create(oid);
12767 if (bl.length()) {
12768 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12769 }
12770 map <string, bufferlist> attrs;
12771 attrs[OI_ATTR].claim(boi);
12772 attrs[SS_ATTR].claim(bss);
12773 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12774 ctx->log.push_back(
12775 pg_log_entry_t(
12776 pg_log_entry_t::MODIFY,
12777 oid,
12778 ctx->at_version,
12779 eversion_t(),
12780 0,
12781 osd_reqid_t(),
12782 ctx->mtime,
12783 0)
12784 );
12785
12786 hit_set_trim(ctx, max);
12787
12788 simple_opc_submit(std::move(ctx));
12789 }
12790
12791 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12792 {
12793 assert(ctx->updated_hset_history);
12794 pg_hit_set_history_t &updated_hit_set_hist =
12795 *(ctx->updated_hset_history);
12796 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12797 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12798 assert(p != updated_hit_set_hist.history.end());
12799 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12800
12801 assert(!is_degraded_or_backfilling_object(oid));
12802
12803 dout(20) << __func__ << " removing " << oid << dendl;
12804 ++ctx->at_version.version;
12805 ctx->log.push_back(
12806 pg_log_entry_t(pg_log_entry_t::DELETE,
12807 oid,
12808 ctx->at_version,
12809 p->version,
12810 0,
12811 osd_reqid_t(),
12812 ctx->mtime,
12813 0));
12814
12815 ctx->op_t->remove(oid);
12816 updated_hit_set_hist.history.pop_front();
12817
12818 ObjectContextRef obc = get_object_context(oid, false);
12819 assert(obc);
12820 --ctx->delta_stats.num_objects;
12821 --ctx->delta_stats.num_objects_hit_set_archive;
12822 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12823 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12824 }
12825 }
12826
12827 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12828 {
12829 while (agent_state->hit_set_map.size() > max_in_memory) {
12830 agent_state->remove_oldest_hit_set();
12831 }
12832 }
12833
12834
12835 // =======================================
12836 // cache agent
12837
12838 void PrimaryLogPG::agent_setup()
12839 {
12840 assert(is_locked());
12841 if (!is_active() ||
12842 !is_primary() ||
12843 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12844 pool.info.tier_of < 0 ||
12845 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12846 agent_clear();
12847 return;
12848 }
12849 if (!agent_state) {
12850 agent_state.reset(new TierAgentState);
12851
12852 // choose random starting position
12853 agent_state->position = hobject_t();
12854 agent_state->position.pool = info.pgid.pool();
12855 agent_state->position.set_hash(pool.info.get_random_pg_position(
12856 info.pgid.pgid,
12857 rand()));
12858 agent_state->start = agent_state->position;
12859
12860 dout(10) << __func__ << " allocated new state, position "
12861 << agent_state->position << dendl;
12862 } else {
12863 dout(10) << __func__ << " keeping existing state" << dendl;
12864 }
12865
12866 if (info.stats.stats_invalid) {
12867 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12868 }
12869
12870 agent_choose_mode();
12871 }
12872
12873 void PrimaryLogPG::agent_clear()
12874 {
12875 agent_stop();
12876 agent_state.reset(NULL);
12877 }
12878
12879 // Return false if no objects operated on since start of object hash space
12880 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12881 {
12882 lock();
12883 if (!agent_state) {
12884 dout(10) << __func__ << " no agent state, stopping" << dendl;
12885 unlock();
12886 return true;
12887 }
12888
12889 assert(!deleting);
12890
12891 if (agent_state->is_idle()) {
12892 dout(10) << __func__ << " idle, stopping" << dendl;
12893 unlock();
12894 return true;
12895 }
12896
12897 osd->logger->inc(l_osd_agent_wake);
12898
12899 dout(10) << __func__
12900 << " max " << start_max
12901 << ", flush " << agent_state->get_flush_mode_name()
12902 << ", evict " << agent_state->get_evict_mode_name()
12903 << ", pos " << agent_state->position
12904 << dendl;
12905 assert(is_primary());
12906 assert(is_active());
12907
12908 agent_load_hit_sets();
12909
12910 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12911 assert(base_pool);
12912
12913 int ls_min = 1;
12914 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12915
12916 // list some objects. this conveniently lists clones (oldest to
12917 // newest) before heads... the same order we want to flush in.
12918 //
12919 // NOTE: do not flush the Sequencer. we will assume that the
12920 // listing we get back is imprecise.
12921 vector<hobject_t> ls;
12922 hobject_t next;
12923 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12924 &ls, &next);
12925 assert(r >= 0);
12926 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12927 int started = 0;
12928 for (vector<hobject_t>::iterator p = ls.begin();
12929 p != ls.end();
12930 ++p) {
12931 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12932 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12933 osd->logger->inc(l_osd_agent_skip);
12934 continue;
12935 }
12936 if (is_degraded_or_backfilling_object(*p)) {
12937 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12938 osd->logger->inc(l_osd_agent_skip);
12939 continue;
12940 }
12941 if (is_missing_object(p->get_head())) {
12942 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12943 osd->logger->inc(l_osd_agent_skip);
12944 continue;
12945 }
12946 ObjectContextRef obc = get_object_context(*p, false, NULL);
12947 if (!obc) {
12948 // we didn't flush; we may miss something here.
12949 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12950 osd->logger->inc(l_osd_agent_skip);
12951 continue;
12952 }
12953 if (!obc->obs.exists) {
12954 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12955 osd->logger->inc(l_osd_agent_skip);
12956 continue;
12957 }
12958 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12959 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12960 osd->logger->inc(l_osd_agent_skip);
12961 continue;
12962 }
12963 if (obc->is_blocked()) {
12964 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12965 osd->logger->inc(l_osd_agent_skip);
12966 continue;
12967 }
12968 if (obc->is_request_pending()) {
12969 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12970 osd->logger->inc(l_osd_agent_skip);
12971 continue;
12972 }
12973
12974 // be careful flushing omap to an EC pool.
12975 if (!base_pool->supports_omap() &&
12976 obc->obs.oi.is_omap()) {
12977 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12978 osd->logger->inc(l_osd_agent_skip);
12979 continue;
12980 }
12981
12982 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12983 agent_maybe_evict(obc, false))
12984 ++started;
12985 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12986 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12987 ++started;
12988 --agent_flush_quota;
12989 }
12990 if (started >= start_max) {
12991 // If finishing early, set "next" to the next object
12992 if (++p != ls.end())
12993 next = *p;
12994 break;
12995 }
12996 }
12997
12998 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12999 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13000 agent_state->hist_age = 0;
13001 agent_state->temp_hist.decay();
13002 }
13003
13004 // Total objects operated on so far
13005 int total_started = agent_state->started + started;
13006 bool need_delay = false;
13007
13008 dout(20) << __func__ << " start pos " << agent_state->position
13009 << " next start pos " << next
13010 << " started " << total_started << dendl;
13011
13012 // See if we've made a full pass over the object hash space
13013 // This might check at most ls_max objects a second time to notice that
13014 // we've checked every objects at least once.
13015 if (agent_state->position < agent_state->start &&
13016 next >= agent_state->start) {
13017 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13018 if (total_started == 0)
13019 need_delay = true;
13020 else
13021 total_started = 0;
13022 agent_state->start = next;
13023 }
13024 agent_state->started = total_started;
13025
13026 // See if we are starting from beginning
13027 if (next.is_max())
13028 agent_state->position = hobject_t();
13029 else
13030 agent_state->position = next;
13031
13032 // Discard old in memory HitSets
13033 hit_set_in_memory_trim(pool.info.hit_set_count);
13034
13035 if (need_delay) {
13036 assert(agent_state->delaying == false);
13037 agent_delay();
13038 unlock();
13039 return false;
13040 }
13041 agent_choose_mode();
13042 unlock();
13043 return true;
13044 }
13045
13046 void PrimaryLogPG::agent_load_hit_sets()
13047 {
13048 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13049 return;
13050 }
13051
13052 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13053 dout(10) << __func__ << dendl;
13054 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13055 p != info.hit_set.history.end(); ++p) {
13056 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13057 dout(10) << __func__ << " loading " << p->begin << "-"
13058 << p->end << dendl;
13059 if (!pool.info.is_replicated()) {
13060 // FIXME: EC not supported here yet
13061 derr << __func__ << " on non-replicated pool" << dendl;
13062 break;
13063 }
13064
13065 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13066 if (is_unreadable_object(oid)) {
13067 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13068 break;
13069 }
13070
13071 ObjectContextRef obc = get_object_context(oid, false);
13072 if (!obc) {
13073 derr << __func__ << ": could not load hitset " << oid << dendl;
13074 break;
13075 }
13076
13077 bufferlist bl;
13078 {
13079 obc->ondisk_read_lock();
13080 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13081 assert(r >= 0);
13082 obc->ondisk_read_unlock();
13083 }
13084 HitSetRef hs(new HitSet);
13085 bufferlist::iterator pbl = bl.begin();
13086 ::decode(*hs, pbl);
13087 agent_state->add_hit_set(p->begin.sec(), hs);
13088 }
13089 }
13090 }
13091 }
13092
13093 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13094 {
13095 if (!obc->obs.oi.is_dirty()) {
13096 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13097 osd->logger->inc(l_osd_agent_skip);
13098 return false;
13099 }
13100 if (obc->obs.oi.is_cache_pinned()) {
13101 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13102 osd->logger->inc(l_osd_agent_skip);
13103 return false;
13104 }
13105
13106 utime_t now = ceph_clock_now();
13107 utime_t ob_local_mtime;
13108 if (obc->obs.oi.local_mtime != utime_t()) {
13109 ob_local_mtime = obc->obs.oi.local_mtime;
13110 } else {
13111 ob_local_mtime = obc->obs.oi.mtime;
13112 }
13113 bool evict_mode_full =
13114 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13115 if (!evict_mode_full &&
13116 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
13117 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13118 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13119 osd->logger->inc(l_osd_agent_skip);
13120 return false;
13121 }
13122
13123 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13124 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13125 osd->logger->inc(l_osd_agent_skip);
13126 return false;
13127 }
13128
13129 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13130
13131 // FIXME: flush anything dirty, regardless of what distribution of
13132 // ages we expect.
13133
13134 hobject_t oid = obc->obs.oi.soid;
13135 osd->agent_start_op(oid);
13136 // no need to capture a pg ref, can't outlive fop or ctx
13137 std::function<void()> on_flush = [this, oid]() {
13138 osd->agent_finish_op(oid);
13139 };
13140
13141 int result = start_flush(
13142 OpRequestRef(), obc, false, NULL,
13143 on_flush);
13144 if (result != -EINPROGRESS) {
13145 on_flush();
13146 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13147 << " with " << result << dendl;
13148 osd->logger->inc(l_osd_agent_skip);
13149 return false;
13150 }
13151
13152 osd->logger->inc(l_osd_agent_flush);
13153 return true;
13154 }
13155
13156 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13157 {
13158 const hobject_t& soid = obc->obs.oi.soid;
13159 if (!after_flush && obc->obs.oi.is_dirty()) {
13160 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13161 return false;
13162 }
13163 if (!obc->obs.oi.watchers.empty()) {
13164 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13165 return false;
13166 }
13167 if (obc->is_blocked()) {
13168 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13169 return false;
13170 }
13171 if (obc->obs.oi.is_cache_pinned()) {
13172 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13173 return false;
13174 }
13175
13176 if (soid.snap == CEPH_NOSNAP) {
13177 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13178 if (result < 0) {
13179 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13180 return false;
13181 }
13182 }
13183
13184 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13185 // is this object old than cache_min_evict_age?
13186 utime_t now = ceph_clock_now();
13187 utime_t ob_local_mtime;
13188 if (obc->obs.oi.local_mtime != utime_t()) {
13189 ob_local_mtime = obc->obs.oi.local_mtime;
13190 } else {
13191 ob_local_mtime = obc->obs.oi.mtime;
13192 }
13193 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13194 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13195 osd->logger->inc(l_osd_agent_skip);
13196 return false;
13197 }
13198 // is this object old and/or cold enough?
13199 int temp = 0;
13200 uint64_t temp_upper = 0, temp_lower = 0;
13201 if (hit_set)
13202 agent_estimate_temp(soid, &temp);
13203 agent_state->temp_hist.add(temp);
13204 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13205
13206 dout(20) << __func__
13207 << " temp " << temp
13208 << " pos " << temp_lower << "-" << temp_upper
13209 << ", evict_effort " << agent_state->evict_effort
13210 << dendl;
13211 dout(30) << "agent_state:\n";
13212 Formatter *f = Formatter::create("");
13213 f->open_object_section("agent_state");
13214 agent_state->dump(f);
13215 f->close_section();
13216 f->flush(*_dout);
13217 delete f;
13218 *_dout << dendl;
13219
13220 if (1000000 - temp_upper >= agent_state->evict_effort)
13221 return false;
13222 }
13223
13224 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13225 OpContextUPtr ctx = simple_opc_create(obc);
13226
13227 if (!ctx->lock_manager.get_lock_type(
13228 ObjectContext::RWState::RWWRITE,
13229 obc->obs.oi.soid,
13230 obc,
13231 OpRequestRef())) {
13232 close_op_ctx(ctx.release());
13233 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13234 return false;
13235 }
13236
13237 osd->agent_start_evict_op();
13238 ctx->register_on_finish(
13239 [this]() {
13240 osd->agent_finish_evict_op();
13241 });
13242
13243 ctx->at_version = get_next_version();
13244 assert(ctx->new_obs.exists);
13245 int r = _delete_oid(ctx.get(), true, false);
13246 if (obc->obs.oi.is_omap())
13247 ctx->delta_stats.num_objects_omap--;
13248 ctx->delta_stats.num_evict++;
13249 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13250 if (obc->obs.oi.is_dirty())
13251 --ctx->delta_stats.num_objects_dirty;
13252 assert(r == 0);
13253 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13254 simple_opc_submit(std::move(ctx));
13255 osd->logger->inc(l_osd_tier_evict);
13256 osd->logger->inc(l_osd_agent_evict);
13257 return true;
13258 }
13259
13260 void PrimaryLogPG::agent_stop()
13261 {
13262 dout(20) << __func__ << dendl;
13263 if (agent_state && !agent_state->is_idle()) {
13264 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13265 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13266 osd->agent_disable_pg(this, agent_state->evict_effort);
13267 }
13268 }
13269
13270 void PrimaryLogPG::agent_delay()
13271 {
13272 dout(20) << __func__ << dendl;
13273 if (agent_state && !agent_state->is_idle()) {
13274 assert(agent_state->delaying == false);
13275 agent_state->delaying = true;
13276 osd->agent_disable_pg(this, agent_state->evict_effort);
13277 }
13278 }
13279
13280 void PrimaryLogPG::agent_choose_mode_restart()
13281 {
13282 dout(20) << __func__ << dendl;
13283 lock();
13284 if (agent_state && agent_state->delaying) {
13285 agent_state->delaying = false;
13286 agent_choose_mode(true);
13287 }
13288 unlock();
13289 }
13290
13291 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13292 {
13293 bool requeued = false;
13294 // Let delay play out
13295 if (agent_state->delaying) {
13296 dout(20) << __func__ << this << " delaying, ignored" << dendl;
13297 return requeued;
13298 }
13299
13300 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13301 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13302 unsigned evict_effort = 0;
13303
13304 if (info.stats.stats_invalid) {
13305 // idle; stats can't be trusted until we scrub.
13306 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13307 goto skip_calc;
13308 }
13309
13310 {
13311 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13312 assert(divisor > 0);
13313
13314 // adjust (effective) user objects down based on the number
13315 // of HitSet objects, which should not count toward our total since
13316 // they cannot be flushed.
13317 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13318
13319 // also exclude omap objects if ec backing pool
13320 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13321 assert(base_pool);
13322 if (!base_pool->supports_omap())
13323 unflushable += info.stats.stats.sum.num_objects_omap;
13324
13325 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13326 if (num_user_objects > unflushable)
13327 num_user_objects -= unflushable;
13328 else
13329 num_user_objects = 0;
13330
13331 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13332 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13333 num_user_bytes -= unflushable_bytes;
13334 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13335 num_user_bytes += num_overhead_bytes;
13336
13337 // also reduce the num_dirty by num_objects_omap
13338 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13339 if (!base_pool->supports_omap()) {
13340 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13341 num_dirty -= info.stats.stats.sum.num_objects_omap;
13342 else
13343 num_dirty = 0;
13344 }
13345
13346 dout(10) << __func__
13347 << " flush_mode: "
13348 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13349 << " evict_mode: "
13350 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13351 << " num_objects: " << info.stats.stats.sum.num_objects
13352 << " num_bytes: " << info.stats.stats.sum.num_bytes
13353 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13354 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13355 << " num_dirty: " << num_dirty
13356 << " num_user_objects: " << num_user_objects
13357 << " num_user_bytes: " << num_user_bytes
13358 << " num_overhead_bytes: " << num_overhead_bytes
13359 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13360 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13361 << dendl;
13362
13363 // get dirty, full ratios
13364 uint64_t dirty_micro = 0;
13365 uint64_t full_micro = 0;
13366 if (pool.info.target_max_bytes && num_user_objects > 0) {
13367 uint64_t avg_size = num_user_bytes / num_user_objects;
13368 dirty_micro =
13369 num_dirty * avg_size * 1000000 /
13370 MAX(pool.info.target_max_bytes / divisor, 1);
13371 full_micro =
13372 num_user_objects * avg_size * 1000000 /
13373 MAX(pool.info.target_max_bytes / divisor, 1);
13374 }
13375 if (pool.info.target_max_objects > 0) {
13376 uint64_t dirty_objects_micro =
13377 num_dirty * 1000000 /
13378 MAX(pool.info.target_max_objects / divisor, 1);
13379 if (dirty_objects_micro > dirty_micro)
13380 dirty_micro = dirty_objects_micro;
13381 uint64_t full_objects_micro =
13382 num_user_objects * 1000000 /
13383 MAX(pool.info.target_max_objects / divisor, 1);
13384 if (full_objects_micro > full_micro)
13385 full_micro = full_objects_micro;
13386 }
13387 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13388 << " full " << ((float)full_micro / 1000000.0)
13389 << dendl;
13390
13391 // flush mode
13392 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13393 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13394 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13395 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13396 flush_target += flush_slop;
13397 flush_high_target += flush_slop;
13398 } else {
13399 flush_target -= MIN(flush_target, flush_slop);
13400 flush_high_target -= MIN(flush_high_target, flush_slop);
13401 }
13402
13403 if (dirty_micro > flush_high_target) {
13404 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13405 } else if (dirty_micro > flush_target) {
13406 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13407 }
13408
13409 // evict mode
13410 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13411 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13412 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13413 evict_target += evict_slop;
13414 else
13415 evict_target -= MIN(evict_target, evict_slop);
13416
13417 if (full_micro > 1000000) {
13418 // evict anything clean
13419 evict_mode = TierAgentState::EVICT_MODE_FULL;
13420 evict_effort = 1000000;
13421 } else if (full_micro > evict_target) {
13422 // set effort in [0..1] range based on where we are between
13423 evict_mode = TierAgentState::EVICT_MODE_SOME;
13424 uint64_t over = full_micro - evict_target;
13425 uint64_t span = 1000000 - evict_target;
13426 evict_effort = MAX(over * 1000000 / span,
13427 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13428
13429 // quantize effort to avoid too much reordering in the agent_queue.
13430 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13431 assert(inc > 0);
13432 uint64_t was = evict_effort;
13433 evict_effort -= evict_effort % inc;
13434 if (evict_effort < inc)
13435 evict_effort = inc;
13436 assert(evict_effort >= inc && evict_effort <= 1000000);
13437 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13438 }
13439 }
13440
13441 skip_calc:
13442 bool old_idle = agent_state->is_idle();
13443 if (flush_mode != agent_state->flush_mode) {
13444 dout(5) << __func__ << " flush_mode "
13445 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13446 << " -> "
13447 << TierAgentState::get_flush_mode_name(flush_mode)
13448 << dendl;
13449 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13450 osd->agent_inc_high_count();
13451 info.stats.stats.sum.num_flush_mode_high = 1;
13452 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13453 info.stats.stats.sum.num_flush_mode_low = 1;
13454 }
13455 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13456 osd->agent_dec_high_count();
13457 info.stats.stats.sum.num_flush_mode_high = 0;
13458 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13459 info.stats.stats.sum.num_flush_mode_low = 0;
13460 }
13461 agent_state->flush_mode = flush_mode;
13462 }
13463 if (evict_mode != agent_state->evict_mode) {
13464 dout(5) << __func__ << " evict_mode "
13465 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13466 << " -> "
13467 << TierAgentState::get_evict_mode_name(evict_mode)
13468 << dendl;
13469 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13470 is_active()) {
13471 if (op)
13472 requeue_op(op);
13473 requeue_ops(waiting_for_active);
13474 requeue_ops(waiting_for_scrub);
13475 requeue_ops(waiting_for_cache_not_full);
13476 objects_blocked_on_cache_full.clear();
13477 requeued = true;
13478 }
13479 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13480 info.stats.stats.sum.num_evict_mode_some = 1;
13481 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13482 info.stats.stats.sum.num_evict_mode_full = 1;
13483 }
13484 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13485 info.stats.stats.sum.num_evict_mode_some = 0;
13486 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13487 info.stats.stats.sum.num_evict_mode_full = 0;
13488 }
13489 agent_state->evict_mode = evict_mode;
13490 }
13491 uint64_t old_effort = agent_state->evict_effort;
13492 if (evict_effort != agent_state->evict_effort) {
13493 dout(5) << __func__ << " evict_effort "
13494 << ((float)agent_state->evict_effort / 1000000.0)
13495 << " -> "
13496 << ((float)evict_effort / 1000000.0)
13497 << dendl;
13498 agent_state->evict_effort = evict_effort;
13499 }
13500
13501 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13502 // (including flush). This is probably fine (they should be
13503 // correlated) but it is not precisely correct.
13504 if (agent_state->is_idle()) {
13505 if (!restart && !old_idle) {
13506 osd->agent_disable_pg(this, old_effort);
13507 }
13508 } else {
13509 if (restart || old_idle) {
13510 osd->agent_enable_pg(this, agent_state->evict_effort);
13511 } else if (old_effort != agent_state->evict_effort) {
13512 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13513 }
13514 }
13515 return requeued;
13516 }
13517
13518 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13519 {
13520 assert(hit_set);
13521 assert(temp);
13522 *temp = 0;
13523 if (hit_set->contains(oid))
13524 *temp = 1000000;
13525 unsigned i = 0;
13526 int last_n = pool.info.hit_set_search_last_n;
13527 for (map<time_t,HitSetRef>::reverse_iterator p =
13528 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13529 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13530 if (p->second->contains(oid)) {
13531 *temp += pool.info.get_grade(i);
13532 --last_n;
13533 }
13534 }
13535 }
13536
13537 // Dup op detection
13538
13539 bool PrimaryLogPG::already_complete(eversion_t v)
13540 {
13541 dout(20) << __func__ << ": " << v << dendl;
13542 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13543 !i.end();
13544 ++i) {
13545 dout(20) << __func__ << ": " << **i << dendl;
13546 // skip copy from temp object ops
13547 if ((*i)->v == eversion_t()) {
13548 dout(20) << __func__ << ": " << **i
13549 << " version is empty" << dendl;
13550 continue;
13551 }
13552 if ((*i)->v > v) {
13553 dout(20) << __func__ << ": " << **i
13554 << " (*i)->v past v" << dendl;
13555 break;
13556 }
13557 if (!(*i)->all_committed) {
13558 dout(20) << __func__ << ": " << **i
13559 << " not committed, returning false"
13560 << dendl;
13561 return false;
13562 }
13563 }
13564 dout(20) << __func__ << ": returning true" << dendl;
13565 return true;
13566 }
13567
13568 bool PrimaryLogPG::already_ack(eversion_t v)
13569 {
13570 dout(20) << __func__ << ": " << v << dendl;
13571 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13572 !i.end();
13573 ++i) {
13574 // skip copy from temp object ops
13575 if ((*i)->v == eversion_t()) {
13576 dout(20) << __func__ << ": " << **i
13577 << " version is empty" << dendl;
13578 continue;
13579 }
13580 if ((*i)->v > v) {
13581 dout(20) << __func__ << ": " << **i
13582 << " (*i)->v past v" << dendl;
13583 break;
13584 }
13585 if (!(*i)->all_applied) {
13586 dout(20) << __func__ << ": " << **i
13587 << " not applied, returning false"
13588 << dendl;
13589 return false;
13590 }
13591 }
13592 dout(20) << __func__ << ": returning true" << dendl;
13593 return true;
13594 }
13595
13596
13597 // ==========================================================================================
13598 // SCRUB
13599
13600
13601 bool PrimaryLogPG::_range_available_for_scrub(
13602 const hobject_t &begin, const hobject_t &end)
13603 {
13604 pair<hobject_t, ObjectContextRef> next;
13605 next.second = object_contexts.lookup(begin);
13606 next.first = begin;
13607 bool more = true;
13608 while (more && next.first < end) {
13609 if (next.second && next.second->is_blocked()) {
13610 next.second->requeue_scrub_on_unblock = true;
13611 dout(10) << __func__ << ": scrub delayed, "
13612 << next.first << " is blocked"
13613 << dendl;
13614 return false;
13615 }
13616 more = object_contexts.get_next(next.first, &next);
13617 }
13618 return true;
13619 }
13620
13621 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13622 const vector<snapid_t>::reverse_iterator &curclone) {
13623 return snapset && curclone != snapset.get().clones.rend();
13624 }
13625
13626 void PrimaryLogPG::log_missing(unsigned missing,
13627 const boost::optional<hobject_t> &head,
13628 LogChannelRef clog,
13629 const spg_t &pgid,
13630 const char *func,
13631 const char *mode,
13632 bool allow_incomplete_clones)
13633 {
13634 assert(head);
13635 if (allow_incomplete_clones) {
13636 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13637 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13638 } else {
13639 clog->info() << mode << " " << pgid << " " << head.get()
13640 << " " << missing << " missing clone(s)";
13641 }
13642 }
13643
13644 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13645 const boost::optional<SnapSet> &snapset,
13646 LogChannelRef clog,
13647 const spg_t &pgid,
13648 const char *mode,
13649 bool allow_incomplete_clones,
13650 boost::optional<snapid_t> target,
13651 vector<snapid_t>::reverse_iterator *curclone,
13652 inconsistent_snapset_wrapper &e)
13653 {
13654 assert(head);
13655 assert(snapset);
13656 unsigned missing = 0;
13657
13658 // NOTE: clones are in descending order, thus **curclone > target test here
13659 hobject_t next_clone(head.get());
13660 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13661 ++missing;
13662 // it is okay to be missing one or more clones in a cache tier.
13663 // skip higher-numbered clones in the list.
13664 if (!allow_incomplete_clones) {
13665 next_clone.snap = **curclone;
13666 clog->error() << mode << " " << pgid << " " << head.get()
13667 << " expected clone " << next_clone << " " << missing
13668 << " missing";
13669 ++scrubber.shallow_errors;
13670 e.set_clone_missing(next_clone.snap);
13671 }
13672 // Clones are descending
13673 ++(*curclone);
13674 }
13675 return missing;
13676 }
13677
13678 /*
13679 * Validate consistency of the object info and snap sets.
13680 *
13681 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13682 * the comparison of the objects is against multiple snapset.clones. There are
13683 * multiple clone lists and in between lists we expect head or snapdir.
13684 *
13685 * Example
13686 *
13687 * objects expected
13688 * ======= =======
13689 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13690 * obj2 head head/snapdir, head ok
13691 * [SnapSet clones 6 4 2 1]
13692 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13693 * obj2 snap 6 obj2 snap 6, match
13694 * obj2 snap 4 obj2 snap 4, match
13695 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13696 * [Snapset clones 3 1]
13697 * obj3 snap 3 obj3 snap 3 match
13698 * obj3 snap 1 obj3 snap 1 match
13699 * obj4 snapdir head/snapdir, snapdir ok
13700 * [Snapset clones 4]
13701 * EOL obj4 snap 4, (expected)
13702 */
13703 void PrimaryLogPG::scrub_snapshot_metadata(
13704 ScrubMap &scrubmap,
13705 const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13706 {
13707 dout(10) << __func__ << dendl;
13708
13709 coll_t c(info.pgid);
13710 bool repair = state_test(PG_STATE_REPAIR);
13711 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13712 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13713 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13714
13715 /// snapsets to repair
13716 map<hobject_t,SnapSet> snapset_to_repair;
13717
13718 // traverse in reverse order.
13719 boost::optional<hobject_t> head;
13720 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13721 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13722 unsigned missing = 0;
13723 inconsistent_snapset_wrapper soid_error, head_error;
13724
13725 bufferlist last_data;
13726
13727 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13728 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13729 const hobject_t& soid = p->first;
13730 soid_error = inconsistent_snapset_wrapper{soid};
13731 object_stat_sum_t stat;
13732 boost::optional<object_info_t> oi;
13733
13734 if (!soid.is_snapdir())
13735 stat.num_objects++;
13736
13737 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13738 stat.num_objects_hit_set_archive++;
13739
13740 if (soid.is_snap()) {
13741 // it's a clone
13742 stat.num_object_clones++;
13743 }
13744
13745 // basic checks.
13746 if (p->second.attrs.count(OI_ATTR) == 0) {
13747 oi = boost::none;
13748 osd->clog->error() << mode << " " << info.pgid << " " << soid
13749 << " no '" << OI_ATTR << "' attr";
13750 ++scrubber.shallow_errors;
13751 soid_error.set_oi_attr_missing();
13752 } else {
13753 bufferlist bv;
13754 bv.push_back(p->second.attrs[OI_ATTR]);
13755 try {
13756 oi = object_info_t(); // Initialize optional<> before decode into it
13757 oi.get().decode(bv);
13758 } catch (buffer::error& e) {
13759 oi = boost::none;
13760 osd->clog->error() << mode << " " << info.pgid << " " << soid
13761 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13762 ++scrubber.shallow_errors;
13763 soid_error.set_oi_attr_corrupted();
13764 soid_error.set_oi_attr_missing(); // Not available too
13765 }
13766 }
13767
13768 if (oi) {
13769 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13770 osd->clog->error() << mode << " " << info.pgid << " " << soid
13771 << " on disk size (" << p->second.size
13772 << ") does not match object info size ("
13773 << oi->size << ") adjusted for ondisk to ("
13774 << pgbackend->be_get_ondisk_size(oi->size)
13775 << ")";
13776 soid_error.set_size_mismatch();
13777 ++scrubber.shallow_errors;
13778 }
13779
13780 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13781
13782 // A clone num_bytes will be added later when we have snapset
13783 if (!soid.is_snap()) {
13784 stat.num_bytes += oi->size;
13785 }
13786 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13787 stat.num_bytes_hit_set_archive += oi->size;
13788
13789 if (!soid.is_snapdir()) {
13790 if (oi->is_dirty())
13791 ++stat.num_objects_dirty;
13792 if (oi->is_whiteout())
13793 ++stat.num_whiteouts;
13794 if (oi->is_omap())
13795 ++stat.num_objects_omap;
13796 if (oi->is_cache_pinned())
13797 ++stat.num_objects_pinned;
13798 }
13799 } else {
13800 // pessimistic assumption that this object might contain a
13801 // legacy SnapSet
13802 stat.num_legacy_snapsets++;
13803 }
13804
13805 // Check for any problems while processing clones
13806 if (doing_clones(snapset, curclone)) {
13807 boost::optional<snapid_t> target;
13808 // Expecting an object with snap for current head
13809 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13810
13811 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13812 << soid << " while processing " << head.get() << dendl;
13813
13814 target = all_clones;
13815 } else {
13816 assert(soid.is_snap());
13817 target = soid.snap;
13818 }
13819
13820 // Log any clones we were expecting to be there up to target
13821 // This will set missing, but will be a no-op if snap.soid == *curclone.
13822 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13823 pool.info.allow_incomplete_clones(), target, &curclone,
13824 head_error);
13825 }
13826 bool expected;
13827 // Check doing_clones() again in case we ran process_clones_to()
13828 if (doing_clones(snapset, curclone)) {
13829 // A head/snapdir would have processed all clones above
13830 // or all greater than *curclone.
13831 assert(soid.is_snap() && *curclone <= soid.snap);
13832
13833 // After processing above clone snap should match the expected curclone
13834 expected = (*curclone == soid.snap);
13835 } else {
13836 // If we aren't doing clones any longer, then expecting head/snapdir
13837 expected = soid.has_snapset();
13838 }
13839 if (!expected) {
13840 // If we couldn't read the head's snapset, just ignore clones
13841 if (head && !snapset) {
13842 osd->clog->error() << mode << " " << info.pgid << " " << soid
13843 << " clone ignored due to missing snapset";
13844 } else {
13845 osd->clog->error() << mode << " " << info.pgid << " " << soid
13846 << " is an unexpected clone";
13847 }
13848 ++scrubber.shallow_errors;
13849 soid_error.set_headless();
13850 scrubber.store->add_snap_error(pool.id, soid_error);
13851 if (head && soid.get_head() == head->get_head())
13852 head_error.set_clone(soid.snap);
13853 continue;
13854 }
13855
13856 // new snapset?
13857 if (soid.has_snapset()) {
13858
13859 if (missing) {
13860 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13861 pool.info.allow_incomplete_clones());
13862 }
13863
13864 // Save previous head error information
13865 if (head && head_error.errors)
13866 scrubber.store->add_snap_error(pool.id, head_error);
13867 // Set this as a new head object
13868 head = soid;
13869 missing = 0;
13870 head_error = soid_error;
13871
13872 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13873
13874 if (p->second.attrs.count(SS_ATTR) == 0) {
13875 osd->clog->error() << mode << " " << info.pgid << " " << soid
13876 << " no '" << SS_ATTR << "' attr";
13877 ++scrubber.shallow_errors;
13878 snapset = boost::none;
13879 head_error.set_ss_attr_missing();
13880 } else {
13881 bufferlist bl;
13882 bl.push_back(p->second.attrs[SS_ATTR]);
13883 bufferlist::iterator blp = bl.begin();
13884 try {
13885 snapset = SnapSet(); // Initialize optional<> before decoding into it
13886 ::decode(snapset.get(), blp);
13887 } catch (buffer::error& e) {
13888 snapset = boost::none;
13889 osd->clog->error() << mode << " " << info.pgid << " " << soid
13890 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13891 ++scrubber.shallow_errors;
13892 head_error.set_ss_attr_corrupted();
13893 }
13894 }
13895
13896 if (snapset) {
13897 // what will be next?
13898 curclone = snapset->clones.rbegin();
13899
13900 if (!snapset->clones.empty()) {
13901 dout(20) << " snapset " << snapset.get() << dendl;
13902 if (snapset->seq == 0) {
13903 osd->clog->error() << mode << " " << info.pgid << " " << soid
13904 << " snaps.seq not set";
13905 ++scrubber.shallow_errors;
13906 head_error.set_snapset_mismatch();
13907 }
13908 }
13909
13910 if (soid.is_head() && !snapset->head_exists) {
13911 osd->clog->error() << mode << " " << info.pgid << " " << soid
13912 << " snapset.head_exists=false, but head exists";
13913 ++scrubber.shallow_errors;
13914 head_error.set_head_mismatch();
13915 // Fix head_exists locally so is_legacy() returns correctly
13916 snapset->head_exists = true;
13917 }
13918 if (soid.is_snapdir() && snapset->head_exists) {
13919 osd->clog->error() << mode << " " << info.pgid << " " << soid
13920 << " snapset.head_exists=true, but snapdir exists";
13921 ++scrubber.shallow_errors;
13922 head_error.set_head_mismatch();
13923 // For symmetry fix this too, but probably doesn't matter
13924 snapset->head_exists = false;
13925 }
13926
13927 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13928 if (soid.is_snapdir()) {
13929 dout(10) << " will move snapset to head from " << soid << dendl;
13930 snapset_to_repair[soid.get_head()] = *snapset;
13931 } else if (snapset->is_legacy()) {
13932 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13933 << dendl;
13934 snapset_to_repair[soid.get_head()] = *snapset;
13935 }
13936 } else {
13937 stat.num_legacy_snapsets++;
13938 }
13939 } else {
13940 // pessimistic assumption that this object might contain a
13941 // legacy SnapSet
13942 stat.num_legacy_snapsets++;
13943 }
13944 } else {
13945 assert(soid.is_snap());
13946 assert(head);
13947 assert(snapset);
13948 assert(soid.snap == *curclone);
13949
13950 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13951
13952 if (snapset->clone_size.count(soid.snap) == 0) {
13953 osd->clog->error() << mode << " " << info.pgid << " " << soid
13954 << " is missing in clone_size";
13955 ++scrubber.shallow_errors;
13956 soid_error.set_size_mismatch();
13957 } else {
13958 if (oi && oi->size != snapset->clone_size[soid.snap]) {
13959 osd->clog->error() << mode << " " << info.pgid << " " << soid
13960 << " size " << oi->size << " != clone_size "
13961 << snapset->clone_size[*curclone];
13962 ++scrubber.shallow_errors;
13963 soid_error.set_size_mismatch();
13964 }
13965
13966 if (snapset->clone_overlap.count(soid.snap) == 0) {
13967 osd->clog->error() << mode << " " << info.pgid << " " << soid
13968 << " is missing in clone_overlap";
13969 ++scrubber.shallow_errors;
13970 soid_error.set_size_mismatch();
13971 } else {
13972 // This checking is based on get_clone_bytes(). The first 2 asserts
13973 // can't happen because we know we have a clone_size and
13974 // a clone_overlap. Now we check that the interval_set won't
13975 // cause the last assert.
13976 uint64_t size = snapset->clone_size.find(soid.snap)->second;
13977 const interval_set<uint64_t> &overlap =
13978 snapset->clone_overlap.find(soid.snap)->second;
13979 bool bad_interval_set = false;
13980 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13981 i != overlap.end(); ++i) {
13982 if (size < i.get_len()) {
13983 bad_interval_set = true;
13984 break;
13985 }
13986 size -= i.get_len();
13987 }
13988
13989 if (bad_interval_set) {
13990 osd->clog->error() << mode << " " << info.pgid << " " << soid
13991 << " bad interval_set in clone_overlap";
13992 ++scrubber.shallow_errors;
13993 soid_error.set_size_mismatch();
13994 } else {
13995 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13996 }
13997 }
13998 }
13999
14000 // migrate legacy_snaps to snapset?
14001 auto p = snapset_to_repair.find(soid.get_head());
14002 if (p != snapset_to_repair.end()) {
14003 if (!oi || oi->legacy_snaps.empty()) {
14004 osd->clog->error() << mode << " " << info.pgid << " " << soid
14005 << " has no oi or legacy_snaps; cannot convert "
14006 << *snapset;
14007 ++scrubber.shallow_errors;
14008 } else {
14009 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
14010 << " to snapset " << p->second << dendl;
14011 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14012 }
14013 }
14014
14015 // what's next?
14016 ++curclone;
14017 if (soid_error.errors)
14018 scrubber.store->add_snap_error(pool.id, soid_error);
14019 }
14020
14021 scrub_cstat.add(stat);
14022 }
14023
14024 if (doing_clones(snapset, curclone)) {
14025 dout(10) << __func__ << " " << mode << " " << info.pgid
14026 << " No more objects while processing " << head.get() << dendl;
14027
14028 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14029 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14030 head_error);
14031 }
14032 // There could be missing found by the test above or even
14033 // before dropping out of the loop for the last head.
14034 if (missing) {
14035 log_missing(missing, head, osd->clog, info.pgid, __func__,
14036 mode, pool.info.allow_incomplete_clones());
14037 }
14038 if (head && head_error.errors)
14039 scrubber.store->add_snap_error(pool.id, head_error);
14040
14041 for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14042 missing_digest.begin();
14043 p != missing_digest.end();
14044 ++p) {
14045 if (p->first.is_snapdir())
14046 continue;
14047 dout(10) << __func__ << " recording digests for " << p->first << dendl;
14048 ObjectContextRef obc = get_object_context(p->first, false);
14049 if (!obc) {
14050 osd->clog->error() << info.pgid << " " << mode
14051 << " cannot get object context for object "
14052 << p->first;
14053 continue;
14054 } else if (obc->obs.oi.soid != p->first) {
14055 osd->clog->error() << info.pgid << " " << mode
14056 << " object " << p->first
14057 << " has a valid oi attr with a mismatched name, "
14058 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14059 continue;
14060 }
14061 OpContextUPtr ctx = simple_opc_create(obc);
14062 ctx->at_version = get_next_version();
14063 ctx->mtime = utime_t(); // do not update mtime
14064 ctx->new_obs.oi.set_data_digest(p->second.first);
14065 ctx->new_obs.oi.set_omap_digest(p->second.second);
14066 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14067
14068 ctx->register_on_success(
14069 [this]() {
14070 dout(20) << "updating scrub digest" << dendl;
14071 if (--scrubber.num_digest_updates_pending == 0) {
14072 requeue_scrub();
14073 }
14074 });
14075
14076 simple_opc_submit(std::move(ctx));
14077 ++scrubber.num_digest_updates_pending;
14078 }
14079 for (auto& p : snapset_to_repair) {
14080 // cache pools may not have the clones, which means we won't know
14081 // what snaps they have. fake out the clone_snaps entries anyway (with
14082 // blank snap lists).
14083 p.second.head_exists = true;
14084 if (pool.info.allow_incomplete_clones()) {
14085 for (auto s : p.second.clones) {
14086 if (p.second.clone_snaps.count(s) == 0) {
14087 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14088 << s << dendl;
14089 p.second.clone_snaps[s];
14090 }
14091 }
14092 }
14093 if (p.second.clones.size() != p.second.clone_snaps.size() ||
14094 p.second.is_legacy()) {
14095 // this happens if we encounter other errors above, like a missing
14096 // or extra clone.
14097 dout(10) << __func__ << " not writing snapset to " << p.first
14098 << " snapset " << p.second << " clones " << p.second.clones
14099 << "; didn't convert fully" << dendl;
14100 scrub_cstat.sum.num_legacy_snapsets++;
14101 continue;
14102 }
14103 dout(10) << __func__ << " writing snapset to " << p.first
14104 << " " << p.second << dendl;
14105 ObjectContextRef obc = get_object_context(p.first, true);
14106 if (!obc) {
14107 osd->clog->error() << info.pgid << " " << mode
14108 << " cannot get object context for object "
14109 << p.first;
14110 continue;
14111 } else if (obc->obs.oi.soid != p.first) {
14112 osd->clog->error() << info.pgid << " " << mode
14113 << " object " << p.first
14114 << " has a valid oi attr with a mismatched name, "
14115 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14116 continue;
14117 }
14118 ObjectContextRef snapset_obc;
14119 if (!obc->obs.exists) {
14120 snapset_obc = get_object_context(p.first.get_snapdir(), false);
14121 if (!snapset_obc) {
14122 osd->clog->error() << info.pgid << " " << mode
14123 << " cannot get object context for "
14124 << p.first.get_snapdir();
14125 continue;
14126 }
14127 }
14128 OpContextUPtr ctx = simple_opc_create(obc);
14129 PGTransaction *t = ctx->op_t.get();
14130 ctx->snapset_obc = snapset_obc;
14131 ctx->at_version = get_next_version();
14132 ctx->mtime = utime_t(); // do not update mtime
14133 ctx->new_snapset = p.second;
14134 if (!ctx->new_obs.exists) {
14135 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
14136 ctx->new_obs.exists = true;
14137 ctx->new_snapset.head_exists = true;
14138 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14139 ++ctx->delta_stats.num_whiteouts;
14140 ++ctx->delta_stats.num_objects;
14141 t->create(p.first);
14142 if (p.first < scrubber.start) {
14143 dout(20) << __func__ << " kludging around update outside of scrub range"
14144 << dendl;
14145 } else {
14146 scrub_cstat.add(ctx->delta_stats);
14147 }
14148 }
14149 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
14150 assert(!ctx->new_snapset.is_legacy());
14151 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14152 ctx->register_on_success(
14153 [this]() {
14154 dout(20) << "updating snapset" << dendl;
14155 if (--scrubber.num_digest_updates_pending == 0) {
14156 requeue_scrub();
14157 }
14158 });
14159
14160 simple_opc_submit(std::move(ctx));
14161 ++scrubber.num_digest_updates_pending;
14162 }
14163
14164 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14165 }
14166
14167 void PrimaryLogPG::_scrub_clear_state()
14168 {
14169 scrub_cstat = object_stat_collection_t();
14170 }
14171
14172 void PrimaryLogPG::_scrub_finish()
14173 {
14174 bool repair = state_test(PG_STATE_REPAIR);
14175 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14176 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14177
14178 if (info.stats.stats_invalid) {
14179 info.stats.stats = scrub_cstat;
14180 info.stats.stats_invalid = false;
14181
14182 if (agent_state)
14183 agent_choose_mode();
14184 }
14185
14186 dout(10) << mode << " got "
14187 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14188 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14189 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14190 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14191 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14192 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14193 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14194 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14195 << dendl;
14196
14197 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14198 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14199 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14200 !info.stats.dirty_stats_invalid) ||
14201 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14202 !info.stats.omap_stats_invalid) ||
14203 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14204 !info.stats.pin_stats_invalid) ||
14205 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14206 !info.stats.hitset_stats_invalid) ||
14207 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14208 !info.stats.hitset_bytes_stats_invalid) ||
14209 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14210 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14211 osd->clog->error() << info.pgid << " " << mode
14212 << " stat mismatch, got "
14213 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14214 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14215 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14216 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14217 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14218 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14219 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14220 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14221 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14222 ++scrubber.shallow_errors;
14223
14224 if (repair) {
14225 ++scrubber.fixed;
14226 info.stats.stats = scrub_cstat;
14227 info.stats.dirty_stats_invalid = false;
14228 info.stats.omap_stats_invalid = false;
14229 info.stats.hitset_stats_invalid = false;
14230 info.stats.hitset_bytes_stats_invalid = false;
14231 publish_stats_to_osd();
14232 share_pg_info();
14233 }
14234 } else if (scrub_cstat.sum.num_legacy_snapsets !=
14235 info.stats.stats.sum.num_legacy_snapsets) {
14236 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14237 << " from " << info.stats.stats.sum.num_legacy_snapsets
14238 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14239 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14240 publish_stats_to_osd();
14241 share_pg_info();
14242 }
14243 // Clear object context cache to get repair information
14244 if (repair)
14245 object_contexts.clear();
14246 }
14247
14248 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14249 {
14250 return osd->check_osdmap_full(missing_on);
14251 }
14252
14253 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14254 {
14255 // Only supports replicated pools
14256 assert(!pool.info.require_rollback());
14257 assert(is_primary());
14258
14259 dout(10) << __func__ << " " << soid
14260 << " peers osd.{" << actingbackfill << "}" << dendl;
14261
14262 if (!is_clean()) {
14263 block_for_clean(soid, op);
14264 return -EAGAIN;
14265 }
14266
14267 assert(!pg_log.get_missing().is_missing(soid));
14268 bufferlist bv;
14269 object_info_t oi;
14270 eversion_t v;
14271 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14272 if (r < 0) {
14273 // Leave v and try to repair without a version, getting attr failed
14274 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14275 << soid << " error=" << r << dendl;
14276 } else try {
14277 bufferlist::iterator bliter = bv.begin();
14278 ::decode(oi, bliter);
14279 v = oi.version;
14280 } catch (...) {
14281 // Leave v as default constructed. This will fail when sent to older OSDs, but
14282 // not much worse than failing here.
14283 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14284 }
14285
14286 missing_loc.add_missing(soid, v, eversion_t());
14287 if (primary_error(soid, v)) {
14288 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14289 // XXX: If we knew that there is no down osd which could include this
14290 // object, it would be nice if we could return EIO here.
14291 // If a "never fail" flag was available, that could be used
14292 // for rbd to NOT return EIO until object marked lost.
14293
14294 // Drop through to save this op in case an osd comes up with the object.
14295 }
14296
14297 // Restart the op after object becomes readable again
14298 waiting_for_unreadable_object[soid].push_back(op);
14299 op->mark_delayed("waiting for missing object");
14300
14301 if (!eio_errors_to_process) {
14302 eio_errors_to_process = true;
14303 assert(is_clean());
14304 queue_peering_event(
14305 CephPeeringEvtRef(
14306 std::make_shared<CephPeeringEvt>(
14307 get_osdmap()->get_epoch(),
14308 get_osdmap()->get_epoch(),
14309 DoRecovery())));
14310 } else {
14311 // A prior error must have already cleared clean state and queued recovery
14312 // or a map change has triggered re-peering.
14313 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14314 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14315 }
14316
14317 return -EAGAIN;
14318 }
14319
14320 /*---SnapTrimmer Logging---*/
14321 #undef dout_prefix
14322 #define dout_prefix *_dout << pg->gen_prefix()
14323
14324 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14325 {
14326 ldout(pg->cct, 20) << "enter " << state_name << dendl;
14327 }
14328
14329 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14330 {
14331 ldout(pg->cct, 20) << "exit " << state_name << dendl;
14332 }
14333
14334 /*---SnapTrimmer states---*/
14335 #undef dout_prefix
14336 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14337 << "SnapTrimmer state<" << get_state_name() << ">: ")
14338
14339 /* NotTrimming */
14340 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14341 : my_base(ctx),
14342 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14343 {
14344 context< SnapTrimmer >().log_enter(state_name);
14345 }
14346
14347 void PrimaryLogPG::NotTrimming::exit()
14348 {
14349 context< SnapTrimmer >().log_exit(state_name, enter_time);
14350 }
14351
14352 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14353 {
14354 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14355 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14356
14357 if (!(pg->is_primary() && pg->is_active())) {
14358 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14359 return discard_event();
14360 }
14361 if (!pg->is_clean() ||
14362 pg->snap_trimq.empty()) {
14363 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14364 return discard_event();
14365 }
14366 if (pg->scrubber.active) {
14367 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14368 return transit< WaitScrub >();
14369 } else {
14370 return transit< Trimming >();
14371 }
14372 }
14373
14374 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14375 {
14376 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14377 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14378
14379 pending = nullptr;
14380 if (!context< SnapTrimmer >().can_trim()) {
14381 post_event(KickTrim());
14382 return transit< NotTrimming >();
14383 }
14384
14385 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14386 ldout(pg->cct, 10) << "NotTrimming: trimming "
14387 << pg->snap_trimq.range_start()
14388 << dendl;
14389 return transit< AwaitAsyncWork >();
14390 }
14391
14392 /* AwaitAsyncWork */
14393 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14394 : my_base(ctx),
14395 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14396 {
14397 auto *pg = context< SnapTrimmer >().pg;
14398 context< SnapTrimmer >().log_enter(state_name);
14399 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14400 pg->state_set(PG_STATE_SNAPTRIM);
14401 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14402 pg->publish_stats_to_osd();
14403 }
14404
14405 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14406 {
14407 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14408 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14409 auto &in_flight = context<Trimming>().in_flight;
14410 assert(in_flight.empty());
14411
14412 assert(pg->is_primary() && pg->is_active());
14413 if (!context< SnapTrimmer >().can_trim()) {
14414 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14415 post_event(KickTrim());
14416 return transit< NotTrimming >();
14417 }
14418
14419 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14420
14421 vector<hobject_t> to_trim;
14422 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14423 to_trim.reserve(max);
14424 int r = pg->snap_mapper.get_next_objects_to_trim(
14425 snap_to_trim,
14426 max,
14427 &to_trim);
14428 if (r != 0 && r != -ENOENT) {
14429 lderr(pg->cct) << "get_next_objects_to_trim returned "
14430 << cpp_strerror(r) << dendl;
14431 assert(0 == "get_next_objects_to_trim returned an invalid code");
14432 } else if (r == -ENOENT) {
14433 // Done!
14434 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14435
14436 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14437 << " to purged_snaps"
14438 << dendl;
14439 pg->info.purged_snaps.insert(snap_to_trim);
14440 pg->snap_trimq.erase(snap_to_trim);
14441 ldout(pg->cct, 10) << "purged_snaps now "
14442 << pg->info.purged_snaps << ", snap_trimq now "
14443 << pg->snap_trimq << dendl;
14444
14445 ObjectStore::Transaction t;
14446 pg->dirty_big_info = true;
14447 pg->write_if_dirty(t);
14448 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14449 assert(tr == 0);
14450
14451 pg->share_pg_info();
14452 post_event(KickTrim());
14453 return transit< NotTrimming >();
14454 }
14455 assert(!to_trim.empty());
14456
14457 for (auto &&object: to_trim) {
14458 // Get next
14459 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14460 OpContextUPtr ctx;
14461 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14462 if (error) {
14463 if (error == -ENOLCK) {
14464 ldout(pg->cct, 10) << "could not get write lock on obj "
14465 << object << dendl;
14466 } else {
14467 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14468 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14469 }
14470 if (!in_flight.empty()) {
14471 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14472 return transit< WaitRepops >();
14473 }
14474 if (error == -ENOLCK) {
14475 ldout(pg->cct, 10) << "waiting for it to clear"
14476 << dendl;
14477 return transit< WaitRWLock >();
14478 } else {
14479 return transit< NotTrimming >();
14480 }
14481 }
14482
14483 in_flight.insert(object);
14484 ctx->register_on_success(
14485 [pg, object, &in_flight]() {
14486 assert(in_flight.find(object) != in_flight.end());
14487 in_flight.erase(object);
14488 if (in_flight.empty()) {
14489 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14490 pg->snap_trimmer_machine.process_event(Reset());
14491 } else {
14492 pg->snap_trimmer_machine.process_event(RepopsComplete());
14493 }
14494 }
14495 });
14496
14497 pg->simple_opc_submit(std::move(ctx));
14498 }
14499
14500 return transit< WaitRepops >();
14501 }
14502
14503 void PrimaryLogPG::setattr_maybe_cache(
14504 ObjectContextRef obc,
14505 OpContext *op,
14506 PGTransaction *t,
14507 const string &key,
14508 bufferlist &val)
14509 {
14510 t->setattr(obc->obs.oi.soid, key, val);
14511 }
14512
14513 void PrimaryLogPG::setattrs_maybe_cache(
14514 ObjectContextRef obc,
14515 OpContext *op,
14516 PGTransaction *t,
14517 map<string, bufferlist> &attrs)
14518 {
14519 t->setattrs(obc->obs.oi.soid, attrs);
14520 }
14521
14522 void PrimaryLogPG::rmattr_maybe_cache(
14523 ObjectContextRef obc,
14524 OpContext *op,
14525 PGTransaction *t,
14526 const string &key)
14527 {
14528 t->rmattr(obc->obs.oi.soid, key);
14529 }
14530
14531 int PrimaryLogPG::getattr_maybe_cache(
14532 ObjectContextRef obc,
14533 const string &key,
14534 bufferlist *val)
14535 {
14536 if (pool.info.require_rollback()) {
14537 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14538 if (i != obc->attr_cache.end()) {
14539 if (val)
14540 *val = i->second;
14541 return 0;
14542 } else {
14543 return -ENODATA;
14544 }
14545 }
14546 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14547 }
14548
14549 int PrimaryLogPG::getattrs_maybe_cache(
14550 ObjectContextRef obc,
14551 map<string, bufferlist> *out,
14552 bool user_only)
14553 {
14554 int r = 0;
14555 if (pool.info.require_rollback()) {
14556 if (out)
14557 *out = obc->attr_cache;
14558 } else {
14559 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14560 }
14561 if (out && user_only) {
14562 map<string, bufferlist> tmp;
14563 for (map<string, bufferlist>::iterator i = out->begin();
14564 i != out->end();
14565 ++i) {
14566 if (i->first.size() > 1 && i->first[0] == '_')
14567 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14568 }
14569 tmp.swap(*out);
14570 }
14571 return r;
14572 }
14573
14574 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14575 return osd->check_failsafe_full(ss);
14576 }
14577
14578 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14579 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14580
14581 #ifdef PG_DEBUG_REFS
14582 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14583 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14584 #endif
14585
14586 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14587 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }