]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
update sources to 12.2.8
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
31
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
47
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
56
57 #ifdef WITH_LTTNG
58 #include "tracing/osd.h"
59 #else
60 #define tracepoint(...)
61 #endif
62
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66 #undef dout_prefix
67 #define dout_prefix _prefix(_dout, this)
68 template <typename T>
69 static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71 }
72
73
74 #include <sstream>
75 #include <utility>
76
77 #include <errno.h>
78
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81 PGLSFilter::PGLSFilter() : cct(nullptr)
82 {
83 }
84
85 PGLSFilter::~PGLSFilter()
86 {
87 }
88
89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104 };
105
106 /**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115 protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125 public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128 };
129
130 template <typename T>
131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135 public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146 };
147
148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152 }
153
154 class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158 public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169 };
170
171
172 Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174 }
175
176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185 };
186
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201 };
202
203 struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
210 opcontext->finish_read(pg);
211 }
212 ~OnReadComplete() override {}
213 };
214
215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
216 PrimaryLogPGRef pg;
217 ObjectContextRef obc;
218 public:
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
220 pg(p), obc(o) {}
221 void finish(int r) override {
222 pg->_applied_recovered_object(obc);
223 }
224 };
225
226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238 };
239
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
245 void finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 }
248 };
249
250 // OpContext
251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
252 {
253 inflightreads = 1;
254 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
255 pair<bufferlist*, Context*> > > in;
256 in.swap(pending_async_reads);
257 pg->pgbackend->objects_read_async(
258 obc->obs.oi.soid,
259 in,
260 new OnReadComplete(pg, this), pg->get_pool().fast_read);
261 }
262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
263 {
264 assert(inflightreads > 0);
265 --inflightreads;
266 if (async_reads_complete()) {
267 assert(pg->in_progress_async_reads.size());
268 assert(pg->in_progress_async_reads.front().second == this);
269 pg->in_progress_async_reads.pop_front();
270
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg->execute_ctx(this);
274 }
275 }
276
277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
278 public:
279 PrimaryLogPG::CopyResults *results = nullptr;
280 PrimaryLogPG::OpContext *ctx;
281 OSDOp &osd_op;
282
283 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
284 : ctx(ctx), osd_op(osd_op) {
285 }
286 ~CopyFromCallback() override {}
287
288 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
289 results = results_.get<1>();
290 int r = results_.get<0>();
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
297 } else {
298 if (r != -ECANCELED) { // on cancel just toss it out; client resends
299 if (ctx->op)
300 ctx->pg->osd->reply_op_error(ctx->op, r);
301 } else if (results->should_requeue) {
302 if (ctx->op)
303 ctx->pg->requeue_op(ctx->op);
304 }
305 ctx->pg->close_op_ctx(ctx);
306 }
307 }
308
309 bool is_temp_obj_used() {
310 return results->started_temp_obj;
311 }
312 uint64_t get_data_size() {
313 return results->object_size;
314 }
315 };
316
317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
318 CopyFromCallback *copy_from_callback;
319
320 CopyFromFinisher(CopyFromCallback *copy_from_callback)
321 : copy_from_callback(copy_from_callback) {
322 }
323
324 int execute() override {
325 // instance will be destructed after this method completes
326 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
327 return 0;
328 }
329 };
330
331 // ======================
332 // PGBackend::Listener
333
334 void PrimaryLogPG::on_local_recover(
335 const hobject_t &hoid,
336 const ObjectRecoveryInfo &_recovery_info,
337 ObjectContextRef obc,
338 bool is_delete,
339 ObjectStore::Transaction *t
340 )
341 {
342 dout(10) << __func__ << ": " << hoid << dendl;
343
344 ObjectRecoveryInfo recovery_info(_recovery_info);
345 clear_object_snap_mapping(t, hoid);
346 if (!is_delete && recovery_info.soid.is_snap()) {
347 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
348 set<snapid_t> snaps;
349 dout(20) << " snapset " << recovery_info.ss
350 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
351 bool error = false;
352 if (recovery_info.ss.is_legacy() ||
353 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
354 assert(recovery_info.oi.legacy_snaps.size());
355 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
356 recovery_info.oi.legacy_snaps.end());
357 } else {
358 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
359 if (p != recovery_info.ss.clone_snaps.end()) {
360 snaps.insert(p->second.begin(), p->second.end());
361 } else {
362 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
363 error = true;
364 }
365 }
366 if (!error) {
367 dout(20) << " snaps " << snaps << dendl;
368 snap_mapper.add_oid(
369 recovery_info.soid,
370 snaps,
371 &_t);
372 }
373 }
374 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
375 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
376 assert(is_primary());
377 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
378 if (latest->op == pg_log_entry_t::LOST_REVERT &&
379 latest->reverting_to == recovery_info.version) {
380 dout(10) << " got old revert version " << recovery_info.version
381 << " for " << *latest << dendl;
382 recovery_info.version = latest->version;
383 // update the attr to the revert event version
384 recovery_info.oi.prior_version = recovery_info.oi.version;
385 recovery_info.oi.version = latest->version;
386 bufferlist bl;
387 ::encode(recovery_info.oi, bl,
388 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
389 assert(!pool.info.require_rollback());
390 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
391 if (obc)
392 obc->attr_cache[OI_ATTR] = bl;
393 }
394 }
395
396 // keep track of active pushes for scrub
397 ++active_pushes;
398
399 if (recovery_info.version > pg_log.get_can_rollback_to()) {
400 /* This can only happen during a repair, and even then, it would
401 * be one heck of a race. If we are repairing the object, the
402 * write in question must be fully committed, so it's not valid
403 * to roll it back anyway (and we'll be rolled forward shortly
404 * anyway) */
405 PGLogEntryHandler h{this, t};
406 pg_log.roll_forward_to(recovery_info.version, &h);
407 }
408 recover_got(recovery_info.soid, recovery_info.version);
409
410 if (is_primary()) {
411 if (!is_delete) {
412 obc->obs.exists = true;
413 obc->ondisk_write_lock();
414
415 bool got = obc->get_recovery_read();
416 assert(got);
417
418 assert(recovering.count(obc->obs.oi.soid));
419 recovering[obc->obs.oi.soid] = obc;
420 obc->obs.oi = recovery_info.oi; // may have been updated above
421 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
422 }
423
424 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
425
426 publish_stats_to_osd();
427 assert(missing_loc.needs_recovery(hoid));
428 if (!is_delete)
429 missing_loc.add_location(hoid, pg_whoami);
430 release_backoffs(hoid);
431 if (!is_unreadable_object(hoid)) {
432 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
433 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
434 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
435 requeue_ops(unreadable_object_entry->second);
436 waiting_for_unreadable_object.erase(unreadable_object_entry);
437 }
438 }
439 } else {
440 t->register_on_applied(
441 new C_OSD_AppliedRecoveredObjectReplica(this));
442
443 }
444
445 t->register_on_commit(
446 new C_OSD_CommittedPushedObject(
447 this,
448 get_osdmap()->get_epoch(),
449 info.last_complete));
450
451 // update pg
452 dirty_info = true;
453 write_if_dirty(*t);
454 }
455
456 void PrimaryLogPG::on_global_recover(
457 const hobject_t &soid,
458 const object_stat_sum_t &stat_diff,
459 bool is_delete)
460 {
461 info.stats.stats.sum.add(stat_diff);
462 missing_loc.recovered(soid);
463 publish_stats_to_osd();
464 dout(10) << "pushed " << soid << " to all replicas" << dendl;
465 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
466 assert(i != recovering.end());
467
468 if (!is_delete) {
469 // recover missing won't have had an obc, but it gets filled in
470 // during on_local_recover
471 assert(i->second);
472 list<OpRequestRef> requeue_list;
473 i->second->drop_recovery_read(&requeue_list);
474 requeue_ops(requeue_list);
475 }
476
477 backfills_in_flight.erase(soid);
478
479 recovering.erase(i);
480 finish_recovery_op(soid);
481 release_backoffs(soid);
482 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
483 if (degraded_object_entry != waiting_for_degraded_object.end()) {
484 dout(20) << " kicking degraded waiters on " << soid << dendl;
485 requeue_ops(degraded_object_entry->second);
486 waiting_for_degraded_object.erase(degraded_object_entry);
487 }
488 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
489 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
490 dout(20) << " kicking unreadable waiters on " << soid << dendl;
491 requeue_ops(unreadable_object_entry->second);
492 waiting_for_unreadable_object.erase(unreadable_object_entry);
493 }
494 finish_degraded_object(soid);
495 }
496
497 void PrimaryLogPG::on_peer_recover(
498 pg_shard_t peer,
499 const hobject_t &soid,
500 const ObjectRecoveryInfo &recovery_info)
501 {
502 publish_stats_to_osd();
503 // done!
504 peer_missing[peer].got(soid, recovery_info.version);
505 }
506
507 void PrimaryLogPG::begin_peer_recover(
508 pg_shard_t peer,
509 const hobject_t soid)
510 {
511 peer_missing[peer].revise_have(soid, eversion_t());
512 }
513
514 void PrimaryLogPG::schedule_recovery_work(
515 GenContext<ThreadPool::TPHandle&> *c)
516 {
517 osd->recovery_gen_wq.queue(c);
518 }
519
520 void PrimaryLogPG::send_message_osd_cluster(
521 int peer, Message *m, epoch_t from_epoch)
522 {
523 osd->send_message_osd_cluster(peer, m, from_epoch);
524 }
525
526 void PrimaryLogPG::send_message_osd_cluster(
527 Message *m, Connection *con)
528 {
529 osd->send_message_osd_cluster(m, con);
530 }
531
532 void PrimaryLogPG::send_message_osd_cluster(
533 Message *m, const ConnectionRef& con)
534 {
535 osd->send_message_osd_cluster(m, con);
536 }
537
538 void PrimaryLogPG::on_primary_error(
539 const hobject_t &oid,
540 eversion_t v)
541 {
542 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
543 primary_failed(oid);
544 primary_error(oid, v);
545 backfill_add_missing(oid, v);
546 }
547
548 void PrimaryLogPG::backfill_add_missing(
549 const hobject_t &oid,
550 eversion_t v)
551 {
552 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
553 backfills_in_flight.erase(oid);
554 missing_loc.add_missing(oid, v, eversion_t());
555 }
556
557 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
558 int peer, epoch_t from_epoch)
559 {
560 return osd->get_con_osd_cluster(peer, from_epoch);
561 }
562
563 PerfCounters *PrimaryLogPG::get_logger()
564 {
565 return osd->logger;
566 }
567
568
569 // ====================
570 // missing objects
571
572 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
573 {
574 return pg_log.get_missing().get_items().count(soid);
575 }
576
577 void PrimaryLogPG::maybe_kick_recovery(
578 const hobject_t &soid)
579 {
580 eversion_t v;
581 if (!missing_loc.needs_recovery(soid, &v))
582 return;
583
584 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
585 if (p != recovering.end()) {
586 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
587 } else if (missing_loc.is_unfound(soid)) {
588 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
589 } else {
590 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
591 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
592 if (is_missing_object(soid)) {
593 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
594 } else if (missing_loc.is_deleted(soid)) {
595 prep_object_replica_deletes(soid, v, h);
596 } else {
597 prep_object_replica_pushes(soid, v, h);
598 }
599 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
600 }
601 }
602
603 void PrimaryLogPG::wait_for_unreadable_object(
604 const hobject_t& soid, OpRequestRef op)
605 {
606 assert(is_unreadable_object(soid));
607 maybe_kick_recovery(soid);
608 waiting_for_unreadable_object[soid].push_back(op);
609 op->mark_delayed("waiting for missing object");
610 }
611
612 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
613 {
614 /* The conditions below may clear (on_local_recover, before we queue
615 * the transaction) before we actually requeue the degraded waiters
616 * in on_global_recover after the transaction completes.
617 */
618 if (waiting_for_degraded_object.count(soid))
619 return true;
620 if (pg_log.get_missing().get_items().count(soid))
621 return true;
622 assert(!actingbackfill.empty());
623 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
624 i != actingbackfill.end();
625 ++i) {
626 if (*i == get_primary()) continue;
627 pg_shard_t peer = *i;
628 auto peer_missing_entry = peer_missing.find(peer);
629 if (peer_missing_entry != peer_missing.end() &&
630 peer_missing_entry->second.get_items().count(soid))
631 return true;
632
633 // Object is degraded if after last_backfill AND
634 // we are backfilling it
635 if (is_backfill_targets(peer) &&
636 peer_info[peer].last_backfill <= soid &&
637 last_backfill_started >= soid &&
638 backfills_in_flight.count(soid))
639 return true;
640 }
641 return false;
642 }
643
644 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
645 {
646 assert(is_degraded_or_backfilling_object(soid));
647
648 maybe_kick_recovery(soid);
649 waiting_for_degraded_object[soid].push_back(op);
650 op->mark_delayed("waiting for degraded object");
651 }
652
653 void PrimaryLogPG::block_write_on_full_cache(
654 const hobject_t& _oid, OpRequestRef op)
655 {
656 const hobject_t oid = _oid.get_head();
657 dout(20) << __func__ << ": blocking object " << oid
658 << " on full cache" << dendl;
659 objects_blocked_on_cache_full.insert(oid);
660 waiting_for_cache_not_full.push_back(op);
661 op->mark_delayed("waiting for cache not full");
662 }
663
664 void PrimaryLogPG::block_for_clean(
665 const hobject_t& oid, OpRequestRef op)
666 {
667 dout(20) << __func__ << ": blocking object " << oid
668 << " on primary repair" << dendl;
669 waiting_for_clean_to_primary_repair.push_back(op);
670 op->mark_delayed("waiting for clean to repair");
671 }
672
673 void PrimaryLogPG::block_write_on_snap_rollback(
674 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
675 {
676 dout(20) << __func__ << ": blocking object " << oid.get_head()
677 << " on snap promotion " << obc->obs.oi.soid << dendl;
678 // otherwise, we'd have blocked in do_op
679 assert(oid.is_head());
680 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
681 objects_blocked_on_snap_promotion[oid] = obc;
682 wait_for_blocked_object(obc->obs.oi.soid, op);
683 }
684
685 void PrimaryLogPG::block_write_on_degraded_snap(
686 const hobject_t& snap, OpRequestRef op)
687 {
688 dout(20) << __func__ << ": blocking object " << snap.get_head()
689 << " on degraded snap " << snap << dendl;
690 // otherwise, we'd have blocked in do_op
691 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
692 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
693 wait_for_degraded_object(snap, op);
694 }
695
696 bool PrimaryLogPG::maybe_await_blocked_snapset(
697 const hobject_t &hoid,
698 OpRequestRef op)
699 {
700 ObjectContextRef obc;
701 obc = object_contexts.lookup(hoid.get_head());
702 if (obc) {
703 if (obc->is_blocked()) {
704 wait_for_blocked_object(obc->obs.oi.soid, op);
705 return true;
706 } else {
707 return false;
708 }
709 }
710 obc = object_contexts.lookup(hoid.get_snapdir());
711 if (obc) {
712 if (obc->is_blocked()) {
713 wait_for_blocked_object(obc->obs.oi.soid, op);
714 return true;
715 } else {
716 return false;
717 }
718 }
719 return false;
720 }
721
722 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
723 {
724 dout(10) << __func__ << " " << soid << " " << op << dendl;
725 waiting_for_blocked_object[soid].push_back(op);
726 op->mark_delayed("waiting for blocked object");
727 }
728
729 void PrimaryLogPG::maybe_force_recovery()
730 {
731 // no force if not in degraded/recovery/backfill states
732 if (!is_degraded() &&
733 !state_test(PG_STATE_RECOVERING |
734 PG_STATE_RECOVERY_WAIT |
735 PG_STATE_BACKFILLING |
736 PG_STATE_BACKFILL_WAIT |
737 PG_STATE_BACKFILL_TOOFULL))
738 return;
739
740 if (pg_log.get_log().approx_size() <
741 cct->_conf->osd_max_pg_log_entries *
742 cct->_conf->osd_force_recovery_pg_log_entries_factor)
743 return;
744
745 // find the oldest missing object
746 version_t min_version = 0;
747 hobject_t soid;
748 if (!pg_log.get_missing().get_items().empty()) {
749 min_version = pg_log.get_missing().get_rmissing().begin()->first;
750 soid = pg_log.get_missing().get_rmissing().begin()->second;
751 }
752 assert(!actingbackfill.empty());
753 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
754 it != actingbackfill.end();
755 ++it) {
756 if (*it == get_primary()) continue;
757 pg_shard_t peer = *it;
758 if (peer_missing.count(peer) &&
759 !peer_missing[peer].get_items().empty() &&
760 min_version > peer_missing[peer].get_rmissing().begin()->first) {
761 min_version = peer_missing[peer].get_rmissing().begin()->first;
762 soid = peer_missing[peer].get_rmissing().begin()->second;
763 }
764 }
765
766 // recover it
767 if (soid != hobject_t())
768 maybe_kick_recovery(soid);
769 }
770
771 class PGLSPlainFilter : public PGLSFilter {
772 string val;
773 public:
774 int init(bufferlist::iterator &params) override
775 {
776 try {
777 ::decode(xattr, params);
778 ::decode(val, params);
779 } catch (buffer::error &e) {
780 return -EINVAL;
781 }
782
783 return 0;
784 }
785 ~PGLSPlainFilter() override {}
786 bool filter(const hobject_t &obj, bufferlist& xattr_data,
787 bufferlist& outdata) override;
788 };
789
790 class PGLSParentFilter : public PGLSFilter {
791 inodeno_t parent_ino;
792 public:
793 CephContext* cct;
794 PGLSParentFilter(CephContext* cct) : cct(cct) {
795 xattr = "_parent";
796 }
797 int init(bufferlist::iterator &params) override
798 {
799 try {
800 ::decode(parent_ino, params);
801 } catch (buffer::error &e) {
802 return -EINVAL;
803 }
804 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
805
806 return 0;
807 }
808 ~PGLSParentFilter() override {}
809 bool filter(const hobject_t &obj, bufferlist& xattr_data,
810 bufferlist& outdata) override;
811 };
812
813 bool PGLSParentFilter::filter(const hobject_t &obj,
814 bufferlist& xattr_data, bufferlist& outdata)
815 {
816 bufferlist::iterator iter = xattr_data.begin();
817 inode_backtrace_t bt;
818
819 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
820
821 ::decode(bt, iter);
822
823 vector<inode_backpointer_t>::iterator vi;
824 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
825 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
826 if (vi->dirino == parent_ino) {
827 ::encode(*vi, outdata);
828 return true;
829 }
830 }
831
832 return false;
833 }
834
835 bool PGLSPlainFilter::filter(const hobject_t &obj,
836 bufferlist& xattr_data, bufferlist& outdata)
837 {
838 if (val.size() != xattr_data.length())
839 return false;
840
841 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
842 return false;
843
844 return true;
845 }
846
847 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
848 {
849 bufferlist bl;
850
851 // If filter has expressed an interest in an xattr, load it.
852 if (!filter->get_xattr().empty()) {
853 int ret = pgbackend->objects_get_attr(
854 sobj,
855 filter->get_xattr(),
856 &bl);
857 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
858 if (ret < 0) {
859 if (ret != -ENODATA || filter->reject_empty_xattr()) {
860 return false;
861 }
862 }
863 }
864
865 return filter->filter(sobj, bl, outdata);
866 }
867
868 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
869 {
870 string type;
871 PGLSFilter *filter;
872
873 try {
874 ::decode(type, iter);
875 }
876 catch (buffer::error& e) {
877 return -EINVAL;
878 }
879
880 if (type.compare("parent") == 0) {
881 filter = new PGLSParentFilter(cct);
882 } else if (type.compare("plain") == 0) {
883 filter = new PGLSPlainFilter();
884 } else {
885 std::size_t dot = type.find(".");
886 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
887 return -EINVAL;
888 }
889
890 const std::string class_name = type.substr(0, dot);
891 const std::string filter_name = type.substr(dot + 1);
892 ClassHandler::ClassData *cls = NULL;
893 int r = osd->class_handler->open_class(class_name, &cls);
894 if (r != 0) {
895 derr << "Error opening class '" << class_name << "': "
896 << cpp_strerror(r) << dendl;
897 if (r != -EPERM) // propogate permission error
898 r = -EINVAL;
899 return r;
900 } else {
901 assert(cls);
902 }
903
904 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
905 if (class_filter == NULL) {
906 derr << "Error finding filter '" << filter_name << "' in class "
907 << class_name << dendl;
908 return -EINVAL;
909 }
910 filter = class_filter->fn();
911 if (!filter) {
912 // Object classes are obliged to return us something, but let's
913 // give an error rather than asserting out.
914 derr << "Buggy class " << class_name << " failed to construct "
915 "filter " << filter_name << dendl;
916 return -EINVAL;
917 }
918 }
919
920 assert(filter);
921 int r = filter->init(iter);
922 if (r < 0) {
923 derr << "Error initializing filter " << type << ": "
924 << cpp_strerror(r) << dendl;
925 delete filter;
926 return -EINVAL;
927 } else {
928 // Successfully constructed and initialized, return it.
929 *pfilter = filter;
930 return 0;
931 }
932 }
933
934
935 // ==========================================================
936
937 int PrimaryLogPG::do_command(
938 cmdmap_t cmdmap,
939 ostream& ss,
940 bufferlist& idata,
941 bufferlist& odata,
942 ConnectionRef con,
943 ceph_tid_t tid)
944 {
945 const auto &missing = pg_log.get_missing();
946 string prefix;
947 string format;
948
949 cmd_getval(cct, cmdmap, "format", format);
950 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
951
952 string command;
953 cmd_getval(cct, cmdmap, "cmd", command);
954 if (command == "query") {
955 f->open_object_section("pg");
956 f->dump_string("state", pg_state_string(get_state()));
957 f->dump_stream("snap_trimq") << snap_trimq;
958 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
959 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
960 f->open_array_section("up");
961 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
962 f->dump_unsigned("osd", *p);
963 f->close_section();
964 f->open_array_section("acting");
965 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
966 f->dump_unsigned("osd", *p);
967 f->close_section();
968 if (!backfill_targets.empty()) {
969 f->open_array_section("backfill_targets");
970 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
971 p != backfill_targets.end();
972 ++p)
973 f->dump_stream("shard") << *p;
974 f->close_section();
975 }
976 if (!actingbackfill.empty()) {
977 f->open_array_section("actingbackfill");
978 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
979 p != actingbackfill.end();
980 ++p)
981 f->dump_stream("shard") << *p;
982 f->close_section();
983 }
984 f->open_object_section("info");
985 _update_calc_stats();
986 info.dump(f.get());
987 f->close_section();
988
989 f->open_array_section("peer_info");
990 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
991 p != peer_info.end();
992 ++p) {
993 f->open_object_section("info");
994 f->dump_stream("peer") << p->first;
995 p->second.dump(f.get());
996 f->close_section();
997 }
998 f->close_section();
999
1000 f->open_array_section("recovery_state");
1001 handle_query_state(f.get());
1002 f->close_section();
1003
1004 f->open_object_section("agent_state");
1005 if (agent_state)
1006 agent_state->dump(f.get());
1007 f->close_section();
1008
1009 f->close_section();
1010 f->flush(odata);
1011 return 0;
1012 }
1013 else if (command == "mark_unfound_lost") {
1014 string mulcmd;
1015 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1016 int mode = -1;
1017 if (mulcmd == "revert") {
1018 if (pool.info.ec_pool()) {
1019 ss << "mode must be 'delete' for ec pool";
1020 return -EINVAL;
1021 }
1022 mode = pg_log_entry_t::LOST_REVERT;
1023 } else if (mulcmd == "delete") {
1024 mode = pg_log_entry_t::LOST_DELETE;
1025 } else {
1026 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1027 return -EINVAL;
1028 }
1029 assert(mode == pg_log_entry_t::LOST_REVERT ||
1030 mode == pg_log_entry_t::LOST_DELETE);
1031
1032 if (!is_primary()) {
1033 ss << "not primary";
1034 return -EROFS;
1035 }
1036
1037 uint64_t unfound = missing_loc.num_unfound();
1038 if (!unfound) {
1039 ss << "pg has no unfound objects";
1040 return 0; // make command idempotent
1041 }
1042
1043 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1044 ss << "pg has " << unfound
1045 << " unfound objects but we haven't probed all sources, not marking lost";
1046 return -EINVAL;
1047 }
1048
1049 mark_all_unfound_lost(mode, con, tid);
1050 return -EAGAIN;
1051 }
1052 else if (command == "list_missing") {
1053 hobject_t offset;
1054 string offset_json;
1055 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1056 json_spirit::Value v;
1057 try {
1058 if (!json_spirit::read(offset_json, v))
1059 throw std::runtime_error("bad json");
1060 offset.decode(v);
1061 } catch (std::runtime_error& e) {
1062 ss << "error parsing offset: " << e.what();
1063 return -EINVAL;
1064 }
1065 }
1066 f->open_object_section("missing");
1067 {
1068 f->open_object_section("offset");
1069 offset.dump(f.get());
1070 f->close_section();
1071 }
1072 f->dump_int("num_missing", missing.num_missing());
1073 f->dump_int("num_unfound", get_num_unfound());
1074 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1075 missing_loc.get_needs_recovery();
1076 map<hobject_t, pg_missing_item>::const_iterator p =
1077 needs_recovery_map.upper_bound(offset);
1078 {
1079 f->open_array_section("objects");
1080 int32_t num = 0;
1081 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1082 if (missing_loc.is_unfound(p->first)) {
1083 f->open_object_section("object");
1084 {
1085 f->open_object_section("oid");
1086 p->first.dump(f.get());
1087 f->close_section();
1088 }
1089 p->second.dump(f.get()); // have, need keys
1090 {
1091 f->open_array_section("locations");
1092 for (set<pg_shard_t>::iterator r =
1093 missing_loc.get_locations(p->first).begin();
1094 r != missing_loc.get_locations(p->first).end();
1095 ++r)
1096 f->dump_stream("shard") << *r;
1097 f->close_section();
1098 }
1099 f->close_section();
1100 num++;
1101 }
1102 }
1103 f->close_section();
1104 }
1105 f->dump_bool("more", p != needs_recovery_map.end());
1106 f->close_section();
1107 f->flush(odata);
1108 return 0;
1109 }
1110
1111 ss << "unknown pg command " << prefix;
1112 return -EINVAL;
1113 }
1114
1115 // ==========================================================
1116
1117 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1118 {
1119 // NOTE: this is non-const because we modify the OSDOp.outdata in
1120 // place
1121 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1122 assert(m->get_type() == CEPH_MSG_OSD_OP);
1123 dout(10) << "do_pg_op " << *m << dendl;
1124
1125 op->mark_started();
1126
1127 int result = 0;
1128 string cname, mname;
1129 PGLSFilter *filter = NULL;
1130 bufferlist filter_out;
1131
1132 snapid_t snapid = m->get_snapid();
1133
1134 vector<OSDOp> ops = m->ops;
1135
1136 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1137 OSDOp& osd_op = *p;
1138 bufferlist::iterator bp = p->indata.begin();
1139 switch (p->op.op) {
1140 case CEPH_OSD_OP_PGNLS_FILTER:
1141 try {
1142 ::decode(cname, bp);
1143 ::decode(mname, bp);
1144 }
1145 catch (const buffer::error& e) {
1146 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1147 result = -EINVAL;
1148 break;
1149 }
1150 if (filter) {
1151 delete filter;
1152 filter = NULL;
1153 }
1154 result = get_pgls_filter(bp, &filter);
1155 if (result < 0)
1156 break;
1157
1158 assert(filter);
1159
1160 // fall through
1161
1162 case CEPH_OSD_OP_PGNLS:
1163 if (snapid != CEPH_NOSNAP) {
1164 result = -EINVAL;
1165 break;
1166 }
1167 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1168 dout(10) << " pgnls pg=" << m->get_pg()
1169 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1170 << " != " << info.pgid << dendl;
1171 result = 0; // hmm?
1172 } else {
1173 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1174
1175 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1176 // read into a buffer
1177 vector<hobject_t> sentries;
1178 pg_nls_response_t response;
1179 try {
1180 ::decode(response.handle, bp);
1181 }
1182 catch (const buffer::error& e) {
1183 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1184 result = -EINVAL;
1185 break;
1186 }
1187
1188 hobject_t next;
1189 hobject_t lower_bound = response.handle;
1190 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1191 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1192 dout(10) << " pgnls lower_bound " << lower_bound
1193 << " pg_end " << pg_end << dendl;
1194 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1195 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1196 // this should only happen with a buggy client.
1197 dout(10) << "outside of PG bounds " << pg_start << " .. "
1198 << pg_end << dendl;
1199 result = -EINVAL;
1200 break;
1201 }
1202
1203 hobject_t current = lower_bound;
1204 osr->flush();
1205 int r = pgbackend->objects_list_partial(
1206 current,
1207 list_size,
1208 list_size,
1209 &sentries,
1210 &next);
1211 if (r != 0) {
1212 result = -EINVAL;
1213 break;
1214 }
1215
1216 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1217 pg_log.get_missing().get_items().lower_bound(current);
1218 vector<hobject_t>::iterator ls_iter = sentries.begin();
1219 hobject_t _max = hobject_t::get_max();
1220 while (1) {
1221 const hobject_t &mcand =
1222 missing_iter == pg_log.get_missing().get_items().end() ?
1223 _max :
1224 missing_iter->first;
1225 const hobject_t &lcand =
1226 ls_iter == sentries.end() ?
1227 _max :
1228 *ls_iter;
1229
1230 hobject_t candidate;
1231 if (mcand == lcand) {
1232 candidate = mcand;
1233 if (!mcand.is_max()) {
1234 ++ls_iter;
1235 ++missing_iter;
1236 }
1237 } else if (mcand < lcand) {
1238 candidate = mcand;
1239 assert(!mcand.is_max());
1240 ++missing_iter;
1241 } else {
1242 candidate = lcand;
1243 assert(!lcand.is_max());
1244 ++ls_iter;
1245 }
1246
1247 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1248 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1249
1250 if (candidate >= next) {
1251 break;
1252 }
1253
1254 if (response.entries.size() == list_size) {
1255 next = candidate;
1256 break;
1257 }
1258
1259 // skip snapdir objects
1260 if (candidate.snap == CEPH_SNAPDIR)
1261 continue;
1262
1263 if (candidate.snap != CEPH_NOSNAP)
1264 continue;
1265
1266 // skip internal namespace
1267 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1268 continue;
1269
1270 if (missing_loc.is_deleted(candidate))
1271 continue;
1272
1273 // skip wrong namespace
1274 if (m->get_hobj().nspace != librados::all_nspaces &&
1275 candidate.get_namespace() != m->get_hobj().nspace)
1276 continue;
1277
1278 if (filter && !pgls_filter(filter, candidate, filter_out))
1279 continue;
1280
1281 dout(20) << "pgnls item 0x" << std::hex
1282 << candidate.get_hash()
1283 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1284 << std::dec << " "
1285 << candidate.oid.name << dendl;
1286
1287 librados::ListObjectImpl item;
1288 item.nspace = candidate.get_namespace();
1289 item.oid = candidate.oid.name;
1290 item.locator = candidate.get_key();
1291 response.entries.push_back(item);
1292 }
1293
1294 if (next.is_max() &&
1295 missing_iter == pg_log.get_missing().get_items().end() &&
1296 ls_iter == sentries.end()) {
1297 result = 1;
1298
1299 // Set response.handle to the start of the next PG according
1300 // to the object sort order.
1301 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1302 } else {
1303 response.handle = next;
1304 }
1305 dout(10) << "pgnls handle=" << response.handle << dendl;
1306 ::encode(response, osd_op.outdata);
1307 if (filter)
1308 ::encode(filter_out, osd_op.outdata);
1309 dout(10) << " pgnls result=" << result << " outdata.length()="
1310 << osd_op.outdata.length() << dendl;
1311 }
1312 break;
1313
1314 case CEPH_OSD_OP_PGLS_FILTER:
1315 try {
1316 ::decode(cname, bp);
1317 ::decode(mname, bp);
1318 }
1319 catch (const buffer::error& e) {
1320 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1321 result = -EINVAL;
1322 break;
1323 }
1324 if (filter) {
1325 delete filter;
1326 filter = NULL;
1327 }
1328 result = get_pgls_filter(bp, &filter);
1329 if (result < 0)
1330 break;
1331
1332 assert(filter);
1333
1334 // fall through
1335
1336 case CEPH_OSD_OP_PGLS:
1337 if (snapid != CEPH_NOSNAP) {
1338 result = -EINVAL;
1339 break;
1340 }
1341 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1342 dout(10) << " pgls pg=" << m->get_pg()
1343 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1344 << " != " << info.pgid << dendl;
1345 result = 0; // hmm?
1346 } else {
1347 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1348
1349 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1350 // read into a buffer
1351 vector<hobject_t> sentries;
1352 pg_ls_response_t response;
1353 try {
1354 ::decode(response.handle, bp);
1355 }
1356 catch (const buffer::error& e) {
1357 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1358 result = -EINVAL;
1359 break;
1360 }
1361
1362 hobject_t next;
1363 hobject_t current = response.handle;
1364 osr->flush();
1365 int r = pgbackend->objects_list_partial(
1366 current,
1367 list_size,
1368 list_size,
1369 &sentries,
1370 &next);
1371 if (r != 0) {
1372 result = -EINVAL;
1373 break;
1374 }
1375
1376 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1377
1378 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1379 pg_log.get_missing().get_items().lower_bound(current);
1380 vector<hobject_t>::iterator ls_iter = sentries.begin();
1381 hobject_t _max = hobject_t::get_max();
1382 while (1) {
1383 const hobject_t &mcand =
1384 missing_iter == pg_log.get_missing().get_items().end() ?
1385 _max :
1386 missing_iter->first;
1387 const hobject_t &lcand =
1388 ls_iter == sentries.end() ?
1389 _max :
1390 *ls_iter;
1391
1392 hobject_t candidate;
1393 if (mcand == lcand) {
1394 candidate = mcand;
1395 if (!mcand.is_max()) {
1396 ++ls_iter;
1397 ++missing_iter;
1398 }
1399 } else if (mcand < lcand) {
1400 candidate = mcand;
1401 assert(!mcand.is_max());
1402 ++missing_iter;
1403 } else {
1404 candidate = lcand;
1405 assert(!lcand.is_max());
1406 ++ls_iter;
1407 }
1408
1409 if (candidate >= next) {
1410 break;
1411 }
1412
1413 if (response.entries.size() == list_size) {
1414 next = candidate;
1415 break;
1416 }
1417
1418 // skip snapdir objects
1419 if (candidate.snap == CEPH_SNAPDIR)
1420 continue;
1421
1422 if (candidate.snap != CEPH_NOSNAP)
1423 continue;
1424
1425 // skip wrong namespace
1426 if (candidate.get_namespace() != m->get_hobj().nspace)
1427 continue;
1428
1429 if (missing_loc.is_deleted(candidate))
1430 continue;
1431
1432 if (filter && !pgls_filter(filter, candidate, filter_out))
1433 continue;
1434
1435 response.entries.push_back(make_pair(candidate.oid,
1436 candidate.get_key()));
1437 }
1438 if (next.is_max() &&
1439 missing_iter == pg_log.get_missing().get_items().end() &&
1440 ls_iter == sentries.end()) {
1441 result = 1;
1442 }
1443 response.handle = next;
1444 ::encode(response, osd_op.outdata);
1445 if (filter)
1446 ::encode(filter_out, osd_op.outdata);
1447 dout(10) << " pgls result=" << result << " outdata.length()="
1448 << osd_op.outdata.length() << dendl;
1449 }
1450 break;
1451
1452 case CEPH_OSD_OP_PG_HITSET_LS:
1453 {
1454 list< pair<utime_t,utime_t> > ls;
1455 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1456 p != info.hit_set.history.end();
1457 ++p)
1458 ls.push_back(make_pair(p->begin, p->end));
1459 if (hit_set)
1460 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1461 ::encode(ls, osd_op.outdata);
1462 }
1463 break;
1464
1465 case CEPH_OSD_OP_PG_HITSET_GET:
1466 {
1467 utime_t stamp(osd_op.op.hit_set_get.stamp);
1468 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1469 // read the current in-memory HitSet, not the version we've
1470 // checkpointed.
1471 if (!hit_set) {
1472 result= -ENOENT;
1473 break;
1474 }
1475 ::encode(*hit_set, osd_op.outdata);
1476 result = osd_op.outdata.length();
1477 } else {
1478 // read an archived HitSet.
1479 hobject_t oid;
1480 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1481 p != info.hit_set.history.end();
1482 ++p) {
1483 if (stamp >= p->begin && stamp <= p->end) {
1484 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1485 break;
1486 }
1487 }
1488 if (oid == hobject_t()) {
1489 result = -ENOENT;
1490 break;
1491 }
1492 if (!pool.info.is_replicated()) {
1493 // FIXME: EC not supported yet
1494 result = -EOPNOTSUPP;
1495 break;
1496 }
1497 if (is_unreadable_object(oid)) {
1498 wait_for_unreadable_object(oid, op);
1499 delete filter;
1500 return;
1501 }
1502 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1503 }
1504 }
1505 break;
1506
1507 case CEPH_OSD_OP_SCRUBLS:
1508 result = do_scrub_ls(m, &osd_op);
1509 break;
1510
1511 default:
1512 result = -EINVAL;
1513 break;
1514 }
1515
1516 if (result < 0)
1517 break;
1518 }
1519
1520 // reply
1521 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1522 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1523 false);
1524 reply->claim_op_out_data(ops);
1525 reply->set_result(result);
1526 reply->set_reply_versions(info.last_update, info.last_user_version);
1527 osd->send_message_osd_client(reply, m->get_connection());
1528 delete filter;
1529 }
1530
1531 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1532 {
1533 if (m->get_pg() != info.pgid.pgid) {
1534 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1535 return -EINVAL; // hmm?
1536 }
1537 auto bp = osd_op->indata.begin();
1538 scrub_ls_arg_t arg;
1539 try {
1540 arg.decode(bp);
1541 } catch (buffer::error&) {
1542 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1543 return -EINVAL;
1544 }
1545 int r = 0;
1546 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1547 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1548 r = -EAGAIN;
1549 } else if (!scrubber.store) {
1550 r = -ENOENT;
1551 } else if (arg.get_snapsets) {
1552 result.vals = scrubber.store->get_snap_errors(osd->store,
1553 get_pgid().pool(),
1554 arg.start_after,
1555 arg.max_return);
1556 } else {
1557 result.vals = scrubber.store->get_object_errors(osd->store,
1558 get_pgid().pool(),
1559 arg.start_after,
1560 arg.max_return);
1561 }
1562 ::encode(result, osd_op->outdata);
1563 return r;
1564 }
1565
1566 void PrimaryLogPG::calc_trim_to()
1567 {
1568 size_t target = cct->_conf->osd_min_pg_log_entries;
1569 if (is_degraded() ||
1570 state_test(PG_STATE_RECOVERING |
1571 PG_STATE_RECOVERY_WAIT |
1572 PG_STATE_BACKFILLING |
1573 PG_STATE_BACKFILL_WAIT |
1574 PG_STATE_BACKFILL_TOOFULL)) {
1575 target = cct->_conf->osd_max_pg_log_entries;
1576 }
1577
1578 eversion_t limit = MIN(
1579 min_last_complete_ondisk,
1580 pg_log.get_can_rollback_to());
1581 if (limit != eversion_t() &&
1582 limit != pg_trim_to &&
1583 pg_log.get_log().approx_size() > target) {
1584 size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
1585 cct->_conf->osd_pg_log_trim_max);
1586 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1587 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1588 return;
1589 }
1590 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1591 eversion_t new_trim_to;
1592 for (size_t i = 0; i < num_to_trim; ++i) {
1593 new_trim_to = it->version;
1594 ++it;
1595 if (new_trim_to > limit) {
1596 new_trim_to = limit;
1597 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1598 break;
1599 }
1600 }
1601 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1602 pg_trim_to = new_trim_to;
1603 assert(pg_trim_to <= pg_log.get_head());
1604 assert(pg_trim_to <= min_last_complete_ondisk);
1605 }
1606 }
1607
1608 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1609 const PGPool &_pool, spg_t p) :
1610 PG(o, curmap, _pool, p),
1611 pgbackend(
1612 PGBackend::build_pg_backend(
1613 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1614 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1615 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1616 new_backfill(false),
1617 temp_seq(0),
1618 snap_trimmer_machine(this)
1619 {
1620 missing_loc.set_backend_predicates(
1621 pgbackend->get_is_readable_predicate(),
1622 pgbackend->get_is_recoverable_predicate());
1623 snap_trimmer_machine.initiate();
1624 }
1625
1626 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1627 {
1628 src_oloc = oloc;
1629 if (oloc.key.empty())
1630 src_oloc.key = oid.name;
1631 }
1632
1633 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1634 {
1635 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1636 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1637 if (!session)
1638 return; // drop it.
1639 session->put(); // get_priv takes a ref, and so does the SessionRef
1640 hobject_t begin = info.pgid.pgid.get_hobj_start();
1641 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1642 if (begin < m->begin) {
1643 begin = m->begin;
1644 }
1645 if (end > m->end) {
1646 end = m->end;
1647 }
1648 dout(10) << __func__ << " backoff ack id " << m->id
1649 << " [" << begin << "," << end << ")" << dendl;
1650 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1651 }
1652
1653 void PrimaryLogPG::do_request(
1654 OpRequestRef& op,
1655 ThreadPool::TPHandle &handle)
1656 {
1657 if (op->osd_trace) {
1658 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1659 op->pg_trace.event("do request");
1660 }
1661 // make sure we have a new enough map
1662 auto p = waiting_for_map.find(op->get_source());
1663 if (p != waiting_for_map.end()) {
1664 // preserve ordering
1665 dout(20) << __func__ << " waiting_for_map "
1666 << p->first << " not empty, queueing" << dendl;
1667 p->second.push_back(op);
1668 op->mark_delayed("waiting_for_map not empty");
1669 return;
1670 }
1671 if (!have_same_or_newer_map(op->min_epoch)) {
1672 dout(20) << __func__ << " min " << op->min_epoch
1673 << ", queue on waiting_for_map " << op->get_source() << dendl;
1674 waiting_for_map[op->get_source()].push_back(op);
1675 op->mark_delayed("op must wait for map");
1676 osd->request_osdmap_update(op->min_epoch);
1677 return;
1678 }
1679
1680 if (can_discard_request(op)) {
1681 return;
1682 }
1683
1684 // pg-wide backoffs
1685 const Message *m = op->get_req();
1686 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1687 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1688 if (!session)
1689 return; // drop it.
1690 session->put(); // get_priv takes a ref, and so does the SessionRef
1691
1692 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1693 if (session->check_backoff(cct, info.pgid,
1694 info.pgid.pgid.get_hobj_start(), m)) {
1695 return;
1696 }
1697
1698 bool backoff =
1699 is_down() ||
1700 is_incomplete() ||
1701 (!is_active() && is_peered());
1702 if (g_conf->osd_backoff_on_peering && !backoff) {
1703 if (is_peering()) {
1704 backoff = true;
1705 }
1706 }
1707 if (backoff) {
1708 add_pg_backoff(session);
1709 return;
1710 }
1711 }
1712 // pg backoff acks at pg-level
1713 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1714 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1715 if (ba->begin != ba->end) {
1716 handle_backoff(op);
1717 return;
1718 }
1719 }
1720 }
1721
1722 if (!is_peered()) {
1723 // Delay unless PGBackend says it's ok
1724 if (pgbackend->can_handle_while_inactive(op)) {
1725 bool handled = pgbackend->handle_message(op);
1726 assert(handled);
1727 return;
1728 } else {
1729 waiting_for_peered.push_back(op);
1730 op->mark_delayed("waiting for peered");
1731 return;
1732 }
1733 }
1734
1735 if (flushes_in_progress > 0) {
1736 dout(20) << flushes_in_progress
1737 << " flushes_in_progress pending "
1738 << "waiting for flush on " << op << dendl;
1739 waiting_for_flush.push_back(op);
1740 op->mark_delayed("waiting for flush");
1741 return;
1742 }
1743
1744 assert(is_peered() && flushes_in_progress == 0);
1745 if (pgbackend->handle_message(op))
1746 return;
1747
1748 switch (op->get_req()->get_type()) {
1749 case CEPH_MSG_OSD_OP:
1750 case CEPH_MSG_OSD_BACKOFF:
1751 if (!is_active()) {
1752 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1753 waiting_for_active.push_back(op);
1754 op->mark_delayed("waiting for active");
1755 return;
1756 }
1757 switch (op->get_req()->get_type()) {
1758 case CEPH_MSG_OSD_OP:
1759 // verify client features
1760 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1761 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1762 osd->reply_op_error(op, -EOPNOTSUPP);
1763 return;
1764 }
1765 do_op(op);
1766 break;
1767 case CEPH_MSG_OSD_BACKOFF:
1768 // object-level backoff acks handled in osdop context
1769 handle_backoff(op);
1770 break;
1771 }
1772 break;
1773
1774 case MSG_OSD_SUBOP:
1775 do_sub_op(op);
1776 break;
1777
1778 case MSG_OSD_SUBOPREPLY:
1779 do_sub_op_reply(op);
1780 break;
1781
1782 case MSG_OSD_PG_SCAN:
1783 do_scan(op, handle);
1784 break;
1785
1786 case MSG_OSD_PG_BACKFILL:
1787 do_backfill(op);
1788 break;
1789
1790 case MSG_OSD_PG_BACKFILL_REMOVE:
1791 do_backfill_remove(op);
1792 break;
1793
1794 case MSG_OSD_SCRUB_RESERVE:
1795 {
1796 const MOSDScrubReserve *m =
1797 static_cast<const MOSDScrubReserve*>(op->get_req());
1798 switch (m->type) {
1799 case MOSDScrubReserve::REQUEST:
1800 handle_scrub_reserve_request(op);
1801 break;
1802 case MOSDScrubReserve::GRANT:
1803 handle_scrub_reserve_grant(op, m->from);
1804 break;
1805 case MOSDScrubReserve::REJECT:
1806 handle_scrub_reserve_reject(op, m->from);
1807 break;
1808 case MOSDScrubReserve::RELEASE:
1809 handle_scrub_reserve_release(op);
1810 break;
1811 }
1812 }
1813 break;
1814
1815 case MSG_OSD_REP_SCRUB:
1816 replica_scrub(op, handle);
1817 break;
1818
1819 case MSG_OSD_REP_SCRUBMAP:
1820 do_replica_scrub_map(op);
1821 break;
1822
1823 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1824 do_update_log_missing(op);
1825 break;
1826
1827 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1828 do_update_log_missing_reply(op);
1829 break;
1830
1831 default:
1832 assert(0 == "bad message type in do_request");
1833 }
1834 }
1835
1836 hobject_t PrimaryLogPG::earliest_backfill() const
1837 {
1838 hobject_t e = hobject_t::get_max();
1839 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1840 i != backfill_targets.end();
1841 ++i) {
1842 pg_shard_t bt = *i;
1843 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1844 assert(iter != peer_info.end());
1845 if (iter->second.last_backfill < e)
1846 e = iter->second.last_backfill;
1847 }
1848 return e;
1849 }
1850
1851 /** do_op - do an op
1852 * pg lock will be held (if multithreaded)
1853 * osd_lock NOT held.
1854 */
1855 void PrimaryLogPG::do_op(OpRequestRef& op)
1856 {
1857 FUNCTRACE();
1858 // NOTE: take a non-const pointer here; we must be careful not to
1859 // change anything that will break other reads on m (operator<<).
1860 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1861 assert(m->get_type() == CEPH_MSG_OSD_OP);
1862 if (m->finish_decode()) {
1863 op->reset_desc(); // for TrackedOp
1864 m->clear_payload();
1865 }
1866
1867 dout(20) << __func__ << ": op " << *m << dendl;
1868
1869 hobject_t head = m->get_hobj();
1870 head.snap = CEPH_NOSNAP;
1871
1872 if (!info.pgid.pgid.contains(
1873 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1874 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1875 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1876 << std::hex << head.get_hash() << std::dec << dendl;
1877 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1878 << " op " << *m;
1879 assert(!cct->_conf->osd_debug_misdirected_ops);
1880 return;
1881 }
1882
1883 bool can_backoff =
1884 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1885 SessionRef session;
1886 if (can_backoff) {
1887 session = static_cast<Session*>(m->get_connection()->get_priv());
1888 if (!session.get()) {
1889 dout(10) << __func__ << " no session" << dendl;
1890 return;
1891 }
1892 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1893
1894 if (session->check_backoff(cct, info.pgid, head, m)) {
1895 return;
1896 }
1897 }
1898
1899 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1900 // not implemented.
1901 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1902 osd->reply_op_error(op, -EINVAL);
1903 return;
1904 }
1905
1906 if (op->rmw_flags == 0) {
1907 int r = osd->osd->init_op_flags(op);
1908 if (r) {
1909 osd->reply_op_error(op, r);
1910 return;
1911 }
1912 }
1913
1914 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1915 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1916 op->may_read() &&
1917 !(op->may_write() || op->may_cache())) {
1918 // balanced reads; any replica will do
1919 if (!(is_primary() || is_replica())) {
1920 osd->handle_misdirected_op(this, op);
1921 return;
1922 }
1923 } else {
1924 // normal case; must be primary
1925 if (!is_primary()) {
1926 osd->handle_misdirected_op(this, op);
1927 return;
1928 }
1929 }
1930
1931 if (!op_has_sufficient_caps(op)) {
1932 osd->reply_op_error(op, -EPERM);
1933 return;
1934 }
1935
1936 if (op->includes_pg_op()) {
1937 return do_pg_op(op);
1938 }
1939
1940 // object name too long?
1941 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1942 dout(4) << "do_op name is longer than "
1943 << cct->_conf->osd_max_object_name_len
1944 << " bytes" << dendl;
1945 osd->reply_op_error(op, -ENAMETOOLONG);
1946 return;
1947 }
1948 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1949 dout(4) << "do_op locator is longer than "
1950 << cct->_conf->osd_max_object_name_len
1951 << " bytes" << dendl;
1952 osd->reply_op_error(op, -ENAMETOOLONG);
1953 return;
1954 }
1955 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1956 dout(4) << "do_op namespace is longer than "
1957 << cct->_conf->osd_max_object_namespace_len
1958 << " bytes" << dendl;
1959 osd->reply_op_error(op, -ENAMETOOLONG);
1960 return;
1961 }
1962
1963 if (int r = osd->store->validate_hobject_key(head)) {
1964 dout(4) << "do_op object " << head << " invalid for backing store: "
1965 << r << dendl;
1966 osd->reply_op_error(op, r);
1967 return;
1968 }
1969
1970 // blacklisted?
1971 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1972 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1973 osd->reply_op_error(op, -EBLACKLISTED);
1974 return;
1975 }
1976
1977 // order this op as a write?
1978 bool write_ordered = op->rwordered();
1979
1980 // discard due to cluster full transition? (we discard any op that
1981 // originates before the cluster or pool is marked full; the client
1982 // will resend after the full flag is removed or if they expect the
1983 // op to succeed despite being full). The except is FULL_FORCE and
1984 // FULL_TRY ops, which there is no reason to discard because they
1985 // bypass all full checks anyway. If this op isn't write or
1986 // read-ordered, we skip.
1987 // FIXME: we exclude mds writes for now.
1988 if (write_ordered && !(m->get_source().is_mds() ||
1989 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1990 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1991 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1992 dout(10) << __func__ << " discarding op sent before full " << m << " "
1993 << *m << dendl;
1994 return;
1995 }
1996 // mds should have stopped writing before this point.
1997 // We can't allow OSD to become non-startable even if mds
1998 // could be writing as part of file removals.
1999 ostringstream ss;
2000 if (write_ordered && osd->check_failsafe_full(ss)) {
2001 dout(10) << __func__ << " fail-safe full check failed, dropping request"
2002 << ss.str()
2003 << dendl;
2004 return;
2005 }
2006 int64_t poolid = get_pgid().pool();
2007 if (op->may_write()) {
2008
2009 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2010 if (!pi) {
2011 return;
2012 }
2013
2014 // invalid?
2015 if (m->get_snapid() != CEPH_NOSNAP) {
2016 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2017 osd->reply_op_error(op, -EINVAL);
2018 return;
2019 }
2020
2021 // too big?
2022 if (cct->_conf->osd_max_write_size &&
2023 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2024 // journal can't hold commit!
2025 derr << "do_op msg data len " << m->get_data_len()
2026 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2027 << " on " << *m << dendl;
2028 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2029 return;
2030 }
2031 }
2032
2033 dout(10) << "do_op " << *m
2034 << (op->may_write() ? " may_write" : "")
2035 << (op->may_read() ? " may_read" : "")
2036 << (op->may_cache() ? " may_cache" : "")
2037 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2038 << " flags " << ceph_osd_flag_string(m->get_flags())
2039 << dendl;
2040
2041 // missing object?
2042 if (is_unreadable_object(head)) {
2043 if (!is_primary()) {
2044 osd->reply_op_error(op, -EAGAIN);
2045 return;
2046 }
2047 if (can_backoff &&
2048 (g_conf->osd_backoff_on_degraded ||
2049 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2050 add_backoff(session, head, head);
2051 maybe_kick_recovery(head);
2052 } else {
2053 wait_for_unreadable_object(head, op);
2054 }
2055 return;
2056 }
2057
2058 // degraded object?
2059 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2060 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2061 add_backoff(session, head, head);
2062 maybe_kick_recovery(head);
2063 } else {
2064 wait_for_degraded_object(head, op);
2065 }
2066 return;
2067 }
2068
2069 if (write_ordered && scrubber.is_chunky_scrub_active() &&
2070 write_blocked_by_scrub(head)) {
2071 dout(20) << __func__ << ": waiting for scrub" << dendl;
2072 waiting_for_scrub.push_back(op);
2073 op->mark_delayed("waiting for scrub");
2074 return;
2075 }
2076
2077 // blocked on snap?
2078 map<hobject_t, snapid_t>::iterator blocked_iter =
2079 objects_blocked_on_degraded_snap.find(head);
2080 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2081 hobject_t to_wait_on(head);
2082 to_wait_on.snap = blocked_iter->second;
2083 wait_for_degraded_object(to_wait_on, op);
2084 return;
2085 }
2086 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2087 objects_blocked_on_snap_promotion.find(head);
2088 if (write_ordered &&
2089 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2090 wait_for_blocked_object(
2091 blocked_snap_promote_iter->second->obs.oi.soid,
2092 op);
2093 return;
2094 }
2095 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2096 block_write_on_full_cache(head, op);
2097 return;
2098 }
2099
2100 // missing snapdir?
2101 hobject_t snapdir = head.get_snapdir();
2102
2103 if (is_unreadable_object(snapdir)) {
2104 wait_for_unreadable_object(snapdir, op);
2105 return;
2106 }
2107
2108 // degraded object?
2109 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2110 wait_for_degraded_object(snapdir, op);
2111 return;
2112 }
2113
2114 // dup/resent?
2115 if (op->may_write() || op->may_cache()) {
2116 // warning: we will get back *a* request for this reqid, but not
2117 // necessarily the most recent. this happens with flush and
2118 // promote ops, but we can't possible have both in our log where
2119 // the original request is still not stable on disk, so for our
2120 // purposes here it doesn't matter which one we get.
2121 eversion_t version;
2122 version_t user_version;
2123 int return_code = 0;
2124 bool got = check_in_progress_op(
2125 m->get_reqid(), &version, &user_version, &return_code);
2126 if (got) {
2127 dout(3) << __func__ << " dup " << m->get_reqid()
2128 << " version " << version << dendl;
2129 if (already_complete(version)) {
2130 osd->reply_op_error(op, return_code, version, user_version);
2131 } else {
2132 dout(10) << " waiting for " << version << " to commit" << dendl;
2133 // always queue ondisk waiters, so that we can requeue if needed
2134 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2135 op->mark_delayed("waiting for ondisk");
2136 }
2137 return;
2138 }
2139 }
2140
2141 ObjectContextRef obc;
2142 bool can_create = op->may_write() || op->may_cache();
2143 hobject_t missing_oid;
2144 const hobject_t& oid = m->get_hobj();
2145
2146 // io blocked on obc?
2147 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2148 maybe_await_blocked_snapset(oid, op)) {
2149 return;
2150 }
2151
2152 int r = find_object_context(
2153 oid, &obc, can_create,
2154 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2155 &missing_oid);
2156
2157 if (r == -EAGAIN) {
2158 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2159 // we have to wait for the object.
2160 if (is_primary()) {
2161 // missing the specific snap we need; requeue and wait.
2162 assert(!op->may_write()); // only happens on a read/cache
2163 wait_for_unreadable_object(missing_oid, op);
2164 return;
2165 }
2166 } else if (r == 0) {
2167 if (is_unreadable_object(obc->obs.oi.soid)) {
2168 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2169 << " is unreadable, waiting" << dendl;
2170 wait_for_unreadable_object(obc->obs.oi.soid, op);
2171 return;
2172 }
2173
2174 // degraded object? (the check above was for head; this could be a clone)
2175 if (write_ordered &&
2176 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2177 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2178 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2179 << " is degraded, waiting" << dendl;
2180 wait_for_degraded_object(obc->obs.oi.soid, op);
2181 return;
2182 }
2183 }
2184
2185 bool in_hit_set = false;
2186 if (hit_set) {
2187 if (obc.get()) {
2188 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2189 in_hit_set = true;
2190 } else {
2191 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2192 in_hit_set = true;
2193 }
2194 if (!op->hitset_inserted) {
2195 hit_set->insert(oid);
2196 op->hitset_inserted = true;
2197 if (hit_set->is_full() ||
2198 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2199 hit_set_persist();
2200 }
2201 }
2202 }
2203
2204 if (agent_state) {
2205 if (agent_choose_mode(false, op))
2206 return;
2207 }
2208
2209 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2210 if (maybe_handle_manifest(op,
2211 write_ordered,
2212 obc))
2213 return;
2214 }
2215
2216 if (maybe_handle_cache(op,
2217 write_ordered,
2218 obc,
2219 r,
2220 missing_oid,
2221 false,
2222 in_hit_set))
2223 return;
2224
2225 if (r && (r != -ENOENT || !obc)) {
2226 // copy the reqids for copy get on ENOENT
2227 if (r == -ENOENT &&
2228 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2229 fill_in_copy_get_noent(op, oid, m->ops[0]);
2230 return;
2231 }
2232 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2233 if (op->may_write() &&
2234 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2235 record_write_error(op, oid, nullptr, r);
2236 } else {
2237 osd->reply_op_error(op, r);
2238 }
2239 return;
2240 }
2241
2242 // make sure locator is consistent
2243 object_locator_t oloc(obc->obs.oi.soid);
2244 if (m->get_object_locator() != oloc) {
2245 dout(10) << " provided locator " << m->get_object_locator()
2246 << " != object's " << obc->obs.oi.soid << dendl;
2247 osd->clog->warn() << "bad locator " << m->get_object_locator()
2248 << " on object " << oloc
2249 << " op " << *m;
2250 }
2251
2252 // io blocked on obc?
2253 if (obc->is_blocked() &&
2254 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2255 wait_for_blocked_object(obc->obs.oi.soid, op);
2256 return;
2257 }
2258
2259 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2260
2261 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2262 OSDOp& osd_op = *p;
2263
2264 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2265 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2266 m->get_snapid() != CEPH_SNAPDIR) {
2267 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2268 osd->reply_op_error(op, -EINVAL);
2269 return;
2270 }
2271 }
2272
2273 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2274
2275 if (!obc->obs.exists)
2276 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2277
2278 /* Due to obc caching, we might have a cached non-existent snapset_obc
2279 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2280 * do_op pipeline make decisions based on whether snapset_obc is
2281 * populated.
2282 */
2283 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2284 ctx->snapset_obc = ObjectContextRef();
2285
2286 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2287 dout(20) << __func__ << ": skipping rw locks" << dendl;
2288 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2289 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2290
2291 // verify there is in fact a flush in progress
2292 // FIXME: we could make this a stronger test.
2293 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2294 if (p == flush_ops.end()) {
2295 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2296 reply_ctx(ctx, -EINVAL);
2297 return;
2298 }
2299 } else if (!get_rw_locks(write_ordered, ctx)) {
2300 dout(20) << __func__ << " waiting for rw locks " << dendl;
2301 op->mark_delayed("waiting for rw locks");
2302 close_op_ctx(ctx);
2303 return;
2304 }
2305 dout(20) << __func__ << " obc " << *obc << dendl;
2306
2307 if (r) {
2308 dout(20) << __func__ << " returned an error: " << r << dendl;
2309 close_op_ctx(ctx);
2310 if (op->may_write() &&
2311 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2312 record_write_error(op, oid, nullptr, r);
2313 } else {
2314 osd->reply_op_error(op, r);
2315 }
2316 return;
2317 }
2318
2319 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2320 ctx->ignore_cache = true;
2321 }
2322
2323 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2324 // This object is lost. Reading from it returns an error.
2325 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2326 << " is lost" << dendl;
2327 reply_ctx(ctx, -ENFILE);
2328 return;
2329 }
2330 if (!op->may_write() &&
2331 !op->may_cache() &&
2332 (!obc->obs.exists ||
2333 ((m->get_snapid() != CEPH_SNAPDIR) &&
2334 obc->obs.oi.is_whiteout()))) {
2335 // copy the reqids for copy get on ENOENT
2336 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2337 fill_in_copy_get_noent(op, oid, m->ops[0]);
2338 close_op_ctx(ctx);
2339 return;
2340 }
2341 reply_ctx(ctx, -ENOENT);
2342 return;
2343 }
2344
2345 op->mark_started();
2346
2347 execute_ctx(ctx);
2348 utime_t prepare_latency = ceph_clock_now();
2349 prepare_latency -= op->get_dequeued_time();
2350 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2351 if (op->may_read() && op->may_write()) {
2352 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2353 } else if (op->may_read()) {
2354 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2355 } else if (op->may_write() || op->may_cache()) {
2356 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2357 }
2358
2359 // force recovery of the oldest missing object if too many logs
2360 maybe_force_recovery();
2361 }
2362
2363 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2364 OpRequestRef op,
2365 bool write_ordered,
2366 ObjectContextRef obc)
2367 {
2368 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2369 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2370 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2371 return cache_result_t::NOOP;
2372 }
2373
2374 if (obc)
2375 dout(10) << __func__ << " " << obc->obs.oi << " "
2376 << (obc->obs.exists ? "exists" : "DNE")
2377 << dendl;
2378
2379 // if it is write-ordered and blocked, stop now
2380 if (obc.get() && obc->is_blocked() && write_ordered) {
2381 // we're already doing something with this object
2382 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2383 return cache_result_t::NOOP;
2384 }
2385
2386 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2387 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2388 OSDOp& osd_op = *p;
2389 ceph_osd_op& op = osd_op.op;
2390 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2391 return cache_result_t::NOOP;
2392 }
2393 }
2394
2395 switch (obc->obs.oi.manifest.type) {
2396 case object_manifest_t::TYPE_REDIRECT:
2397 if (op->may_write() || write_ordered) {
2398 do_proxy_write(op, obc->obs.oi.soid, obc);
2399 } else {
2400 do_proxy_read(op, obc);
2401 }
2402 return cache_result_t::HANDLED_PROXY;
2403 case object_manifest_t::TYPE_CHUNKED:
2404 default:
2405 assert(0 == "unrecognized manifest type");
2406 }
2407
2408 return cache_result_t::NOOP;
2409 }
2410
2411 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2412 MOSDOpReply *orig_reply, int r)
2413 {
2414 dout(20) << __func__ << " r=" << r << dendl;
2415 assert(op->may_write());
2416 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2417 mempool::osd_pglog::list<pg_log_entry_t> entries;
2418 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2419 get_next_version(), eversion_t(), 0,
2420 reqid, utime_t(), r));
2421
2422 struct OnComplete {
2423 PrimaryLogPG *pg;
2424 OpRequestRef op;
2425 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2426 int r;
2427 OnComplete(
2428 PrimaryLogPG *pg,
2429 OpRequestRef op,
2430 MOSDOpReply *orig_reply,
2431 int r)
2432 : pg(pg), op(op),
2433 orig_reply(orig_reply, false /* take over ref */), r(r)
2434 {}
2435 void operator()() {
2436 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2437 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2438 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2439 MOSDOpReply *reply = orig_reply.detach();
2440 if (reply == nullptr) {
2441 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2442 flags, true);
2443 }
2444 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2445 pg->osd->send_message_osd_client(reply, m->get_connection());
2446 }
2447 };
2448
2449 ObcLockManager lock_manager;
2450 submit_log_entries(
2451 entries,
2452 std::move(lock_manager),
2453 boost::optional<std::function<void(void)> >(
2454 OnComplete(this, op, orig_reply, r)),
2455 op,
2456 r);
2457 }
2458
2459 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2460 OpRequestRef op,
2461 bool write_ordered,
2462 ObjectContextRef obc,
2463 int r, hobject_t missing_oid,
2464 bool must_promote,
2465 bool in_hit_set,
2466 ObjectContextRef *promote_obc)
2467 {
2468 // return quickly if caching is not enabled
2469 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2470 return cache_result_t::NOOP;
2471
2472 if (op &&
2473 op->get_req() &&
2474 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2475 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2476 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2477 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2478 return cache_result_t::NOOP;
2479 }
2480
2481 must_promote = must_promote || op->need_promote();
2482
2483 if (obc)
2484 dout(25) << __func__ << " " << obc->obs.oi << " "
2485 << (obc->obs.exists ? "exists" : "DNE")
2486 << " missing_oid " << missing_oid
2487 << " must_promote " << (int)must_promote
2488 << " in_hit_set " << (int)in_hit_set
2489 << dendl;
2490 else
2491 dout(25) << __func__ << " (no obc)"
2492 << " missing_oid " << missing_oid
2493 << " must_promote " << (int)must_promote
2494 << " in_hit_set " << (int)in_hit_set
2495 << dendl;
2496
2497 // if it is write-ordered and blocked, stop now
2498 if (obc.get() && obc->is_blocked() && write_ordered) {
2499 // we're already doing something with this object
2500 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2501 return cache_result_t::NOOP;
2502 }
2503
2504 if (r == -ENOENT && missing_oid == hobject_t()) {
2505 // we know this object is logically absent (e.g., an undefined clone)
2506 return cache_result_t::NOOP;
2507 }
2508
2509 if (obc.get() && obc->obs.exists) {
2510 osd->logger->inc(l_osd_op_cache_hit);
2511 return cache_result_t::NOOP;
2512 }
2513 if (!is_primary()) {
2514 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2515 osd->reply_op_error(op, -EAGAIN);
2516 return cache_result_t::REPLIED_WITH_EAGAIN;
2517 }
2518
2519 if (missing_oid == hobject_t() && obc.get()) {
2520 missing_oid = obc->obs.oi.soid;
2521 }
2522
2523 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2524 const object_locator_t oloc = m->get_object_locator();
2525
2526 if (op->need_skip_handle_cache()) {
2527 return cache_result_t::NOOP;
2528 }
2529
2530 // older versions do not proxy the feature bits.
2531 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2532 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2533 OpRequestRef promote_op;
2534
2535 switch (pool.info.cache_mode) {
2536 case pg_pool_t::CACHEMODE_WRITEBACK:
2537 if (agent_state &&
2538 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2539 if (!op->may_write() && !op->may_cache() &&
2540 !write_ordered && !must_promote) {
2541 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2542 do_proxy_read(op);
2543 return cache_result_t::HANDLED_PROXY;
2544 }
2545 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2546 block_write_on_full_cache(missing_oid, op);
2547 return cache_result_t::BLOCKED_FULL;
2548 }
2549
2550 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2551 promote_object(obc, missing_oid, oloc, op, promote_obc);
2552 return cache_result_t::BLOCKED_PROMOTE;
2553 }
2554
2555 if (op->may_write() || op->may_cache()) {
2556 if (can_proxy_write) {
2557 do_proxy_write(op, missing_oid);
2558 } else {
2559 // promote if can't proxy the write
2560 promote_object(obc, missing_oid, oloc, op, promote_obc);
2561 return cache_result_t::BLOCKED_PROMOTE;
2562 }
2563
2564 // Promote too?
2565 if (!op->need_skip_promote() &&
2566 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2567 pool.info.min_write_recency_for_promote,
2568 OpRequestRef(),
2569 promote_obc)) {
2570 return cache_result_t::BLOCKED_PROMOTE;
2571 }
2572 return cache_result_t::HANDLED_PROXY;
2573 } else {
2574 do_proxy_read(op);
2575
2576 // Avoid duplicate promotion
2577 if (obc.get() && obc->is_blocked()) {
2578 if (promote_obc)
2579 *promote_obc = obc;
2580 return cache_result_t::BLOCKED_PROMOTE;
2581 }
2582
2583 // Promote too?
2584 if (!op->need_skip_promote()) {
2585 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2586 pool.info.min_read_recency_for_promote,
2587 promote_op, promote_obc);
2588 }
2589
2590 return cache_result_t::HANDLED_PROXY;
2591 }
2592 assert(0 == "unreachable");
2593 return cache_result_t::NOOP;
2594
2595 case pg_pool_t::CACHEMODE_FORWARD:
2596 // FIXME: this mode allows requests to be reordered.
2597 do_cache_redirect(op);
2598 return cache_result_t::HANDLED_REDIRECT;
2599
2600 case pg_pool_t::CACHEMODE_READONLY:
2601 // TODO: clean this case up
2602 if (!obc.get() && r == -ENOENT) {
2603 // we don't have the object and op's a read
2604 promote_object(obc, missing_oid, oloc, op, promote_obc);
2605 return cache_result_t::BLOCKED_PROMOTE;
2606 }
2607 if (!r) { // it must be a write
2608 do_cache_redirect(op);
2609 return cache_result_t::HANDLED_REDIRECT;
2610 }
2611 // crap, there was a failure of some kind
2612 return cache_result_t::NOOP;
2613
2614 case pg_pool_t::CACHEMODE_READFORWARD:
2615 // Do writeback to the cache tier for writes
2616 if (op->may_write() || write_ordered || must_promote) {
2617 if (agent_state &&
2618 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2619 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2620 block_write_on_full_cache(missing_oid, op);
2621 return cache_result_t::BLOCKED_FULL;
2622 }
2623 promote_object(obc, missing_oid, oloc, op, promote_obc);
2624 return cache_result_t::BLOCKED_PROMOTE;
2625 }
2626
2627 // If it is a read, we can read, we need to forward it
2628 do_cache_redirect(op);
2629 return cache_result_t::HANDLED_REDIRECT;
2630
2631 case pg_pool_t::CACHEMODE_PROXY:
2632 if (!must_promote) {
2633 if (op->may_write() || op->may_cache() || write_ordered) {
2634 if (can_proxy_write) {
2635 do_proxy_write(op, missing_oid);
2636 return cache_result_t::HANDLED_PROXY;
2637 }
2638 } else {
2639 do_proxy_read(op);
2640 return cache_result_t::HANDLED_PROXY;
2641 }
2642 }
2643 // ugh, we're forced to promote.
2644 if (agent_state &&
2645 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2646 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2647 block_write_on_full_cache(missing_oid, op);
2648 return cache_result_t::BLOCKED_FULL;
2649 }
2650 promote_object(obc, missing_oid, oloc, op, promote_obc);
2651 return cache_result_t::BLOCKED_PROMOTE;
2652
2653 case pg_pool_t::CACHEMODE_READPROXY:
2654 // Do writeback to the cache tier for writes
2655 if (op->may_write() || write_ordered || must_promote) {
2656 if (agent_state &&
2657 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2658 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2659 block_write_on_full_cache(missing_oid, op);
2660 return cache_result_t::BLOCKED_FULL;
2661 }
2662 promote_object(obc, missing_oid, oloc, op, promote_obc);
2663 return cache_result_t::BLOCKED_PROMOTE;
2664 }
2665
2666 // If it is a read, we can read, we need to proxy it
2667 do_proxy_read(op);
2668 return cache_result_t::HANDLED_PROXY;
2669
2670 default:
2671 assert(0 == "unrecognized cache_mode");
2672 }
2673 return cache_result_t::NOOP;
2674 }
2675
2676 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2677 const hobject_t& missing_oid,
2678 const object_locator_t& oloc,
2679 bool in_hit_set,
2680 uint32_t recency,
2681 OpRequestRef promote_op,
2682 ObjectContextRef *promote_obc)
2683 {
2684 dout(20) << __func__ << " missing_oid " << missing_oid
2685 << " in_hit_set " << in_hit_set << dendl;
2686
2687 switch (recency) {
2688 case 0:
2689 break;
2690 case 1:
2691 // Check if in the current hit set
2692 if (in_hit_set) {
2693 break;
2694 } else {
2695 // not promoting
2696 return false;
2697 }
2698 break;
2699 default:
2700 {
2701 unsigned count = (int)in_hit_set;
2702 if (count) {
2703 // Check if in other hit sets
2704 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2705 for (map<time_t,HitSetRef>::reverse_iterator itor =
2706 agent_state->hit_set_map.rbegin();
2707 itor != agent_state->hit_set_map.rend();
2708 ++itor) {
2709 if (!itor->second->contains(oid)) {
2710 break;
2711 }
2712 ++count;
2713 if (count >= recency) {
2714 break;
2715 }
2716 }
2717 }
2718 if (count >= recency) {
2719 break;
2720 }
2721 return false; // not promoting
2722 }
2723 break;
2724 }
2725
2726 if (osd->promote_throttle()) {
2727 dout(10) << __func__ << " promote throttled" << dendl;
2728 return false;
2729 }
2730 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2731 return true;
2732 }
2733
2734 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2735 {
2736 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2737 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2738 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2739 get_osdmap()->get_epoch(), flags, false);
2740 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2741 reply->set_redirect(redir);
2742 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2743 << op << dendl;
2744 m->get_connection()->send_message(reply);
2745 return;
2746 }
2747
2748 struct C_ProxyRead : public Context {
2749 PrimaryLogPGRef pg;
2750 hobject_t oid;
2751 epoch_t last_peering_reset;
2752 ceph_tid_t tid;
2753 PrimaryLogPG::ProxyReadOpRef prdop;
2754 utime_t start;
2755 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2756 const PrimaryLogPG::ProxyReadOpRef& prd)
2757 : pg(p), oid(o), last_peering_reset(lpr),
2758 tid(0), prdop(prd), start(ceph_clock_now())
2759 {}
2760 void finish(int r) override {
2761 if (prdop->canceled)
2762 return;
2763 pg->lock();
2764 if (prdop->canceled) {
2765 pg->unlock();
2766 return;
2767 }
2768 if (last_peering_reset == pg->get_last_peering_reset()) {
2769 pg->finish_proxy_read(oid, tid, r);
2770 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2771 }
2772 pg->unlock();
2773 }
2774 };
2775
2776 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2777 {
2778 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2779 // stash the result in the request's OSDOp vector
2780 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2781 object_locator_t oloc;
2782 hobject_t soid;
2783 /* extensible tier */
2784 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2785 switch (obc->obs.oi.manifest.type) {
2786 case object_manifest_t::TYPE_REDIRECT:
2787 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2788 soid = obc->obs.oi.manifest.redirect_target;
2789 break;
2790 case object_manifest_t::TYPE_CHUNKED:
2791 default:
2792 assert(0 == "unrecognized manifest type");
2793 }
2794 } else {
2795 /* proxy */
2796 soid = m->get_hobj();
2797 oloc = object_locator_t(m->get_object_locator());
2798 oloc.pool = pool.info.tier_of;
2799 }
2800 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2801
2802 // pass through some original flags that make sense.
2803 // - leave out redirection and balancing flags since we are
2804 // already proxying through the primary
2805 // - leave off read/write/exec flags that are derived from the op
2806 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2807 CEPH_OSD_FLAG_ORDERSNAP |
2808 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2809 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2810
2811 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2812
2813 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2814
2815 ObjectOperation obj_op;
2816 obj_op.dup(prdop->ops);
2817
2818 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2819 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2820 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2821 ceph_osd_op op = obj_op.ops[i].op;
2822 switch (op.op) {
2823 case CEPH_OSD_OP_READ:
2824 case CEPH_OSD_OP_SYNC_READ:
2825 case CEPH_OSD_OP_SPARSE_READ:
2826 case CEPH_OSD_OP_CHECKSUM:
2827 case CEPH_OSD_OP_CMPEXT:
2828 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2829 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2830 }
2831 }
2832 }
2833
2834 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2835 prdop);
2836 ceph_tid_t tid = osd->objecter->read(
2837 soid.oid, oloc, obj_op,
2838 m->get_snapid(), NULL,
2839 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2840 &prdop->user_version,
2841 &prdop->data_offset,
2842 m->get_features());
2843 fin->tid = tid;
2844 prdop->objecter_tid = tid;
2845 proxyread_ops[tid] = prdop;
2846 in_progress_proxy_ops[soid].push_back(op);
2847 }
2848
2849 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2850 {
2851 dout(10) << __func__ << " " << oid << " tid " << tid
2852 << " " << cpp_strerror(r) << dendl;
2853
2854 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2855 if (p == proxyread_ops.end()) {
2856 dout(10) << __func__ << " no proxyread_op found" << dendl;
2857 return;
2858 }
2859 ProxyReadOpRef prdop = p->second;
2860 if (tid != prdop->objecter_tid) {
2861 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2862 << " tid " << prdop->objecter_tid << dendl;
2863 return;
2864 }
2865 if (oid != prdop->soid) {
2866 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2867 << " soid " << prdop->soid << dendl;
2868 return;
2869 }
2870 proxyread_ops.erase(tid);
2871
2872 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2873 if (q == in_progress_proxy_ops.end()) {
2874 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2875 return;
2876 }
2877 assert(q->second.size());
2878 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2879 q->second.end(),
2880 prdop->op);
2881 assert(it != q->second.end());
2882 OpRequestRef op = *it;
2883 q->second.erase(it);
2884 if (q->second.size() == 0) {
2885 in_progress_proxy_ops.erase(oid);
2886 }
2887
2888 osd->logger->inc(l_osd_tier_proxy_read);
2889
2890 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2891 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2892 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2893 ctx->user_at_version = prdop->user_version;
2894 ctx->data_off = prdop->data_offset;
2895 ctx->ignore_log_op_stats = true;
2896 complete_read_ctx(r, ctx);
2897 }
2898
2899 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2900 {
2901 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2902 if (p == in_progress_proxy_ops.end())
2903 return;
2904
2905 list<OpRequestRef>& ls = p->second;
2906 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2907 requeue_ops(ls);
2908 in_progress_proxy_ops.erase(p);
2909 }
2910
2911 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
2912 vector<ceph_tid_t> *tids)
2913 {
2914 dout(10) << __func__ << " " << prdop->soid << dendl;
2915 prdop->canceled = true;
2916
2917 // cancel objecter op, if we can
2918 if (prdop->objecter_tid) {
2919 tids->push_back(prdop->objecter_tid);
2920 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2921 prdop->ops[i].outdata.clear();
2922 }
2923 proxyread_ops.erase(prdop->objecter_tid);
2924 prdop->objecter_tid = 0;
2925 }
2926 }
2927
2928 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
2929 {
2930 dout(10) << __func__ << dendl;
2931
2932 // cancel proxy reads
2933 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2934 while (p != proxyread_ops.end()) {
2935 cancel_proxy_read((p++)->second, tids);
2936 }
2937
2938 // cancel proxy writes
2939 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2940 while (q != proxywrite_ops.end()) {
2941 cancel_proxy_write((q++)->second, tids);
2942 }
2943
2944 if (requeue) {
2945 map<hobject_t, list<OpRequestRef>>::iterator p =
2946 in_progress_proxy_ops.begin();
2947 while (p != in_progress_proxy_ops.end()) {
2948 list<OpRequestRef>& ls = p->second;
2949 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2950 << " requests" << dendl;
2951 requeue_ops(ls);
2952 in_progress_proxy_ops.erase(p++);
2953 }
2954 } else {
2955 in_progress_proxy_ops.clear();
2956 }
2957 }
2958
2959 struct C_ProxyWrite_Commit : public Context {
2960 PrimaryLogPGRef pg;
2961 hobject_t oid;
2962 epoch_t last_peering_reset;
2963 ceph_tid_t tid;
2964 PrimaryLogPG::ProxyWriteOpRef pwop;
2965 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2966 const PrimaryLogPG::ProxyWriteOpRef& pw)
2967 : pg(p), oid(o), last_peering_reset(lpr),
2968 tid(0), pwop(pw)
2969 {}
2970 void finish(int r) override {
2971 if (pwop->canceled)
2972 return;
2973 pg->lock();
2974 if (pwop->canceled) {
2975 pg->unlock();
2976 return;
2977 }
2978 if (last_peering_reset == pg->get_last_peering_reset()) {
2979 pg->finish_proxy_write(oid, tid, r);
2980 }
2981 pg->unlock();
2982 }
2983 };
2984
2985 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2986 {
2987 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2988 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2989 object_locator_t oloc;
2990 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2991 hobject_t soid;
2992 /* extensible tier */
2993 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2994 switch (obc->obs.oi.manifest.type) {
2995 case object_manifest_t::TYPE_REDIRECT:
2996 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2997 soid = obc->obs.oi.manifest.redirect_target;
2998 break;
2999 case object_manifest_t::TYPE_CHUNKED:
3000 default:
3001 assert(0 == "unrecognized manifest type");
3002 }
3003 } else {
3004 /* proxy */
3005 soid = m->get_hobj();
3006 oloc = object_locator_t(m->get_object_locator());
3007 oloc.pool = pool.info.tier_of;
3008 }
3009
3010 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3011 if (!(op->may_write() || op->may_cache())) {
3012 flags |= CEPH_OSD_FLAG_RWORDERED;
3013 }
3014 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3015
3016 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3017 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3018 pwop->mtime = m->get_mtime();
3019
3020 ObjectOperation obj_op;
3021 obj_op.dup(pwop->ops);
3022
3023 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3024 this, soid, get_last_peering_reset(), pwop);
3025 ceph_tid_t tid = osd->objecter->mutate(
3026 soid.oid, oloc, obj_op, snapc,
3027 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3028 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3029 &pwop->user_version, pwop->reqid);
3030 fin->tid = tid;
3031 pwop->objecter_tid = tid;
3032 proxywrite_ops[tid] = pwop;
3033 in_progress_proxy_ops[soid].push_back(op);
3034 }
3035
3036 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3037 {
3038 dout(10) << __func__ << " " << oid << " tid " << tid
3039 << " " << cpp_strerror(r) << dendl;
3040
3041 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3042 if (p == proxywrite_ops.end()) {
3043 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3044 return;
3045 }
3046 ProxyWriteOpRef pwop = p->second;
3047 assert(tid == pwop->objecter_tid);
3048 assert(oid == pwop->soid);
3049
3050 proxywrite_ops.erase(tid);
3051
3052 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3053 if (q == in_progress_proxy_ops.end()) {
3054 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3055 delete pwop->ctx;
3056 pwop->ctx = NULL;
3057 return;
3058 }
3059 list<OpRequestRef>& in_progress_op = q->second;
3060 assert(in_progress_op.size());
3061 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3062 in_progress_op.end(),
3063 pwop->op);
3064 assert(it != in_progress_op.end());
3065 in_progress_op.erase(it);
3066 if (in_progress_op.size() == 0) {
3067 in_progress_proxy_ops.erase(oid);
3068 }
3069
3070 osd->logger->inc(l_osd_tier_proxy_write);
3071
3072 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3073 assert(m != NULL);
3074
3075 if (!pwop->sent_reply) {
3076 // send commit.
3077 MOSDOpReply *reply = pwop->ctx->reply;
3078 if (reply)
3079 pwop->ctx->reply = NULL;
3080 else {
3081 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3082 reply->set_reply_versions(eversion_t(), pwop->user_version);
3083 }
3084 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3085 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3086 osd->send_message_osd_client(reply, m->get_connection());
3087 pwop->sent_reply = true;
3088 pwop->ctx->op->mark_commit_sent();
3089 }
3090
3091 delete pwop->ctx;
3092 pwop->ctx = NULL;
3093 }
3094
3095 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3096 vector<ceph_tid_t> *tids)
3097 {
3098 dout(10) << __func__ << " " << pwop->soid << dendl;
3099 pwop->canceled = true;
3100
3101 // cancel objecter op, if we can
3102 if (pwop->objecter_tid) {
3103 tids->push_back(pwop->objecter_tid);
3104 delete pwop->ctx;
3105 pwop->ctx = NULL;
3106 proxywrite_ops.erase(pwop->objecter_tid);
3107 pwop->objecter_tid = 0;
3108 }
3109 }
3110
3111 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3112 ObjectContextRef obc;
3113 PrimaryLogPG *pg;
3114 utime_t start;
3115 public:
3116 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3117 : obc(obc_),
3118 pg(pg_),
3119 start(ceph_clock_now()) {}
3120
3121 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3122 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3123 int r = results.get<0>();
3124 pg->finish_promote(r, results_data, obc);
3125 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3126 }
3127 };
3128
3129 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3130 const hobject_t& missing_oid,
3131 const object_locator_t& oloc,
3132 OpRequestRef op,
3133 ObjectContextRef *promote_obc)
3134 {
3135 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3136 assert(hoid != hobject_t());
3137 if (write_blocked_by_scrub(hoid)) {
3138 dout(10) << __func__ << " " << hoid
3139 << " blocked by scrub" << dendl;
3140 if (op) {
3141 waiting_for_scrub.push_back(op);
3142 op->mark_delayed("waiting for scrub");
3143 dout(10) << __func__ << " " << hoid
3144 << " placing op in waiting_for_scrub" << dendl;
3145 } else {
3146 dout(10) << __func__ << " " << hoid
3147 << " no op, dropping on the floor" << dendl;
3148 }
3149 return;
3150 }
3151 if (!obc) { // we need to create an ObjectContext
3152 assert(missing_oid != hobject_t());
3153 obc = get_object_context(missing_oid, true);
3154 }
3155 if (promote_obc)
3156 *promote_obc = obc;
3157
3158 /*
3159 * Before promote complete, if there are proxy-reads for the object,
3160 * for this case we don't use DONTNEED.
3161 */
3162 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3163 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3164 if (q == in_progress_proxy_ops.end()) {
3165 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3166 }
3167
3168 PromoteCallback *cb = new PromoteCallback(obc, this);
3169 object_locator_t my_oloc = oloc;
3170 my_oloc.pool = pool.info.tier_of;
3171
3172 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3173 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3174 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3175 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3176 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3177 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3178 src_fadvise_flags, 0);
3179
3180 assert(obc->is_blocked());
3181
3182 if (op)
3183 wait_for_blocked_object(obc->obs.oi.soid, op);
3184 info.stats.stats.sum.num_promote++;
3185 }
3186
3187 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3188 {
3189 FUNCTRACE();
3190 dout(10) << __func__ << " " << ctx << dendl;
3191 ctx->reset_obs(ctx->obc);
3192 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3193 OpRequestRef op = ctx->op;
3194 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3195 ObjectContextRef obc = ctx->obc;
3196 const hobject_t& soid = obc->obs.oi.soid;
3197
3198 // this method must be idempotent since we may call it several times
3199 // before we finally apply the resulting transaction.
3200 ctx->op_t.reset(new PGTransaction);
3201
3202 if (op->may_write() || op->may_cache()) {
3203 // snap
3204 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3205 pool.info.is_pool_snaps_mode()) {
3206 // use pool's snapc
3207 ctx->snapc = pool.snapc;
3208 } else {
3209 // client specified snapc
3210 ctx->snapc.seq = m->get_snap_seq();
3211 ctx->snapc.snaps = m->get_snaps();
3212 filter_snapc(ctx->snapc.snaps);
3213 }
3214 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3215 ctx->snapc.seq < obc->ssc->snapset.seq) {
3216 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3217 << " < snapset seq " << obc->ssc->snapset.seq
3218 << " on " << obc->obs.oi.soid << dendl;
3219 reply_ctx(ctx, -EOLDSNAPC);
3220 return;
3221 }
3222
3223 // version
3224 ctx->at_version = get_next_version();
3225 ctx->mtime = m->get_mtime();
3226
3227 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3228 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3229 << " snapc " << ctx->snapc
3230 << " snapset " << obc->ssc->snapset
3231 << dendl;
3232 } else {
3233 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3234 << " ov " << obc->obs.oi.version
3235 << dendl;
3236 }
3237
3238 if (!ctx->user_at_version)
3239 ctx->user_at_version = obc->obs.oi.user_version;
3240 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3241
3242 if (op->may_read()) {
3243 dout(10) << " taking ondisk_read_lock" << dendl;
3244 obc->ondisk_read_lock();
3245 }
3246
3247 {
3248 #ifdef WITH_LTTNG
3249 osd_reqid_t reqid = ctx->op->get_reqid();
3250 #endif
3251 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3252 reqid.name._num, reqid.tid, reqid.inc);
3253 }
3254
3255 int result = prepare_transaction(ctx);
3256
3257 {
3258 #ifdef WITH_LTTNG
3259 osd_reqid_t reqid = ctx->op->get_reqid();
3260 #endif
3261 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3262 reqid.name._num, reqid.tid, reqid.inc);
3263 }
3264
3265 if (op->may_read()) {
3266 dout(10) << " dropping ondisk_read_lock" << dendl;
3267 obc->ondisk_read_unlock();
3268 }
3269
3270 bool pending_async_reads = !ctx->pending_async_reads.empty();
3271 if (result == -EINPROGRESS || pending_async_reads) {
3272 // come back later.
3273 if (pending_async_reads) {
3274 in_progress_async_reads.push_back(make_pair(op, ctx));
3275 ctx->start_async_reads(this);
3276 }
3277 return;
3278 }
3279
3280 if (result == -EAGAIN) {
3281 // clean up after the ctx
3282 close_op_ctx(ctx);
3283 return;
3284 }
3285
3286 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3287 // prepare the reply
3288 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3289 successful_write);
3290
3291 // Write operations aren't allowed to return a data payload because
3292 // we can't do so reliably. If the client has to resend the request
3293 // and it has already been applied, we will return 0 with no
3294 // payload. Non-deterministic behavior is no good. However, it is
3295 // possible to construct an operation that does a read, does a guard
3296 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3297 // with the write, or return a CMPXATTR and the read value.
3298 if (successful_write) {
3299 // write. normalize the result code.
3300 dout(20) << " zeroing write result code " << result << dendl;
3301 result = 0;
3302 }
3303 ctx->reply->set_result(result);
3304
3305 // read or error?
3306 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3307 // finish side-effects
3308 if (result >= 0)
3309 do_osd_op_effects(ctx, m->get_connection());
3310
3311 complete_read_ctx(result, ctx);
3312 return;
3313 }
3314
3315 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3316
3317 assert(op->may_write() || op->may_cache());
3318
3319 // trim log?
3320 calc_trim_to();
3321
3322 // verify that we are doing this in order?
3323 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3324 !pool.info.is_tier() && !pool.info.has_tiers()) {
3325 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3326 ceph_tid_t t = m->get_tid();
3327 client_t n = m->get_source().num();
3328 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3329 if (p == cm.end()) {
3330 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3331 cm[n] = t;
3332 } else {
3333 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3334 if (p->second > t) {
3335 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3336 assert(0 == "out of order op");
3337 }
3338 p->second = t;
3339 }
3340 }
3341
3342 if (ctx->update_log_only) {
3343 if (result >= 0)
3344 do_osd_op_effects(ctx, m->get_connection());
3345
3346 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3347 // save just what we need from ctx
3348 MOSDOpReply *reply = ctx->reply;
3349 ctx->reply = nullptr;
3350 reply->claim_op_out_data(*ctx->ops);
3351 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3352 close_op_ctx(ctx);
3353
3354 if (result == -ENOENT) {
3355 reply->set_enoent_reply_versions(info.last_update,
3356 info.last_user_version);
3357 }
3358 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3359 // append to pg log for dup detection - don't save buffers for now
3360 record_write_error(op, soid, reply, result);
3361 return;
3362 }
3363
3364 // no need to capture PG ref, repop cancel will handle that
3365 // Can capture the ctx by pointer, it's owned by the repop
3366 ctx->register_on_commit(
3367 [m, ctx, this](){
3368 if (ctx->op)
3369 log_op_stats(
3370 ctx);
3371
3372 if (m && !ctx->sent_reply) {
3373 MOSDOpReply *reply = ctx->reply;
3374 if (reply)
3375 ctx->reply = nullptr;
3376 else {
3377 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3378 reply->set_reply_versions(ctx->at_version,
3379 ctx->user_at_version);
3380 }
3381 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3382 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3383 osd->send_message_osd_client(reply, m->get_connection());
3384 ctx->sent_reply = true;
3385 ctx->op->mark_commit_sent();
3386 }
3387 });
3388 ctx->register_on_success(
3389 [ctx, this]() {
3390 do_osd_op_effects(
3391 ctx,
3392 ctx->op ? ctx->op->get_req()->get_connection() :
3393 ConnectionRef());
3394 });
3395 ctx->register_on_finish(
3396 [ctx, this]() {
3397 delete ctx;
3398 });
3399
3400 // issue replica writes
3401 ceph_tid_t rep_tid = osd->get_tid();
3402
3403 RepGather *repop = new_repop(ctx, obc, rep_tid);
3404
3405 issue_repop(repop, ctx);
3406 eval_repop(repop);
3407 repop->put();
3408 }
3409
3410 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3411 release_object_locks(ctx->lock_manager);
3412
3413 ctx->op_t.reset();
3414
3415 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3416 ctx->on_finish.erase(p++)) {
3417 (*p)();
3418 }
3419 delete ctx;
3420 }
3421
3422 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3423 {
3424 if (ctx->op)
3425 osd->reply_op_error(ctx->op, r);
3426 close_op_ctx(ctx);
3427 }
3428
3429 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3430 {
3431 if (ctx->op)
3432 osd->reply_op_error(ctx->op, r, v, uv);
3433 close_op_ctx(ctx);
3434 }
3435
3436 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3437 {
3438 OpRequestRef op = ctx->op;
3439 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3440
3441 utime_t now = ceph_clock_now();
3442 utime_t latency = now;
3443 latency -= ctx->op->get_req()->get_recv_stamp();
3444 utime_t process_latency = now;
3445 process_latency -= ctx->op->get_dequeued_time();
3446
3447 uint64_t inb = ctx->bytes_written;
3448 uint64_t outb = ctx->bytes_read;
3449
3450 osd->logger->inc(l_osd_op);
3451
3452 osd->logger->inc(l_osd_op_outb, outb);
3453 osd->logger->inc(l_osd_op_inb, inb);
3454 osd->logger->tinc(l_osd_op_lat, latency);
3455 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3456
3457 if (op->may_read() && op->may_write()) {
3458 osd->logger->inc(l_osd_op_rw);
3459 osd->logger->inc(l_osd_op_rw_inb, inb);
3460 osd->logger->inc(l_osd_op_rw_outb, outb);
3461 osd->logger->tinc(l_osd_op_rw_lat, latency);
3462 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3463 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3464 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3465 } else if (op->may_read()) {
3466 osd->logger->inc(l_osd_op_r);
3467 osd->logger->inc(l_osd_op_r_outb, outb);
3468 osd->logger->tinc(l_osd_op_r_lat, latency);
3469 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3470 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3471 } else if (op->may_write() || op->may_cache()) {
3472 osd->logger->inc(l_osd_op_w);
3473 osd->logger->inc(l_osd_op_w_inb, inb);
3474 osd->logger->tinc(l_osd_op_w_lat, latency);
3475 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3476 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3477 } else
3478 ceph_abort();
3479
3480 dout(15) << "log_op_stats " << *m
3481 << " inb " << inb
3482 << " outb " << outb
3483 << " lat " << latency << dendl;
3484 }
3485
3486 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3487 {
3488 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3489 assert(have_same_or_newer_map(m->map_epoch));
3490 assert(m->get_type() == MSG_OSD_SUBOP);
3491 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3492
3493 if (!is_peered()) {
3494 waiting_for_peered.push_back(op);
3495 op->mark_delayed("waiting for active");
3496 return;
3497 }
3498
3499 const OSDOp *first = NULL;
3500 if (m->ops.size() >= 1) {
3501 first = &m->ops[0];
3502 }
3503
3504 if (first) {
3505 switch (first->op.op) {
3506 case CEPH_OSD_OP_DELETE:
3507 sub_op_remove(op);
3508 return;
3509 case CEPH_OSD_OP_SCRUB_RESERVE:
3510 handle_scrub_reserve_request(op);
3511 return;
3512 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3513 handle_scrub_reserve_release(op);
3514 return;
3515 case CEPH_OSD_OP_SCRUB_MAP:
3516 sub_op_scrub_map(op);
3517 return;
3518 }
3519 }
3520 }
3521
3522 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3523 {
3524 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3525 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3526 if (r->ops.size() >= 1) {
3527 const OSDOp& first = r->ops[0];
3528 switch (first.op.op) {
3529 case CEPH_OSD_OP_SCRUB_RESERVE:
3530 {
3531 pg_shard_t from = r->from;
3532 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3533 bool reserved;
3534 ::decode(reserved, p);
3535 if (reserved) {
3536 handle_scrub_reserve_grant(op, from);
3537 } else {
3538 handle_scrub_reserve_reject(op, from);
3539 }
3540 }
3541 return;
3542 }
3543 }
3544 }
3545
3546 void PrimaryLogPG::do_scan(
3547 OpRequestRef op,
3548 ThreadPool::TPHandle &handle)
3549 {
3550 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3551 assert(m->get_type() == MSG_OSD_PG_SCAN);
3552 dout(10) << "do_scan " << *m << dendl;
3553
3554 op->mark_started();
3555
3556 switch (m->op) {
3557 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3558 {
3559 ostringstream ss;
3560 if (osd->check_backfill_full(ss)) {
3561 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3562 queue_peering_event(
3563 CephPeeringEvtRef(
3564 std::make_shared<CephPeeringEvt>(
3565 get_osdmap()->get_epoch(),
3566 get_osdmap()->get_epoch(),
3567 BackfillTooFull())));
3568 return;
3569 }
3570
3571 BackfillInterval bi;
3572 bi.begin = m->begin;
3573 // No need to flush, there won't be any in progress writes occuring
3574 // past m->begin
3575 scan_range(
3576 cct->_conf->osd_backfill_scan_min,
3577 cct->_conf->osd_backfill_scan_max,
3578 &bi,
3579 handle);
3580 MOSDPGScan *reply = new MOSDPGScan(
3581 MOSDPGScan::OP_SCAN_DIGEST,
3582 pg_whoami,
3583 get_osdmap()->get_epoch(), m->query_epoch,
3584 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3585 ::encode(bi.objects, reply->get_data());
3586 osd->send_message_osd_cluster(reply, m->get_connection());
3587 }
3588 break;
3589
3590 case MOSDPGScan::OP_SCAN_DIGEST:
3591 {
3592 pg_shard_t from = m->from;
3593
3594 // Check that from is in backfill_targets vector
3595 assert(is_backfill_targets(from));
3596
3597 BackfillInterval& bi = peer_backfill_info[from];
3598 bi.begin = m->begin;
3599 bi.end = m->end;
3600 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3601
3602 // take care to preserve ordering!
3603 bi.clear_objects();
3604 ::decode_noclear(bi.objects, p);
3605
3606 if (waiting_on_backfill.erase(from)) {
3607 if (waiting_on_backfill.empty()) {
3608 assert(peer_backfill_info.size() == backfill_targets.size());
3609 finish_recovery_op(hobject_t::get_max());
3610 }
3611 } else {
3612 // we canceled backfill for a while due to a too full, and this
3613 // is an extra response from a non-too-full peer
3614 }
3615 }
3616 break;
3617 }
3618 }
3619
3620 void PrimaryLogPG::do_backfill(OpRequestRef op)
3621 {
3622 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3623 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3624 dout(10) << "do_backfill " << *m << dendl;
3625
3626 op->mark_started();
3627
3628 switch (m->op) {
3629 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3630 {
3631 assert(cct->_conf->osd_kill_backfill_at != 1);
3632
3633 MOSDPGBackfill *reply = new MOSDPGBackfill(
3634 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3635 get_osdmap()->get_epoch(),
3636 m->query_epoch,
3637 spg_t(info.pgid.pgid, get_primary().shard));
3638 reply->set_priority(get_recovery_op_priority());
3639 osd->send_message_osd_cluster(reply, m->get_connection());
3640 queue_peering_event(
3641 CephPeeringEvtRef(
3642 std::make_shared<CephPeeringEvt>(
3643 get_osdmap()->get_epoch(),
3644 get_osdmap()->get_epoch(),
3645 RecoveryDone())));
3646 }
3647 // fall-thru
3648
3649 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3650 {
3651 assert(cct->_conf->osd_kill_backfill_at != 2);
3652
3653 info.set_last_backfill(m->last_backfill);
3654 info.stats = m->stats;
3655
3656 ObjectStore::Transaction t;
3657 dirty_info = true;
3658 write_if_dirty(t);
3659 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3660 assert(tr == 0);
3661 }
3662 break;
3663
3664 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3665 {
3666 assert(is_primary());
3667 assert(cct->_conf->osd_kill_backfill_at != 3);
3668 finish_recovery_op(hobject_t::get_max());
3669 }
3670 break;
3671 }
3672 }
3673
3674 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3675 {
3676 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3677 op->get_req());
3678 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3679 dout(7) << __func__ << " " << m->ls << dendl;
3680
3681 op->mark_started();
3682
3683 ObjectStore::Transaction t;
3684 for (auto& p : m->ls) {
3685 remove_snap_mapped_object(t, p.first);
3686 }
3687 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3688 assert(r == 0);
3689 }
3690
3691 int PrimaryLogPG::trim_object(
3692 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3693 {
3694 *ctxp = NULL;
3695 // load clone info
3696 bufferlist bl;
3697 ObjectContextRef obc = get_object_context(coid, false, NULL);
3698 if (!obc || !obc->ssc || !obc->ssc->exists) {
3699 osd->clog->error() << __func__ << ": Can not trim " << coid
3700 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3701 return -ENOENT;
3702 }
3703
3704 hobject_t snapoid(
3705 coid.oid, coid.get_key(),
3706 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3707 info.pgid.pool(), coid.get_namespace());
3708 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3709 if (!snapset_obc) {
3710 osd->clog->error() << __func__ << ": Can not trim " << coid
3711 << " repair needed, no snapset obc for " << snapoid;
3712 return -ENOENT;
3713 }
3714
3715 SnapSet& snapset = obc->ssc->snapset;
3716
3717 bool legacy = snapset.is_legacy() ||
3718 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3719
3720 object_info_t &coi = obc->obs.oi;
3721 set<snapid_t> old_snaps;
3722 if (legacy) {
3723 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3724 } else {
3725 auto p = snapset.clone_snaps.find(coid.snap);
3726 if (p == snapset.clone_snaps.end()) {
3727 osd->clog->error() << "No clone_snaps in snapset " << snapset
3728 << " for object " << coid << "\n";
3729 return -ENOENT;
3730 }
3731 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3732 snapset.clone_snaps[coid.snap].end());
3733 }
3734 if (old_snaps.empty()) {
3735 osd->clog->error() << "No object info snaps for object " << coid;
3736 return -ENOENT;
3737 }
3738
3739 dout(10) << coid << " old_snaps " << old_snaps
3740 << " old snapset " << snapset << dendl;
3741 if (snapset.seq == 0) {
3742 osd->clog->error() << "No snapset.seq for object " << coid;
3743 return -ENOENT;
3744 }
3745
3746 set<snapid_t> new_snaps;
3747 for (set<snapid_t>::iterator i = old_snaps.begin();
3748 i != old_snaps.end();
3749 ++i) {
3750 if (!pool.info.is_removed_snap(*i))
3751 new_snaps.insert(*i);
3752 }
3753
3754 vector<snapid_t>::iterator p = snapset.clones.end();
3755
3756 if (new_snaps.empty()) {
3757 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3758 if (p == snapset.clones.end()) {
3759 osd->clog->error() << "Snap " << coid.snap << " not in clones";
3760 return -ENOENT;
3761 }
3762 }
3763
3764 OpContextUPtr ctx = simple_opc_create(obc);
3765 ctx->snapset_obc = snapset_obc;
3766
3767 if (!ctx->lock_manager.get_snaptrimmer_write(
3768 coid,
3769 obc,
3770 first)) {
3771 close_op_ctx(ctx.release());
3772 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3773 return -ENOLCK;
3774 }
3775
3776 if (!ctx->lock_manager.get_snaptrimmer_write(
3777 snapoid,
3778 snapset_obc,
3779 first)) {
3780 close_op_ctx(ctx.release());
3781 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3782 return -ENOLCK;
3783 }
3784
3785 ctx->at_version = get_next_version();
3786
3787 PGTransaction *t = ctx->op_t.get();
3788
3789 if (new_snaps.empty()) {
3790 // remove clone
3791 dout(10) << coid << " snaps " << old_snaps << " -> "
3792 << new_snaps << " ... deleting" << dendl;
3793
3794 // ...from snapset
3795 assert(p != snapset.clones.end());
3796
3797 snapid_t last = coid.snap;
3798 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3799
3800 if (p != snapset.clones.begin()) {
3801 // not the oldest... merge overlap into next older clone
3802 vector<snapid_t>::iterator n = p - 1;
3803 hobject_t prev_coid = coid;
3804 prev_coid.snap = *n;
3805 bool adjust_prev_bytes = is_present_clone(prev_coid);
3806
3807 if (adjust_prev_bytes)
3808 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3809
3810 snapset.clone_overlap[*n].intersection_of(
3811 snapset.clone_overlap[*p]);
3812
3813 if (adjust_prev_bytes)
3814 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3815 }
3816 ctx->delta_stats.num_objects--;
3817 if (coi.is_dirty())
3818 ctx->delta_stats.num_objects_dirty--;
3819 if (coi.is_omap())
3820 ctx->delta_stats.num_objects_omap--;
3821 if (coi.is_whiteout()) {
3822 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3823 ctx->delta_stats.num_whiteouts--;
3824 }
3825 ctx->delta_stats.num_object_clones--;
3826 if (coi.is_cache_pinned())
3827 ctx->delta_stats.num_objects_pinned--;
3828 obc->obs.exists = false;
3829
3830 snapset.clones.erase(p);
3831 snapset.clone_overlap.erase(last);
3832 snapset.clone_size.erase(last);
3833 snapset.clone_snaps.erase(last);
3834
3835 ctx->log.push_back(
3836 pg_log_entry_t(
3837 pg_log_entry_t::DELETE,
3838 coid,
3839 ctx->at_version,
3840 ctx->obs->oi.version,
3841 0,
3842 osd_reqid_t(),
3843 ctx->mtime,
3844 0)
3845 );
3846 t->remove(coid);
3847 t->update_snaps(
3848 coid,
3849 old_snaps,
3850 new_snaps);
3851
3852 coi = object_info_t(coid);
3853
3854 ctx->at_version.version++;
3855 } else {
3856 // save adjusted snaps for this object
3857 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3858 if (legacy) {
3859 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3860 } else {
3861 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3862 new_snaps.rend());
3863 // we still do a 'modify' event on this object just to trigger a
3864 // snapmapper.update ... :(
3865 }
3866
3867 coi.prior_version = coi.version;
3868 coi.version = ctx->at_version;
3869 bl.clear();
3870 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3871 t->setattr(coid, OI_ATTR, bl);
3872
3873 ctx->log.push_back(
3874 pg_log_entry_t(
3875 pg_log_entry_t::MODIFY,
3876 coid,
3877 coi.version,
3878 coi.prior_version,
3879 0,
3880 osd_reqid_t(),
3881 ctx->mtime,
3882 0)
3883 );
3884 ctx->at_version.version++;
3885
3886 t->update_snaps(
3887 coid,
3888 old_snaps,
3889 new_snaps);
3890 }
3891
3892 // save head snapset
3893 dout(10) << coid << " new snapset " << snapset << " on "
3894 << snapset_obc->obs.oi << dendl;
3895 if (snapset.clones.empty() &&
3896 (!snapset.head_exists ||
3897 (snapset_obc->obs.oi.is_whiteout() &&
3898 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3899 !snapset_obc->obs.oi.is_cache_pinned()))) {
3900 // NOTE: this arguably constitutes minor interference with the
3901 // tiering agent if this is a cache tier since a snap trim event
3902 // is effectively evicting a whiteout we might otherwise want to
3903 // keep around.
3904 dout(10) << coid << " removing " << snapoid << dendl;
3905 ctx->log.push_back(
3906 pg_log_entry_t(
3907 pg_log_entry_t::DELETE,
3908 snapoid,
3909 ctx->at_version,
3910 ctx->snapset_obc->obs.oi.version,
3911 0,
3912 osd_reqid_t(),
3913 ctx->mtime,
3914 0)
3915 );
3916 if (snapoid.is_head()) {
3917 derr << "removing snap head" << dendl;
3918 object_info_t& oi = ctx->snapset_obc->obs.oi;
3919 ctx->delta_stats.num_objects--;
3920 if (oi.is_dirty()) {
3921 ctx->delta_stats.num_objects_dirty--;
3922 }
3923 if (oi.is_omap())
3924 ctx->delta_stats.num_objects_omap--;
3925 if (oi.is_whiteout()) {
3926 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3927 ctx->delta_stats.num_whiteouts--;
3928 }
3929 if (oi.is_cache_pinned()) {
3930 ctx->delta_stats.num_objects_pinned--;
3931 }
3932 }
3933 ctx->snapset_obc->obs.exists = false;
3934 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3935 t->remove(snapoid);
3936 } else {
3937 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3938 snapset.filter(pool.info);
3939 dout(10) << coid << " writing updated snapset on " << snapoid
3940 << ", snapset is " << snapset << dendl;
3941 ctx->log.push_back(
3942 pg_log_entry_t(
3943 pg_log_entry_t::MODIFY,
3944 snapoid,
3945 ctx->at_version,
3946 ctx->snapset_obc->obs.oi.version,
3947 0,
3948 osd_reqid_t(),
3949 ctx->mtime,
3950 0)
3951 );
3952
3953 ctx->snapset_obc->obs.oi.prior_version =
3954 ctx->snapset_obc->obs.oi.version;
3955 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3956
3957 map <string, bufferlist> attrs;
3958 bl.clear();
3959 ::encode(snapset, bl);
3960 attrs[SS_ATTR].claim(bl);
3961
3962 bl.clear();
3963 ::encode(ctx->snapset_obc->obs.oi, bl,
3964 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3965 attrs[OI_ATTR].claim(bl);
3966 t->setattrs(snapoid, attrs);
3967 }
3968
3969 *ctxp = std::move(ctx);
3970 return 0;
3971 }
3972
3973 void PrimaryLogPG::kick_snap_trim()
3974 {
3975 assert(is_active());
3976 assert(is_primary());
3977 if (is_clean() && !snap_trimq.empty()) {
3978 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3979 snap_trimmer_machine.process_event(KickTrim());
3980 }
3981 }
3982
3983 void PrimaryLogPG::snap_trimmer_scrub_complete()
3984 {
3985 if (is_primary() && is_active() && is_clean()) {
3986 assert(!snap_trimq.empty());
3987 snap_trimmer_machine.process_event(ScrubComplete());
3988 }
3989 }
3990
3991 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3992 {
3993 if (deleting || pg_has_reset_since(queued)) {
3994 return;
3995 }
3996
3997 assert(is_primary());
3998
3999 dout(10) << "snap_trimmer posting" << dendl;
4000 snap_trimmer_machine.process_event(DoSnapWork());
4001 dout(10) << "snap_trimmer complete" << dendl;
4002 return;
4003 }
4004
4005 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4006 {
4007 __u64 v2;
4008
4009 string v2s(xattr.c_str(), xattr.length());
4010 if (v2s.length())
4011 v2 = strtoull(v2s.c_str(), NULL, 10);
4012 else
4013 v2 = 0;
4014
4015 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4016
4017 switch (op) {
4018 case CEPH_OSD_CMPXATTR_OP_EQ:
4019 return (v1 == v2);
4020 case CEPH_OSD_CMPXATTR_OP_NE:
4021 return (v1 != v2);
4022 case CEPH_OSD_CMPXATTR_OP_GT:
4023 return (v1 > v2);
4024 case CEPH_OSD_CMPXATTR_OP_GTE:
4025 return (v1 >= v2);
4026 case CEPH_OSD_CMPXATTR_OP_LT:
4027 return (v1 < v2);
4028 case CEPH_OSD_CMPXATTR_OP_LTE:
4029 return (v1 <= v2);
4030 default:
4031 return -EINVAL;
4032 }
4033 }
4034
4035 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4036 {
4037 string v2s(xattr.c_str(), xattr.length());
4038
4039 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4040
4041 switch (op) {
4042 case CEPH_OSD_CMPXATTR_OP_EQ:
4043 return (v1s.compare(v2s) == 0);
4044 case CEPH_OSD_CMPXATTR_OP_NE:
4045 return (v1s.compare(v2s) != 0);
4046 case CEPH_OSD_CMPXATTR_OP_GT:
4047 return (v1s.compare(v2s) > 0);
4048 case CEPH_OSD_CMPXATTR_OP_GTE:
4049 return (v1s.compare(v2s) >= 0);
4050 case CEPH_OSD_CMPXATTR_OP_LT:
4051 return (v1s.compare(v2s) < 0);
4052 case CEPH_OSD_CMPXATTR_OP_LTE:
4053 return (v1s.compare(v2s) <= 0);
4054 default:
4055 return -EINVAL;
4056 }
4057 }
4058
4059 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4060 {
4061 ceph_osd_op& op = osd_op.op;
4062 vector<OSDOp> write_ops(1);
4063 OSDOp& write_op = write_ops[0];
4064 uint64_t write_length = op.writesame.length;
4065 int result = 0;
4066
4067 if (!write_length)
4068 return 0;
4069
4070 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4071 return -EINVAL;
4072
4073 if (op.writesame.data_length != osd_op.indata.length()) {
4074 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4075 return -EINVAL;
4076 }
4077
4078 while (write_length) {
4079 write_op.indata.append(osd_op.indata);
4080 write_length -= op.writesame.data_length;
4081 }
4082
4083 write_op.op.op = CEPH_OSD_OP_WRITE;
4084 write_op.op.extent.offset = op.writesame.offset;
4085 write_op.op.extent.length = op.writesame.length;
4086 result = do_osd_ops(ctx, write_ops);
4087 if (result < 0)
4088 derr << "do_writesame do_osd_ops failed " << result << dendl;
4089
4090 return result;
4091 }
4092
4093 // ========================================================================
4094 // low level osd ops
4095
4096 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4097 {
4098 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4099 bufferlist header, vals;
4100 int r = _get_tmap(ctx, &header, &vals);
4101 if (r < 0) {
4102 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4103 r = 0;
4104 return r;
4105 }
4106
4107 vector<OSDOp> ops(3);
4108
4109 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4110 ops[0].op.extent.offset = 0;
4111 ops[0].op.extent.length = 0;
4112
4113 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4114 ops[1].indata.claim(header);
4115
4116 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4117 ops[2].indata.claim(vals);
4118
4119 return do_osd_ops(ctx, ops);
4120 }
4121
4122 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4123 bufferlist& bl)
4124 {
4125 // decode
4126 bufferlist header;
4127 map<string, bufferlist> m;
4128 if (bl.length()) {
4129 bufferlist::iterator p = bl.begin();
4130 ::decode(header, p);
4131 ::decode(m, p);
4132 assert(p.end());
4133 }
4134
4135 // do the update(s)
4136 while (!bp.end()) {
4137 __u8 op;
4138 string key;
4139 ::decode(op, bp);
4140
4141 switch (op) {
4142 case CEPH_OSD_TMAP_SET: // insert key
4143 {
4144 ::decode(key, bp);
4145 bufferlist data;
4146 ::decode(data, bp);
4147 m[key] = data;
4148 }
4149 break;
4150 case CEPH_OSD_TMAP_RM: // remove key
4151 ::decode(key, bp);
4152 if (!m.count(key)) {
4153 return -ENOENT;
4154 }
4155 m.erase(key);
4156 break;
4157 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4158 ::decode(key, bp);
4159 m.erase(key);
4160 break;
4161 case CEPH_OSD_TMAP_HDR: // update header
4162 {
4163 ::decode(header, bp);
4164 }
4165 break;
4166 default:
4167 return -EINVAL;
4168 }
4169 }
4170
4171 // reencode
4172 bufferlist obl;
4173 ::encode(header, obl);
4174 ::encode(m, obl);
4175
4176 // write it out
4177 vector<OSDOp> nops(1);
4178 OSDOp& newop = nops[0];
4179 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4180 newop.op.extent.offset = 0;
4181 newop.op.extent.length = obl.length();
4182 newop.indata = obl;
4183 do_osd_ops(ctx, nops);
4184 osd_op.outdata.claim(newop.outdata);
4185 return 0;
4186 }
4187
4188 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4189 {
4190 bufferlist::iterator orig_bp = bp;
4191 int result = 0;
4192 if (bp.end()) {
4193 dout(10) << "tmapup is a no-op" << dendl;
4194 } else {
4195 // read the whole object
4196 vector<OSDOp> nops(1);
4197 OSDOp& newop = nops[0];
4198 newop.op.op = CEPH_OSD_OP_READ;
4199 newop.op.extent.offset = 0;
4200 newop.op.extent.length = 0;
4201 result = do_osd_ops(ctx, nops);
4202
4203 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4204
4205 dout(30) << " starting is \n";
4206 newop.outdata.hexdump(*_dout);
4207 *_dout << dendl;
4208
4209 bufferlist::iterator ip = newop.outdata.begin();
4210 bufferlist obl;
4211
4212 dout(30) << "the update command is: \n";
4213 osd_op.indata.hexdump(*_dout);
4214 *_dout << dendl;
4215
4216 // header
4217 bufferlist header;
4218 __u32 nkeys = 0;
4219 if (newop.outdata.length()) {
4220 ::decode(header, ip);
4221 ::decode(nkeys, ip);
4222 }
4223 dout(10) << "tmapup header " << header.length() << dendl;
4224
4225 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4226 ++bp;
4227 ::decode(header, bp);
4228 dout(10) << "tmapup new header " << header.length() << dendl;
4229 }
4230
4231 ::encode(header, obl);
4232
4233 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4234
4235 // update keys
4236 bufferlist newkeydata;
4237 string nextkey, last_in_key;
4238 bufferlist nextval;
4239 bool have_next = false;
4240 if (!ip.end()) {
4241 have_next = true;
4242 ::decode(nextkey, ip);
4243 ::decode(nextval, ip);
4244 }
4245 while (!bp.end() && !result) {
4246 __u8 op;
4247 string key;
4248 try {
4249 ::decode(op, bp);
4250 ::decode(key, bp);
4251 }
4252 catch (buffer::error& e) {
4253 return -EINVAL;
4254 }
4255 if (key < last_in_key) {
4256 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4257 << "', falling back to an inefficient (unsorted) update" << dendl;
4258 bp = orig_bp;
4259 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4260 }
4261 last_in_key = key;
4262
4263 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4264
4265 // skip existing intervening keys
4266 bool key_exists = false;
4267 while (have_next && !key_exists) {
4268 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4269 if (nextkey > key)
4270 break;
4271 if (nextkey < key) {
4272 // copy untouched.
4273 ::encode(nextkey, newkeydata);
4274 ::encode(nextval, newkeydata);
4275 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4276 } else {
4277 // don't copy; discard old value. and stop.
4278 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4279 key_exists = true;
4280 nkeys--;
4281 }
4282 if (!ip.end()) {
4283 ::decode(nextkey, ip);
4284 ::decode(nextval, ip);
4285 } else {
4286 have_next = false;
4287 }
4288 }
4289
4290 if (op == CEPH_OSD_TMAP_SET) {
4291 bufferlist val;
4292 try {
4293 ::decode(val, bp);
4294 }
4295 catch (buffer::error& e) {
4296 return -EINVAL;
4297 }
4298 ::encode(key, newkeydata);
4299 ::encode(val, newkeydata);
4300 dout(20) << " set " << key << " " << val.length() << dendl;
4301 nkeys++;
4302 } else if (op == CEPH_OSD_TMAP_CREATE) {
4303 if (key_exists) {
4304 return -EEXIST;
4305 }
4306 bufferlist val;
4307 try {
4308 ::decode(val, bp);
4309 }
4310 catch (buffer::error& e) {
4311 return -EINVAL;
4312 }
4313 ::encode(key, newkeydata);
4314 ::encode(val, newkeydata);
4315 dout(20) << " create " << key << " " << val.length() << dendl;
4316 nkeys++;
4317 } else if (op == CEPH_OSD_TMAP_RM) {
4318 // do nothing.
4319 if (!key_exists) {
4320 return -ENOENT;
4321 }
4322 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4323 // do nothing
4324 } else {
4325 dout(10) << " invalid tmap op " << (int)op << dendl;
4326 return -EINVAL;
4327 }
4328 }
4329
4330 // copy remaining
4331 if (have_next) {
4332 ::encode(nextkey, newkeydata);
4333 ::encode(nextval, newkeydata);
4334 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4335 }
4336 if (!ip.end()) {
4337 bufferlist rest;
4338 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4339 dout(20) << " keep trailing " << rest.length()
4340 << " at " << newkeydata.length() << dendl;
4341 newkeydata.claim_append(rest);
4342 }
4343
4344 // encode final key count + key data
4345 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4346 ::encode(nkeys, obl);
4347 obl.claim_append(newkeydata);
4348
4349 if (0) {
4350 dout(30) << " final is \n";
4351 obl.hexdump(*_dout);
4352 *_dout << dendl;
4353
4354 // sanity check
4355 bufferlist::iterator tp = obl.begin();
4356 bufferlist h;
4357 ::decode(h, tp);
4358 map<string,bufferlist> d;
4359 ::decode(d, tp);
4360 assert(tp.end());
4361 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4362 }
4363
4364 // write it out
4365 if (!result) {
4366 dout(20) << "tmapput write " << obl.length() << dendl;
4367 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4368 newop.op.extent.offset = 0;
4369 newop.op.extent.length = obl.length();
4370 newop.indata = obl;
4371 do_osd_ops(ctx, nops);
4372 osd_op.outdata.claim(newop.outdata);
4373 }
4374 }
4375 return result;
4376 }
4377
4378 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4379 {
4380 if (offset >= max ||
4381 length > max ||
4382 offset + length > max)
4383 return -EFBIG;
4384
4385 return 0;
4386 }
4387
4388 struct FillInVerifyExtent : public Context {
4389 ceph_le64 *r;
4390 int32_t *rval;
4391 bufferlist *outdatap;
4392 boost::optional<uint32_t> maybe_crc;
4393 uint64_t size;
4394 OSDService *osd;
4395 hobject_t soid;
4396 __le32 flags;
4397 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4398 boost::optional<uint32_t> mc, uint64_t size,
4399 OSDService *osd, hobject_t soid, __le32 flags) :
4400 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4401 size(size), osd(osd), soid(soid), flags(flags) {}
4402 void finish(int len) override {
4403 *r = len;
4404 if (len < 0) {
4405 *rval = len;
4406 return;
4407 }
4408 *rval = 0;
4409
4410 // whole object? can we verify the checksum?
4411 if (maybe_crc && *r == size) {
4412 uint32_t crc = outdatap->crc32c(-1);
4413 if (maybe_crc != crc) {
4414 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4415 << " != expected 0x" << *maybe_crc
4416 << std::dec << " on " << soid;
4417 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4418 *rval = -EIO;
4419 *r = 0;
4420 }
4421 }
4422 }
4423 }
4424 };
4425
4426 struct ToSparseReadResult : public Context {
4427 int* result;
4428 bufferlist* data_bl;
4429 uint64_t data_offset;
4430 ceph_le64* len;
4431 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4432 ceph_le64* len)
4433 : result(result), data_bl(bl), data_offset(offset),len(len) {}
4434 void finish(int r) override {
4435 if (r < 0) {
4436 *result = r;
4437 return;
4438 }
4439 *result = 0;
4440 *len = r;
4441 bufferlist outdata;
4442 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4443 ::encode(extents, outdata);
4444 ::encode_destructively(*data_bl, outdata);
4445 data_bl->swap(outdata);
4446 }
4447 };
4448
4449 template<typename V>
4450 static string list_keys(const map<string, V>& m) {
4451 string s;
4452 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4453 if (!s.empty()) {
4454 s.push_back(',');
4455 }
4456 s.append(itr->first);
4457 }
4458 return s;
4459 }
4460
4461 template<typename T>
4462 static string list_entries(const T& m) {
4463 string s;
4464 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4465 if (!s.empty()) {
4466 s.push_back(',');
4467 }
4468 s.append(*itr);
4469 }
4470 return s;
4471 }
4472
4473 void PrimaryLogPG::maybe_create_new_object(
4474 OpContext *ctx,
4475 bool ignore_transaction)
4476 {
4477 ObjectState& obs = ctx->new_obs;
4478 if (!obs.exists) {
4479 ctx->delta_stats.num_objects++;
4480 obs.exists = true;
4481 assert(!obs.oi.is_whiteout());
4482 obs.oi.new_object();
4483 if (!ignore_transaction)
4484 ctx->op_t->create(obs.oi.soid);
4485 } else if (obs.oi.is_whiteout()) {
4486 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4487 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4488 --ctx->delta_stats.num_whiteouts;
4489 }
4490 }
4491
4492 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4493 OSDOp& osd_op;
4494
4495 ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4496 }
4497
4498 int execute() override {
4499 return osd_op.rval;
4500 }
4501 };
4502
4503 struct C_ChecksumRead : public Context {
4504 PrimaryLogPG *primary_log_pg;
4505 OSDOp &osd_op;
4506 Checksummer::CSumType csum_type;
4507 bufferlist init_value_bl;
4508 ceph_le64 read_length;
4509 bufferlist read_bl;
4510 Context *fill_extent_ctx;
4511
4512 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4513 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4514 boost::optional<uint32_t> maybe_crc, uint64_t size,
4515 OSDService *osd, hobject_t soid, __le32 flags)
4516 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4517 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4518 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4519 &read_bl, maybe_crc, size,
4520 osd, soid, flags)) {
4521 }
4522 ~C_ChecksumRead() override {
4523 delete fill_extent_ctx;
4524 }
4525
4526 void finish(int r) override {
4527 fill_extent_ctx->complete(r);
4528 fill_extent_ctx = nullptr;
4529
4530 if (osd_op.rval >= 0) {
4531 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4532 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4533 &init_value_bl_it, read_bl);
4534 }
4535 }
4536 };
4537
4538 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4539 bufferlist::iterator *bl_it)
4540 {
4541 dout(20) << __func__ << dendl;
4542 bool skip_data_digest =
4543 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4544 g_conf->osd_distrust_data_digest;
4545
4546 auto& op = osd_op.op;
4547 if (op.checksum.chunk_size > 0) {
4548 if (op.checksum.length == 0) {
4549 dout(10) << __func__ << ": length required when chunk size provided"
4550 << dendl;
4551 return -EINVAL;
4552 }
4553 if (op.checksum.length % op.checksum.chunk_size != 0) {
4554 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4555 return -EINVAL;
4556 }
4557 }
4558
4559 auto& oi = ctx->new_obs.oi;
4560 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4561 // zeroed offset+length implies checksum whole object
4562 op.checksum.length = oi.size;
4563 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4564 return -EOVERFLOW;
4565 }
4566
4567 Checksummer::CSumType csum_type;
4568 switch (op.checksum.type) {
4569 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4570 csum_type = Checksummer::CSUM_XXHASH32;
4571 break;
4572 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4573 csum_type = Checksummer::CSUM_XXHASH64;
4574 break;
4575 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4576 csum_type = Checksummer::CSUM_CRC32C;
4577 break;
4578 default:
4579 dout(10) << __func__ << ": unknown crc type ("
4580 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4581 return -EINVAL;
4582 }
4583
4584 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4585 if (bl_it->get_remaining() < csum_init_value_size) {
4586 dout(10) << __func__ << ": init value not provided" << dendl;
4587 return -EINVAL;
4588 }
4589
4590 bufferlist init_value_bl;
4591 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4592 csum_init_value_size);
4593 bl_it->advance(csum_init_value_size);
4594
4595 if (pool.info.require_rollback() && op.checksum.length > 0) {
4596 // If there is a data digest and it is possible we are reading
4597 // entire object, pass the digest.
4598 boost::optional<uint32_t> maybe_crc;
4599 if (!skip_data_digest &&
4600 oi.is_data_digest() && op.checksum.offset == 0 &&
4601 op.checksum.length >= oi.size) {
4602 maybe_crc = oi.data_digest;
4603 }
4604
4605 // async read
4606 auto& soid = oi.soid;
4607 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4608 std::move(init_value_bl), maybe_crc,
4609 oi.size, osd, soid, op.flags);
4610
4611 ctx->pending_async_reads.push_back({
4612 {op.checksum.offset, op.checksum.length, op.flags},
4613 {&checksum_ctx->read_bl, checksum_ctx}});
4614
4615 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4616 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4617 new ReadFinisher(osd_op));
4618 return -EINPROGRESS;
4619 }
4620
4621 // sync read
4622 std::vector<OSDOp> read_ops(1);
4623 auto& read_op = read_ops[0];
4624 if (op.checksum.length > 0) {
4625 read_op.op.op = CEPH_OSD_OP_READ;
4626 read_op.op.flags = op.flags;
4627 read_op.op.extent.offset = op.checksum.offset;
4628 read_op.op.extent.length = op.checksum.length;
4629 read_op.op.extent.truncate_size = 0;
4630 read_op.op.extent.truncate_seq = 0;
4631
4632 int r = do_osd_ops(ctx, read_ops);
4633 if (r < 0) {
4634 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4635 return r;
4636 }
4637 }
4638
4639 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4640 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4641 read_op.outdata);
4642 }
4643
4644 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4645 Checksummer::CSumType csum_type,
4646 bufferlist::iterator *init_value_bl_it,
4647 const bufferlist &read_bl) {
4648 dout(20) << __func__ << dendl;
4649
4650 auto& op = osd_op.op;
4651
4652 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4653 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4654 << op.checksum.length << dendl;
4655 return -EINVAL;
4656 }
4657
4658 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4659 op.checksum.chunk_size : read_bl.length());
4660 uint32_t csum_count = (csum_chunk_size > 0 ?
4661 read_bl.length() / csum_chunk_size : 0);
4662
4663 bufferlist csum;
4664 bufferptr csum_data;
4665 if (csum_count > 0) {
4666 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4667 csum_data = buffer::create(csum_value_size * csum_count);
4668 csum_data.zero();
4669 csum.append(csum_data);
4670
4671 switch (csum_type) {
4672 case Checksummer::CSUM_XXHASH32:
4673 {
4674 Checksummer::xxhash32::init_value_t init_value;
4675 ::decode(init_value, *init_value_bl_it);
4676 Checksummer::calculate<Checksummer::xxhash32>(
4677 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4678 &csum_data);
4679 }
4680 break;
4681 case Checksummer::CSUM_XXHASH64:
4682 {
4683 Checksummer::xxhash64::init_value_t init_value;
4684 ::decode(init_value, *init_value_bl_it);
4685 Checksummer::calculate<Checksummer::xxhash64>(
4686 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4687 &csum_data);
4688 }
4689 break;
4690 case Checksummer::CSUM_CRC32C:
4691 {
4692 Checksummer::crc32c::init_value_t init_value;
4693 ::decode(init_value, *init_value_bl_it);
4694 Checksummer::calculate<Checksummer::crc32c>(
4695 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4696 &csum_data);
4697 }
4698 break;
4699 default:
4700 break;
4701 }
4702 }
4703
4704 ::encode(csum_count, osd_op.outdata);
4705 osd_op.outdata.claim_append(csum);
4706 return 0;
4707 }
4708
4709 struct C_ExtentCmpRead : public Context {
4710 PrimaryLogPG *primary_log_pg;
4711 OSDOp &osd_op;
4712 ceph_le64 read_length;
4713 bufferlist read_bl;
4714 Context *fill_extent_ctx;
4715
4716 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4717 boost::optional<uint32_t> maybe_crc, uint64_t size,
4718 OSDService *osd, hobject_t soid, __le32 flags)
4719 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4720 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4721 &read_bl, maybe_crc, size,
4722 osd, soid, flags)) {
4723 }
4724 ~C_ExtentCmpRead() override {
4725 delete fill_extent_ctx;
4726 }
4727
4728 void finish(int r) override {
4729 if (r == -ENOENT) {
4730 osd_op.rval = 0;
4731 read_bl.clear();
4732 delete fill_extent_ctx;
4733 } else {
4734 fill_extent_ctx->complete(r);
4735 }
4736 fill_extent_ctx = nullptr;
4737
4738 if (osd_op.rval >= 0) {
4739 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4740 }
4741 }
4742 };
4743
4744 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4745 {
4746 dout(20) << __func__ << dendl;
4747 ceph_osd_op& op = osd_op.op;
4748 bool skip_data_digest =
4749 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4750 g_conf->osd_distrust_data_digest;
4751
4752 auto& oi = ctx->new_obs.oi;
4753 uint64_t size = oi.size;
4754 if ((oi.truncate_seq < op.extent.truncate_seq) &&
4755 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4756 size = op.extent.truncate_size;
4757 }
4758
4759 if (op.extent.offset >= size) {
4760 op.extent.length = 0;
4761 } else if (op.extent.offset + op.extent.length > size) {
4762 op.extent.length = size - op.extent.offset;
4763 }
4764
4765 if (op.extent.length == 0) {
4766 dout(20) << __func__ << " zero length extent" << dendl;
4767 return finish_extent_cmp(osd_op, bufferlist{});
4768 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4769 dout(20) << __func__ << " object DNE" << dendl;
4770 return finish_extent_cmp(osd_op, {});
4771 } else if (pool.info.require_rollback()) {
4772 // If there is a data digest and it is possible we are reading
4773 // entire object, pass the digest.
4774 boost::optional<uint32_t> maybe_crc;
4775 if (!skip_data_digest &&
4776 oi.is_data_digest() && op.checksum.offset == 0 &&
4777 op.checksum.length >= oi.size) {
4778 maybe_crc = oi.data_digest;
4779 }
4780
4781 // async read
4782 auto& soid = oi.soid;
4783 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4784 osd, soid, op.flags);
4785 ctx->pending_async_reads.push_back({
4786 {op.extent.offset, op.extent.length, op.flags},
4787 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4788
4789 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4790
4791 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4792 new ReadFinisher(osd_op));
4793 return -EINPROGRESS;
4794 }
4795
4796 // sync read
4797 vector<OSDOp> read_ops(1);
4798 OSDOp& read_op = read_ops[0];
4799
4800 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4801 read_op.op.extent.offset = op.extent.offset;
4802 read_op.op.extent.length = op.extent.length;
4803 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4804 read_op.op.extent.truncate_size = op.extent.truncate_size;
4805
4806 int result = do_osd_ops(ctx, read_ops);
4807 if (result < 0) {
4808 derr << __func__ << " failed " << result << dendl;
4809 return result;
4810 }
4811 return finish_extent_cmp(osd_op, read_op.outdata);
4812 }
4813
4814 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4815 {
4816 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4817 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4818 if (osd_op.indata[idx] != read_byte) {
4819 return (-MAX_ERRNO - idx);
4820 }
4821 }
4822
4823 return 0;
4824 }
4825
4826 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4827 dout(20) << __func__ << dendl;
4828 auto& op = osd_op.op;
4829 auto& oi = ctx->new_obs.oi;
4830 auto& soid = oi.soid;
4831 __u32 seq = oi.truncate_seq;
4832 uint64_t size = oi.size;
4833 bool trimmed_read = false;
4834 bool skip_data_digest =
4835 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4836 g_conf->osd_distrust_data_digest;
4837
4838 // are we beyond truncate_size?
4839 if ( (seq < op.extent.truncate_seq) &&
4840 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4841 size = op.extent.truncate_size;
4842
4843 if (op.extent.length == 0) //length is zero mean read the whole object
4844 op.extent.length = size;
4845
4846 if (op.extent.offset >= size) {
4847 op.extent.length = 0;
4848 trimmed_read = true;
4849 } else if (op.extent.offset + op.extent.length > size) {
4850 op.extent.length = size - op.extent.offset;
4851 trimmed_read = true;
4852 }
4853
4854 // read into a buffer
4855 int result = 0;
4856 if (trimmed_read && op.extent.length == 0) {
4857 // read size was trimmed to zero and it is expected to do nothing
4858 // a read operation of 0 bytes does *not* do nothing, this is why
4859 // the trimmed_read boolean is needed
4860 } else if (pool.info.require_rollback()) {
4861 boost::optional<uint32_t> maybe_crc;
4862 // If there is a data digest and it is possible we are reading
4863 // entire object, pass the digest. FillInVerifyExtent will
4864 // will check the oi.size again.
4865 if (!skip_data_digest &&
4866 oi.is_data_digest() && op.extent.offset == 0 &&
4867 op.extent.length >= oi.size)
4868 maybe_crc = oi.data_digest;
4869 ctx->pending_async_reads.push_back(
4870 make_pair(
4871 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4872 make_pair(&osd_op.outdata,
4873 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4874 &osd_op.outdata, maybe_crc, oi.size,
4875 osd, soid, op.flags))));
4876 dout(10) << " async_read noted for " << soid << dendl;
4877
4878 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4879 new ReadFinisher(osd_op));
4880 } else {
4881 int r = pgbackend->objects_read_sync(
4882 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4883 // whole object? can we verify the checksum?
4884 if (!skip_data_digest && r >= 0 && op.extent.offset == 0 &&
4885 (uint64_t)r == oi.size && oi.is_data_digest()) {
4886 uint32_t crc = osd_op.outdata.crc32c(-1);
4887 if (oi.data_digest != crc) {
4888 osd->clog->error() << info.pgid << std::hex
4889 << " full-object read crc 0x" << crc
4890 << " != expected 0x" << oi.data_digest
4891 << std::dec << " on " << soid;
4892 r = -EIO; // try repair later
4893 }
4894 }
4895 if (r == -EIO) {
4896 r = rep_repair_primary_object(soid, ctx->op);
4897 }
4898 if (r >= 0)
4899 op.extent.length = r;
4900 else {
4901 result = r;
4902 op.extent.length = 0;
4903 }
4904 dout(10) << " read got " << r << " / " << op.extent.length
4905 << " bytes from obj " << soid << dendl;
4906 }
4907
4908 // XXX the op.extent.length is the requested length for async read
4909 // On error this length is changed to 0 after the error comes back.
4910 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4911 ctx->delta_stats.num_rd++;
4912 return result;
4913 }
4914
4915 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4916 dout(20) << __func__ << dendl;
4917 auto& op = osd_op.op;
4918 auto& oi = ctx->new_obs.oi;
4919 auto& soid = oi.soid;
4920 bool skip_data_digest =
4921 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4922 g_conf->osd_distrust_data_digest;
4923
4924 if (op.extent.truncate_seq) {
4925 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4926 return -EINVAL;
4927 }
4928
4929 ++ctx->num_read;
4930 if (pool.info.ec_pool()) {
4931 // translate sparse read to a normal one if not supported
4932 uint64_t offset = op.extent.offset;
4933 uint64_t length = op.extent.length;
4934 if (offset > oi.size) {
4935 length = 0;
4936 } else if (offset + length > oi.size) {
4937 length = oi.size - offset;
4938 }
4939
4940 if (length > 0) {
4941 ctx->pending_async_reads.push_back(
4942 make_pair(
4943 boost::make_tuple(offset, length, op.flags),
4944 make_pair(
4945 &osd_op.outdata,
4946 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4947 &op.extent.length))));
4948 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4949
4950 ctx->op_finishers[ctx->current_osd_subop_num].reset(
4951 new ReadFinisher(osd_op));
4952 } else {
4953 dout(10) << " sparse read ended up empty for " << soid << dendl;
4954 map<uint64_t, uint64_t> extents;
4955 ::encode(extents, osd_op.outdata);
4956 }
4957 } else {
4958 // read into a buffer
4959 map<uint64_t, uint64_t> m;
4960 uint32_t total_read = 0;
4961 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4962 info.pgid.shard),
4963 op.extent.offset, op.extent.length, m);
4964 if (r < 0) {
4965 return r;
4966 }
4967
4968 map<uint64_t, uint64_t>::iterator miter;
4969 bufferlist data_bl;
4970 uint64_t last = op.extent.offset;
4971 for (miter = m.begin(); miter != m.end(); ++miter) {
4972 // verify hole?
4973 if (cct->_conf->osd_verify_sparse_read_holes &&
4974 last < miter->first) {
4975 bufferlist t;
4976 uint64_t len = miter->first - last;
4977 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4978 if (r < 0) {
4979 osd->clog->error() << coll << " " << soid
4980 << " sparse-read failed to read: "
4981 << r;
4982 } else if (!t.is_zero()) {
4983 osd->clog->error() << coll << " " << soid
4984 << " sparse-read found data in hole "
4985 << last << "~" << len;
4986 }
4987 }
4988
4989 bufferlist tmpbl;
4990 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4991 op.flags, &tmpbl);
4992 if (r == -EIO) {
4993 r = rep_repair_primary_object(soid, ctx->op);
4994 }
4995 if (r < 0) {
4996 return r;
4997 }
4998
4999 // this is usually happen when we get extent that exceeds the actual file
5000 // size
5001 if (r < (int)miter->second)
5002 miter->second = r;
5003 total_read += r;
5004 dout(10) << "sparse-read " << miter->first << "@" << miter->second
5005 << dendl;
5006 data_bl.claim_append(tmpbl);
5007 last = miter->first + r;
5008 }
5009
5010 if (r < 0) {
5011 return r;
5012 }
5013
5014 // verify trailing hole?
5015 if (cct->_conf->osd_verify_sparse_read_holes) {
5016 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
5017 if (last < end) {
5018 bufferlist t;
5019 uint64_t len = end - last;
5020 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5021 if (r < 0) {
5022 osd->clog->error() << coll << " " << soid
5023 << " sparse-read failed to read: " << r;
5024 } else if (!t.is_zero()) {
5025 osd->clog->error() << coll << " " << soid
5026 << " sparse-read found data in hole "
5027 << last << "~" << len;
5028 }
5029 }
5030 }
5031
5032 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5033 // Maybe at first, there is no much whole objects. With continued use, more
5034 // and more whole object exist. So from this point, for spare-read add
5035 // checksum make sense.
5036 if (!skip_data_digest &&
5037 total_read == oi.size && oi.is_data_digest()) {
5038 uint32_t crc = data_bl.crc32c(-1);
5039 if (oi.data_digest != crc) {
5040 osd->clog->error() << info.pgid << std::hex
5041 << " full-object read crc 0x" << crc
5042 << " != expected 0x" << oi.data_digest
5043 << std::dec << " on " << soid;
5044 r = rep_repair_primary_object(soid, ctx->op);
5045 if (r < 0) {
5046 return r;
5047 }
5048 }
5049 }
5050
5051 op.extent.length = total_read;
5052
5053 ::encode(m, osd_op.outdata); // re-encode since it might be modified
5054 ::encode_destructively(data_bl, osd_op.outdata);
5055
5056 dout(10) << " sparse_read got " << total_read << " bytes from object "
5057 << soid << dendl;
5058 }
5059
5060 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5061 ctx->delta_stats.num_rd++;
5062 return 0;
5063 }
5064
5065 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5066 {
5067 int result = 0;
5068 SnapSetContext *ssc = ctx->obc->ssc;
5069 ObjectState& obs = ctx->new_obs;
5070 object_info_t& oi = obs.oi;
5071 const hobject_t& soid = oi.soid;
5072 bool skip_data_digest =
5073 osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest;
5074
5075 PGTransaction* t = ctx->op_t.get();
5076
5077 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5078
5079 ctx->current_osd_subop_num = 0;
5080 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5081 OSDOp& osd_op = *p;
5082 ceph_osd_op& op = osd_op.op;
5083
5084 OpFinisher* op_finisher = nullptr;
5085 {
5086 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5087 if (op_finisher_it != ctx->op_finishers.end()) {
5088 op_finisher = op_finisher_it->second.get();
5089 }
5090 }
5091
5092 // TODO: check endianness (__le32 vs uint32_t, etc.)
5093 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5094 // but the code in this function seems to treat them as native-endian. What should the
5095 // tracepoints do?
5096 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5097
5098 dout(10) << "do_osd_op " << osd_op << dendl;
5099
5100 bufferlist::iterator bp = osd_op.indata.begin();
5101
5102 // user-visible modifcation?
5103 switch (op.op) {
5104 // non user-visible modifications
5105 case CEPH_OSD_OP_WATCH:
5106 case CEPH_OSD_OP_CACHE_EVICT:
5107 case CEPH_OSD_OP_CACHE_FLUSH:
5108 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5109 case CEPH_OSD_OP_UNDIRTY:
5110 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5111 case CEPH_OSD_OP_CACHE_PIN:
5112 case CEPH_OSD_OP_CACHE_UNPIN:
5113 case CEPH_OSD_OP_SET_REDIRECT:
5114 break;
5115 default:
5116 if (op.op & CEPH_OSD_OP_MODE_WR)
5117 ctx->user_modify = true;
5118 }
5119
5120 // munge -1 truncate to 0 truncate
5121 if (ceph_osd_op_uses_extent(op.op) &&
5122 op.extent.truncate_seq == 1 &&
5123 op.extent.truncate_size == (-1ULL)) {
5124 op.extent.truncate_size = 0;
5125 op.extent.truncate_seq = 0;
5126 }
5127
5128 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5129 if (op.op == CEPH_OSD_OP_ZERO &&
5130 obs.exists &&
5131 op.extent.offset < cct->_conf->osd_max_object_size &&
5132 op.extent.length >= 1 &&
5133 op.extent.length <= cct->_conf->osd_max_object_size &&
5134 op.extent.offset + op.extent.length >= oi.size) {
5135 if (op.extent.offset >= oi.size) {
5136 // no-op
5137 goto fail;
5138 }
5139 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5140 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5141 op.op = CEPH_OSD_OP_TRUNCATE;
5142 }
5143
5144 switch (op.op) {
5145
5146 // --- READS ---
5147
5148 case CEPH_OSD_OP_CMPEXT:
5149 ++ctx->num_read;
5150 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5151 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5152 op.extent.length, op.extent.truncate_size,
5153 op.extent.truncate_seq);
5154
5155 if (op_finisher == nullptr) {
5156 result = do_extent_cmp(ctx, osd_op);
5157 } else {
5158 result = op_finisher->execute();
5159 }
5160 break;
5161
5162 case CEPH_OSD_OP_SYNC_READ:
5163 if (pool.info.require_rollback()) {
5164 result = -EOPNOTSUPP;
5165 break;
5166 }
5167 // fall through
5168 case CEPH_OSD_OP_READ:
5169 ++ctx->num_read;
5170 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5171 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5172 op.extent.length, op.extent.truncate_size,
5173 op.extent.truncate_seq);
5174 if (op_finisher == nullptr) {
5175 if (!ctx->data_off) {
5176 ctx->data_off = op.extent.offset;
5177 }
5178 result = do_read(ctx, osd_op);
5179 } else {
5180 result = op_finisher->execute();
5181 }
5182 break;
5183
5184 case CEPH_OSD_OP_CHECKSUM:
5185 ++ctx->num_read;
5186 {
5187 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5188 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5189 op.checksum.offset, op.checksum.length,
5190 op.checksum.chunk_size);
5191
5192 if (op_finisher == nullptr) {
5193 result = do_checksum(ctx, osd_op, &bp);
5194 } else {
5195 result = op_finisher->execute();
5196 }
5197 }
5198 break;
5199
5200 /* map extents */
5201 case CEPH_OSD_OP_MAPEXT:
5202 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5203 if (pool.info.require_rollback()) {
5204 result = -EOPNOTSUPP;
5205 break;
5206 }
5207 ++ctx->num_read;
5208 {
5209 // read into a buffer
5210 bufferlist bl;
5211 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5212 info.pgid.shard),
5213 op.extent.offset, op.extent.length, bl);
5214 osd_op.outdata.claim(bl);
5215 if (r < 0)
5216 result = r;
5217 else
5218 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5219 ctx->delta_stats.num_rd++;
5220 dout(10) << " map_extents done on object " << soid << dendl;
5221 }
5222 break;
5223
5224 /* map extents */
5225 case CEPH_OSD_OP_SPARSE_READ:
5226 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5227 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5228 op.extent.length, op.extent.truncate_size,
5229 op.extent.truncate_seq);
5230 if (op_finisher == nullptr) {
5231 result = do_sparse_read(ctx, osd_op);
5232 } else {
5233 result = op_finisher->execute();
5234 }
5235 break;
5236
5237 case CEPH_OSD_OP_CALL:
5238 {
5239 string cname, mname;
5240 bufferlist indata;
5241 try {
5242 bp.copy(op.cls.class_len, cname);
5243 bp.copy(op.cls.method_len, mname);
5244 bp.copy(op.cls.indata_len, indata);
5245 } catch (buffer::error& e) {
5246 dout(10) << "call unable to decode class + method + indata" << dendl;
5247 dout(30) << "in dump: ";
5248 osd_op.indata.hexdump(*_dout);
5249 *_dout << dendl;
5250 result = -EINVAL;
5251 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5252 break;
5253 }
5254 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5255
5256 ClassHandler::ClassData *cls;
5257 result = osd->class_handler->open_class(cname, &cls);
5258 assert(result == 0); // init_op_flags() already verified this works.
5259
5260 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5261 if (!method) {
5262 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5263 result = -EOPNOTSUPP;
5264 break;
5265 }
5266
5267 int flags = method->get_flags();
5268 if (flags & CLS_METHOD_WR)
5269 ctx->user_modify = true;
5270
5271 bufferlist outdata;
5272 dout(10) << "call method " << cname << "." << mname << dendl;
5273 int prev_rd = ctx->num_read;
5274 int prev_wr = ctx->num_write;
5275 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5276
5277 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5278 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5279 result = -EIO;
5280 break;
5281 }
5282 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5283 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5284 result = -EIO;
5285 break;
5286 }
5287
5288 dout(10) << "method called response length=" << outdata.length() << dendl;
5289 op.extent.length = outdata.length();
5290 osd_op.outdata.claim_append(outdata);
5291 dout(30) << "out dump: ";
5292 osd_op.outdata.hexdump(*_dout);
5293 *_dout << dendl;
5294 }
5295 break;
5296
5297 case CEPH_OSD_OP_STAT:
5298 // note: stat does not require RD
5299 {
5300 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5301
5302 if (obs.exists && !oi.is_whiteout()) {
5303 ::encode(oi.size, osd_op.outdata);
5304 ::encode(oi.mtime, osd_op.outdata);
5305 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5306 } else {
5307 result = -ENOENT;
5308 dout(10) << "stat oi object does not exist" << dendl;
5309 }
5310
5311 ctx->delta_stats.num_rd++;
5312 }
5313 break;
5314
5315 case CEPH_OSD_OP_ISDIRTY:
5316 ++ctx->num_read;
5317 {
5318 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5319 bool is_dirty = obs.oi.is_dirty();
5320 ::encode(is_dirty, osd_op.outdata);
5321 ctx->delta_stats.num_rd++;
5322 result = 0;
5323 }
5324 break;
5325
5326 case CEPH_OSD_OP_UNDIRTY:
5327 ++ctx->num_write;
5328 {
5329 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5330 if (oi.is_dirty()) {
5331 ctx->undirty = true; // see make_writeable()
5332 ctx->modify = true;
5333 ctx->delta_stats.num_wr++;
5334 }
5335 result = 0;
5336 }
5337 break;
5338
5339 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5340 ++ctx->num_write;
5341 {
5342 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5343 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5344 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5345 result = -EINVAL;
5346 break;
5347 }
5348 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5349 result = -EINVAL;
5350 break;
5351 }
5352 if (!obs.exists) {
5353 result = 0;
5354 break;
5355 }
5356 if (oi.is_cache_pinned()) {
5357 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5358 result = -EPERM;
5359 break;
5360 }
5361 if (oi.is_dirty()) {
5362 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5363 if (result == -EINPROGRESS)
5364 result = -EAGAIN;
5365 } else {
5366 result = 0;
5367 }
5368 }
5369 break;
5370
5371 case CEPH_OSD_OP_CACHE_FLUSH:
5372 ++ctx->num_write;
5373 {
5374 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5375 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5376 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5377 result = -EINVAL;
5378 break;
5379 }
5380 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5381 result = -EINVAL;
5382 break;
5383 }
5384 if (!obs.exists) {
5385 result = 0;
5386 break;
5387 }
5388 if (oi.is_cache_pinned()) {
5389 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5390 result = -EPERM;
5391 break;
5392 }
5393 hobject_t missing;
5394 if (oi.is_dirty()) {
5395 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5396 if (result == -EINPROGRESS)
5397 result = -EAGAIN;
5398 } else {
5399 result = 0;
5400 }
5401 // Check special return value which has set missing_return
5402 if (result == -ENOENT) {
5403 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5404 assert(!missing.is_min());
5405 wait_for_unreadable_object(missing, ctx->op);
5406 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5407 result = -EAGAIN;
5408 }
5409 }
5410 break;
5411
5412 case CEPH_OSD_OP_CACHE_EVICT:
5413 ++ctx->num_write;
5414 {
5415 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5416 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5417 result = -EINVAL;
5418 break;
5419 }
5420 if (!obs.exists) {
5421 result = 0;
5422 break;
5423 }
5424 if (oi.is_cache_pinned()) {
5425 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5426 result = -EPERM;
5427 break;
5428 }
5429 if (oi.is_dirty()) {
5430 result = -EBUSY;
5431 break;
5432 }
5433 if (!oi.watchers.empty()) {
5434 result = -EBUSY;
5435 break;
5436 }
5437 if (soid.snap == CEPH_NOSNAP) {
5438 result = _verify_no_head_clones(soid, ssc->snapset);
5439 if (result < 0)
5440 break;
5441 }
5442 result = _delete_oid(ctx, true, false);
5443 if (result >= 0) {
5444 // mark that this is a cache eviction to avoid triggering normal
5445 // make_writeable() clone or snapdir object creation in finish_ctx()
5446 ctx->cache_evict = true;
5447 }
5448 osd->logger->inc(l_osd_tier_evict);
5449 }
5450 break;
5451
5452 case CEPH_OSD_OP_GETXATTR:
5453 ++ctx->num_read;
5454 {
5455 string aname;
5456 bp.copy(op.xattr.name_len, aname);
5457 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5458 string name = "_" + aname;
5459 int r = getattr_maybe_cache(
5460 ctx->obc,
5461 name,
5462 &(osd_op.outdata));
5463 if (r >= 0) {
5464 op.xattr.value_len = osd_op.outdata.length();
5465 result = 0;
5466 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5467 } else
5468 result = r;
5469
5470 ctx->delta_stats.num_rd++;
5471 }
5472 break;
5473
5474 case CEPH_OSD_OP_GETXATTRS:
5475 ++ctx->num_read;
5476 {
5477 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5478 map<string, bufferlist> out;
5479 result = getattrs_maybe_cache(
5480 ctx->obc,
5481 &out);
5482
5483 bufferlist bl;
5484 ::encode(out, bl);
5485 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5486 ctx->delta_stats.num_rd++;
5487 osd_op.outdata.claim_append(bl);
5488 }
5489 break;
5490
5491 case CEPH_OSD_OP_CMPXATTR:
5492 ++ctx->num_read;
5493 {
5494 string aname;
5495 bp.copy(op.xattr.name_len, aname);
5496 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5497 string name = "_" + aname;
5498 name[op.xattr.name_len + 1] = 0;
5499
5500 bufferlist xattr;
5501 result = getattr_maybe_cache(
5502 ctx->obc,
5503 name,
5504 &xattr);
5505 if (result < 0 && result != -EEXIST && result != -ENODATA)
5506 break;
5507
5508 ctx->delta_stats.num_rd++;
5509 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5510
5511 switch (op.xattr.cmp_mode) {
5512 case CEPH_OSD_CMPXATTR_MODE_STRING:
5513 {
5514 string val;
5515 bp.copy(op.xattr.value_len, val);
5516 val[op.xattr.value_len] = 0;
5517 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5518 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5519 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5520 }
5521 break;
5522
5523 case CEPH_OSD_CMPXATTR_MODE_U64:
5524 {
5525 uint64_t u64val;
5526 try {
5527 ::decode(u64val, bp);
5528 }
5529 catch (buffer::error& e) {
5530 result = -EINVAL;
5531 goto fail;
5532 }
5533 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5534 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5535 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5536 }
5537 break;
5538
5539 default:
5540 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5541 result = -EINVAL;
5542 }
5543
5544 if (!result) {
5545 dout(10) << "comparison returned false" << dendl;
5546 result = -ECANCELED;
5547 break;
5548 }
5549 if (result < 0) {
5550 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5551 break;
5552 }
5553
5554 dout(10) << "comparison returned true" << dendl;
5555 }
5556 break;
5557
5558 case CEPH_OSD_OP_ASSERT_VER:
5559 ++ctx->num_read;
5560 {
5561 uint64_t ver = op.assert_ver.ver;
5562 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5563 if (!ver)
5564 result = -EINVAL;
5565 else if (ver < oi.user_version)
5566 result = -ERANGE;
5567 else if (ver > oi.user_version)
5568 result = -EOVERFLOW;
5569 }
5570 break;
5571
5572 case CEPH_OSD_OP_LIST_WATCHERS:
5573 ++ctx->num_read;
5574 {
5575 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5576 obj_list_watch_response_t resp;
5577
5578 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5579 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5580 ++oi_iter) {
5581 dout(20) << "key cookie=" << oi_iter->first.first
5582 << " entity=" << oi_iter->first.second << " "
5583 << oi_iter->second << dendl;
5584 assert(oi_iter->first.first == oi_iter->second.cookie);
5585 assert(oi_iter->first.second.is_client());
5586
5587 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5588 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5589 resp.entries.push_back(wi);
5590 }
5591
5592 resp.encode(osd_op.outdata, ctx->get_features());
5593 result = 0;
5594
5595 ctx->delta_stats.num_rd++;
5596 break;
5597 }
5598
5599 case CEPH_OSD_OP_LIST_SNAPS:
5600 ++ctx->num_read;
5601 {
5602 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5603 obj_list_snap_response_t resp;
5604
5605 if (!ssc) {
5606 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5607 }
5608 assert(ssc);
5609
5610 int clonecount = ssc->snapset.clones.size();
5611 if (ssc->snapset.head_exists)
5612 clonecount++;
5613 resp.clones.reserve(clonecount);
5614 for (auto clone_iter = ssc->snapset.clones.begin();
5615 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5616 clone_info ci;
5617 ci.cloneid = *clone_iter;
5618
5619 hobject_t clone_oid = soid;
5620 clone_oid.snap = *clone_iter;
5621
5622 if (!ssc->snapset.is_legacy()) {
5623 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5624 if (p == ssc->snapset.clone_snaps.end()) {
5625 osd->clog->error() << "osd." << osd->whoami
5626 << ": inconsistent clone_snaps found for oid "
5627 << soid << " clone " << *clone_iter
5628 << " snapset " << ssc->snapset;
5629 result = -EINVAL;
5630 break;
5631 }
5632 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5633 ci.snaps.push_back(*q);
5634 }
5635 } else {
5636 /* No need to take a lock here. We are only inspecting state cached on
5637 * in the ObjectContext, so we aren't performing an actual read unless
5638 * the clone obc is not already loaded (in which case, it cannot have
5639 * an in progress write). We also do not risk exposing uncommitted
5640 * state since we do have a read lock on the head object or snapdir,
5641 * which we would have to write lock in order to make user visible
5642 * modifications to the snapshot state (snap trim related mutations
5643 * are not user visible).
5644 */
5645 if (is_missing_object(clone_oid)) {
5646 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5647 wait_for_unreadable_object(clone_oid, ctx->op);
5648 result = -EAGAIN;
5649 break;
5650 }
5651
5652 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5653 if (!clone_obc) {
5654 if (maybe_handle_cache(
5655 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5656 // promoting the clone
5657 result = -EAGAIN;
5658 } else {
5659 osd->clog->error() << "osd." << osd->whoami
5660 << ": missing clone " << clone_oid
5661 << " for oid "
5662 << soid;
5663 // should not happen
5664 result = -ENOENT;
5665 }
5666 break;
5667 }
5668 for (vector<snapid_t>::reverse_iterator p =
5669 clone_obc->obs.oi.legacy_snaps.rbegin();
5670 p != clone_obc->obs.oi.legacy_snaps.rend();
5671 ++p) {
5672 ci.snaps.push_back(*p);
5673 }
5674 }
5675
5676 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5677
5678 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5679 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5680 if (coi == ssc->snapset.clone_overlap.end()) {
5681 osd->clog->error() << "osd." << osd->whoami
5682 << ": inconsistent clone_overlap found for oid "
5683 << soid << " clone " << *clone_iter;
5684 result = -EINVAL;
5685 break;
5686 }
5687 const interval_set<uint64_t> &o = coi->second;
5688 ci.overlap.reserve(o.num_intervals());
5689 for (interval_set<uint64_t>::const_iterator r = o.begin();
5690 r != o.end(); ++r) {
5691 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5692 r.get_len()));
5693 }
5694
5695 map<snapid_t, uint64_t>::const_iterator si;
5696 si = ssc->snapset.clone_size.find(ci.cloneid);
5697 if (si == ssc->snapset.clone_size.end()) {
5698 osd->clog->error() << "osd." << osd->whoami
5699 << ": inconsistent clone_size found for oid "
5700 << soid << " clone " << *clone_iter;
5701 result = -EINVAL;
5702 break;
5703 }
5704 ci.size = si->second;
5705
5706 resp.clones.push_back(ci);
5707 }
5708 if (result < 0) {
5709 break;
5710 }
5711 if (ssc->snapset.head_exists &&
5712 !ctx->obc->obs.oi.is_whiteout()) {
5713 assert(obs.exists);
5714 clone_info ci;
5715 ci.cloneid = CEPH_NOSNAP;
5716
5717 //Size for HEAD is oi.size
5718 ci.size = oi.size;
5719
5720 resp.clones.push_back(ci);
5721 }
5722 resp.seq = ssc->snapset.seq;
5723
5724 resp.encode(osd_op.outdata);
5725 result = 0;
5726
5727 ctx->delta_stats.num_rd++;
5728 break;
5729 }
5730
5731 case CEPH_OSD_OP_NOTIFY:
5732 ++ctx->num_read;
5733 {
5734 uint32_t timeout;
5735 bufferlist bl;
5736
5737 try {
5738 uint32_t ver; // obsolete
5739 ::decode(ver, bp);
5740 ::decode(timeout, bp);
5741 ::decode(bl, bp);
5742 } catch (const buffer::error &e) {
5743 timeout = 0;
5744 }
5745 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5746 if (!timeout)
5747 timeout = cct->_conf->osd_default_notify_timeout;
5748
5749 notify_info_t n;
5750 n.timeout = timeout;
5751 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5752 n.cookie = op.watch.cookie;
5753 n.bl = bl;
5754 ctx->notifies.push_back(n);
5755
5756 // return our unique notify id to the client
5757 ::encode(n.notify_id, osd_op.outdata);
5758 }
5759 break;
5760
5761 case CEPH_OSD_OP_NOTIFY_ACK:
5762 ++ctx->num_read;
5763 {
5764 try {
5765 uint64_t notify_id = 0;
5766 uint64_t watch_cookie = 0;
5767 ::decode(notify_id, bp);
5768 ::decode(watch_cookie, bp);
5769 bufferlist reply_bl;
5770 if (!bp.end()) {
5771 ::decode(reply_bl, bp);
5772 }
5773 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5774 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5775 ctx->notify_acks.push_back(ack);
5776 } catch (const buffer::error &e) {
5777 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5778 OpContext::NotifyAck ack(
5779 // op.watch.cookie is actually the notify_id for historical reasons
5780 op.watch.cookie
5781 );
5782 ctx->notify_acks.push_back(ack);
5783 }
5784 }
5785 break;
5786
5787 case CEPH_OSD_OP_SETALLOCHINT:
5788 ++ctx->num_write;
5789 {
5790 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5791 maybe_create_new_object(ctx);
5792 oi.expected_object_size = op.alloc_hint.expected_object_size;
5793 oi.expected_write_size = op.alloc_hint.expected_write_size;
5794 oi.alloc_hint_flags = op.alloc_hint.flags;
5795 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5796 op.alloc_hint.expected_write_size,
5797 op.alloc_hint.flags);
5798 ctx->delta_stats.num_wr++;
5799 result = 0;
5800 }
5801 break;
5802
5803
5804 // --- WRITES ---
5805
5806 // -- object data --
5807
5808 case CEPH_OSD_OP_WRITE:
5809 ++ctx->num_write;
5810 { // write
5811 __u32 seq = oi.truncate_seq;
5812 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5813 if (op.extent.length != osd_op.indata.length()) {
5814 result = -EINVAL;
5815 break;
5816 }
5817
5818 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5819 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5820
5821 if (pool.info.requires_aligned_append() &&
5822 (op.extent.offset % pool.info.required_alignment() != 0)) {
5823 result = -EOPNOTSUPP;
5824 break;
5825 }
5826
5827 if (!obs.exists) {
5828 if (pool.info.requires_aligned_append() && op.extent.offset) {
5829 result = -EOPNOTSUPP;
5830 break;
5831 }
5832 } else if (op.extent.offset != oi.size &&
5833 pool.info.requires_aligned_append()) {
5834 result = -EOPNOTSUPP;
5835 break;
5836 }
5837
5838 if (seq && (seq > op.extent.truncate_seq) &&
5839 (op.extent.offset + op.extent.length > oi.size)) {
5840 // old write, arrived after trimtrunc
5841 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5842 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5843 << ", adjusting write length to " << op.extent.length << dendl;
5844 bufferlist t;
5845 t.substr_of(osd_op.indata, 0, op.extent.length);
5846 osd_op.indata.swap(t);
5847 }
5848 if (op.extent.truncate_seq > seq) {
5849 // write arrives before trimtrunc
5850 if (obs.exists && !oi.is_whiteout()) {
5851 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5852 << ", truncating to " << op.extent.truncate_size << dendl;
5853 t->truncate(soid, op.extent.truncate_size);
5854 oi.truncate_seq = op.extent.truncate_seq;
5855 oi.truncate_size = op.extent.truncate_size;
5856 if (op.extent.truncate_size != oi.size) {
5857 ctx->delta_stats.num_bytes -= oi.size;
5858 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5859 oi.size = op.extent.truncate_size;
5860 }
5861 } else {
5862 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5863 << ", but object is new" << dendl;
5864 oi.truncate_seq = op.extent.truncate_seq;
5865 oi.truncate_size = op.extent.truncate_size;
5866 }
5867 }
5868 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5869 if (result < 0)
5870 break;
5871
5872 maybe_create_new_object(ctx);
5873
5874 if (op.extent.length == 0) {
5875 if (op.extent.offset > oi.size) {
5876 t->truncate(
5877 soid, op.extent.offset);
5878 } else {
5879 t->nop(soid);
5880 }
5881 } else {
5882 t->write(
5883 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5884 }
5885
5886 if (op.extent.offset == 0 && op.extent.length >= oi.size
5887 && !skip_data_digest) {
5888 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5889 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
5890 if (skip_data_digest) {
5891 obs.oi.clear_data_digest();
5892 } else {
5893 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5894 }
5895 } else {
5896 obs.oi.clear_data_digest();
5897 }
5898 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5899 op.extent.offset, op.extent.length);
5900
5901 }
5902 break;
5903
5904 case CEPH_OSD_OP_WRITEFULL:
5905 ++ctx->num_write;
5906 { // write full object
5907 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5908
5909 if (op.extent.length != osd_op.indata.length()) {
5910 result = -EINVAL;
5911 break;
5912 }
5913 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5914 if (result < 0)
5915 break;
5916
5917 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5918 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5919
5920 maybe_create_new_object(ctx);
5921 if (pool.info.require_rollback()) {
5922 t->truncate(soid, 0);
5923 } else if (obs.exists && op.extent.length < oi.size) {
5924 t->truncate(soid, op.extent.length);
5925 }
5926 if (op.extent.length) {
5927 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5928 }
5929 if (!skip_data_digest) {
5930 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5931 } else {
5932 obs.oi.clear_data_digest();
5933 }
5934
5935 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5936 0, op.extent.length, true);
5937 }
5938 break;
5939
5940 case CEPH_OSD_OP_WRITESAME:
5941 ++ctx->num_write;
5942 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5943 result = do_writesame(ctx, osd_op);
5944 break;
5945
5946 case CEPH_OSD_OP_ROLLBACK :
5947 ++ctx->num_write;
5948 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5949 result = _rollback_to(ctx, op);
5950 break;
5951
5952 case CEPH_OSD_OP_ZERO:
5953 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5954 if (pool.info.requires_aligned_append()) {
5955 result = -EOPNOTSUPP;
5956 break;
5957 }
5958 ++ctx->num_write;
5959 { // zero
5960 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5961 if (result < 0)
5962 break;
5963 assert(op.extent.length);
5964 if (obs.exists && !oi.is_whiteout()) {
5965 t->zero(soid, op.extent.offset, op.extent.length);
5966 interval_set<uint64_t> ch;
5967 ch.insert(op.extent.offset, op.extent.length);
5968 ctx->modified_ranges.union_of(ch);
5969 ctx->delta_stats.num_wr++;
5970 oi.clear_data_digest();
5971 } else {
5972 // no-op
5973 }
5974 }
5975 break;
5976 case CEPH_OSD_OP_CREATE:
5977 ++ctx->num_write;
5978 {
5979 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5980 int flags = le32_to_cpu(op.flags);
5981 if (obs.exists && !oi.is_whiteout() &&
5982 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5983 result = -EEXIST; /* this is an exclusive create */
5984 } else {
5985 if (osd_op.indata.length()) {
5986 bufferlist::iterator p = osd_op.indata.begin();
5987 string category;
5988 try {
5989 ::decode(category, p);
5990 }
5991 catch (buffer::error& e) {
5992 result = -EINVAL;
5993 goto fail;
5994 }
5995 // category is no longer implemented.
5996 }
5997 if (result >= 0) {
5998 maybe_create_new_object(ctx);
5999 t->nop(soid);
6000 }
6001 }
6002 }
6003 break;
6004
6005 case CEPH_OSD_OP_TRIMTRUNC:
6006 op.extent.offset = op.extent.truncate_size;
6007 // falling through
6008
6009 case CEPH_OSD_OP_TRUNCATE:
6010 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6011 if (pool.info.requires_aligned_append()) {
6012 result = -EOPNOTSUPP;
6013 break;
6014 }
6015 ++ctx->num_write;
6016 {
6017 // truncate
6018 if (!obs.exists || oi.is_whiteout()) {
6019 dout(10) << " object dne, truncate is a no-op" << dendl;
6020 break;
6021 }
6022
6023 if (op.extent.offset > cct->_conf->osd_max_object_size) {
6024 result = -EFBIG;
6025 break;
6026 }
6027
6028 if (op.extent.truncate_seq) {
6029 assert(op.extent.offset == op.extent.truncate_size);
6030 if (op.extent.truncate_seq <= oi.truncate_seq) {
6031 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6032 << ", no-op" << dendl;
6033 break; // old
6034 }
6035 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6036 << ", truncating" << dendl;
6037 oi.truncate_seq = op.extent.truncate_seq;
6038 oi.truncate_size = op.extent.truncate_size;
6039 }
6040
6041 maybe_create_new_object(ctx);
6042 t->truncate(soid, op.extent.offset);
6043 if (oi.size > op.extent.offset) {
6044 interval_set<uint64_t> trim;
6045 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6046 ctx->modified_ranges.union_of(trim);
6047 }
6048 if (op.extent.offset != oi.size) {
6049 ctx->delta_stats.num_bytes -= oi.size;
6050 ctx->delta_stats.num_bytes += op.extent.offset;
6051 oi.size = op.extent.offset;
6052 }
6053 ctx->delta_stats.num_wr++;
6054 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6055
6056 oi.clear_data_digest();
6057 }
6058 break;
6059
6060 case CEPH_OSD_OP_DELETE:
6061 ++ctx->num_write;
6062 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6063 {
6064 result = _delete_oid(ctx, false, ctx->ignore_cache);
6065 }
6066 break;
6067
6068 case CEPH_OSD_OP_WATCH:
6069 ++ctx->num_write;
6070 {
6071 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6072 op.watch.cookie, op.watch.op);
6073 if (!obs.exists) {
6074 result = -ENOENT;
6075 break;
6076 }
6077 uint64_t cookie = op.watch.cookie;
6078 entity_name_t entity = ctx->reqid.name;
6079 ObjectContextRef obc = ctx->obc;
6080
6081 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6082 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6083 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6084 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6085 dout(10) << "watch: peer_addr="
6086 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6087
6088 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6089 if (op.watch.timeout != 0) {
6090 timeout = op.watch.timeout;
6091 }
6092
6093 watch_info_t w(cookie, timeout,
6094 ctx->op->get_req()->get_connection()->get_peer_addr());
6095 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6096 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6097 if (oi.watchers.count(make_pair(cookie, entity))) {
6098 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6099 } else {
6100 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6101 oi.watchers[make_pair(cookie, entity)] = w;
6102 t->nop(soid); // make sure update the object_info on disk!
6103 }
6104 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6105 ctx->watch_connects.push_back(make_pair(w, will_ping));
6106 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6107 if (!oi.watchers.count(make_pair(cookie, entity))) {
6108 result = -ENOTCONN;
6109 break;
6110 }
6111 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6112 ctx->watch_connects.push_back(make_pair(w, true));
6113 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6114 /* Note: WATCH with PING doesn't cause may_write() to return true,
6115 * so if there is nothing else in the transaction, this is going
6116 * to run do_osd_op_effects, but not write out a log entry */
6117 if (!oi.watchers.count(make_pair(cookie, entity))) {
6118 result = -ENOTCONN;
6119 break;
6120 }
6121 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6122 obc->watchers.find(make_pair(cookie, entity));
6123 if (p == obc->watchers.end() ||
6124 !p->second->is_connected()) {
6125 // client needs to reconnect
6126 result = -ETIMEDOUT;
6127 break;
6128 }
6129 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6130 p->second->got_ping(ceph_clock_now());
6131 result = 0;
6132 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6133 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6134 oi.watchers.find(make_pair(cookie, entity));
6135 if (oi_iter != oi.watchers.end()) {
6136 dout(10) << " removed watch " << oi_iter->second << " by "
6137 << entity << dendl;
6138 oi.watchers.erase(oi_iter);
6139 t->nop(soid); // update oi on disk
6140 ctx->watch_disconnects.push_back(
6141 watch_disconnect_t(cookie, entity, false));
6142 } else {
6143 dout(10) << " can't remove: no watch by " << entity << dendl;
6144 }
6145 }
6146 }
6147 break;
6148
6149 case CEPH_OSD_OP_CACHE_PIN:
6150 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6151 if ((!pool.info.is_tier() ||
6152 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6153 result = -EINVAL;
6154 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6155 break;
6156 }
6157 ++ctx->num_write;
6158 {
6159 if (!obs.exists || oi.is_whiteout()) {
6160 result = -ENOENT;
6161 break;
6162 }
6163
6164 if (!oi.is_cache_pinned()) {
6165 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6166 ctx->modify = true;
6167 ctx->delta_stats.num_objects_pinned++;
6168 ctx->delta_stats.num_wr++;
6169 }
6170 result = 0;
6171 }
6172 break;
6173
6174 case CEPH_OSD_OP_CACHE_UNPIN:
6175 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6176 if ((!pool.info.is_tier() ||
6177 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6178 result = -EINVAL;
6179 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6180 break;
6181 }
6182 ++ctx->num_write;
6183 {
6184 if (!obs.exists || oi.is_whiteout()) {
6185 result = -ENOENT;
6186 break;
6187 }
6188
6189 if (oi.is_cache_pinned()) {
6190 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6191 ctx->modify = true;
6192 ctx->delta_stats.num_objects_pinned--;
6193 ctx->delta_stats.num_wr++;
6194 }
6195 result = 0;
6196 }
6197 break;
6198
6199 case CEPH_OSD_OP_SET_REDIRECT:
6200 ++ctx->num_write;
6201 {
6202 if (pool.info.is_tier()) {
6203 result = -EINVAL;
6204 break;
6205 }
6206 if (!obs.exists) {
6207 result = -ENOENT;
6208 break;
6209 }
6210 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6211 result = -EOPNOTSUPP;
6212 break;
6213 }
6214
6215 object_t target_name;
6216 object_locator_t target_oloc;
6217 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6218 version_t target_version = op.copy_from.src_version;
6219 try {
6220 ::decode(target_name, bp);
6221 ::decode(target_oloc, bp);
6222 }
6223 catch (buffer::error& e) {
6224 result = -EINVAL;
6225 goto fail;
6226 }
6227 pg_t raw_pg;
6228 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6229 hobject_t target(target_name, target_oloc.key, target_snapid,
6230 raw_pg.ps(), raw_pg.pool(),
6231 target_oloc.nspace);
6232 if (target == soid) {
6233 dout(20) << " set-redirect self is invalid" << dendl;
6234 result = -EINVAL;
6235 break;
6236 }
6237 oi.set_flag(object_info_t::FLAG_MANIFEST);
6238 oi.manifest.redirect_target = target;
6239 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6240 t->truncate(soid, 0);
6241 if (oi.is_omap() && pool.info.supports_omap()) {
6242 t->omap_clear(soid);
6243 obs.oi.clear_omap_digest();
6244 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6245 }
6246 ctx->delta_stats.num_bytes -= oi.size;
6247 oi.size = 0;
6248 oi.new_object();
6249 oi.user_version = target_version;
6250 ctx->user_at_version = target_version;
6251 /* rm_attrs */
6252 map<string,bufferlist> rmattrs;
6253 result = getattrs_maybe_cache(ctx->obc,
6254 &rmattrs);
6255 if (result < 0) {
6256 return result;
6257 }
6258 map<string, bufferlist>::iterator iter;
6259 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6260 const string& name = iter->first;
6261 t->rmattr(soid, name);
6262 }
6263 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6264 }
6265
6266 break;
6267
6268 // -- object attrs --
6269
6270 case CEPH_OSD_OP_SETXATTR:
6271 ++ctx->num_write;
6272 {
6273 if (cct->_conf->osd_max_attr_size > 0 &&
6274 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6275 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6276 result = -EFBIG;
6277 break;
6278 }
6279 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6280 cct->_conf->osd_max_attr_name_len);
6281 if (op.xattr.name_len > max_name_len) {
6282 result = -ENAMETOOLONG;
6283 break;
6284 }
6285 maybe_create_new_object(ctx);
6286 string aname;
6287 bp.copy(op.xattr.name_len, aname);
6288 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6289 string name = "_" + aname;
6290 bufferlist bl;
6291 bp.copy(op.xattr.value_len, bl);
6292 t->setattr(soid, name, bl);
6293 ctx->delta_stats.num_wr++;
6294 }
6295 break;
6296
6297 case CEPH_OSD_OP_RMXATTR:
6298 ++ctx->num_write;
6299 {
6300 string aname;
6301 bp.copy(op.xattr.name_len, aname);
6302 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6303 if (!obs.exists || oi.is_whiteout()) {
6304 result = -ENOENT;
6305 break;
6306 }
6307 string name = "_" + aname;
6308 t->rmattr(soid, name);
6309 ctx->delta_stats.num_wr++;
6310 }
6311 break;
6312
6313
6314 // -- fancy writers --
6315 case CEPH_OSD_OP_APPEND:
6316 {
6317 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6318 // just do it inline; this works because we are happy to execute
6319 // fancy op on replicas as well.
6320 vector<OSDOp> nops(1);
6321 OSDOp& newop = nops[0];
6322 newop.op.op = CEPH_OSD_OP_WRITE;
6323 newop.op.extent.offset = oi.size;
6324 newop.op.extent.length = op.extent.length;
6325 newop.op.extent.truncate_seq = oi.truncate_seq;
6326 newop.indata = osd_op.indata;
6327 result = do_osd_ops(ctx, nops);
6328 osd_op.outdata.claim(newop.outdata);
6329 }
6330 break;
6331
6332 case CEPH_OSD_OP_STARTSYNC:
6333 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6334 t->nop(soid);
6335 break;
6336
6337
6338 // -- trivial map --
6339 case CEPH_OSD_OP_TMAPGET:
6340 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6341 if (pool.info.require_rollback()) {
6342 result = -EOPNOTSUPP;
6343 break;
6344 }
6345 {
6346 vector<OSDOp> nops(1);
6347 OSDOp& newop = nops[0];
6348 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6349 newop.op.extent.offset = 0;
6350 newop.op.extent.length = 0;
6351 do_osd_ops(ctx, nops);
6352 osd_op.outdata.claim(newop.outdata);
6353 }
6354 break;
6355
6356 case CEPH_OSD_OP_TMAPPUT:
6357 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6358 if (pool.info.require_rollback()) {
6359 result = -EOPNOTSUPP;
6360 break;
6361 }
6362 {
6363 //_dout_lock.Lock();
6364 //osd_op.data.hexdump(*_dout);
6365 //_dout_lock.Unlock();
6366
6367 // verify sort order
6368 bool unsorted = false;
6369 if (true) {
6370 bufferlist header;
6371 ::decode(header, bp);
6372 uint32_t n;
6373 ::decode(n, bp);
6374 string last_key;
6375 while (n--) {
6376 string key;
6377 ::decode(key, bp);
6378 dout(10) << "tmapput key " << key << dendl;
6379 bufferlist val;
6380 ::decode(val, bp);
6381 if (key < last_key) {
6382 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6383 unsorted = true;
6384 break;
6385 }
6386 last_key = key;
6387 }
6388 }
6389
6390 // write it
6391 vector<OSDOp> nops(1);
6392 OSDOp& newop = nops[0];
6393 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6394 newop.op.extent.offset = 0;
6395 newop.op.extent.length = osd_op.indata.length();
6396 newop.indata = osd_op.indata;
6397
6398 if (unsorted) {
6399 bp = osd_op.indata.begin();
6400 bufferlist header;
6401 map<string, bufferlist> m;
6402 ::decode(header, bp);
6403 ::decode(m, bp);
6404 assert(bp.end());
6405 bufferlist newbl;
6406 ::encode(header, newbl);
6407 ::encode(m, newbl);
6408 newop.indata = newbl;
6409 }
6410 result = do_osd_ops(ctx, nops);
6411 assert(result == 0);
6412 }
6413 break;
6414
6415 case CEPH_OSD_OP_TMAPUP:
6416 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6417 if (pool.info.require_rollback()) {
6418 result = -EOPNOTSUPP;
6419 break;
6420 }
6421 ++ctx->num_write;
6422 result = do_tmapup(ctx, bp, osd_op);
6423 break;
6424
6425 case CEPH_OSD_OP_TMAP2OMAP:
6426 ++ctx->num_write;
6427 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6428 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6429 break;
6430
6431 // OMAP Read ops
6432 case CEPH_OSD_OP_OMAPGETKEYS:
6433 ++ctx->num_read;
6434 {
6435 string start_after;
6436 uint64_t max_return;
6437 try {
6438 ::decode(start_after, bp);
6439 ::decode(max_return, bp);
6440 }
6441 catch (buffer::error& e) {
6442 result = -EINVAL;
6443 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6444 goto fail;
6445 }
6446 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6447 max_return = cct->_conf->osd_max_omap_entries_per_request;
6448 }
6449 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6450
6451 bufferlist bl;
6452 uint32_t num = 0;
6453 bool truncated = false;
6454 if (oi.is_omap()) {
6455 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6456 coll, ghobject_t(soid)
6457 );
6458 assert(iter);
6459 iter->upper_bound(start_after);
6460 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6461 if (num >= max_return ||
6462 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6463 truncated = true;
6464 break;
6465 }
6466 ::encode(iter->key(), bl);
6467 }
6468 } // else return empty out_set
6469 ::encode(num, osd_op.outdata);
6470 osd_op.outdata.claim_append(bl);
6471 ::encode(truncated, osd_op.outdata);
6472 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6473 ctx->delta_stats.num_rd++;
6474 }
6475 break;
6476
6477 case CEPH_OSD_OP_OMAPGETVALS:
6478 ++ctx->num_read;
6479 {
6480 string start_after;
6481 uint64_t max_return;
6482 string filter_prefix;
6483 try {
6484 ::decode(start_after, bp);
6485 ::decode(max_return, bp);
6486 ::decode(filter_prefix, bp);
6487 }
6488 catch (buffer::error& e) {
6489 result = -EINVAL;
6490 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6491 goto fail;
6492 }
6493 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6494 max_return = cct->_conf->osd_max_omap_entries_per_request;
6495 }
6496 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6497
6498 uint32_t num = 0;
6499 bool truncated = false;
6500 bufferlist bl;
6501 if (oi.is_omap()) {
6502 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6503 coll, ghobject_t(soid)
6504 );
6505 if (!iter) {
6506 result = -ENOENT;
6507 goto fail;
6508 }
6509 iter->upper_bound(start_after);
6510 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6511 for (num = 0;
6512 iter->valid() &&
6513 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6514 ++num, iter->next(false)) {
6515 dout(20) << "Found key " << iter->key() << dendl;
6516 if (num >= max_return ||
6517 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6518 truncated = true;
6519 break;
6520 }
6521 ::encode(iter->key(), bl);
6522 ::encode(iter->value(), bl);
6523 }
6524 } // else return empty out_set
6525 ::encode(num, osd_op.outdata);
6526 osd_op.outdata.claim_append(bl);
6527 ::encode(truncated, osd_op.outdata);
6528 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6529 ctx->delta_stats.num_rd++;
6530 }
6531 break;
6532
6533 case CEPH_OSD_OP_OMAPGETHEADER:
6534 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6535 if (!oi.is_omap()) {
6536 // return empty header
6537 break;
6538 }
6539 ++ctx->num_read;
6540 {
6541 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6542 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6543 ctx->delta_stats.num_rd++;
6544 }
6545 break;
6546
6547 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6548 ++ctx->num_read;
6549 {
6550 set<string> keys_to_get;
6551 try {
6552 ::decode(keys_to_get, bp);
6553 }
6554 catch (buffer::error& e) {
6555 result = -EINVAL;
6556 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6557 goto fail;
6558 }
6559 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6560 map<string, bufferlist> out;
6561 if (oi.is_omap()) {
6562 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6563 } // else return empty omap entries
6564 ::encode(out, osd_op.outdata);
6565 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6566 ctx->delta_stats.num_rd++;
6567 }
6568 break;
6569
6570 case CEPH_OSD_OP_OMAP_CMP:
6571 ++ctx->num_read;
6572 {
6573 if (!obs.exists || oi.is_whiteout()) {
6574 result = -ENOENT;
6575 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6576 break;
6577 }
6578 map<string, pair<bufferlist, int> > assertions;
6579 try {
6580 ::decode(assertions, bp);
6581 }
6582 catch (buffer::error& e) {
6583 result = -EINVAL;
6584 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6585 goto fail;
6586 }
6587 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6588
6589 map<string, bufferlist> out;
6590
6591 if (oi.is_omap()) {
6592 set<string> to_get;
6593 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6594 i != assertions.end();
6595 ++i)
6596 to_get.insert(i->first);
6597 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6598 to_get, &out);
6599 if (r < 0) {
6600 result = r;
6601 break;
6602 }
6603 } // else leave out empty
6604
6605 //Should set num_rd_kb based on encode length of map
6606 ctx->delta_stats.num_rd++;
6607
6608 int r = 0;
6609 bufferlist empty;
6610 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6611 i != assertions.end();
6612 ++i) {
6613 auto out_entry = out.find(i->first);
6614 bufferlist &bl = (out_entry != out.end()) ?
6615 out_entry->second : empty;
6616 switch (i->second.second) {
6617 case CEPH_OSD_CMPXATTR_OP_EQ:
6618 if (!(bl == i->second.first)) {
6619 r = -ECANCELED;
6620 }
6621 break;
6622 case CEPH_OSD_CMPXATTR_OP_LT:
6623 if (!(bl < i->second.first)) {
6624 r = -ECANCELED;
6625 }
6626 break;
6627 case CEPH_OSD_CMPXATTR_OP_GT:
6628 if (!(bl > i->second.first)) {
6629 r = -ECANCELED;
6630 }
6631 break;
6632 default:
6633 r = -EINVAL;
6634 break;
6635 }
6636 if (r < 0)
6637 break;
6638 }
6639 if (r < 0) {
6640 result = r;
6641 }
6642 }
6643 break;
6644
6645 // OMAP Write ops
6646 case CEPH_OSD_OP_OMAPSETVALS:
6647 if (!pool.info.supports_omap()) {
6648 result = -EOPNOTSUPP;
6649 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6650 break;
6651 }
6652 ++ctx->num_write;
6653 {
6654 maybe_create_new_object(ctx);
6655 bufferlist to_set_bl;
6656 try {
6657 decode_str_str_map_to_bl(bp, &to_set_bl);
6658 }
6659 catch (buffer::error& e) {
6660 result = -EINVAL;
6661 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6662 goto fail;
6663 }
6664 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6665 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6666 dout(20) << "setting vals: " << dendl;
6667 map<string,bufferlist> to_set;
6668 bufferlist::iterator pt = to_set_bl.begin();
6669 ::decode(to_set, pt);
6670 for (map<string, bufferlist>::iterator i = to_set.begin();
6671 i != to_set.end();
6672 ++i) {
6673 dout(20) << "\t" << i->first << dendl;
6674 }
6675 }
6676 t->omap_setkeys(soid, to_set_bl);
6677 ctx->delta_stats.num_wr++;
6678 }
6679 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6680 obs.oi.clear_omap_digest();
6681 break;
6682
6683 case CEPH_OSD_OP_OMAPSETHEADER:
6684 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6685 if (!pool.info.supports_omap()) {
6686 result = -EOPNOTSUPP;
6687 break;
6688 }
6689 ++ctx->num_write;
6690 {
6691 maybe_create_new_object(ctx);
6692 t->omap_setheader(soid, osd_op.indata);
6693 ctx->delta_stats.num_wr++;
6694 }
6695 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6696 obs.oi.clear_omap_digest();
6697 break;
6698
6699 case CEPH_OSD_OP_OMAPCLEAR:
6700 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6701 if (!pool.info.supports_omap()) {
6702 result = -EOPNOTSUPP;
6703 break;
6704 }
6705 ++ctx->num_write;
6706 {
6707 if (!obs.exists || oi.is_whiteout()) {
6708 result = -ENOENT;
6709 break;
6710 }
6711 if (oi.is_omap()) {
6712 t->omap_clear(soid);
6713 ctx->delta_stats.num_wr++;
6714 obs.oi.clear_omap_digest();
6715 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6716 }
6717 }
6718 break;
6719
6720 case CEPH_OSD_OP_OMAPRMKEYS:
6721 if (!pool.info.supports_omap()) {
6722 result = -EOPNOTSUPP;
6723 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6724 break;
6725 }
6726 ++ctx->num_write;
6727 {
6728 if (!obs.exists || oi.is_whiteout()) {
6729 result = -ENOENT;
6730 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6731 break;
6732 }
6733 bufferlist to_rm_bl;
6734 try {
6735 decode_str_set_to_bl(bp, &to_rm_bl);
6736 }
6737 catch (buffer::error& e) {
6738 result = -EINVAL;
6739 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6740 goto fail;
6741 }
6742 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6743 t->omap_rmkeys(soid, to_rm_bl);
6744 ctx->delta_stats.num_wr++;
6745 }
6746 obs.oi.clear_omap_digest();
6747 break;
6748
6749 case CEPH_OSD_OP_COPY_GET:
6750 ++ctx->num_read;
6751 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6752 soid.snap.val);
6753 if (op_finisher == nullptr) {
6754 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6755 } else {
6756 result = op_finisher->execute();
6757 }
6758 break;
6759
6760 case CEPH_OSD_OP_COPY_FROM:
6761 ++ctx->num_write;
6762 {
6763 object_t src_name;
6764 object_locator_t src_oloc;
6765 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6766 version_t src_version = op.copy_from.src_version;
6767 try {
6768 ::decode(src_name, bp);
6769 ::decode(src_oloc, bp);
6770 }
6771 catch (buffer::error& e) {
6772 result = -EINVAL;
6773 tracepoint(osd,
6774 do_osd_op_pre_copy_from,
6775 soid.oid.name.c_str(),
6776 soid.snap.val,
6777 "???",
6778 0,
6779 "???",
6780 "???",
6781 0,
6782 src_snapid,
6783 src_version);
6784 goto fail;
6785 }
6786 tracepoint(osd,
6787 do_osd_op_pre_copy_from,
6788 soid.oid.name.c_str(),
6789 soid.snap.val,
6790 src_name.name.c_str(),
6791 src_oloc.pool,
6792 src_oloc.key.c_str(),
6793 src_oloc.nspace.c_str(),
6794 src_oloc.hash,
6795 src_snapid,
6796 src_version);
6797 if (op_finisher == nullptr) {
6798 // start
6799 pg_t raw_pg;
6800 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6801 hobject_t src(src_name, src_oloc.key, src_snapid,
6802 raw_pg.ps(), raw_pg.pool(),
6803 src_oloc.nspace);
6804 if (src == soid) {
6805 dout(20) << " copy from self is invalid" << dendl;
6806 result = -EINVAL;
6807 break;
6808 }
6809 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6810 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6811 new CopyFromFinisher(cb));
6812 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6813 op.copy_from.flags,
6814 false,
6815 op.copy_from.src_fadvise_flags,
6816 op.flags);
6817 result = -EINPROGRESS;
6818 } else {
6819 // finish
6820 result = op_finisher->execute();
6821 assert(result == 0);
6822
6823 // COPY_FROM cannot be executed multiple times -- it must restart
6824 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6825 }
6826 }
6827 break;
6828
6829 default:
6830 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6831 dout(1) << "unrecognized osd op " << op.op
6832 << " " << ceph_osd_op_name(op.op)
6833 << dendl;
6834 result = -EOPNOTSUPP;
6835 }
6836
6837 fail:
6838 osd_op.rval = result;
6839 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6840 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6841 result = 0;
6842
6843 if (result < 0)
6844 break;
6845 }
6846 return result;
6847 }
6848
6849 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6850 {
6851 if (ctx->new_obs.oi.size == 0) {
6852 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6853 return -ENODATA;
6854 }
6855 vector<OSDOp> nops(1);
6856 OSDOp &newop = nops[0];
6857 newop.op.op = CEPH_OSD_OP_TMAPGET;
6858 do_osd_ops(ctx, nops);
6859 try {
6860 bufferlist::iterator i = newop.outdata.begin();
6861 ::decode(*header, i);
6862 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6863 } catch (...) {
6864 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6865 << dendl;
6866 return -EINVAL;
6867 }
6868 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6869 << dendl;
6870 return 0;
6871 }
6872
6873 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6874 const SnapSet& ss)
6875 {
6876 // verify that all clones have been evicted
6877 dout(20) << __func__ << " verifying clones are absent "
6878 << ss << dendl;
6879 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6880 p != ss.clones.end();
6881 ++p) {
6882 hobject_t clone_oid = soid;
6883 clone_oid.snap = *p;
6884 if (is_missing_object(clone_oid))
6885 return -EBUSY;
6886 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6887 if (clone_obc && clone_obc->obs.exists) {
6888 dout(10) << __func__ << " cannot evict head before clone "
6889 << clone_oid << dendl;
6890 return -EBUSY;
6891 }
6892 if (copy_ops.count(clone_oid)) {
6893 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6894 << clone_oid << dendl;
6895 return -EBUSY;
6896 }
6897 }
6898 return 0;
6899 }
6900
6901 inline int PrimaryLogPG::_delete_oid(
6902 OpContext *ctx,
6903 bool no_whiteout, // no whiteouts, no matter what.
6904 bool try_no_whiteout) // try not to whiteout
6905 {
6906 SnapSet& snapset = ctx->new_snapset;
6907 ObjectState& obs = ctx->new_obs;
6908 object_info_t& oi = obs.oi;
6909 const hobject_t& soid = oi.soid;
6910 PGTransaction* t = ctx->op_t.get();
6911
6912 // cache: cache: set whiteout on delete?
6913 bool whiteout = false;
6914 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6915 && !no_whiteout
6916 && !try_no_whiteout) {
6917 whiteout = true;
6918 }
6919 bool legacy;
6920 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6921 legacy = false;
6922 // in luminous or later, we can't delete the head if there are
6923 // clones. we trust the caller passing no_whiteout has already
6924 // verified they don't exist.
6925 if (!snapset.clones.empty() ||
6926 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6927 if (no_whiteout) {
6928 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6929 << dendl;
6930 } else {
6931 dout(20) << __func__ << " has or will have clones; will whiteout"
6932 << dendl;
6933 whiteout = true;
6934 }
6935 }
6936 } else {
6937 legacy = true;
6938 }
6939 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6940 << " no_whiteout=" << (int)no_whiteout
6941 << " try_no_whiteout=" << (int)try_no_whiteout
6942 << dendl;
6943 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6944 return -ENOENT;
6945
6946 t->remove(soid);
6947
6948 if (oi.size > 0) {
6949 interval_set<uint64_t> ch;
6950 ch.insert(0, oi.size);
6951 ctx->modified_ranges.union_of(ch);
6952 }
6953
6954 ctx->delta_stats.num_wr++;
6955 if (soid.is_snap()) {
6956 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6957 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6958 } else {
6959 ctx->delta_stats.num_bytes -= oi.size;
6960 }
6961 oi.size = 0;
6962 oi.new_object();
6963
6964 // disconnect all watchers
6965 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6966 oi.watchers.begin();
6967 p != oi.watchers.end();
6968 ++p) {
6969 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6970 ctx->watch_disconnects.push_back(
6971 watch_disconnect_t(p->first.first, p->first.second, true));
6972 }
6973 oi.watchers.clear();
6974
6975 if (whiteout) {
6976 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6977 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6978 ctx->delta_stats.num_whiteouts++;
6979 t->create(soid);
6980 osd->logger->inc(l_osd_tier_whiteout);
6981 return 0;
6982 }
6983
6984 // delete the head
6985 ctx->delta_stats.num_objects--;
6986 if (soid.is_snap())
6987 ctx->delta_stats.num_object_clones--;
6988 if (oi.is_whiteout()) {
6989 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6990 ctx->delta_stats.num_whiteouts--;
6991 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6992 }
6993 if (oi.is_cache_pinned()) {
6994 ctx->delta_stats.num_objects_pinned--;
6995 }
6996 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6997 snapset.head_exists = false;
6998 }
6999 obs.exists = false;
7000 return 0;
7001 }
7002
7003 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7004 {
7005 SnapSet& snapset = ctx->new_snapset;
7006 ObjectState& obs = ctx->new_obs;
7007 object_info_t& oi = obs.oi;
7008 const hobject_t& soid = oi.soid;
7009 PGTransaction* t = ctx->op_t.get();
7010 snapid_t snapid = (uint64_t)op.snap.snapid;
7011 hobject_t missing_oid;
7012
7013 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7014
7015 ObjectContextRef rollback_to;
7016 int ret = find_object_context(
7017 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7018 soid.get_namespace()),
7019 &rollback_to, false, false, &missing_oid);
7020 if (ret == -EAGAIN) {
7021 /* clone must be missing */
7022 assert(is_degraded_or_backfilling_object(missing_oid));
7023 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7024 << missing_oid << " (requested snapid: ) " << snapid << dendl;
7025 block_write_on_degraded_snap(missing_oid, ctx->op);
7026 return ret;
7027 }
7028 {
7029 ObjectContextRef promote_obc;
7030 cache_result_t tier_mode_result;
7031 if (obs.exists && obs.oi.has_manifest()) {
7032 tier_mode_result =
7033 maybe_handle_manifest_detail(
7034 ctx->op,
7035 true,
7036 rollback_to);
7037 } else {
7038 tier_mode_result =
7039 maybe_handle_cache_detail(
7040 ctx->op,
7041 true,
7042 rollback_to,
7043 ret,
7044 missing_oid,
7045 true,
7046 false,
7047 &promote_obc);
7048 }
7049 switch (tier_mode_result) {
7050 case cache_result_t::NOOP:
7051 break;
7052 case cache_result_t::BLOCKED_PROMOTE:
7053 assert(promote_obc);
7054 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7055 return -EAGAIN;
7056 case cache_result_t::BLOCKED_FULL:
7057 block_write_on_full_cache(soid, ctx->op);
7058 return -EAGAIN;
7059 case cache_result_t::REPLIED_WITH_EAGAIN:
7060 assert(0 == "this can't happen, no rollback on replica");
7061 default:
7062 assert(0 == "must promote was set, other values are not valid");
7063 return -EAGAIN;
7064 }
7065 }
7066
7067 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7068 // there's no snapshot here, or there's no object.
7069 // if there's no snapshot, we delete the object; otherwise, do nothing.
7070 dout(20) << "_rollback_to deleting head on " << soid.oid
7071 << " because got ENOENT|whiteout on find_object_context" << dendl;
7072 if (ctx->obc->obs.oi.watchers.size()) {
7073 // Cannot delete an object with watchers
7074 ret = -EBUSY;
7075 } else {
7076 _delete_oid(ctx, false, false);
7077 ret = 0;
7078 }
7079 } else if (ret) {
7080 // ummm....huh? It *can't* return anything else at time of writing.
7081 assert(0 == "unexpected error code in _rollback_to");
7082 } else { //we got our context, let's use it to do the rollback!
7083 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7084 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7085 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7086 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7087 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7088 ret = -EAGAIN;
7089 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7090 // rolling back to the head; we just need to clone it.
7091 ctx->modify = true;
7092 } else {
7093 /* 1) Delete current head
7094 * 2) Clone correct snapshot into head
7095 * 3) Calculate clone_overlaps by following overlaps
7096 * forward from rollback snapshot */
7097 dout(10) << "_rollback_to deleting " << soid.oid
7098 << " and rolling back to old snap" << dendl;
7099
7100 if (obs.exists) {
7101 t->remove(soid);
7102 }
7103 t->clone(soid, rollback_to_sobject);
7104 snapset.head_exists = true;
7105 t->add_obc(rollback_to);
7106
7107 map<snapid_t, interval_set<uint64_t> >::iterator iter =
7108 snapset.clone_overlap.lower_bound(snapid);
7109 interval_set<uint64_t> overlaps = iter->second;
7110 assert(iter != snapset.clone_overlap.end());
7111 for ( ;
7112 iter != snapset.clone_overlap.end();
7113 ++iter)
7114 overlaps.intersection_of(iter->second);
7115
7116 if (obs.oi.size > 0) {
7117 interval_set<uint64_t> modified;
7118 modified.insert(0, obs.oi.size);
7119 overlaps.intersection_of(modified);
7120 modified.subtract(overlaps);
7121 ctx->modified_ranges.union_of(modified);
7122 }
7123
7124 // Adjust the cached objectcontext
7125 maybe_create_new_object(ctx, true);
7126 ctx->delta_stats.num_bytes -= obs.oi.size;
7127 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7128 obs.oi.size = rollback_to->obs.oi.size;
7129 if (rollback_to->obs.oi.is_data_digest())
7130 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7131 else
7132 obs.oi.clear_data_digest();
7133 if (rollback_to->obs.oi.is_omap_digest())
7134 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7135 else
7136 obs.oi.clear_omap_digest();
7137
7138 if (rollback_to->obs.oi.is_omap()) {
7139 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7140 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7141 } else {
7142 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7143 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7144 }
7145
7146 snapset.head_exists = true;
7147 }
7148 }
7149 return ret;
7150 }
7151
7152 void PrimaryLogPG::_make_clone(
7153 OpContext *ctx,
7154 PGTransaction* t,
7155 ObjectContextRef obc,
7156 const hobject_t& head, const hobject_t& coid,
7157 object_info_t *poi)
7158 {
7159 bufferlist bv;
7160 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7161
7162 t->clone(coid, head);
7163 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7164 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7165 }
7166
7167 void PrimaryLogPG::make_writeable(OpContext *ctx)
7168 {
7169 const hobject_t& soid = ctx->obs->oi.soid;
7170 SnapContext& snapc = ctx->snapc;
7171
7172 // clone?
7173 assert(soid.snap == CEPH_NOSNAP);
7174 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7175 << " snapc=" << snapc << dendl;
7176
7177 bool was_dirty = ctx->obc->obs.oi.is_dirty();
7178 if (ctx->new_obs.exists) {
7179 // we will mark the object dirty
7180 if (ctx->undirty && was_dirty) {
7181 dout(20) << " clearing DIRTY flag" << dendl;
7182 assert(ctx->new_obs.oi.is_dirty());
7183 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7184 --ctx->delta_stats.num_objects_dirty;
7185 osd->logger->inc(l_osd_tier_clean);
7186 } else if (!was_dirty && !ctx->undirty) {
7187 dout(20) << " setting DIRTY flag" << dendl;
7188 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7189 ++ctx->delta_stats.num_objects_dirty;
7190 osd->logger->inc(l_osd_tier_dirty);
7191 }
7192 } else {
7193 if (was_dirty) {
7194 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7195 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7196 --ctx->delta_stats.num_objects_dirty;
7197 }
7198 }
7199
7200 if ((ctx->new_obs.exists &&
7201 ctx->new_obs.oi.is_omap()) &&
7202 (!ctx->obc->obs.exists ||
7203 !ctx->obc->obs.oi.is_omap())) {
7204 ++ctx->delta_stats.num_objects_omap;
7205 }
7206 if ((!ctx->new_obs.exists ||
7207 !ctx->new_obs.oi.is_omap()) &&
7208 (ctx->obc->obs.exists &&
7209 ctx->obc->obs.oi.is_omap())) {
7210 --ctx->delta_stats.num_objects_omap;
7211 }
7212
7213 // use newer snapc?
7214 if (ctx->new_snapset.seq > snapc.seq) {
7215 snapc.seq = ctx->new_snapset.seq;
7216 snapc.snaps = ctx->new_snapset.snaps;
7217 filter_snapc(snapc.snaps);
7218 dout(10) << " using newer snapc " << snapc << dendl;
7219 }
7220
7221 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7222 snapc.snaps.size() && // there are snaps
7223 !ctx->cache_evict &&
7224 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
7225 // clone
7226 hobject_t coid = soid;
7227 coid.snap = snapc.seq;
7228
7229 unsigned l;
7230 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7231
7232 vector<snapid_t> snaps(l);
7233 for (unsigned i=0; i<l; i++)
7234 snaps[i] = snapc.snaps[i];
7235
7236 // prepare clone
7237 object_info_t static_snap_oi(coid);
7238 object_info_t *snap_oi;
7239 if (is_primary()) {
7240 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7241 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7242 ctx->clone_obc->obs.oi = static_snap_oi;
7243 ctx->clone_obc->obs.exists = true;
7244 ctx->clone_obc->ssc = ctx->obc->ssc;
7245 ctx->clone_obc->ssc->ref++;
7246 if (pool.info.require_rollback())
7247 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7248 snap_oi = &ctx->clone_obc->obs.oi;
7249 bool got = ctx->lock_manager.get_write_greedy(
7250 coid,
7251 ctx->clone_obc,
7252 ctx->op);
7253 assert(got);
7254 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7255 } else {
7256 snap_oi = &static_snap_oi;
7257 }
7258 snap_oi->version = ctx->at_version;
7259 snap_oi->prior_version = ctx->obs->oi.version;
7260 snap_oi->copy_user_bits(ctx->obs->oi);
7261
7262 bool legacy = ctx->new_snapset.is_legacy() ||
7263 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7264 if (legacy) {
7265 snap_oi->legacy_snaps = snaps;
7266 }
7267
7268 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7269
7270 ctx->delta_stats.num_objects++;
7271 if (snap_oi->is_dirty()) {
7272 ctx->delta_stats.num_objects_dirty++;
7273 osd->logger->inc(l_osd_tier_dirty);
7274 }
7275 if (snap_oi->is_omap())
7276 ctx->delta_stats.num_objects_omap++;
7277 if (snap_oi->is_cache_pinned())
7278 ctx->delta_stats.num_objects_pinned++;
7279 ctx->delta_stats.num_object_clones++;
7280 ctx->new_snapset.clones.push_back(coid.snap);
7281 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7282 if (!legacy) {
7283 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7284 }
7285
7286 // clone_overlap should contain an entry for each clone
7287 // (an empty interval_set if there is no overlap)
7288 ctx->new_snapset.clone_overlap[coid.snap];
7289 if (ctx->obs->oi.size)
7290 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7291
7292 // log clone
7293 dout(10) << " cloning v " << ctx->obs->oi.version
7294 << " to " << coid << " v " << ctx->at_version
7295 << " snaps=" << snaps
7296 << " snapset=" << ctx->new_snapset << dendl;
7297 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7298 ctx->obs->oi.version,
7299 ctx->obs->oi.user_version,
7300 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7301 ::encode(snaps, ctx->log.back().snaps);
7302
7303 ctx->at_version.version++;
7304 }
7305
7306 // update most recent clone_overlap and usage stats
7307 if (ctx->new_snapset.clones.size() > 0) {
7308 /* we need to check whether the most recent clone exists, if it's been evicted,
7309 * it's not included in the stats */
7310 hobject_t last_clone_oid = soid;
7311 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7312 if (is_present_clone(last_clone_oid)) {
7313 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7314 ctx->modified_ranges.intersection_of(newest_overlap);
7315 // modified_ranges is still in use by the clone
7316 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7317 newest_overlap.subtract(ctx->modified_ranges);
7318 }
7319 }
7320
7321 // update snapset with latest snap context
7322 ctx->new_snapset.seq = snapc.seq;
7323 ctx->new_snapset.snaps = snapc.snaps;
7324 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7325 // pessimistic assumption that this is a net-new legacy SnapSet
7326 ctx->delta_stats.num_legacy_snapsets++;
7327 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7328 } else if (ctx->new_snapset.is_legacy()) {
7329 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7330 }
7331 dout(20) << "make_writeable " << soid
7332 << " done, snapset=" << ctx->new_snapset << dendl;
7333 }
7334
7335
7336 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7337 interval_set<uint64_t>& modified, uint64_t offset,
7338 uint64_t length, bool write_full)
7339 {
7340 interval_set<uint64_t> ch;
7341 if (write_full) {
7342 if (oi.size)
7343 ch.insert(0, oi.size);
7344 } else if (length)
7345 ch.insert(offset, length);
7346 modified.union_of(ch);
7347 if (write_full || offset + length > oi.size) {
7348 uint64_t new_size = offset + length;
7349 delta_stats.num_bytes -= oi.size;
7350 delta_stats.num_bytes += new_size;
7351 oi.size = new_size;
7352 }
7353 delta_stats.num_wr++;
7354 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7355 }
7356
7357 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7358 {
7359 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7360 delta_stats.num_bytes += p.get_len();
7361 }
7362 }
7363
7364 void PrimaryLogPG::complete_disconnect_watches(
7365 ObjectContextRef obc,
7366 const list<watch_disconnect_t> &to_disconnect)
7367 {
7368 for (list<watch_disconnect_t>::const_iterator i =
7369 to_disconnect.begin();
7370 i != to_disconnect.end();
7371 ++i) {
7372 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7373 auto watchers_entry = obc->watchers.find(watcher);
7374 if (watchers_entry != obc->watchers.end()) {
7375 WatchRef watch = watchers_entry->second;
7376 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7377 obc->watchers.erase(watcher);
7378 watch->remove(i->send_disconnect);
7379 } else {
7380 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7381 << watcher << dendl;
7382 }
7383 }
7384 }
7385
7386 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7387 {
7388 entity_name_t entity = ctx->reqid.name;
7389 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7390
7391 // disconnects first
7392 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7393
7394 assert(conn);
7395
7396 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7397 if (!session.get())
7398 return;
7399 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7400
7401 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7402 i != ctx->watch_connects.end();
7403 ++i) {
7404 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7405 dout(15) << "do_osd_op_effects applying watch connect on session "
7406 << session.get() << " watcher " << watcher << dendl;
7407 WatchRef watch;
7408 if (ctx->obc->watchers.count(watcher)) {
7409 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7410 << dendl;
7411 watch = ctx->obc->watchers[watcher];
7412 } else {
7413 dout(15) << "do_osd_op_effects new watcher " << watcher
7414 << dendl;
7415 watch = Watch::makeWatchRef(
7416 this, osd, ctx->obc, i->first.timeout_seconds,
7417 i->first.cookie, entity, conn->get_peer_addr());
7418 ctx->obc->watchers.insert(
7419 make_pair(
7420 watcher,
7421 watch));
7422 }
7423 watch->connect(conn, i->second);
7424 }
7425
7426 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7427 p != ctx->notifies.end();
7428 ++p) {
7429 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7430 ConnectionRef conn(ctx->op->get_req()->get_connection());
7431 NotifyRef notif(
7432 Notify::makeNotifyRef(
7433 conn,
7434 ctx->reqid.name.num(),
7435 p->bl,
7436 p->timeout,
7437 p->cookie,
7438 p->notify_id,
7439 ctx->obc->obs.oi.user_version,
7440 osd));
7441 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7442 ctx->obc->watchers.begin();
7443 i != ctx->obc->watchers.end();
7444 ++i) {
7445 dout(10) << "starting notify on watch " << i->first << dendl;
7446 i->second->start_notify(notif);
7447 }
7448 notif->init();
7449 }
7450
7451 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7452 p != ctx->notify_acks.end();
7453 ++p) {
7454 if (p->watch_cookie)
7455 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7456 else
7457 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7458 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7459 ctx->obc->watchers.begin();
7460 i != ctx->obc->watchers.end();
7461 ++i) {
7462 if (i->first.second != entity) continue;
7463 if (p->watch_cookie &&
7464 p->watch_cookie.get() != i->first.first) continue;
7465 dout(10) << "acking notify on watch " << i->first << dendl;
7466 i->second->notify_ack(p->notify_id, p->reply_bl);
7467 }
7468 }
7469 }
7470
7471 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7472 {
7473 ostringstream ss;
7474 ss << "temp_" << info.pgid << "_" << get_role()
7475 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7476 hobject_t hoid = target.make_temp_hobject(ss.str());
7477 dout(20) << __func__ << " " << hoid << dendl;
7478 return hoid;
7479 }
7480
7481 hobject_t PrimaryLogPG::get_temp_recovery_object(
7482 const hobject_t& target,
7483 eversion_t version)
7484 {
7485 ostringstream ss;
7486 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7487 << "_" << version
7488 << "_" << info.history.same_interval_since
7489 << "_" << target.snap;
7490 // pgid + version + interval + snapid is unique, and short
7491 hobject_t hoid = target.make_temp_hobject(ss.str());
7492 dout(20) << __func__ << " " << hoid << dendl;
7493 return hoid;
7494 }
7495
7496 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7497 {
7498 assert(!ctx->ops->empty());
7499
7500 const hobject_t& soid = ctx->obs->oi.soid;
7501
7502 // valid snap context?
7503 if (!ctx->snapc.is_valid()) {
7504 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7505 return -EINVAL;
7506 }
7507
7508 // prepare the actual mutation
7509 int result = do_osd_ops(ctx, *ctx->ops);
7510 if (result < 0) {
7511 if (ctx->op->may_write() &&
7512 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7513 // need to save the error code in the pg log, to detect dup ops,
7514 // but do nothing else
7515 ctx->update_log_only = true;
7516 }
7517 return result;
7518 }
7519
7520 // read-op? write-op noop? done?
7521 if (ctx->op_t->empty() && !ctx->modify) {
7522 unstable_stats.add(ctx->delta_stats);
7523 if (ctx->op->may_write() &&
7524 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7525 ctx->update_log_only = true;
7526 }
7527 return result;
7528 }
7529
7530 // check for full
7531 if ((ctx->delta_stats.num_bytes > 0 ||
7532 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7533 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7534 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7535 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7536 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7537 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7538 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7539 << dendl;
7540 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7541 // they tried, they failed.
7542 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7543 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7544 } else {
7545 // drop request
7546 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7547 return -EAGAIN;
7548 }
7549 }
7550
7551 // clone, if necessary
7552 if (soid.snap == CEPH_NOSNAP)
7553 make_writeable(ctx);
7554
7555 finish_ctx(ctx,
7556 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7557 pg_log_entry_t::DELETE);
7558
7559 return result;
7560 }
7561
7562 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7563 {
7564 const hobject_t& soid = ctx->obs->oi.soid;
7565 dout(20) << __func__ << " " << soid << " " << ctx
7566 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7567 << dendl;
7568 utime_t now = ceph_clock_now();
7569
7570 // snapset
7571 bufferlist bss;
7572
7573 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7574 ::encode(ctx->new_snapset, bss);
7575 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7576 !ctx->new_snapset.is_legacy());
7577
7578 if (ctx->new_obs.exists) {
7579 if (!ctx->obs->exists) {
7580 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7581 hobject_t snapoid = soid.get_snapdir();
7582 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7583 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7584 ctx->at_version,
7585 ctx->snapset_obc->obs.oi.version,
7586 0, osd_reqid_t(), ctx->mtime, 0));
7587 ctx->op_t->remove(snapoid);
7588
7589 ctx->at_version.version++;
7590
7591 ctx->snapset_obc->obs.exists = false;
7592 }
7593 }
7594 } else if (!ctx->new_snapset.clones.empty() &&
7595 !ctx->cache_evict &&
7596 !ctx->new_snapset.head_exists &&
7597 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7598 // save snapset on _snap
7599 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7600 info.pgid.pool(), soid.get_namespace());
7601 dout(10) << " final snapset " << ctx->new_snapset
7602 << " in " << snapoid << dendl;
7603 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7604 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7605 ctx->at_version,
7606 eversion_t(),
7607 0, osd_reqid_t(), ctx->mtime, 0));
7608
7609 if (!ctx->snapset_obc)
7610 ctx->snapset_obc = get_object_context(snapoid, true);
7611 bool got = false;
7612 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7613 got = ctx->lock_manager.get_write_greedy(
7614 snapoid,
7615 ctx->snapset_obc,
7616 ctx->op);
7617 } else {
7618 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7619 got = ctx->lock_manager.get_lock_type(
7620 ObjectContext::RWState::RWEXCL,
7621 snapoid,
7622 ctx->snapset_obc,
7623 ctx->op);
7624 }
7625 assert(got);
7626 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7627 ctx->snapset_obc->obs.exists = true;
7628 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7629 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7630 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7631 ctx->snapset_obc->obs.oi.local_mtime = now;
7632
7633 map<string, bufferlist> attrs;
7634 bufferlist bv(sizeof(ctx->new_obs.oi));
7635 ::encode(ctx->snapset_obc->obs.oi, bv,
7636 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7637 ctx->op_t->create(snapoid);
7638 attrs[OI_ATTR].claim(bv);
7639 attrs[SS_ATTR].claim(bss);
7640 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7641 ctx->at_version.version++;
7642 }
7643 }
7644
7645 // finish and log the op.
7646 if (ctx->user_modify) {
7647 // update the user_version for any modify ops, except for the watch op
7648 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7649 /* In order for new clients and old clients to interoperate properly
7650 * when exchanging versions, we need to lower bound the user_version
7651 * (which our new clients pay proper attention to)
7652 * by the at_version (which is all the old clients can ever see). */
7653 if (ctx->at_version.version > ctx->user_at_version)
7654 ctx->user_at_version = ctx->at_version.version;
7655 ctx->new_obs.oi.user_version = ctx->user_at_version;
7656 }
7657 ctx->bytes_written = ctx->op_t->get_bytes_written();
7658
7659 if (ctx->new_obs.exists) {
7660 // on the head object
7661 ctx->new_obs.oi.version = ctx->at_version;
7662 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7663 ctx->new_obs.oi.last_reqid = ctx->reqid;
7664 if (ctx->mtime != utime_t()) {
7665 ctx->new_obs.oi.mtime = ctx->mtime;
7666 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7667 ctx->new_obs.oi.local_mtime = now;
7668 } else {
7669 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7670 }
7671
7672 map <string, bufferlist> attrs;
7673 bufferlist bv(sizeof(ctx->new_obs.oi));
7674 ::encode(ctx->new_obs.oi, bv,
7675 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7676 attrs[OI_ATTR].claim(bv);
7677
7678 if (soid.snap == CEPH_NOSNAP) {
7679 dout(10) << " final snapset " << ctx->new_snapset
7680 << " in " << soid << dendl;
7681 attrs[SS_ATTR].claim(bss);
7682 } else {
7683 dout(10) << " no snapset (this is a clone)" << dendl;
7684 }
7685 ctx->op_t->setattrs(soid, attrs);
7686 } else {
7687 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7688 }
7689
7690 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7691 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7692
7693 // append to log
7694 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7695 ctx->obs->oi.version,
7696 ctx->user_at_version, ctx->reqid,
7697 ctx->mtime, 0));
7698 if (soid.snap < CEPH_NOSNAP) {
7699 switch (log_op_type) {
7700 case pg_log_entry_t::MODIFY:
7701 case pg_log_entry_t::PROMOTE:
7702 case pg_log_entry_t::CLEAN:
7703 if (legacy_snapset) {
7704 dout(20) << __func__ << " encoding legacy_snaps "
7705 << ctx->new_obs.oi.legacy_snaps
7706 << dendl;
7707 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7708 } else {
7709 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7710 << dendl;
7711 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7712 }
7713 break;
7714 default:
7715 break;
7716 }
7717 }
7718
7719 if (!ctx->extra_reqids.empty()) {
7720 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7721 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7722 }
7723
7724 // apply new object state.
7725 ctx->obc->obs = ctx->new_obs;
7726
7727 if (soid.is_head() && !ctx->obc->obs.exists &&
7728 (!maintain_ssc || ctx->cache_evict)) {
7729 ctx->obc->ssc->exists = false;
7730 ctx->obc->ssc->snapset = SnapSet();
7731 } else {
7732 ctx->obc->ssc->exists = true;
7733 ctx->obc->ssc->snapset = ctx->new_snapset;
7734 }
7735 }
7736
7737 void PrimaryLogPG::apply_stats(
7738 const hobject_t &soid,
7739 const object_stat_sum_t &delta_stats) {
7740
7741 info.stats.stats.add(delta_stats);
7742
7743 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7744 i != backfill_targets.end();
7745 ++i) {
7746 pg_shard_t bt = *i;
7747 pg_info_t& pinfo = peer_info[bt];
7748 if (soid <= pinfo.last_backfill)
7749 pinfo.stats.stats.add(delta_stats);
7750 else if (soid <= last_backfill_started)
7751 pending_backfill_updates[soid].stats.add(delta_stats);
7752 }
7753
7754 if (is_primary() && scrubber.active) {
7755 if (soid < scrubber.start) {
7756 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7757 << "," << scrubber.end << ")" << dendl;
7758 scrub_cstat.add(delta_stats);
7759 } else {
7760 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7761 << "," << scrubber.end << ")" << dendl;
7762 }
7763 }
7764 }
7765
7766 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7767 {
7768 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7769 assert(ctx->async_reads_complete());
7770
7771 for (vector<OSDOp>::iterator p = ctx->ops->begin();
7772 p != ctx->ops->end() && result >= 0; ++p) {
7773 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7774 result = p->rval;
7775 break;
7776 }
7777 ctx->bytes_read += p->outdata.length();
7778 }
7779 ctx->reply->claim_op_out_data(*ctx->ops);
7780 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7781
7782 MOSDOpReply *reply = ctx->reply;
7783 ctx->reply = nullptr;
7784
7785 if (result >= 0) {
7786 if (!ctx->ignore_log_op_stats) {
7787 log_op_stats(ctx);
7788 publish_stats_to_osd();
7789 }
7790
7791 // on read, return the current object version
7792 if (ctx->obs) {
7793 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7794 } else {
7795 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7796 }
7797 } else if (result == -ENOENT) {
7798 // on ENOENT, set a floor for what the next user version will be.
7799 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7800 }
7801
7802 reply->set_result(result);
7803 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7804 osd->send_message_osd_client(reply, m->get_connection());
7805 close_op_ctx(ctx);
7806 }
7807
7808 // ========================================================================
7809 // copyfrom
7810
7811 struct C_Copyfrom : public Context {
7812 PrimaryLogPGRef pg;
7813 hobject_t oid;
7814 epoch_t last_peering_reset;
7815 ceph_tid_t tid;
7816 PrimaryLogPG::CopyOpRef cop;
7817 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7818 const PrimaryLogPG::CopyOpRef& c)
7819 : pg(p), oid(o), last_peering_reset(lpr),
7820 tid(0), cop(c)
7821 {}
7822 void finish(int r) override {
7823 if (r == -ECANCELED)
7824 return;
7825 pg->lock();
7826 if (last_peering_reset == pg->get_last_peering_reset()) {
7827 pg->process_copy_chunk(oid, tid, r);
7828 }
7829 pg->unlock();
7830 }
7831 };
7832
7833 struct C_CopyFrom_AsyncReadCb : public Context {
7834 OSDOp *osd_op;
7835 object_copy_data_t reply_obj;
7836 uint64_t features;
7837 size_t len;
7838 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7839 osd_op(osd_op), features(features), len(0) {}
7840 void finish(int r) override {
7841 osd_op->rval = r;
7842 if (r < 0) {
7843 return;
7844 }
7845
7846 assert(len > 0);
7847 assert(len <= reply_obj.data.length());
7848 bufferlist bl;
7849 bl.substr_of(reply_obj.data, 0, len);
7850 reply_obj.data.swap(bl);
7851 ::encode(reply_obj, osd_op->outdata, features);
7852 }
7853 };
7854
7855 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7856 OSDOp& osd_op, ObjectContextRef &obc)
7857 {
7858 object_info_t& oi = obc->obs.oi;
7859 hobject_t& soid = oi.soid;
7860 int result = 0;
7861 object_copy_cursor_t cursor;
7862 uint64_t out_max;
7863 bool skip_data_digest =
7864 (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
7865 g_conf->osd_distrust_data_digest;
7866
7867 try {
7868 ::decode(cursor, bp);
7869 ::decode(out_max, bp);
7870 }
7871 catch (buffer::error& e) {
7872 result = -EINVAL;
7873 return result;
7874 }
7875
7876 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7877 uint64_t features = op->get_features();
7878
7879 bool async_read_started = false;
7880 object_copy_data_t _reply_obj;
7881 C_CopyFrom_AsyncReadCb *cb = NULL;
7882 if (pool.info.require_rollback()) {
7883 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7884 }
7885 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7886 // size, mtime
7887 reply_obj.size = oi.size;
7888 reply_obj.mtime = oi.mtime;
7889 assert(obc->ssc);
7890 if (soid.snap < CEPH_NOSNAP) {
7891 if (obc->ssc->snapset.is_legacy()) {
7892 reply_obj.snaps = oi.legacy_snaps;
7893 } else {
7894 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7895 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7896 reply_obj.snaps = p->second;
7897 }
7898 } else {
7899 reply_obj.snap_seq = obc->ssc->snapset.seq;
7900 }
7901 if (!skip_data_digest && oi.is_data_digest()) {
7902 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7903 reply_obj.data_digest = oi.data_digest;
7904 }
7905 if (oi.is_omap_digest()) {
7906 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7907 reply_obj.omap_digest = oi.omap_digest;
7908 }
7909 reply_obj.truncate_seq = oi.truncate_seq;
7910 reply_obj.truncate_size = oi.truncate_size;
7911
7912 // attrs
7913 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7914 if (!cursor.attr_complete) {
7915 result = getattrs_maybe_cache(
7916 ctx->obc,
7917 &out_attrs);
7918 if (result < 0) {
7919 if (cb) {
7920 delete cb;
7921 }
7922 return result;
7923 }
7924 cursor.attr_complete = true;
7925 dout(20) << " got attrs" << dendl;
7926 }
7927
7928 int64_t left = out_max - osd_op.outdata.length();
7929
7930 // data
7931 bufferlist& bl = reply_obj.data;
7932 if (left > 0 && !cursor.data_complete) {
7933 if (cursor.data_offset < oi.size) {
7934 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7935 if (cb) {
7936 async_read_started = true;
7937 ctx->pending_async_reads.push_back(
7938 make_pair(
7939 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7940 make_pair(&bl, cb)));
7941 cb->len = max_read;
7942
7943 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7944 new ReadFinisher(osd_op));
7945 result = -EINPROGRESS;
7946
7947 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7948 } else {
7949 result = pgbackend->objects_read_sync(
7950 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7951 if (result < 0)
7952 return result;
7953 }
7954 left -= max_read;
7955 cursor.data_offset += max_read;
7956 }
7957 if (cursor.data_offset == oi.size) {
7958 cursor.data_complete = true;
7959 dout(20) << " got data" << dendl;
7960 }
7961 assert(cursor.data_offset <= oi.size);
7962 }
7963
7964 // omap
7965 uint32_t omap_keys = 0;
7966 if (!pool.info.supports_omap() || !oi.is_omap()) {
7967 cursor.omap_complete = true;
7968 } else {
7969 if (left > 0 && !cursor.omap_complete) {
7970 assert(cursor.data_complete);
7971 if (cursor.omap_offset.empty()) {
7972 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7973 &reply_obj.omap_header);
7974 }
7975 bufferlist omap_data;
7976 ObjectMap::ObjectMapIterator iter =
7977 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7978 assert(iter);
7979 iter->upper_bound(cursor.omap_offset);
7980 for (; iter->valid(); iter->next(false)) {
7981 ++omap_keys;
7982 ::encode(iter->key(), omap_data);
7983 ::encode(iter->value(), omap_data);
7984 left -= iter->key().length() + 4 + iter->value().length() + 4;
7985 if (left <= 0)
7986 break;
7987 }
7988 if (omap_keys) {
7989 ::encode(omap_keys, reply_obj.omap_data);
7990 reply_obj.omap_data.claim_append(omap_data);
7991 }
7992 if (iter->valid()) {
7993 cursor.omap_offset = iter->key();
7994 } else {
7995 cursor.omap_complete = true;
7996 dout(20) << " got omap" << dendl;
7997 }
7998 }
7999 }
8000
8001 if (cursor.is_complete()) {
8002 // include reqids only in the final step. this is a bit fragile
8003 // but it works...
8004 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
8005 dout(20) << " got reqids" << dendl;
8006 }
8007
8008 dout(20) << " cursor.is_complete=" << cursor.is_complete()
8009 << " " << out_attrs.size() << " attrs"
8010 << " " << bl.length() << " bytes"
8011 << " " << reply_obj.omap_header.length() << " omap header bytes"
8012 << " " << reply_obj.omap_data.length() << " omap data bytes in "
8013 << omap_keys << " keys"
8014 << " " << reply_obj.reqids.size() << " reqids"
8015 << dendl;
8016 reply_obj.cursor = cursor;
8017 if (!async_read_started) {
8018 ::encode(reply_obj, osd_op.outdata, features);
8019 }
8020 if (cb && !async_read_started) {
8021 delete cb;
8022 }
8023
8024 if (result > 0) {
8025 result = 0;
8026 }
8027 return result;
8028 }
8029
8030 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8031 OSDOp& osd_op)
8032 {
8033 // NOTE: we take non-const ref here for claim_op_out_data below; we must
8034 // be careful not to modify anything else that will upset a racing
8035 // operator<<
8036 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
8037 uint64_t features = m->get_features();
8038 object_copy_data_t reply_obj;
8039
8040 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
8041 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8042 ::encode(reply_obj, osd_op.outdata, features);
8043 osd_op.rval = -ENOENT;
8044 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8045 reply->claim_op_out_data(m->ops);
8046 reply->set_result(-ENOENT);
8047 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8048 osd->send_message_osd_client(reply, m->get_connection());
8049 }
8050
8051 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8052 hobject_t src, object_locator_t oloc,
8053 version_t version, unsigned flags,
8054 bool mirror_snapset,
8055 unsigned src_obj_fadvise_flags,
8056 unsigned dest_obj_fadvise_flags)
8057 {
8058 const hobject_t& dest = obc->obs.oi.soid;
8059 dout(10) << __func__ << " " << dest
8060 << " from " << src << " " << oloc << " v" << version
8061 << " flags " << flags
8062 << (mirror_snapset ? " mirror_snapset" : "")
8063 << dendl;
8064
8065 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8066 src.snap == CEPH_SNAPDIR));
8067
8068 // cancel a previous in-progress copy?
8069 if (copy_ops.count(dest)) {
8070 // FIXME: if the src etc match, we could avoid restarting from the
8071 // beginning.
8072 CopyOpRef cop = copy_ops[dest];
8073 vector<ceph_tid_t> tids;
8074 cancel_copy(cop, false, &tids);
8075 osd->objecter->op_cancel(tids, -ECANCELED);
8076 }
8077
8078 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8079 mirror_snapset, src_obj_fadvise_flags,
8080 dest_obj_fadvise_flags));
8081 copy_ops[dest] = cop;
8082 obc->start_block();
8083
8084 _copy_some(obc, cop);
8085 }
8086
8087 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8088 {
8089 dout(10) << __func__ << " " << obc << " " << cop << dendl;
8090
8091 unsigned flags = 0;
8092 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8093 flags |= CEPH_OSD_FLAG_FLUSH;
8094 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8095 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8096 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8097 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8098 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8099 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8100 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8101 flags |= CEPH_OSD_FLAG_RWORDERED;
8102
8103 C_GatherBuilder gather(cct);
8104
8105 if (cop->cursor.is_initial() && cop->mirror_snapset) {
8106 // list snaps too.
8107 assert(cop->src.snap == CEPH_NOSNAP);
8108 ObjectOperation op;
8109 op.list_snaps(&cop->results.snapset, NULL);
8110 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8111 CEPH_SNAPDIR, NULL,
8112 flags, gather.new_sub(), NULL);
8113 cop->objecter_tid2 = tid;
8114 }
8115
8116 ObjectOperation op;
8117 if (cop->results.user_version) {
8118 op.assert_version(cop->results.user_version);
8119 } else {
8120 // we should learn the version after the first chunk, if we didn't know
8121 // it already!
8122 assert(cop->cursor.is_initial());
8123 }
8124 op.copy_get(&cop->cursor, get_copy_chunk_size(),
8125 &cop->results.object_size, &cop->results.mtime,
8126 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8127 &cop->results.snaps, &cop->results.snap_seq,
8128 &cop->results.flags,
8129 &cop->results.source_data_digest,
8130 &cop->results.source_omap_digest,
8131 &cop->results.reqids,
8132 &cop->results.truncate_seq,
8133 &cop->results.truncate_size,
8134 &cop->rval);
8135 op.set_last_op_flags(cop->src_obj_fadvise_flags);
8136
8137 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8138 get_last_peering_reset(), cop);
8139 gather.set_finisher(new C_OnFinisher(fin,
8140 &osd->objecter_finisher));
8141
8142 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8143 cop->src.snap, NULL,
8144 flags,
8145 gather.new_sub(),
8146 // discover the object version if we don't know it yet
8147 cop->results.user_version ? NULL : &cop->results.user_version);
8148 fin->tid = tid;
8149 cop->objecter_tid = tid;
8150 gather.activate();
8151 }
8152
8153 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8154 {
8155 vector<ceph_tid_t> tids;
8156 dout(10) << __func__ << " " << oid << " tid " << tid
8157 << " " << cpp_strerror(r) << dendl;
8158 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8159 if (p == copy_ops.end()) {
8160 dout(10) << __func__ << " no copy_op found" << dendl;
8161 return;
8162 }
8163 CopyOpRef cop = p->second;
8164 if (tid != cop->objecter_tid) {
8165 dout(10) << __func__ << " tid " << tid << " != cop " << cop
8166 << " tid " << cop->objecter_tid << dendl;
8167 return;
8168 }
8169
8170 if (cop->omap_data.length() || cop->omap_header.length())
8171 cop->results.has_omap = true;
8172
8173 if (r >= 0 && !pool.info.supports_omap() &&
8174 (cop->omap_data.length() || cop->omap_header.length())) {
8175 r = -EOPNOTSUPP;
8176 }
8177 cop->objecter_tid = 0;
8178 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
8179 ObjectContextRef& cobc = cop->obc;
8180
8181 if (r < 0)
8182 goto out;
8183
8184 assert(cop->rval >= 0);
8185
8186 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8187 // verify snap hasn't been deleted
8188 vector<snapid_t>::iterator p = cop->results.snaps.begin();
8189 while (p != cop->results.snaps.end()) {
8190 if (pool.info.is_removed_snap(*p)) {
8191 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8192 << dendl;
8193 for (vector<snapid_t>::iterator q = p + 1;
8194 q != cop->results.snaps.end();
8195 ++q)
8196 *(q - 1) = *q;
8197 cop->results.snaps.resize(cop->results.snaps.size() - 1);
8198 } else {
8199 ++p;
8200 }
8201 }
8202 if (cop->results.snaps.empty()) {
8203 dout(10) << __func__ << " no more snaps for " << oid << dendl;
8204 r = -ENOENT;
8205 goto out;
8206 }
8207 }
8208
8209 assert(cop->rval >= 0);
8210
8211 if (!cop->temp_cursor.data_complete) {
8212 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8213 }
8214 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8215 if (cop->omap_header.length()) {
8216 cop->results.omap_digest =
8217 cop->omap_header.crc32c(cop->results.omap_digest);
8218 }
8219 if (cop->omap_data.length()) {
8220 bufferlist keys;
8221 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8222 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8223 }
8224 }
8225
8226 if (!cop->temp_cursor.attr_complete) {
8227 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8228 p != cop->attrs.end();
8229 ++p) {
8230 cop->results.attrs[string("_") + p->first] = p->second;
8231 }
8232 cop->attrs.clear();
8233 }
8234
8235 if (!cop->cursor.is_complete()) {
8236 // write out what we have so far
8237 if (cop->temp_cursor.is_initial()) {
8238 assert(!cop->results.started_temp_obj);
8239 cop->results.started_temp_obj = true;
8240 cop->results.temp_oid = generate_temp_object(oid);
8241 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8242 }
8243 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8244 OpContextUPtr ctx = simple_opc_create(tempobc);
8245 if (cop->temp_cursor.is_initial()) {
8246 ctx->new_temp_oid = cop->results.temp_oid;
8247 }
8248 _write_copy_chunk(cop, ctx->op_t.get());
8249 simple_opc_submit(std::move(ctx));
8250 dout(10) << __func__ << " fetching more" << dendl;
8251 _copy_some(cobc, cop);
8252 return;
8253 }
8254
8255 // verify digests?
8256 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8257 dout(20) << __func__ << std::hex
8258 << " got digest: rx data 0x" << cop->results.data_digest
8259 << " omap 0x" << cop->results.omap_digest
8260 << ", source: data 0x" << cop->results.source_data_digest
8261 << " omap 0x" << cop->results.source_omap_digest
8262 << std::dec
8263 << " flags " << cop->results.flags
8264 << dendl;
8265 }
8266 if (cop->results.is_data_digest() &&
8267 cop->results.data_digest != cop->results.source_data_digest) {
8268 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8269 << " != source 0x" << cop->results.source_data_digest << std::dec
8270 << dendl;
8271 osd->clog->error() << info.pgid << " copy from " << cop->src
8272 << " to " << cop->obc->obs.oi.soid << std::hex
8273 << " data digest 0x" << cop->results.data_digest
8274 << " != source 0x" << cop->results.source_data_digest
8275 << std::dec;
8276 r = -EIO;
8277 goto out;
8278 }
8279 if (cop->results.is_omap_digest() &&
8280 cop->results.omap_digest != cop->results.source_omap_digest) {
8281 derr << __func__ << std::hex
8282 << " omap digest 0x" << cop->results.omap_digest
8283 << " != source 0x" << cop->results.source_omap_digest
8284 << std::dec << dendl;
8285 osd->clog->error() << info.pgid << " copy from " << cop->src
8286 << " to " << cop->obc->obs.oi.soid << std::hex
8287 << " omap digest 0x" << cop->results.omap_digest
8288 << " != source 0x" << cop->results.source_omap_digest
8289 << std::dec;
8290 r = -EIO;
8291 goto out;
8292 }
8293 if (cct->_conf->osd_debug_inject_copyfrom_error) {
8294 derr << __func__ << " injecting copyfrom failure" << dendl;
8295 r = -EIO;
8296 goto out;
8297 }
8298
8299 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8300 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8301 ObjectState& obs = cop->obc->obs;
8302 if (cop->temp_cursor.is_initial()) {
8303 dout(20) << "fill_in_final_tx: writing "
8304 << "directly to final object" << dendl;
8305 // write directly to final object
8306 cop->results.temp_oid = obs.oi.soid;
8307 _write_copy_chunk(cop, t);
8308 } else {
8309 // finish writing to temp object, then move into place
8310 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8311 _write_copy_chunk(cop, t);
8312 t->rename(obs.oi.soid, cop->results.temp_oid);
8313 }
8314 t->setattrs(obs.oi.soid, cop->results.attrs);
8315 });
8316
8317 dout(20) << __func__ << " success; committing" << dendl;
8318
8319 out:
8320 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8321 CopyCallbackResults results(r, &cop->results);
8322 cop->cb->complete(results);
8323
8324 copy_ops.erase(cobc->obs.oi.soid);
8325 cobc->stop_block();
8326
8327 if (r < 0 && cop->results.started_temp_obj) {
8328 dout(10) << __func__ << " deleting partial temp object "
8329 << cop->results.temp_oid << dendl;
8330 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8331 OpContextUPtr ctx = simple_opc_create(tempobc);
8332 ctx->op_t->remove(cop->results.temp_oid);
8333 ctx->discard_temp_oid = cop->results.temp_oid;
8334 simple_opc_submit(std::move(ctx));
8335 }
8336
8337 // cancel and requeue proxy ops on this object
8338 if (!r) {
8339 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8340 it != proxyread_ops.end();) {
8341 if (it->second->soid == cobc->obs.oi.soid) {
8342 cancel_proxy_read((it++)->second, &tids);
8343 } else {
8344 ++it;
8345 }
8346 }
8347 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8348 it != proxywrite_ops.end();) {
8349 if (it->second->soid == cobc->obs.oi.soid) {
8350 cancel_proxy_write((it++)->second, &tids);
8351 } else {
8352 ++it;
8353 }
8354 }
8355 osd->objecter->op_cancel(tids, -ECANCELED);
8356 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8357 }
8358
8359 kick_object_context_blocked(cobc);
8360 }
8361
8362 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
8363 vector<ceph_tid_t> tids;
8364 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8365 it != proxyread_ops.end();) {
8366 if (it->second->soid == oid) {
8367 cancel_proxy_read((it++)->second, &tids);
8368 } else {
8369 ++it;
8370 }
8371 }
8372 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8373 it != proxywrite_ops.end();) {
8374 if (it->second->soid == oid) {
8375 cancel_proxy_write((it++)->second, &tids);
8376 } else {
8377 ++it;
8378 }
8379 }
8380 osd->objecter->op_cancel(tids, -ECANCELED);
8381 kick_proxy_ops_blocked(oid);
8382 }
8383
8384 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8385 {
8386 dout(20) << __func__ << " " << cop
8387 << " " << cop->attrs.size() << " attrs"
8388 << " " << cop->data.length() << " bytes"
8389 << " " << cop->omap_header.length() << " omap header bytes"
8390 << " " << cop->omap_data.length() << " omap data bytes"
8391 << dendl;
8392 if (!cop->temp_cursor.attr_complete) {
8393 t->create(cop->results.temp_oid);
8394 }
8395 if (!cop->temp_cursor.data_complete) {
8396 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8397 cop->cursor.data_offset);
8398 if (pool.info.requires_aligned_append() &&
8399 !cop->cursor.data_complete) {
8400 /**
8401 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8402 * to pick it up on the next pass.
8403 */
8404 assert(cop->temp_cursor.data_offset %
8405 pool.info.required_alignment() == 0);
8406 if (cop->data.length() % pool.info.required_alignment() != 0) {
8407 uint64_t to_trim =
8408 cop->data.length() % pool.info.required_alignment();
8409 bufferlist bl;
8410 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8411 cop->data.swap(bl);
8412 cop->cursor.data_offset -= to_trim;
8413 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8414 cop->cursor.data_offset);
8415 }
8416 }
8417 if (cop->data.length()) {
8418 t->write(
8419 cop->results.temp_oid,
8420 cop->temp_cursor.data_offset,
8421 cop->data.length(),
8422 cop->data,
8423 cop->dest_obj_fadvise_flags);
8424 }
8425 cop->data.clear();
8426 }
8427 if (pool.info.supports_omap()) {
8428 if (!cop->temp_cursor.omap_complete) {
8429 if (cop->omap_header.length()) {
8430 t->omap_setheader(
8431 cop->results.temp_oid,
8432 cop->omap_header);
8433 cop->omap_header.clear();
8434 }
8435 if (cop->omap_data.length()) {
8436 map<string,bufferlist> omap;
8437 bufferlist::iterator p = cop->omap_data.begin();
8438 ::decode(omap, p);
8439 t->omap_setkeys(cop->results.temp_oid, omap);
8440 cop->omap_data.clear();
8441 }
8442 }
8443 } else {
8444 assert(cop->omap_header.length() == 0);
8445 assert(cop->omap_data.length() == 0);
8446 }
8447 cop->temp_cursor = cop->cursor;
8448 }
8449
8450 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8451 {
8452 OpContext *ctx = cb->ctx;
8453 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8454
8455 ObjectState& obs = ctx->new_obs;
8456 if (obs.exists) {
8457 dout(20) << __func__ << ": exists, removing" << dendl;
8458 ctx->op_t->remove(obs.oi.soid);
8459 } else {
8460 ctx->delta_stats.num_objects++;
8461 obs.exists = true;
8462 }
8463 if (cb->is_temp_obj_used()) {
8464 ctx->discard_temp_oid = cb->results->temp_oid;
8465 }
8466 cb->results->fill_in_final_tx(ctx->op_t.get());
8467
8468 // CopyFromCallback fills this in for us
8469 obs.oi.user_version = ctx->user_at_version;
8470
8471 if (cb->results->is_data_digest()) {
8472 obs.oi.set_data_digest(cb->results->data_digest);
8473 } else {
8474 obs.oi.clear_data_digest();
8475 }
8476 if (cb->results->is_omap_digest()) {
8477 obs.oi.set_omap_digest(cb->results->omap_digest);
8478 } else {
8479 obs.oi.clear_omap_digest();
8480 }
8481
8482 obs.oi.truncate_seq = cb->results->truncate_seq;
8483 obs.oi.truncate_size = cb->results->truncate_size;
8484
8485 ctx->extra_reqids = cb->results->reqids;
8486
8487 // cache: clear whiteout?
8488 if (obs.oi.is_whiteout()) {
8489 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8490 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8491 --ctx->delta_stats.num_whiteouts;
8492 }
8493
8494 if (cb->results->has_omap) {
8495 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8496 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8497 } else {
8498 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8499 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8500 }
8501
8502 interval_set<uint64_t> ch;
8503 if (obs.oi.size > 0)
8504 ch.insert(0, obs.oi.size);
8505 ctx->modified_ranges.union_of(ch);
8506
8507 if (cb->get_data_size() != obs.oi.size) {
8508 ctx->delta_stats.num_bytes -= obs.oi.size;
8509 obs.oi.size = cb->get_data_size();
8510 ctx->delta_stats.num_bytes += obs.oi.size;
8511 }
8512 ctx->delta_stats.num_wr++;
8513 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8514
8515 osd->logger->inc(l_osd_copyfrom);
8516 }
8517
8518 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8519 ObjectContextRef obc)
8520 {
8521 const hobject_t& soid = obc->obs.oi.soid;
8522 dout(10) << __func__ << " " << soid << " r=" << r
8523 << " uv" << results->user_version << dendl;
8524
8525 if (r == -ECANCELED) {
8526 return;
8527 }
8528
8529 if (r != -ENOENT && soid.is_snap()) {
8530 if (results->snaps.empty()) {
8531 // we must have read "snap" content from the head object in
8532 // the base pool. use snap_seq to construct what snaps should
8533 // be for this clone (what is was before we evicted the clean
8534 // clone from this pool, and what it will be when we flush and
8535 // the clone eventually happens in the base pool).
8536 SnapSet& snapset = obc->ssc->snapset;
8537 vector<snapid_t>::iterator p = snapset.snaps.begin();
8538 while (p != snapset.snaps.end() && *p > soid.snap)
8539 ++p;
8540 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8541 results->snaps.push_back(*p);
8542 ++p;
8543 }
8544 }
8545
8546 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8547 filter_snapc(results->snaps);
8548
8549 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8550 if (results->snaps.empty()) {
8551 dout(20) << __func__
8552 << " snaps are empty, clone is invalid,"
8553 << " setting r to ENOENT" << dendl;
8554 r = -ENOENT;
8555 }
8556 }
8557
8558 if (r < 0 && results->started_temp_obj) {
8559 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8560 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8561 assert(tempobc);
8562 OpContextUPtr ctx = simple_opc_create(tempobc);
8563 ctx->op_t->remove(results->temp_oid);
8564 simple_opc_submit(std::move(ctx));
8565 results->started_temp_obj = false;
8566 }
8567
8568 if (r == -ENOENT && soid.is_snap()) {
8569 dout(10) << __func__
8570 << ": enoent while trying to promote clone, " << soid
8571 << " must have been trimmed, removing from snapset"
8572 << dendl;
8573 hobject_t head(soid.get_head());
8574 ObjectContextRef obc = get_object_context(head, false);
8575 assert(obc);
8576
8577 OpContextUPtr tctx = simple_opc_create(obc);
8578 tctx->at_version = get_next_version();
8579 filter_snapc(tctx->new_snapset.snaps);
8580 vector<snapid_t> new_clones;
8581 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8582 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8583 i != tctx->new_snapset.clones.end();
8584 ++i) {
8585 if (*i != soid.snap) {
8586 new_clones.push_back(*i);
8587 auto p = tctx->new_snapset.clone_snaps.find(*i);
8588 if (p != tctx->new_snapset.clone_snaps.end()) {
8589 new_clone_snaps[*i] = p->second;
8590 }
8591 }
8592 }
8593 tctx->new_snapset.clones.swap(new_clones);
8594 tctx->new_snapset.clone_overlap.erase(soid.snap);
8595 tctx->new_snapset.clone_size.erase(soid.snap);
8596 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8597
8598 // take RWWRITE lock for duration of our local write. ignore starvation.
8599 if (!tctx->lock_manager.take_write_lock(
8600 head,
8601 obc)) {
8602 assert(0 == "problem!");
8603 }
8604 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8605
8606 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8607
8608 simple_opc_submit(std::move(tctx));
8609 return;
8610 }
8611
8612 bool whiteout = false;
8613 if (r == -ENOENT) {
8614 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8615 dout(10) << __func__ << " whiteout " << soid << dendl;
8616 whiteout = true;
8617 }
8618
8619 if (r < 0 && !whiteout) {
8620 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8621 // pass error to everyone blocked on this object
8622 // FIXME: this is pretty sloppy, but at this point we got
8623 // something unexpected and don't have many other options.
8624 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8625 waiting_for_blocked_object.find(soid);
8626 if (blocked_iter != waiting_for_blocked_object.end()) {
8627 while (!blocked_iter->second.empty()) {
8628 osd->reply_op_error(blocked_iter->second.front(), r);
8629 blocked_iter->second.pop_front();
8630 }
8631 waiting_for_blocked_object.erase(blocked_iter);
8632 }
8633 return;
8634 }
8635
8636 osd->promote_finish(results->object_size);
8637
8638 OpContextUPtr tctx = simple_opc_create(obc);
8639 tctx->at_version = get_next_version();
8640
8641 ++tctx->delta_stats.num_objects;
8642 if (soid.snap < CEPH_NOSNAP)
8643 ++tctx->delta_stats.num_object_clones;
8644 tctx->new_obs.exists = true;
8645
8646 tctx->extra_reqids = results->reqids;
8647
8648 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8649 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8650
8651 if (whiteout) {
8652 // create a whiteout
8653 tctx->op_t->create(soid);
8654 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8655 ++tctx->delta_stats.num_whiteouts;
8656 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8657 osd->logger->inc(l_osd_tier_whiteout);
8658 } else {
8659 if (results->has_omap) {
8660 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8661 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8662 ++tctx->delta_stats.num_objects_omap;
8663 }
8664
8665 results->fill_in_final_tx(tctx->op_t.get());
8666 if (results->started_temp_obj) {
8667 tctx->discard_temp_oid = results->temp_oid;
8668 }
8669 tctx->new_obs.oi.size = results->object_size;
8670 tctx->new_obs.oi.user_version = results->user_version;
8671 if (results->is_data_digest()) {
8672 tctx->new_obs.oi.set_data_digest(results->data_digest);
8673 } else {
8674 tctx->new_obs.oi.clear_data_digest();
8675 }
8676 if (results->is_omap_digest()) {
8677 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8678 } else {
8679 tctx->new_obs.oi.clear_omap_digest();
8680 }
8681 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8682 tctx->new_obs.oi.truncate_size = results->truncate_size;
8683
8684 if (soid.snap != CEPH_NOSNAP) {
8685 if (legacy_snapset) {
8686 tctx->new_obs.oi.legacy_snaps = results->snaps;
8687 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8688 } else {
8689 // it's already in the snapset
8690 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8691 }
8692 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8693 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8694 results->object_size);
8695 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8696
8697 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8698 } else {
8699 tctx->delta_stats.num_bytes += results->object_size;
8700 }
8701 }
8702
8703 if (results->mirror_snapset) {
8704 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8705 tctx->new_snapset.from_snap_set(
8706 results->snapset,
8707 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8708 }
8709 tctx->new_snapset.head_exists = true;
8710 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8711
8712 // take RWWRITE lock for duration of our local write. ignore starvation.
8713 if (!tctx->lock_manager.take_write_lock(
8714 obc->obs.oi.soid,
8715 obc)) {
8716 assert(0 == "problem!");
8717 }
8718 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8719
8720 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8721
8722 simple_opc_submit(std::move(tctx));
8723
8724 osd->logger->inc(l_osd_tier_promote);
8725
8726 if (agent_state &&
8727 agent_state->is_idle())
8728 agent_choose_mode();
8729 }
8730
8731 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
8732 vector<ceph_tid_t> *tids)
8733 {
8734 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8735 << " from " << cop->src << " " << cop->oloc
8736 << " v" << cop->results.user_version << dendl;
8737
8738 // cancel objecter op, if we can
8739 if (cop->objecter_tid) {
8740 tids->push_back(cop->objecter_tid);
8741 cop->objecter_tid = 0;
8742 if (cop->objecter_tid2) {
8743 tids->push_back(cop->objecter_tid2);
8744 cop->objecter_tid2 = 0;
8745 }
8746 }
8747
8748 copy_ops.erase(cop->obc->obs.oi.soid);
8749 cop->obc->stop_block();
8750
8751 kick_object_context_blocked(cop->obc);
8752 cop->results.should_requeue = requeue;
8753 CopyCallbackResults result(-ECANCELED, &cop->results);
8754 cop->cb->complete(result);
8755
8756 // There may still be an objecter callback referencing this copy op.
8757 // That callback will not need the obc since it's been canceled, and
8758 // we need the obc reference to go away prior to flush.
8759 cop->obc = ObjectContextRef();
8760 }
8761
8762 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
8763 {
8764 dout(10) << __func__ << dendl;
8765 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8766 while (p != copy_ops.end()) {
8767 // requeue this op? can I queue up all of them?
8768 cancel_copy((p++)->second, requeue, tids);
8769 }
8770 }
8771
8772
8773 // ========================================================================
8774 // flush
8775 //
8776 // Flush a dirty object in the cache tier by writing it back to the
8777 // base tier. The sequence looks like:
8778 //
8779 // * send a copy-from operation to the base tier to copy the current
8780 // version of the object
8781 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8782 // * on completion, we check if the object has been modified. if so,
8783 // just reply with -EAGAIN.
8784 // * try to take a write lock so we can clear the dirty flag. if this
8785 // fails, wait and retry
8786 // * start a repop that clears the bit.
8787 //
8788 // If we have to wait, we will retry by coming back through the
8789 // start_flush method. We check if a flush is already in progress
8790 // and, if so, try to finish it by rechecking the version and trying
8791 // to clear the dirty bit.
8792 //
8793 // In order for the cache-flush (a write op) to not block the copy-get
8794 // from reading the object, the client *must* set the SKIPRWLOCKS
8795 // flag.
8796 //
8797 // NOTE: normally writes are strictly ordered for the client, but
8798 // flushes are special in that they can be reordered with respect to
8799 // other writes. In particular, we can't have a flush request block
8800 // an update to the cache pool object!
8801
8802 struct C_Flush : public Context {
8803 PrimaryLogPGRef pg;
8804 hobject_t oid;
8805 epoch_t last_peering_reset;
8806 ceph_tid_t tid;
8807 utime_t start;
8808 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8809 : pg(p), oid(o), last_peering_reset(lpr),
8810 tid(0), start(ceph_clock_now())
8811 {}
8812 void finish(int r) override {
8813 if (r == -ECANCELED)
8814 return;
8815 pg->lock();
8816 if (last_peering_reset == pg->get_last_peering_reset()) {
8817 pg->finish_flush(oid, tid, r);
8818 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8819 }
8820 pg->unlock();
8821 }
8822 };
8823
8824 int PrimaryLogPG::start_flush(
8825 OpRequestRef op, ObjectContextRef obc,
8826 bool blocking, hobject_t *pmissing,
8827 boost::optional<std::function<void()>> &&on_flush)
8828 {
8829 const object_info_t& oi = obc->obs.oi;
8830 const hobject_t& soid = oi.soid;
8831 dout(10) << __func__ << " " << soid
8832 << " v" << oi.version
8833 << " uv" << oi.user_version
8834 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8835 << dendl;
8836
8837 // get a filtered snapset, need to remove removed snaps
8838 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8839
8840 // verify there are no (older) check for dirty clones
8841 {
8842 dout(20) << " snapset " << snapset << dendl;
8843 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8844 while (p != snapset.clones.rend() && *p >= soid.snap)
8845 ++p;
8846 if (p != snapset.clones.rend()) {
8847 hobject_t next = soid;
8848 next.snap = *p;
8849 assert(next.snap < soid.snap);
8850 if (pg_log.get_missing().is_missing(next)) {
8851 dout(10) << __func__ << " missing clone is " << next << dendl;
8852 if (pmissing)
8853 *pmissing = next;
8854 return -ENOENT;
8855 }
8856 ObjectContextRef older_obc = get_object_context(next, false);
8857 if (older_obc) {
8858 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8859 << dendl;
8860 if (older_obc->obs.oi.is_dirty()) {
8861 dout(10) << __func__ << " next oldest clone is dirty: "
8862 << older_obc->obs.oi << dendl;
8863 return -EBUSY;
8864 }
8865 } else {
8866 dout(20) << __func__ << " next oldest clone " << next
8867 << " is not present; implicitly clean" << dendl;
8868 }
8869 } else {
8870 dout(20) << __func__ << " no older clones" << dendl;
8871 }
8872 }
8873
8874 if (blocking)
8875 obc->start_block();
8876
8877 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8878 if (p != flush_ops.end()) {
8879 FlushOpRef fop = p->second;
8880 if (fop->op == op) {
8881 // we couldn't take the write lock on a cache-try-flush before;
8882 // now we are trying again for the lock.
8883 return try_flush_mark_clean(fop);
8884 }
8885 if (fop->flushed_version == obc->obs.oi.user_version &&
8886 (fop->blocking || !blocking)) {
8887 // nonblocking can join anything
8888 // blocking can only join a blocking flush
8889 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8890 if (op)
8891 fop->dup_ops.push_back(op);
8892 return -EAGAIN; // clean up this ctx; op will retry later
8893 }
8894
8895 // cancel current flush since it will fail anyway, or because we
8896 // are blocking and the existing flush is nonblocking.
8897 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8898 if (fop->op)
8899 osd->reply_op_error(fop->op, -EBUSY);
8900 while (!fop->dup_ops.empty()) {
8901 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8902 fop->dup_ops.pop_front();
8903 }
8904 vector<ceph_tid_t> tids;
8905 cancel_flush(fop, false, &tids);
8906 osd->objecter->op_cancel(tids, -ECANCELED);
8907 }
8908
8909 /**
8910 * In general, we need to send a delete and a copyfrom.
8911 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8912 * where 4 is marked as clean. To flush 10, we have to:
8913 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8914 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8915 *
8916 * There is a complicating case. Supposed there had been a clone 7
8917 * for snaps [7, 6] which has been trimmed since they no longer exist.
8918 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8919 * the delete, the snap will be promoted to 5, and the head will become
8920 * a snapdir. When the copy-from goes through, we'll end up with
8921 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8922 *
8923 * Another complication is the case where there is an interval change
8924 * after doing the delete and the flush but before marking the object
8925 * clean. We'll happily delete head and then recreate it at the same
8926 * sequence number, which works out ok.
8927 */
8928
8929 SnapContext snapc, dsnapc;
8930 if (snapset.seq != 0) {
8931 if (soid.snap == CEPH_NOSNAP) {
8932 snapc.seq = snapset.seq;
8933 snapc.snaps = snapset.snaps;
8934 } else {
8935 snapid_t min_included_snap;
8936 if (snapset.is_legacy()) {
8937 min_included_snap = oi.legacy_snaps.back();
8938 } else {
8939 auto p = snapset.clone_snaps.find(soid.snap);
8940 assert(p != snapset.clone_snaps.end());
8941 min_included_snap = p->second.back();
8942 }
8943 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8944 }
8945
8946 snapid_t prev_snapc = 0;
8947 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8948 citer != snapset.clones.rend();
8949 ++citer) {
8950 if (*citer < soid.snap) {
8951 prev_snapc = *citer;
8952 break;
8953 }
8954 }
8955
8956 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8957 }
8958
8959 object_locator_t base_oloc(soid);
8960 base_oloc.pool = pool.info.tier_of;
8961
8962 if (dsnapc.seq < snapc.seq) {
8963 ObjectOperation o;
8964 o.remove();
8965 osd->objecter->mutate(
8966 soid.oid,
8967 base_oloc,
8968 o,
8969 dsnapc,
8970 ceph::real_clock::from_ceph_timespec(oi.mtime),
8971 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8972 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8973 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8974 }
8975
8976 FlushOpRef fop(std::make_shared<FlushOp>());
8977 fop->obc = obc;
8978 fop->flushed_version = oi.user_version;
8979 fop->blocking = blocking;
8980 fop->on_flush = std::move(on_flush);
8981 fop->op = op;
8982
8983 ObjectOperation o;
8984 if (oi.is_whiteout()) {
8985 fop->removal = true;
8986 o.remove();
8987 } else {
8988 object_locator_t oloc(soid);
8989 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8990 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8991 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8992 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8993 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8994 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8995
8996 //mean the base tier don't cache data after this
8997 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8998 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8999 }
9000 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
9001
9002 ceph_tid_t tid = osd->objecter->mutate(
9003 soid.oid, base_oloc, o, snapc,
9004 ceph::real_clock::from_ceph_timespec(oi.mtime),
9005 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
9006 new C_OnFinisher(fin,
9007 &osd->objecter_finisher));
9008 /* we're under the pg lock and fin->finish() is grabbing that */
9009 fin->tid = tid;
9010 fop->objecter_tid = tid;
9011
9012 flush_ops[soid] = fop;
9013 info.stats.stats.sum.num_flush++;
9014 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
9015 return -EINPROGRESS;
9016 }
9017
9018 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
9019 {
9020 dout(10) << __func__ << " " << oid << " tid " << tid
9021 << " " << cpp_strerror(r) << dendl;
9022 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
9023 if (p == flush_ops.end()) {
9024 dout(10) << __func__ << " no flush_op found" << dendl;
9025 return;
9026 }
9027 FlushOpRef fop = p->second;
9028 if (tid != fop->objecter_tid) {
9029 dout(10) << __func__ << " tid " << tid << " != fop " << fop
9030 << " tid " << fop->objecter_tid << dendl;
9031 return;
9032 }
9033 ObjectContextRef obc = fop->obc;
9034 fop->objecter_tid = 0;
9035
9036 if (r < 0 && !(r == -ENOENT && fop->removal)) {
9037 if (fop->op)
9038 osd->reply_op_error(fop->op, -EBUSY);
9039 if (fop->blocking) {
9040 obc->stop_block();
9041 kick_object_context_blocked(obc);
9042 }
9043
9044 if (!fop->dup_ops.empty()) {
9045 dout(20) << __func__ << " requeueing dups" << dendl;
9046 requeue_ops(fop->dup_ops);
9047 }
9048 if (fop->on_flush) {
9049 (*(fop->on_flush))();
9050 fop->on_flush = boost::none;
9051 }
9052 flush_ops.erase(oid);
9053 return;
9054 }
9055
9056 r = try_flush_mark_clean(fop);
9057 if (r == -EBUSY && fop->op) {
9058 osd->reply_op_error(fop->op, r);
9059 }
9060 }
9061
9062 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
9063 {
9064 ObjectContextRef obc = fop->obc;
9065 const hobject_t& oid = obc->obs.oi.soid;
9066
9067 if (fop->blocking) {
9068 obc->stop_block();
9069 kick_object_context_blocked(obc);
9070 }
9071
9072 if (fop->flushed_version != obc->obs.oi.user_version ||
9073 !obc->obs.exists) {
9074 if (obc->obs.exists)
9075 dout(10) << __func__ << " flushed_version " << fop->flushed_version
9076 << " != current " << obc->obs.oi.user_version
9077 << dendl;
9078 else
9079 dout(10) << __func__ << " object no longer exists" << dendl;
9080
9081 if (!fop->dup_ops.empty()) {
9082 dout(20) << __func__ << " requeueing dups" << dendl;
9083 requeue_ops(fop->dup_ops);
9084 }
9085 if (fop->on_flush) {
9086 (*(fop->on_flush))();
9087 fop->on_flush = boost::none;
9088 }
9089 flush_ops.erase(oid);
9090 if (fop->blocking)
9091 osd->logger->inc(l_osd_tier_flush_fail);
9092 else
9093 osd->logger->inc(l_osd_tier_try_flush_fail);
9094 return -EBUSY;
9095 }
9096
9097 if (!fop->blocking &&
9098 write_blocked_by_scrub(oid)) {
9099 if (fop->op) {
9100 dout(10) << __func__ << " blocked by scrub" << dendl;
9101 requeue_op(fop->op);
9102 requeue_ops(fop->dup_ops);
9103 return -EAGAIN; // will retry
9104 } else {
9105 osd->logger->inc(l_osd_tier_try_flush_fail);
9106 vector<ceph_tid_t> tids;
9107 cancel_flush(fop, false, &tids);
9108 osd->objecter->op_cancel(tids, -ECANCELED);
9109 return -ECANCELED;
9110 }
9111 }
9112
9113 // successfully flushed, can we evict this object?
9114 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9115 agent_maybe_evict(obc, true)) {
9116 osd->logger->inc(l_osd_tier_clean);
9117 if (fop->on_flush) {
9118 (*(fop->on_flush))();
9119 fop->on_flush = boost::none;
9120 }
9121 flush_ops.erase(oid);
9122 return 0;
9123 }
9124
9125 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9126 OpContextUPtr ctx = simple_opc_create(fop->obc);
9127
9128 // successfully flushed; can we clear the dirty bit?
9129 // try to take the lock manually, since we don't
9130 // have a ctx yet.
9131 if (ctx->lock_manager.get_lock_type(
9132 ObjectContext::RWState::RWWRITE,
9133 oid,
9134 obc,
9135 fop->op)) {
9136 dout(20) << __func__ << " took write lock" << dendl;
9137 } else if (fop->op) {
9138 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
9139 << fop->dup_ops << dendl;
9140 close_op_ctx(ctx.release());
9141 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
9142 for (auto op : fop->dup_ops) {
9143 bool locked = ctx->lock_manager.get_lock_type(
9144 ObjectContext::RWState::RWWRITE,
9145 oid,
9146 obc,
9147 op);
9148 assert(!locked);
9149 }
9150 return -EAGAIN; // will retry
9151 } else {
9152 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9153 close_op_ctx(ctx.release());
9154 osd->logger->inc(l_osd_tier_try_flush_fail);
9155 vector<ceph_tid_t> tids;
9156 cancel_flush(fop, false, &tids);
9157 osd->objecter->op_cancel(tids, -ECANCELED);
9158 return -ECANCELED;
9159 }
9160
9161 if (fop->on_flush) {
9162 ctx->register_on_finish(*(fop->on_flush));
9163 fop->on_flush = boost::none;
9164 }
9165
9166 ctx->at_version = get_next_version();
9167
9168 ctx->new_obs = obc->obs;
9169 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9170 --ctx->delta_stats.num_objects_dirty;
9171
9172 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9173
9174 osd->logger->inc(l_osd_tier_clean);
9175
9176 if (!fop->dup_ops.empty() || fop->op) {
9177 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9178 list<OpRequestRef> ls;
9179 if (fop->op)
9180 ls.push_back(fop->op);
9181 ls.splice(ls.end(), fop->dup_ops);
9182 requeue_ops(ls);
9183 }
9184
9185 simple_opc_submit(std::move(ctx));
9186
9187 flush_ops.erase(oid);
9188
9189 if (fop->blocking)
9190 osd->logger->inc(l_osd_tier_flush);
9191 else
9192 osd->logger->inc(l_osd_tier_try_flush);
9193
9194 return -EINPROGRESS;
9195 }
9196
9197 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
9198 vector<ceph_tid_t> *tids)
9199 {
9200 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9201 << fop->objecter_tid << dendl;
9202 if (fop->objecter_tid) {
9203 tids->push_back(fop->objecter_tid);
9204 fop->objecter_tid = 0;
9205 }
9206 if (fop->io_tids.size()) {
9207 for (auto &p : fop->io_tids) {
9208 tids->push_back(p.second);
9209 p.second = 0;
9210 }
9211 }
9212 if (fop->blocking && fop->obc->is_blocked()) {
9213 fop->obc->stop_block();
9214 kick_object_context_blocked(fop->obc);
9215 }
9216 if (requeue) {
9217 if (fop->op)
9218 requeue_op(fop->op);
9219 requeue_ops(fop->dup_ops);
9220 }
9221 if (fop->on_flush) {
9222 (*(fop->on_flush))();
9223 fop->on_flush = boost::none;
9224 }
9225 flush_ops.erase(fop->obc->obs.oi.soid);
9226 }
9227
9228 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
9229 {
9230 dout(10) << __func__ << dendl;
9231 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9232 while (p != flush_ops.end()) {
9233 cancel_flush((p++)->second, requeue, tids);
9234 }
9235 }
9236
9237 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9238 {
9239 if (!pool.info.allow_incomplete_clones())
9240 return true;
9241 if (is_missing_object(coid))
9242 return true;
9243 ObjectContextRef obc = get_object_context(coid, false);
9244 return obc && obc->obs.exists;
9245 }
9246
9247 // ========================================================================
9248 // rep op gather
9249
9250 class C_OSD_RepopApplied : public Context {
9251 PrimaryLogPGRef pg;
9252 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9253 public:
9254 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9255 : pg(pg), repop(repop) {}
9256 void finish(int) override {
9257 pg->repop_all_applied(repop.get());
9258 }
9259 };
9260
9261
9262 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9263 {
9264 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9265 << dendl;
9266 assert(!repop->applies_with_commit);
9267 repop->all_applied = true;
9268 if (!repop->rep_aborted) {
9269 eval_repop(repop);
9270 }
9271 }
9272
9273 class C_OSD_RepopCommit : public Context {
9274 PrimaryLogPGRef pg;
9275 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9276 public:
9277 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9278 : pg(pg), repop(repop) {}
9279 void finish(int) override {
9280 pg->repop_all_committed(repop.get());
9281 }
9282 };
9283
9284 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9285 {
9286 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9287 << dendl;
9288 repop->all_committed = true;
9289 if (repop->applies_with_commit) {
9290 assert(!repop->all_applied);
9291 repop->all_applied = true;
9292 }
9293
9294 if (!repop->rep_aborted) {
9295 if (repop->v != eversion_t()) {
9296 last_update_ondisk = repop->v;
9297 last_complete_ondisk = repop->pg_local_last_complete;
9298 }
9299 eval_repop(repop);
9300 }
9301 }
9302
9303 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9304 {
9305 dout(10) << "op_applied version " << applied_version << dendl;
9306 if (applied_version == eversion_t())
9307 return;
9308 assert(applied_version > last_update_applied);
9309 assert(applied_version <= info.last_update);
9310 last_update_applied = applied_version;
9311 if (is_primary()) {
9312 if (scrubber.active) {
9313 if (last_update_applied >= scrubber.subset_last_update) {
9314 if (ops_blocked_by_scrub()) {
9315 requeue_scrub(true);
9316 } else {
9317 requeue_scrub(false);
9318 }
9319
9320 }
9321 } else {
9322 assert(scrubber.start == scrubber.end);
9323 }
9324 } else {
9325 if (scrubber.active_rep_scrub) {
9326 if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9327 scrubber.active_rep_scrub->get_req())->scrub_to) {
9328 osd->enqueue_back(
9329 info.pgid,
9330 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9331 scrubber.active_rep_scrub = OpRequestRef();
9332 }
9333 }
9334 }
9335 }
9336
9337 void PrimaryLogPG::eval_repop(RepGather *repop)
9338 {
9339 const MOSDOp *m = NULL;
9340 if (repop->op)
9341 m = static_cast<const MOSDOp *>(repop->op->get_req());
9342
9343 if (m)
9344 dout(10) << "eval_repop " << *repop
9345 << (repop->rep_done ? " DONE" : "")
9346 << dendl;
9347 else
9348 dout(10) << "eval_repop " << *repop << " (no op)"
9349 << (repop->rep_done ? " DONE" : "")
9350 << dendl;
9351
9352 if (repop->rep_done)
9353 return;
9354
9355 // ondisk?
9356 if (repop->all_committed) {
9357 dout(10) << " commit: " << *repop << dendl;
9358 for (auto p = repop->on_committed.begin();
9359 p != repop->on_committed.end();
9360 repop->on_committed.erase(p++)) {
9361 (*p)();
9362 }
9363 // send dup commits, in order
9364 if (waiting_for_ondisk.count(repop->v)) {
9365 assert(waiting_for_ondisk.begin()->first == repop->v);
9366 for (list<pair<OpRequestRef, version_t> >::iterator i =
9367 waiting_for_ondisk[repop->v].begin();
9368 i != waiting_for_ondisk[repop->v].end();
9369 ++i) {
9370 osd->reply_op_error(i->first, repop->r, repop->v,
9371 i->second);
9372 }
9373 waiting_for_ondisk.erase(repop->v);
9374 }
9375 }
9376
9377 // applied?
9378 if (repop->all_applied) {
9379 if (repop->applies_with_commit) {
9380 assert(repop->on_applied.empty());
9381 }
9382 dout(10) << " applied: " << *repop << " " << dendl;
9383 for (auto p = repop->on_applied.begin();
9384 p != repop->on_applied.end();
9385 repop->on_applied.erase(p++)) {
9386 (*p)();
9387 }
9388 }
9389
9390 // done.
9391 if (repop->all_applied && repop->all_committed) {
9392 repop->rep_done = true;
9393
9394 publish_stats_to_osd();
9395 calc_min_last_complete_ondisk();
9396
9397 dout(10) << " removing " << *repop << dendl;
9398 assert(!repop_queue.empty());
9399 dout(20) << " q front is " << *repop_queue.front() << dendl;
9400 if (repop_queue.front() != repop) {
9401 if (!repop->applies_with_commit) {
9402 dout(0) << " removing " << *repop << dendl;
9403 dout(0) << " q front is " << *repop_queue.front() << dendl;
9404 assert(repop_queue.front() == repop);
9405 }
9406 } else {
9407 RepGather *to_remove = nullptr;
9408 while (!repop_queue.empty() &&
9409 (to_remove = repop_queue.front())->rep_done) {
9410 repop_queue.pop_front();
9411 for (auto p = to_remove->on_success.begin();
9412 p != to_remove->on_success.end();
9413 to_remove->on_success.erase(p++)) {
9414 (*p)();
9415 }
9416 remove_repop(to_remove);
9417 }
9418 }
9419 }
9420 }
9421
9422 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9423 {
9424 FUNCTRACE();
9425 const hobject_t& soid = ctx->obs->oi.soid;
9426 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9427 << " o " << soid
9428 << dendl;
9429
9430 repop->v = ctx->at_version;
9431 if (ctx->at_version > eversion_t()) {
9432 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9433 i != actingbackfill.end();
9434 ++i) {
9435 if (*i == get_primary()) continue;
9436 pg_info_t &pinfo = peer_info[*i];
9437 // keep peer_info up to date
9438 if (pinfo.last_complete == pinfo.last_update)
9439 pinfo.last_complete = ctx->at_version;
9440 pinfo.last_update = ctx->at_version;
9441 }
9442 }
9443
9444 ctx->obc->ondisk_write_lock();
9445
9446 bool unlock_snapset_obc = false;
9447 ctx->op_t->add_obc(ctx->obc);
9448 if (ctx->clone_obc) {
9449 ctx->clone_obc->ondisk_write_lock();
9450 ctx->op_t->add_obc(ctx->clone_obc);
9451 }
9452 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9453 ctx->obc->obs.oi.soid) {
9454 ctx->snapset_obc->ondisk_write_lock();
9455 unlock_snapset_obc = true;
9456 ctx->op_t->add_obc(ctx->snapset_obc);
9457 }
9458
9459 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9460 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9461 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9462 ctx->obc,
9463 ctx->clone_obc,
9464 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9465 if (!(ctx->log.empty())) {
9466 assert(ctx->at_version >= projected_last_update);
9467 projected_last_update = ctx->at_version;
9468 }
9469 for (auto &&entry: ctx->log) {
9470 projected_log.add(entry);
9471 }
9472 pgbackend->submit_transaction(
9473 soid,
9474 ctx->delta_stats,
9475 ctx->at_version,
9476 std::move(ctx->op_t),
9477 pg_trim_to,
9478 min_last_complete_ondisk,
9479 ctx->log,
9480 ctx->updated_hset_history,
9481 onapplied_sync,
9482 on_all_applied,
9483 on_all_commit,
9484 repop->rep_tid,
9485 ctx->reqid,
9486 ctx->op);
9487 }
9488
9489 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9490 OpContext *ctx, ObjectContextRef obc,
9491 ceph_tid_t rep_tid)
9492 {
9493 if (ctx->op)
9494 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9495 else
9496 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9497
9498 RepGather *repop = new RepGather(
9499 ctx, rep_tid, info.last_complete, false);
9500
9501 repop->start = ceph_clock_now();
9502
9503 repop_queue.push_back(&repop->queue_item);
9504 repop->get();
9505
9506 osd->logger->inc(l_osd_op_wip);
9507
9508 dout(10) << __func__ << ": " << *repop << dendl;
9509 return repop;
9510 }
9511
9512 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9513 eversion_t version,
9514 int r,
9515 ObcLockManager &&manager,
9516 OpRequestRef &&op,
9517 boost::optional<std::function<void(void)> > &&on_complete)
9518 {
9519 RepGather *repop = new RepGather(
9520 std::move(manager),
9521 std::move(op),
9522 std::move(on_complete),
9523 osd->get_tid(),
9524 info.last_complete,
9525 true,
9526 r);
9527 repop->v = version;
9528
9529 repop->start = ceph_clock_now();
9530
9531 repop_queue.push_back(&repop->queue_item);
9532
9533 osd->logger->inc(l_osd_op_wip);
9534
9535 dout(10) << __func__ << ": " << *repop << dendl;
9536 return boost::intrusive_ptr<RepGather>(repop);
9537 }
9538
9539 void PrimaryLogPG::remove_repop(RepGather *repop)
9540 {
9541 dout(20) << __func__ << " " << *repop << dendl;
9542
9543 for (auto p = repop->on_finish.begin();
9544 p != repop->on_finish.end();
9545 repop->on_finish.erase(p++)) {
9546 (*p)();
9547 }
9548
9549 release_object_locks(
9550 repop->lock_manager);
9551 repop->put();
9552
9553 osd->logger->dec(l_osd_op_wip);
9554 }
9555
9556 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9557 {
9558 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9559 ceph_tid_t rep_tid = osd->get_tid();
9560 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9561 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9562 ctx->op_t.reset(new PGTransaction());
9563 ctx->mtime = ceph_clock_now();
9564 return ctx;
9565 }
9566
9567 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9568 {
9569 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9570 dout(20) << __func__ << " " << repop << dendl;
9571 issue_repop(repop, ctx.get());
9572 eval_repop(repop);
9573 calc_trim_to();
9574 repop->put();
9575 }
9576
9577
9578 void PrimaryLogPG::submit_log_entries(
9579 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9580 ObcLockManager &&manager,
9581 boost::optional<std::function<void(void)> > &&_on_complete,
9582 OpRequestRef op,
9583 int r)
9584 {
9585 dout(10) << __func__ << " " << entries << dendl;
9586 assert(is_primary());
9587
9588 eversion_t version;
9589 if (!entries.empty()) {
9590 assert(entries.rbegin()->version >= projected_last_update);
9591 version = projected_last_update = entries.rbegin()->version;
9592 }
9593
9594 boost::intrusive_ptr<RepGather> repop;
9595 boost::optional<std::function<void(void)> > on_complete;
9596 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9597 repop = new_repop(
9598 version,
9599 r,
9600 std::move(manager),
9601 std::move(op),
9602 std::move(_on_complete));
9603 } else {
9604 on_complete = std::move(_on_complete);
9605 }
9606
9607 pgbackend->call_write_ordered(
9608 [this, entries, repop, on_complete]() {
9609 ObjectStore::Transaction t;
9610 eversion_t old_last_update = info.last_update;
9611 merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
9612
9613
9614 set<pg_shard_t> waiting_on;
9615 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9616 i != actingbackfill.end();
9617 ++i) {
9618 pg_shard_t peer(*i);
9619 if (peer == pg_whoami) continue;
9620 assert(peer_missing.count(peer));
9621 assert(peer_info.count(peer));
9622 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9623 assert(repop);
9624 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9625 entries,
9626 spg_t(info.pgid.pgid, i->shard),
9627 pg_whoami.shard,
9628 get_osdmap()->get_epoch(),
9629 last_peering_reset,
9630 repop->rep_tid,
9631 pg_trim_to,
9632 min_last_complete_ondisk);
9633 osd->send_message_osd_cluster(
9634 peer.osd, m, get_osdmap()->get_epoch());
9635 waiting_on.insert(peer);
9636 } else {
9637 MOSDPGLog *m = new MOSDPGLog(
9638 peer.shard, pg_whoami.shard,
9639 info.last_update.epoch,
9640 info);
9641 m->log.log = entries;
9642 m->log.tail = old_last_update;
9643 m->log.head = info.last_update;
9644 osd->send_message_osd_cluster(
9645 peer.osd, m, get_osdmap()->get_epoch());
9646 }
9647 }
9648 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9649 ceph_tid_t rep_tid = repop->rep_tid;
9650 waiting_on.insert(pg_whoami);
9651 log_entry_update_waiting_on.insert(
9652 make_pair(
9653 rep_tid,
9654 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9655 ));
9656 struct OnComplete : public Context {
9657 PrimaryLogPGRef pg;
9658 ceph_tid_t rep_tid;
9659 epoch_t epoch;
9660 OnComplete(
9661 PrimaryLogPGRef pg,
9662 ceph_tid_t rep_tid,
9663 epoch_t epoch)
9664 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9665 void finish(int) override {
9666 pg->lock();
9667 if (!pg->pg_has_reset_since(epoch)) {
9668 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9669 assert(it != pg->log_entry_update_waiting_on.end());
9670 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9671 assert(it2 != it->second.waiting_on.end());
9672 it->second.waiting_on.erase(it2);
9673 if (it->second.waiting_on.empty()) {
9674 pg->repop_all_committed(it->second.repop.get());
9675 pg->log_entry_update_waiting_on.erase(it);
9676 }
9677 }
9678 pg->unlock();
9679 }
9680 };
9681 t.register_on_commit(
9682 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9683 } else {
9684 if (on_complete) {
9685 struct OnComplete : public Context {
9686 PrimaryLogPGRef pg;
9687 std::function<void(void)> on_complete;
9688 epoch_t epoch;
9689 OnComplete(
9690 PrimaryLogPGRef pg,
9691 const std::function<void(void)> &on_complete,
9692 epoch_t epoch)
9693 : pg(pg),
9694 on_complete(std::move(on_complete)),
9695 epoch(epoch) {}
9696 void finish(int) override {
9697 pg->lock();
9698 if (!pg->pg_has_reset_since(epoch))
9699 on_complete();
9700 pg->unlock();
9701 }
9702 };
9703 t.register_on_complete(
9704 new OnComplete{
9705 this, *on_complete, get_osdmap()->get_epoch()
9706 });
9707 }
9708 }
9709 t.register_on_applied(
9710 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9711 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9712 assert(r == 0);
9713 });
9714
9715 calc_trim_to();
9716 }
9717
9718 void PrimaryLogPG::cancel_log_updates()
9719 {
9720 // get rid of all the LogUpdateCtx so their references to repops are
9721 // dropped
9722 log_entry_update_waiting_on.clear();
9723 }
9724
9725 // -------------------------------------------------------
9726
9727 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9728 {
9729 pair<hobject_t, ObjectContextRef> i;
9730 while (object_contexts.get_next(i.first, &i)) {
9731 ObjectContextRef obc(i.second);
9732 get_obc_watchers(obc, pg_watchers);
9733 }
9734 }
9735
9736 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9737 {
9738 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9739 obc->watchers.begin();
9740 j != obc->watchers.end();
9741 ++j) {
9742 obj_watch_item_t owi;
9743
9744 owi.obj = obc->obs.oi.soid;
9745 owi.wi.addr = j->second->get_peer_addr();
9746 owi.wi.name = j->second->get_entity();
9747 owi.wi.cookie = j->second->get_cookie();
9748 owi.wi.timeout_seconds = j->second->get_timeout();
9749
9750 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9751 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9752
9753 pg_watchers.push_back(owi);
9754 }
9755 }
9756
9757 void PrimaryLogPG::check_blacklisted_watchers()
9758 {
9759 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9760 pair<hobject_t, ObjectContextRef> i;
9761 while (object_contexts.get_next(i.first, &i))
9762 check_blacklisted_obc_watchers(i.second);
9763 }
9764
9765 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9766 {
9767 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9768 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9769 obc->watchers.begin();
9770 k != obc->watchers.end();
9771 ) {
9772 //Advance iterator now so handle_watch_timeout() can erase element
9773 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9774 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9775 entity_addr_t ea = j->second->get_peer_addr();
9776 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9777 if (get_osdmap()->is_blacklisted(ea)) {
9778 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9779 assert(j->second->get_pg() == this);
9780 j->second->unregister_cb();
9781 handle_watch_timeout(j->second);
9782 }
9783 }
9784 }
9785
9786 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9787 {
9788 assert(is_active());
9789 assert((recovering.count(obc->obs.oi.soid) ||
9790 !is_missing_object(obc->obs.oi.soid)) ||
9791 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9792 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9793 pg_log_entry_t::LOST_REVERT &&
9794 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9795 obc->obs.oi.version));
9796
9797 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9798 assert(obc->watchers.empty());
9799 // populate unconnected_watchers
9800 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9801 obc->obs.oi.watchers.begin();
9802 p != obc->obs.oi.watchers.end();
9803 ++p) {
9804 utime_t expire = info.stats.last_became_active;
9805 expire += p->second.timeout_seconds;
9806 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9807 WatchRef watch(
9808 Watch::makeWatchRef(
9809 this, osd, obc, p->second.timeout_seconds, p->first.first,
9810 p->first.second, p->second.addr));
9811 watch->disconnect();
9812 obc->watchers.insert(
9813 make_pair(
9814 make_pair(p->first.first, p->first.second),
9815 watch));
9816 }
9817 // Look for watchers from blacklisted clients and drop
9818 check_blacklisted_obc_watchers(obc);
9819 }
9820
9821 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9822 {
9823 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9824 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9825
9826 if (!is_active()) {
9827 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9828 return;
9829 }
9830 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9831 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9832 watch->get_delayed_cb()
9833 );
9834 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9835 << obc->obs.oi.soid
9836 << dendl;
9837 return;
9838 }
9839
9840 if (write_blocked_by_scrub(obc->obs.oi.soid)) {
9841 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9842 << obc->obs.oi.soid
9843 << dendl;
9844 scrubber.add_callback(
9845 watch->get_delayed_cb() // This callback!
9846 );
9847 return;
9848 }
9849
9850 OpContextUPtr ctx = simple_opc_create(obc);
9851 ctx->at_version = get_next_version();
9852
9853 object_info_t& oi = ctx->new_obs.oi;
9854 oi.watchers.erase(make_pair(watch->get_cookie(),
9855 watch->get_entity()));
9856
9857 list<watch_disconnect_t> watch_disconnects = {
9858 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9859 };
9860 ctx->register_on_success(
9861 [this, obc, watch_disconnects]() {
9862 complete_disconnect_watches(obc, watch_disconnects);
9863 });
9864
9865
9866 PGTransaction *t = ctx->op_t.get();
9867 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9868 ctx->at_version,
9869 oi.version,
9870 0,
9871 osd_reqid_t(), ctx->mtime, 0));
9872
9873 oi.prior_version = obc->obs.oi.version;
9874 oi.version = ctx->at_version;
9875 bufferlist bl;
9876 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9877 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9878
9879 // apply new object state.
9880 ctx->obc->obs = ctx->new_obs;
9881
9882 // no ctx->delta_stats
9883 simple_opc_submit(std::move(ctx));
9884 }
9885
9886 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9887 SnapSetContext *ssc)
9888 {
9889 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9890 assert(obc->destructor_callback == NULL);
9891 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9892 obc->obs.oi = oi;
9893 obc->obs.exists = false;
9894 obc->ssc = ssc;
9895 if (ssc)
9896 register_snapset_context(ssc);
9897 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9898 if (is_active())
9899 populate_obc_watchers(obc);
9900 return obc;
9901 }
9902
9903 ObjectContextRef PrimaryLogPG::get_object_context(
9904 const hobject_t& soid,
9905 bool can_create,
9906 const map<string, bufferlist> *attrs)
9907 {
9908 assert(
9909 attrs || !pg_log.get_missing().is_missing(soid) ||
9910 // or this is a revert... see recover_primary()
9911 (pg_log.get_log().objects.count(soid) &&
9912 pg_log.get_log().objects.find(soid)->second->op ==
9913 pg_log_entry_t::LOST_REVERT));
9914 ObjectContextRef obc = object_contexts.lookup(soid);
9915 osd->logger->inc(l_osd_object_ctx_cache_total);
9916 if (obc) {
9917 osd->logger->inc(l_osd_object_ctx_cache_hit);
9918 dout(10) << __func__ << ": found obc in cache: " << obc
9919 << dendl;
9920 } else {
9921 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9922 // check disk
9923 bufferlist bv;
9924 if (attrs) {
9925 assert(attrs->count(OI_ATTR));
9926 bv = attrs->find(OI_ATTR)->second;
9927 } else {
9928 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9929 if (r < 0) {
9930 if (!can_create) {
9931 dout(10) << __func__ << ": no obc for soid "
9932 << soid << " and !can_create"
9933 << dendl;
9934 return ObjectContextRef(); // -ENOENT!
9935 }
9936
9937 dout(10) << __func__ << ": no obc for soid "
9938 << soid << " but can_create"
9939 << dendl;
9940 // new object.
9941 object_info_t oi(soid);
9942 SnapSetContext *ssc = get_snapset_context(
9943 soid, true, 0, false);
9944 assert(ssc);
9945 obc = create_object_context(oi, ssc);
9946 dout(10) << __func__ << ": " << obc << " " << soid
9947 << " " << obc->rwstate
9948 << " oi: " << obc->obs.oi
9949 << " ssc: " << obc->ssc
9950 << " snapset: " << obc->ssc->snapset << dendl;
9951 return obc;
9952 }
9953 }
9954
9955 object_info_t oi;
9956 try {
9957 bufferlist::iterator bliter = bv.begin();
9958 ::decode(oi, bliter);
9959 } catch (...) {
9960 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9961 return ObjectContextRef(); // -ENOENT!
9962 }
9963
9964 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9965
9966 obc = object_contexts.lookup_or_create(oi.soid);
9967 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9968 obc->obs.oi = oi;
9969 obc->obs.exists = true;
9970
9971 obc->ssc = get_snapset_context(
9972 soid, true,
9973 soid.has_snapset() ? attrs : 0);
9974
9975 if (is_active())
9976 populate_obc_watchers(obc);
9977
9978 if (pool.info.require_rollback()) {
9979 if (attrs) {
9980 obc->attr_cache = *attrs;
9981 } else {
9982 int r = pgbackend->objects_get_attrs(
9983 soid,
9984 &obc->attr_cache);
9985 assert(r == 0);
9986 }
9987 }
9988
9989 dout(10) << __func__ << ": creating obc from disk: " << obc
9990 << dendl;
9991 }
9992
9993 // XXX: Caller doesn't expect this
9994 if (obc->ssc == NULL) {
9995 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9996 return ObjectContextRef(); // -ENOENT!
9997 }
9998
9999 dout(10) << __func__ << ": " << obc << " " << soid
10000 << " " << obc->rwstate
10001 << " oi: " << obc->obs.oi
10002 << " exists: " << (int)obc->obs.exists
10003 << " ssc: " << obc->ssc
10004 << " snapset: " << obc->ssc->snapset << dendl;
10005 return obc;
10006 }
10007
10008 void PrimaryLogPG::context_registry_on_change()
10009 {
10010 pair<hobject_t, ObjectContextRef> i;
10011 while (object_contexts.get_next(i.first, &i)) {
10012 ObjectContextRef obc(i.second);
10013 if (obc) {
10014 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10015 obc->watchers.begin();
10016 j != obc->watchers.end();
10017 obc->watchers.erase(j++)) {
10018 j->second->discard();
10019 }
10020 }
10021 }
10022 }
10023
10024
10025 /*
10026 * If we return an error, and set *pmissing, then promoting that
10027 * object may help.
10028 *
10029 * If we return -EAGAIN, we will always set *pmissing to the missing
10030 * object to wait for.
10031 *
10032 * If we return an error but do not set *pmissing, then we know the
10033 * object does not exist.
10034 */
10035 int PrimaryLogPG::find_object_context(const hobject_t& oid,
10036 ObjectContextRef *pobc,
10037 bool can_create,
10038 bool map_snapid_to_clone,
10039 hobject_t *pmissing)
10040 {
10041 FUNCTRACE();
10042 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
10043 // want the head?
10044 if (oid.snap == CEPH_NOSNAP) {
10045 ObjectContextRef obc = get_object_context(oid, can_create);
10046 if (!obc) {
10047 if (pmissing)
10048 *pmissing = oid;
10049 return -ENOENT;
10050 }
10051 dout(10) << "find_object_context " << oid
10052 << " @" << oid.snap
10053 << " oi=" << obc->obs.oi
10054 << dendl;
10055 *pobc = obc;
10056
10057 return 0;
10058 }
10059
10060 hobject_t head = oid.get_head();
10061
10062 // want the snapdir?
10063 if (oid.snap == CEPH_SNAPDIR) {
10064 // return head or snapdir, whichever exists.
10065 ObjectContextRef headobc = get_object_context(head, can_create);
10066 ObjectContextRef obc = headobc;
10067 if (!obc || !obc->obs.exists)
10068 obc = get_object_context(oid, can_create);
10069 if (!obc || !obc->obs.exists) {
10070 // if we have neither, we would want to promote the head.
10071 if (pmissing)
10072 *pmissing = head;
10073 if (pobc)
10074 *pobc = headobc; // may be null
10075 return -ENOENT;
10076 }
10077 dout(10) << "find_object_context " << oid
10078 << " @" << oid.snap
10079 << " oi=" << obc->obs.oi
10080 << dendl;
10081 *pobc = obc;
10082
10083 // always populate ssc for SNAPDIR...
10084 if (!obc->ssc)
10085 obc->ssc = get_snapset_context(
10086 oid, true);
10087 return 0;
10088 }
10089
10090 // we want a snap
10091 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
10092 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
10093 return -ENOENT;
10094 }
10095
10096 SnapSetContext *ssc = get_snapset_context(oid, can_create);
10097 if (!ssc || !(ssc->exists || can_create)) {
10098 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
10099 if (pmissing)
10100 *pmissing = head; // start by getting the head
10101 if (ssc)
10102 put_snapset_context(ssc);
10103 return -ENOENT;
10104 }
10105
10106 if (map_snapid_to_clone) {
10107 dout(10) << "find_object_context " << oid << " @" << oid.snap
10108 << " snapset " << ssc->snapset
10109 << " map_snapid_to_clone=true" << dendl;
10110 if (oid.snap > ssc->snapset.seq) {
10111 // already must be readable
10112 ObjectContextRef obc = get_object_context(head, false);
10113 dout(10) << "find_object_context " << oid << " @" << oid.snap
10114 << " snapset " << ssc->snapset
10115 << " maps to head" << dendl;
10116 *pobc = obc;
10117 put_snapset_context(ssc);
10118 return (obc && obc->obs.exists) ? 0 : -ENOENT;
10119 } else {
10120 vector<snapid_t>::const_iterator citer = std::find(
10121 ssc->snapset.clones.begin(),
10122 ssc->snapset.clones.end(),
10123 oid.snap);
10124 if (citer == ssc->snapset.clones.end()) {
10125 dout(10) << "find_object_context " << oid << " @" << oid.snap
10126 << " snapset " << ssc->snapset
10127 << " maps to nothing" << dendl;
10128 put_snapset_context(ssc);
10129 return -ENOENT;
10130 }
10131
10132 dout(10) << "find_object_context " << oid << " @" << oid.snap
10133 << " snapset " << ssc->snapset
10134 << " maps to " << oid << dendl;
10135
10136 if (pg_log.get_missing().is_missing(oid)) {
10137 dout(10) << "find_object_context " << oid << " @" << oid.snap
10138 << " snapset " << ssc->snapset
10139 << " " << oid << " is missing" << dendl;
10140 if (pmissing)
10141 *pmissing = oid;
10142 put_snapset_context(ssc);
10143 return -EAGAIN;
10144 }
10145
10146 ObjectContextRef obc = get_object_context(oid, false);
10147 if (!obc || !obc->obs.exists) {
10148 dout(10) << "find_object_context " << oid << " @" << oid.snap
10149 << " snapset " << ssc->snapset
10150 << " " << oid << " is not present" << dendl;
10151 if (pmissing)
10152 *pmissing = oid;
10153 put_snapset_context(ssc);
10154 return -ENOENT;
10155 }
10156 dout(10) << "find_object_context " << oid << " @" << oid.snap
10157 << " snapset " << ssc->snapset
10158 << " " << oid << " HIT" << dendl;
10159 *pobc = obc;
10160 put_snapset_context(ssc);
10161 return 0;
10162 }
10163 ceph_abort(); //unreachable
10164 }
10165
10166 dout(10) << "find_object_context " << oid << " @" << oid.snap
10167 << " snapset " << ssc->snapset << dendl;
10168
10169 // head?
10170 if (oid.snap > ssc->snapset.seq) {
10171 if (ssc->snapset.head_exists) {
10172 ObjectContextRef obc = get_object_context(head, false);
10173 dout(10) << "find_object_context " << head
10174 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10175 << " -- HIT " << obc->obs
10176 << dendl;
10177 if (!obc->ssc)
10178 obc->ssc = ssc;
10179 else {
10180 assert(ssc == obc->ssc);
10181 put_snapset_context(ssc);
10182 }
10183 *pobc = obc;
10184 return 0;
10185 }
10186 dout(10) << "find_object_context " << head
10187 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10188 << " but head dne -- DNE"
10189 << dendl;
10190 put_snapset_context(ssc);
10191 return -ENOENT;
10192 }
10193
10194 // which clone would it be?
10195 unsigned k = 0;
10196 while (k < ssc->snapset.clones.size() &&
10197 ssc->snapset.clones[k] < oid.snap)
10198 k++;
10199 if (k == ssc->snapset.clones.size()) {
10200 dout(10) << "find_object_context no clones with last >= oid.snap "
10201 << oid.snap << " -- DNE" << dendl;
10202 put_snapset_context(ssc);
10203 return -ENOENT;
10204 }
10205 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10206 info.pgid.pool(), oid.get_namespace());
10207
10208 if (pg_log.get_missing().is_missing(soid)) {
10209 dout(20) << "find_object_context " << soid << " missing, try again later"
10210 << dendl;
10211 if (pmissing)
10212 *pmissing = soid;
10213 put_snapset_context(ssc);
10214 return -EAGAIN;
10215 }
10216
10217 ObjectContextRef obc = get_object_context(soid, false);
10218 if (!obc || !obc->obs.exists) {
10219 if (pmissing)
10220 *pmissing = soid;
10221 put_snapset_context(ssc);
10222 if (is_degraded_or_backfilling_object(soid)) {
10223 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10224 return -EAGAIN;
10225 } else {
10226 dout(20) << __func__ << " missing clone " << soid << dendl;
10227 return -ENOENT;
10228 }
10229 }
10230
10231 if (!obc->ssc) {
10232 obc->ssc = ssc;
10233 } else {
10234 assert(obc->ssc == ssc);
10235 put_snapset_context(ssc);
10236 }
10237 ssc = 0;
10238
10239 // clone
10240 dout(20) << "find_object_context " << soid
10241 << " snapset " << obc->ssc->snapset
10242 << " legacy_snaps " << obc->obs.oi.legacy_snaps
10243 << dendl;
10244 snapid_t first, last;
10245 if (obc->ssc->snapset.is_legacy()) {
10246 first = obc->obs.oi.legacy_snaps.back();
10247 last = obc->obs.oi.legacy_snaps.front();
10248 } else {
10249 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10250 assert(p != obc->ssc->snapset.clone_snaps.end());
10251 if (p->second.empty()) {
10252 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
10253 assert(!cct->_conf->osd_debug_verify_snaps);
10254 return -ENOENT;
10255 }
10256 first = p->second.back();
10257 last = p->second.front();
10258 }
10259 if (first <= oid.snap) {
10260 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10261 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10262 *pobc = obc;
10263 return 0;
10264 } else {
10265 dout(20) << "find_object_context " << soid << " [" << first << "," << last
10266 << "] does not contain " << oid.snap << " -- DNE" << dendl;
10267 return -ENOENT;
10268 }
10269 }
10270
10271 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10272 {
10273 if (obc->ssc)
10274 put_snapset_context(obc->ssc);
10275 }
10276
10277 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10278 {
10279 object_info_t& oi = obc->obs.oi;
10280
10281 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10282 object_stat_sum_t stat;
10283
10284 stat.num_bytes += oi.size;
10285
10286 if (oi.soid.snap != CEPH_SNAPDIR)
10287 stat.num_objects++;
10288 if (oi.is_dirty())
10289 stat.num_objects_dirty++;
10290 if (oi.is_whiteout())
10291 stat.num_whiteouts++;
10292 if (oi.is_omap())
10293 stat.num_objects_omap++;
10294 if (oi.is_cache_pinned())
10295 stat.num_objects_pinned++;
10296
10297 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10298 stat.num_object_clones++;
10299
10300 if (!obc->ssc)
10301 obc->ssc = get_snapset_context(oi.soid, false);
10302 assert(obc->ssc);
10303
10304 // subtract off clone overlap
10305 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10306 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10307 for (interval_set<uint64_t>::const_iterator r = o.begin();
10308 r != o.end();
10309 ++r) {
10310 stat.num_bytes -= r.get_len();
10311 }
10312 }
10313 }
10314
10315 // add it in
10316 pgstat->stats.sum.add(stat);
10317 }
10318
10319 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10320 {
10321 const hobject_t& soid = obc->obs.oi.soid;
10322 if (obc->is_blocked()) {
10323 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10324 return;
10325 }
10326
10327 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10328 if (p != waiting_for_blocked_object.end()) {
10329 list<OpRequestRef>& ls = p->second;
10330 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10331 requeue_ops(ls);
10332 waiting_for_blocked_object.erase(p);
10333 }
10334
10335 map<hobject_t, ObjectContextRef>::iterator i =
10336 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10337 if (i != objects_blocked_on_snap_promotion.end()) {
10338 assert(i->second == obc);
10339 objects_blocked_on_snap_promotion.erase(i);
10340 }
10341
10342 if (obc->requeue_scrub_on_unblock) {
10343 obc->requeue_scrub_on_unblock = false;
10344 requeue_scrub();
10345 }
10346 }
10347
10348 SnapSetContext *PrimaryLogPG::get_snapset_context(
10349 const hobject_t& oid,
10350 bool can_create,
10351 const map<string, bufferlist> *attrs,
10352 bool oid_existed)
10353 {
10354 Mutex::Locker l(snapset_contexts_lock);
10355 SnapSetContext *ssc;
10356 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10357 oid.get_snapdir());
10358 if (p != snapset_contexts.end()) {
10359 if (can_create || p->second->exists) {
10360 ssc = p->second;
10361 } else {
10362 return NULL;
10363 }
10364 } else {
10365 bufferlist bv;
10366 if (!attrs) {
10367 int r = -ENOENT;
10368 if (!(oid.is_head() && !oid_existed))
10369 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10370 if (r < 0) {
10371 // try _snapset
10372 if (!(oid.is_snapdir() && !oid_existed))
10373 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10374 if (r < 0 && !can_create)
10375 return NULL;
10376 }
10377 } else {
10378 assert(attrs->count(SS_ATTR));
10379 bv = attrs->find(SS_ATTR)->second;
10380 }
10381 ssc = new SnapSetContext(oid.get_snapdir());
10382 _register_snapset_context(ssc);
10383 if (bv.length()) {
10384 bufferlist::iterator bvp = bv.begin();
10385 try {
10386 ssc->snapset.decode(bvp);
10387 } catch (buffer::error& e) {
10388 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10389 return NULL;
10390 }
10391 ssc->exists = true;
10392 } else {
10393 ssc->exists = false;
10394 }
10395 }
10396 assert(ssc);
10397 ssc->ref++;
10398 return ssc;
10399 }
10400
10401 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10402 {
10403 Mutex::Locker l(snapset_contexts_lock);
10404 --ssc->ref;
10405 if (ssc->ref == 0) {
10406 if (ssc->registered)
10407 snapset_contexts.erase(ssc->oid);
10408 delete ssc;
10409 }
10410 }
10411
10412 /** pull - request object from a peer
10413 */
10414
10415 /*
10416 * Return values:
10417 * NONE - didn't pull anything
10418 * YES - pulled what the caller wanted
10419 * OTHER - needed to pull something else first (_head or _snapdir)
10420 */
10421 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10422
10423 int PrimaryLogPG::recover_missing(
10424 const hobject_t &soid, eversion_t v,
10425 int priority,
10426 PGBackend::RecoveryHandle *h)
10427 {
10428 if (missing_loc.is_unfound(soid)) {
10429 dout(7) << "pull " << soid
10430 << " v " << v
10431 << " but it is unfound" << dendl;
10432 return PULL_NONE;
10433 }
10434
10435 if (missing_loc.is_deleted(soid)) {
10436 start_recovery_op(soid);
10437 assert(!recovering.count(soid));
10438 recovering.insert(make_pair(soid, ObjectContextRef()));
10439 epoch_t cur_epoch = get_osdmap()->get_epoch();
10440 remove_missing_object(soid, v, new FunctionContext(
10441 [=](int) {
10442 lock();
10443 if (!pg_has_reset_since(cur_epoch)) {
10444 bool object_missing = false;
10445 for (const auto& shard : actingbackfill) {
10446 if (shard == pg_whoami)
10447 continue;
10448 if (peer_missing[shard].is_missing(soid)) {
10449 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10450 object_missing = true;
10451 break;
10452 }
10453 }
10454 if (!object_missing) {
10455 object_stat_sum_t stat_diff;
10456 stat_diff.num_objects_recovered = 1;
10457 on_global_recover(soid, stat_diff, true);
10458 } else {
10459 auto recovery_handle = pgbackend->open_recovery_op();
10460 pgbackend->recover_delete_object(soid, v, recovery_handle);
10461 pgbackend->run_recovery_op(recovery_handle, priority);
10462 }
10463 }
10464 unlock();
10465 }));
10466 return PULL_YES;
10467 }
10468
10469 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10470 ObjectContextRef obc;
10471 ObjectContextRef head_obc;
10472 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10473 // do we have the head and/or snapdir?
10474 hobject_t head = soid.get_head();
10475 if (pg_log.get_missing().is_missing(head)) {
10476 if (recovering.count(head)) {
10477 dout(10) << " missing but already recovering head " << head << dendl;
10478 return PULL_NONE;
10479 } else {
10480 int r = recover_missing(
10481 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10482 h);
10483 if (r != PULL_NONE)
10484 return PULL_OTHER;
10485 return PULL_NONE;
10486 }
10487 }
10488 head = soid.get_snapdir();
10489 if (pg_log.get_missing().is_missing(head)) {
10490 if (recovering.count(head)) {
10491 dout(10) << " missing but already recovering snapdir " << head << dendl;
10492 return PULL_NONE;
10493 } else {
10494 int r = recover_missing(
10495 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10496 h);
10497 if (r != PULL_NONE)
10498 return PULL_OTHER;
10499 return PULL_NONE;
10500 }
10501 }
10502
10503 // we must have one or the other
10504 head_obc = get_object_context(
10505 soid.get_head(),
10506 false,
10507 0);
10508 if (!head_obc)
10509 head_obc = get_object_context(
10510 soid.get_snapdir(),
10511 false,
10512 0);
10513 assert(head_obc);
10514 }
10515 start_recovery_op(soid);
10516 assert(!recovering.count(soid));
10517 recovering.insert(make_pair(soid, obc));
10518 int r = pgbackend->recover_object(
10519 soid,
10520 v,
10521 head_obc,
10522 obc,
10523 h);
10524 // This is only a pull which shouldn't return an error
10525 assert(r >= 0);
10526 return PULL_YES;
10527 }
10528
10529 void PrimaryLogPG::send_remove_op(
10530 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10531 {
10532 ceph_tid_t tid = osd->get_tid();
10533 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10534
10535 dout(10) << "send_remove_op " << oid << " from osd." << peer
10536 << " tid " << tid << dendl;
10537
10538 MOSDSubOp *subop = new MOSDSubOp(
10539 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10540 oid, CEPH_OSD_FLAG_ACK,
10541 get_osdmap()->get_epoch(), tid, v);
10542 subop->ops = vector<OSDOp>(1);
10543 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10544
10545 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10546 }
10547
10548 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10549 eversion_t v, Context *on_complete)
10550 {
10551 dout(20) << __func__ << " " << soid << " " << v << dendl;
10552 assert(on_complete != nullptr);
10553 // delete locally
10554 ObjectStore::Transaction t;
10555 remove_snap_mapped_object(t, soid);
10556
10557 ObjectRecoveryInfo recovery_info;
10558 recovery_info.soid = soid;
10559 recovery_info.version = v;
10560
10561 epoch_t cur_epoch = get_osdmap()->get_epoch();
10562 t.register_on_complete(new FunctionContext(
10563 [=](int) {
10564 lock();
10565 if (!pg_has_reset_since(cur_epoch)) {
10566 ObjectStore::Transaction t2;
10567 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10568 t2.register_on_complete(on_complete);
10569 int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10570 assert(r == 0);
10571 unlock();
10572 } else {
10573 unlock();
10574 on_complete->complete(-EAGAIN);
10575 }
10576 }));
10577 int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10578 assert(r == 0);
10579 }
10580
10581 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10582 {
10583 dout(10) << "finish_degraded_object " << oid << dendl;
10584 if (callbacks_for_degraded_object.count(oid)) {
10585 list<Context*> contexts;
10586 contexts.swap(callbacks_for_degraded_object[oid]);
10587 callbacks_for_degraded_object.erase(oid);
10588 for (list<Context*>::iterator i = contexts.begin();
10589 i != contexts.end();
10590 ++i) {
10591 (*i)->complete(0);
10592 }
10593 }
10594 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10595 oid.get_head());
10596 if (i != objects_blocked_on_degraded_snap.end() &&
10597 i->second == oid.snap)
10598 objects_blocked_on_degraded_snap.erase(i);
10599 }
10600
10601 void PrimaryLogPG::_committed_pushed_object(
10602 epoch_t epoch, eversion_t last_complete)
10603 {
10604 lock();
10605 if (!pg_has_reset_since(epoch)) {
10606 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10607 last_complete_ondisk = last_complete;
10608
10609 if (last_complete_ondisk == info.last_update) {
10610 if (!is_primary()) {
10611 // Either we are a replica or backfill target.
10612 // we are fully up to date. tell the primary!
10613 osd->send_message_osd_cluster(
10614 get_primary().osd,
10615 new MOSDPGTrim(
10616 get_osdmap()->get_epoch(),
10617 spg_t(info.pgid.pgid, get_primary().shard),
10618 last_complete_ondisk),
10619 get_osdmap()->get_epoch());
10620 } else {
10621 calc_min_last_complete_ondisk();
10622 }
10623 }
10624
10625 } else {
10626 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10627 }
10628
10629 unlock();
10630 }
10631
10632 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10633 {
10634 lock();
10635 dout(20) << __func__ << dendl;
10636 if (obc) {
10637 dout(20) << "obc = " << *obc << dendl;
10638 }
10639 assert(active_pushes >= 1);
10640 --active_pushes;
10641
10642 // requeue an active chunky scrub waiting on recovery ops
10643 if (!deleting && active_pushes == 0
10644 && scrubber.is_chunky_scrub_active()) {
10645 if (ops_blocked_by_scrub()) {
10646 requeue_scrub(true);
10647 } else {
10648 requeue_scrub(false);
10649 }
10650 }
10651 unlock();
10652 }
10653
10654 void PrimaryLogPG::_applied_recovered_object_replica()
10655 {
10656 lock();
10657 dout(20) << __func__ << dendl;
10658 assert(active_pushes >= 1);
10659 --active_pushes;
10660
10661 // requeue an active chunky scrub waiting on recovery ops
10662 if (!deleting && active_pushes == 0 &&
10663 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10664 scrubber.active_rep_scrub->get_req())->chunky) {
10665 osd->enqueue_back(
10666 info.pgid,
10667 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10668 scrubber.active_rep_scrub = OpRequestRef();
10669 }
10670 unlock();
10671 }
10672
10673 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10674 {
10675 dout(10) << "got missing " << oid << " v " << v << dendl;
10676 pg_log.recover_got(oid, v, info);
10677 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10678 dout(10) << "last_complete now " << info.last_complete
10679 << " log.complete_to " << pg_log.get_log().complete_to->version
10680 << dendl;
10681 } else {
10682 dout(10) << "last_complete now " << info.last_complete
10683 << " log.complete_to at end" << dendl;
10684 //below is not true in the repair case.
10685 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10686 assert(info.last_complete == info.last_update);
10687 }
10688 }
10689
10690 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10691 {
10692 list<pg_shard_t> fl = { pg_whoami };
10693 failed_push(fl, soid);
10694 }
10695
10696 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10697 {
10698 dout(20) << __func__ << ": " << soid << dendl;
10699 assert(recovering.count(soid));
10700 auto obc = recovering[soid];
10701 if (obc) {
10702 list<OpRequestRef> blocked_ops;
10703 obc->drop_recovery_read(&blocked_ops);
10704 requeue_ops(blocked_ops);
10705 }
10706 recovering.erase(soid);
10707 for (auto&& i : from)
10708 missing_loc.remove_location(soid, i);
10709 dout(0) << __func__ << " " << soid << " from shard " << from
10710 << ", reps on " << missing_loc.get_locations(soid)
10711 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10712 finish_recovery_op(soid); // close out this attempt,
10713 }
10714
10715 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10716 {
10717 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10718 assert(m->get_type() == MSG_OSD_SUBOP);
10719 dout(7) << "sub_op_remove " << m->poid << dendl;
10720
10721 op->mark_started();
10722
10723 ObjectStore::Transaction t;
10724 remove_snap_mapped_object(t, m->poid);
10725 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10726 assert(r == 0);
10727 }
10728
10729 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10730 {
10731 eversion_t v;
10732 pg_missing_item pmi;
10733 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10734 assert(is_missing);
10735 v = pmi.have;
10736 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10737
10738 assert(!actingbackfill.empty());
10739 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10740 i != actingbackfill.end();
10741 ++i) {
10742 if (*i == get_primary()) continue;
10743 pg_shard_t peer = *i;
10744 if (!peer_missing[peer].is_missing(oid)) {
10745 continue;
10746 }
10747 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10748 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10749 if (h > v)
10750 v = h;
10751 }
10752
10753 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10754 return v;
10755 }
10756
10757 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10758 {
10759 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10760 op->get_req());
10761 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10762 ObjectStore::Transaction t;
10763 boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
10764 if (m->pg_trim_to != eversion_t())
10765 op_trim_to = m->pg_trim_to;
10766 if (m->pg_roll_forward_to != eversion_t())
10767 op_roll_forward_to = m->pg_roll_forward_to;
10768
10769 dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
10770
10771 append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
10772 eversion_t new_lcod = info.last_complete;
10773
10774 Context *complete = new FunctionContext(
10775 [=](int) {
10776 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10777 op->get_req());
10778 lock();
10779 if (!pg_has_reset_since(msg->get_epoch())) {
10780 update_last_complete_ondisk(new_lcod);
10781 MOSDPGUpdateLogMissingReply *reply =
10782 new MOSDPGUpdateLogMissingReply(
10783 spg_t(info.pgid.pgid, primary_shard().shard),
10784 pg_whoami.shard,
10785 msg->get_epoch(),
10786 msg->min_epoch,
10787 msg->get_tid(),
10788 new_lcod);
10789 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10790 msg->get_connection()->send_message(reply);
10791 }
10792 unlock();
10793 });
10794
10795 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10796 t.register_on_commit(complete);
10797 } else {
10798 /* Hack to work around the fact that ReplicatedBackend sends
10799 * ack+commit if commit happens first
10800 *
10801 * This behavior is no longer necessary, but we preserve it so old
10802 * primaries can keep their repops in order */
10803 if (pool.info.ec_pool()) {
10804 t.register_on_complete(complete);
10805 } else {
10806 t.register_on_commit(complete);
10807 }
10808 }
10809 t.register_on_applied(
10810 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10811 int tr = osd->store->queue_transaction(
10812 osr.get(),
10813 std::move(t),
10814 nullptr);
10815 assert(tr == 0);
10816 }
10817
10818 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10819 {
10820 const MOSDPGUpdateLogMissingReply *m =
10821 static_cast<const MOSDPGUpdateLogMissingReply*>(
10822 op->get_req());
10823 dout(20) << __func__ << " got reply from "
10824 << m->get_from() << dendl;
10825
10826 auto it = log_entry_update_waiting_on.find(m->get_tid());
10827 if (it != log_entry_update_waiting_on.end()) {
10828 if (it->second.waiting_on.count(m->get_from())) {
10829 it->second.waiting_on.erase(m->get_from());
10830 if (m->last_complete_ondisk != eversion_t()) {
10831 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
10832 }
10833 } else {
10834 osd->clog->error()
10835 << info.pgid << " got reply "
10836 << *m << " from shard we are not waiting for "
10837 << m->get_from();
10838 }
10839
10840 if (it->second.waiting_on.empty()) {
10841 repop_all_committed(it->second.repop.get());
10842 log_entry_update_waiting_on.erase(it);
10843 }
10844 } else {
10845 osd->clog->error()
10846 << info.pgid << " got reply "
10847 << *m << " on unknown tid " << m->get_tid();
10848 }
10849 }
10850
10851 /* Mark all unfound objects as lost.
10852 */
10853 void PrimaryLogPG::mark_all_unfound_lost(
10854 int what,
10855 ConnectionRef con,
10856 ceph_tid_t tid)
10857 {
10858 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10859 list<hobject_t> oids;
10860
10861 dout(30) << __func__ << ": log before:\n";
10862 pg_log.get_log().print(*_dout);
10863 *_dout << dendl;
10864
10865 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10866
10867 utime_t mtime = ceph_clock_now();
10868 map<hobject_t, pg_missing_item>::const_iterator m =
10869 missing_loc.get_needs_recovery().begin();
10870 map<hobject_t, pg_missing_item>::const_iterator mend =
10871 missing_loc.get_needs_recovery().end();
10872
10873 ObcLockManager manager;
10874 eversion_t v = get_next_version();
10875 v.epoch = get_osdmap()->get_epoch();
10876 uint64_t num_unfound = missing_loc.num_unfound();
10877 while (m != mend) {
10878 const hobject_t &oid(m->first);
10879 if (!missing_loc.is_unfound(oid)) {
10880 // We only care about unfound objects
10881 ++m;
10882 continue;
10883 }
10884
10885 ObjectContextRef obc;
10886 eversion_t prev;
10887
10888 switch (what) {
10889 case pg_log_entry_t::LOST_MARK:
10890 assert(0 == "actually, not implemented yet!");
10891 break;
10892
10893 case pg_log_entry_t::LOST_REVERT:
10894 prev = pick_newest_available(oid);
10895 if (prev > eversion_t()) {
10896 // log it
10897 pg_log_entry_t e(
10898 pg_log_entry_t::LOST_REVERT, oid, v,
10899 m->second.need, 0, osd_reqid_t(), mtime, 0);
10900 e.reverting_to = prev;
10901 e.mark_unrollbackable();
10902 log_entries.push_back(e);
10903 dout(10) << e << dendl;
10904
10905 // we are now missing the new version; recovery code will sort it out.
10906 ++v.version;
10907 ++m;
10908 break;
10909 }
10910
10911 case pg_log_entry_t::LOST_DELETE:
10912 {
10913 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10914 0, osd_reqid_t(), mtime, 0);
10915 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10916 if (pool.info.require_rollback()) {
10917 e.mod_desc.try_rmobject(v.version);
10918 } else {
10919 e.mark_unrollbackable();
10920 }
10921 } // otherwise, just do what we used to do
10922 dout(10) << e << dendl;
10923 log_entries.push_back(e);
10924 oids.push_back(oid);
10925
10926 // If context found mark object as deleted in case
10927 // of racing with new creation. This can happen if
10928 // object lost and EIO at primary.
10929 obc = object_contexts.lookup(oid);
10930 if (obc)
10931 obc->obs.exists = false;
10932
10933 ++v.version;
10934 ++m;
10935 }
10936 break;
10937
10938 default:
10939 ceph_abort();
10940 }
10941 }
10942
10943 info.stats.stats_invalid = true;
10944
10945 submit_log_entries(
10946 log_entries,
10947 std::move(manager),
10948 boost::optional<std::function<void(void)> >(
10949 [this, oids, con, num_unfound, tid]() {
10950 if (perform_deletes_during_peering()) {
10951 for (auto oid : oids) {
10952 // clear old locations - merge_new_log_entries will have
10953 // handled rebuilding missing_loc for each of these
10954 // objects if we have the RECOVERY_DELETES flag
10955 missing_loc.recovered(oid);
10956 }
10957 }
10958
10959 if (is_recovery_unfound()) {
10960 queue_peering_event(
10961 CephPeeringEvtRef(
10962 std::make_shared<CephPeeringEvt>(
10963 get_osdmap()->get_epoch(),
10964 get_osdmap()->get_epoch(),
10965 DoRecovery())));
10966 } else if (is_backfill_unfound()) {
10967 queue_peering_event(
10968 CephPeeringEvtRef(
10969 std::make_shared<CephPeeringEvt>(
10970 get_osdmap()->get_epoch(),
10971 get_osdmap()->get_epoch(),
10972 RequestBackfill())));
10973 } else {
10974 queue_recovery();
10975 }
10976
10977 stringstream ss;
10978 ss << "pg has " << num_unfound
10979 << " objects unfound and apparently lost marking";
10980 string rs = ss.str();
10981 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10982 osd->clog->info() << rs;
10983 if (con) {
10984 MCommandReply *reply = new MCommandReply(0, rs);
10985 reply->set_tid(tid);
10986 con->send_message(reply);
10987 }
10988 }),
10989 OpRequestRef());
10990 }
10991
10992 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10993 {
10994 assert(repop_queue.empty());
10995 }
10996
10997 /*
10998 * pg status change notification
10999 */
11000
11001 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11002 {
11003 list<OpRequestRef> rq;
11004
11005 // apply all repops
11006 while (!repop_queue.empty()) {
11007 RepGather *repop = repop_queue.front();
11008 repop_queue.pop_front();
11009 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11010 repop->rep_aborted = true;
11011 repop->on_applied.clear();
11012 repop->on_committed.clear();
11013 repop->on_success.clear();
11014
11015 if (requeue) {
11016 if (repop->op) {
11017 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11018 rq.push_back(repop->op);
11019 repop->op = OpRequestRef();
11020 }
11021
11022 // also requeue any dups, interleaved into position
11023 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
11024 waiting_for_ondisk.find(repop->v);
11025 if (p != waiting_for_ondisk.end()) {
11026 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11027 for (list<pair<OpRequestRef, version_t> >::iterator i =
11028 p->second.begin();
11029 i != p->second.end();
11030 ++i) {
11031 rq.push_back(i->first);
11032 }
11033 waiting_for_ondisk.erase(p);
11034 }
11035 }
11036
11037 remove_repop(repop);
11038 }
11039
11040 assert(repop_queue.empty());
11041
11042 if (requeue) {
11043 requeue_ops(rq);
11044 if (!waiting_for_ondisk.empty()) {
11045 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
11046 waiting_for_ondisk.begin();
11047 i != waiting_for_ondisk.end();
11048 ++i) {
11049 for (list<pair<OpRequestRef, version_t> >::iterator j =
11050 i->second.begin();
11051 j != i->second.end();
11052 ++j) {
11053 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
11054 << i->first << dendl;
11055 }
11056 }
11057 assert(waiting_for_ondisk.empty());
11058 }
11059 }
11060
11061 waiting_for_ondisk.clear();
11062 }
11063
11064 void PrimaryLogPG::on_flushed()
11065 {
11066 assert(flushes_in_progress > 0);
11067 flushes_in_progress--;
11068 if (flushes_in_progress == 0) {
11069 requeue_ops(waiting_for_flush);
11070 }
11071 if (!is_peered() || !is_primary()) {
11072 pair<hobject_t, ObjectContextRef> i;
11073 while (object_contexts.get_next(i.first, &i)) {
11074 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
11075 }
11076 assert(object_contexts.empty());
11077 }
11078 pgbackend->on_flushed();
11079 }
11080
11081 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
11082 {
11083 dout(10) << "on_removal" << dendl;
11084
11085 // adjust info to backfill
11086 info.set_last_backfill(hobject_t());
11087 pg_log.reset_backfill();
11088 dirty_info = true;
11089
11090
11091 // clear log
11092 PGLogEntryHandler rollbacker{this, t};
11093 pg_log.roll_forward(&rollbacker);
11094
11095 write_if_dirty(*t);
11096
11097 if (!deleting)
11098 on_shutdown();
11099 }
11100
11101 void PrimaryLogPG::clear_async_reads()
11102 {
11103 dout(10) << __func__ << dendl;
11104 for(auto& i : in_progress_async_reads) {
11105 dout(10) << "clear ctx: "
11106 << "OpRequestRef " << i.first
11107 << " OpContext " << i.second
11108 << dendl;
11109 close_op_ctx(i.second);
11110 }
11111 }
11112
11113 void PrimaryLogPG::on_shutdown()
11114 {
11115 dout(10) << "on_shutdown" << dendl;
11116
11117 // remove from queues
11118 osd->pg_stat_queue_dequeue(this);
11119 osd->peering_wq.dequeue(this);
11120
11121 // handles queue races
11122 deleting = true;
11123
11124 if (recovery_queued) {
11125 recovery_queued = false;
11126 osd->clear_queued_recovery(this);
11127 }
11128
11129 clear_scrub_reserved();
11130 scrub_clear_state();
11131
11132 unreg_next_scrub();
11133
11134 vector<ceph_tid_t> tids;
11135 cancel_copy_ops(false, &tids);
11136 cancel_flush_ops(false, &tids);
11137 cancel_proxy_ops(false, &tids);
11138 osd->objecter->op_cancel(tids, -ECANCELED);
11139
11140 apply_and_flush_repops(false);
11141 cancel_log_updates();
11142 // we must remove PGRefs, so do this this prior to release_backoffs() callers
11143 clear_backoffs();
11144 // clean up snap trim references
11145 snap_trimmer_machine.process_event(Reset());
11146
11147 pgbackend->on_change();
11148
11149 context_registry_on_change();
11150 object_contexts.clear();
11151
11152 clear_async_reads();
11153
11154 osd->remote_reserver.cancel_reservation(info.pgid);
11155 osd->local_reserver.cancel_reservation(info.pgid);
11156
11157 clear_primary_state();
11158 cancel_recovery();
11159 }
11160
11161 void PrimaryLogPG::on_activate()
11162 {
11163 // all clean?
11164 if (needs_recovery()) {
11165 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11166 queue_peering_event(
11167 CephPeeringEvtRef(
11168 std::make_shared<CephPeeringEvt>(
11169 get_osdmap()->get_epoch(),
11170 get_osdmap()->get_epoch(),
11171 DoRecovery())));
11172 } else if (needs_backfill()) {
11173 dout(10) << "activate queueing backfill" << dendl;
11174 queue_peering_event(
11175 CephPeeringEvtRef(
11176 std::make_shared<CephPeeringEvt>(
11177 get_osdmap()->get_epoch(),
11178 get_osdmap()->get_epoch(),
11179 RequestBackfill())));
11180 } else {
11181 dout(10) << "activate all replicas clean, no recovery" << dendl;
11182 eio_errors_to_process = false;
11183 queue_peering_event(
11184 CephPeeringEvtRef(
11185 std::make_shared<CephPeeringEvt>(
11186 get_osdmap()->get_epoch(),
11187 get_osdmap()->get_epoch(),
11188 AllReplicasRecovered())));
11189 }
11190
11191 publish_stats_to_osd();
11192
11193 if (!backfill_targets.empty()) {
11194 last_backfill_started = earliest_backfill();
11195 new_backfill = true;
11196 assert(!last_backfill_started.is_max());
11197 dout(5) << "on activate: bft=" << backfill_targets
11198 << " from " << last_backfill_started << dendl;
11199 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11200 i != backfill_targets.end();
11201 ++i) {
11202 dout(5) << "target shard " << *i
11203 << " from " << peer_info[*i].last_backfill
11204 << dendl;
11205 }
11206 }
11207
11208 hit_set_setup();
11209 agent_setup();
11210 }
11211
11212 void PrimaryLogPG::_on_new_interval()
11213 {
11214 dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11215 if (!pg_log.get_missing().may_include_deletes &&
11216 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11217 pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11218 }
11219 assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11220 }
11221
11222 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11223 {
11224 dout(10) << "on_change" << dendl;
11225
11226 if (hit_set && hit_set->insert_count() == 0) {
11227 dout(20) << " discarding empty hit_set" << dendl;
11228 hit_set_clear();
11229 }
11230
11231 if (recovery_queued) {
11232 recovery_queued = false;
11233 osd->clear_queued_recovery(this);
11234 }
11235
11236 // requeue everything in the reverse order they should be
11237 // reexamined.
11238 requeue_ops(waiting_for_peered);
11239 requeue_ops(waiting_for_flush);
11240 requeue_ops(waiting_for_active);
11241
11242 clear_scrub_reserved();
11243
11244 vector<ceph_tid_t> tids;
11245 cancel_copy_ops(is_primary(), &tids);
11246 cancel_flush_ops(is_primary(), &tids);
11247 cancel_proxy_ops(is_primary(), &tids);
11248 osd->objecter->op_cancel(tids, -ECANCELED);
11249
11250 // requeue object waiters
11251 for (auto& p : waiting_for_unreadable_object) {
11252 release_backoffs(p.first);
11253 }
11254 if (is_primary()) {
11255 requeue_object_waiters(waiting_for_unreadable_object);
11256 } else {
11257 waiting_for_unreadable_object.clear();
11258 }
11259 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11260 p != waiting_for_degraded_object.end();
11261 waiting_for_degraded_object.erase(p++)) {
11262 release_backoffs(p->first);
11263 if (is_primary())
11264 requeue_ops(p->second);
11265 else
11266 p->second.clear();
11267 finish_degraded_object(p->first);
11268 }
11269
11270 // requeues waiting_for_scrub
11271 scrub_clear_state();
11272
11273 for (auto p = waiting_for_blocked_object.begin();
11274 p != waiting_for_blocked_object.end();
11275 waiting_for_blocked_object.erase(p++)) {
11276 if (is_primary())
11277 requeue_ops(p->second);
11278 else
11279 p->second.clear();
11280 }
11281 for (auto i = callbacks_for_degraded_object.begin();
11282 i != callbacks_for_degraded_object.end();
11283 ) {
11284 finish_degraded_object((i++)->first);
11285 }
11286 assert(callbacks_for_degraded_object.empty());
11287
11288 if (is_primary()) {
11289 requeue_ops(waiting_for_cache_not_full);
11290 } else {
11291 waiting_for_cache_not_full.clear();
11292 }
11293 objects_blocked_on_cache_full.clear();
11294
11295 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11296 in_progress_async_reads.begin();
11297 i != in_progress_async_reads.end();
11298 in_progress_async_reads.erase(i++)) {
11299 close_op_ctx(i->second);
11300 if (is_primary())
11301 requeue_op(i->first);
11302 }
11303
11304 // this will requeue ops we were working on but didn't finish, and
11305 // any dups
11306 apply_and_flush_repops(is_primary());
11307 cancel_log_updates();
11308
11309 // do this *after* apply_and_flush_repops so that we catch any newly
11310 // registered watches.
11311 context_registry_on_change();
11312
11313 pgbackend->on_change_cleanup(t);
11314 scrubber.cleanup_store(t);
11315 pgbackend->on_change();
11316
11317 // clear snap_trimmer state
11318 snap_trimmer_machine.process_event(Reset());
11319
11320 debug_op_order.clear();
11321 unstable_stats.clear();
11322
11323 // we don't want to cache object_contexts through the interval change
11324 // NOTE: we actually assert that all currently live references are dead
11325 // by the time the flush for the next interval completes.
11326 object_contexts.clear();
11327
11328 // should have been cleared above by finishing all of the degraded objects
11329 assert(objects_blocked_on_degraded_snap.empty());
11330 }
11331
11332 void PrimaryLogPG::on_role_change()
11333 {
11334 dout(10) << "on_role_change" << dendl;
11335 if (get_role() != 0 && hit_set) {
11336 dout(10) << " clearing hit set" << dendl;
11337 hit_set_clear();
11338 }
11339 }
11340
11341 void PrimaryLogPG::on_pool_change()
11342 {
11343 dout(10) << __func__ << dendl;
11344 // requeue cache full waiters just in case the cache_mode is
11345 // changing away from writeback mode. note that if we are not
11346 // active the normal requeuing machinery is sufficient (and properly
11347 // ordered).
11348 if (is_active() &&
11349 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11350 !waiting_for_cache_not_full.empty()) {
11351 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11352 << dendl;
11353 requeue_ops(waiting_for_cache_not_full);
11354 objects_blocked_on_cache_full.clear();
11355 }
11356 hit_set_setup();
11357 agent_setup();
11358 }
11359
11360 // clear state. called on recovery completion AND cancellation.
11361 void PrimaryLogPG::_clear_recovery_state()
11362 {
11363 missing_loc.clear();
11364 #ifdef DEBUG_RECOVERY_OIDS
11365 recovering_oids.clear();
11366 #endif
11367 last_backfill_started = hobject_t();
11368 set<hobject_t>::iterator i = backfills_in_flight.begin();
11369 while (i != backfills_in_flight.end()) {
11370 assert(recovering.count(*i));
11371 backfills_in_flight.erase(i++);
11372 }
11373
11374 list<OpRequestRef> blocked_ops;
11375 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11376 i != recovering.end();
11377 recovering.erase(i++)) {
11378 if (i->second) {
11379 i->second->drop_recovery_read(&blocked_ops);
11380 requeue_ops(blocked_ops);
11381 }
11382 }
11383 assert(backfills_in_flight.empty());
11384 pending_backfill_updates.clear();
11385 assert(recovering.empty());
11386 pgbackend->clear_recovery_state();
11387 }
11388
11389 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11390 {
11391 dout(20) << __func__ << ": " << soid << dendl;
11392 assert(recovering.count(soid));
11393 ObjectContextRef obc = recovering[soid];
11394 if (obc) {
11395 list<OpRequestRef> blocked_ops;
11396 obc->drop_recovery_read(&blocked_ops);
11397 requeue_ops(blocked_ops);
11398 }
11399 recovering.erase(soid);
11400 finish_recovery_op(soid);
11401 release_backoffs(soid);
11402 if (waiting_for_degraded_object.count(soid)) {
11403 dout(20) << " kicking degraded waiters on " << soid << dendl;
11404 requeue_ops(waiting_for_degraded_object[soid]);
11405 waiting_for_degraded_object.erase(soid);
11406 }
11407 if (waiting_for_unreadable_object.count(soid)) {
11408 dout(20) << " kicking unreadable waiters on " << soid << dendl;
11409 requeue_ops(waiting_for_unreadable_object[soid]);
11410 waiting_for_unreadable_object.erase(soid);
11411 }
11412 if (is_missing_object(soid))
11413 pg_log.set_last_requested(0); // get recover_primary to start over
11414 finish_degraded_object(soid);
11415 }
11416
11417 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11418 {
11419 /*
11420 * check that any peers we are planning to (or currently) pulling
11421 * objects from are dealt with.
11422 */
11423 missing_loc.check_recovery_sources(osdmap);
11424 pgbackend->check_recovery_sources(osdmap);
11425
11426 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11427 i != peer_log_requested.end();
11428 ) {
11429 if (!osdmap->is_up(i->osd)) {
11430 dout(10) << "peer_log_requested removing " << *i << dendl;
11431 peer_log_requested.erase(i++);
11432 } else {
11433 ++i;
11434 }
11435 }
11436
11437 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11438 i != peer_missing_requested.end();
11439 ) {
11440 if (!osdmap->is_up(i->osd)) {
11441 dout(10) << "peer_missing_requested removing " << *i << dendl;
11442 peer_missing_requested.erase(i++);
11443 } else {
11444 ++i;
11445 }
11446 }
11447 }
11448
11449 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11450 {
11451 set<pg_shard_t> now_down;
11452 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11453 p != missing_loc_sources.end();
11454 ) {
11455 if (osdmap->is_up(p->osd)) {
11456 ++p;
11457 continue;
11458 }
11459 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11460 now_down.insert(*p);
11461 missing_loc_sources.erase(p++);
11462 }
11463
11464 if (now_down.empty()) {
11465 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11466 } else {
11467 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11468 << missing_loc_sources << dendl;
11469
11470 // filter missing_loc
11471 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11472 while (p != missing_loc.end()) {
11473 set<pg_shard_t>::iterator q = p->second.begin();
11474 while (q != p->second.end())
11475 if (now_down.count(*q)) {
11476 p->second.erase(q++);
11477 } else {
11478 ++q;
11479 }
11480 if (p->second.empty())
11481 missing_loc.erase(p++);
11482 else
11483 ++p;
11484 }
11485 }
11486 }
11487
11488
11489 bool PrimaryLogPG::start_recovery_ops(
11490 uint64_t max,
11491 ThreadPool::TPHandle &handle,
11492 uint64_t *ops_started)
11493 {
11494 uint64_t& started = *ops_started;
11495 started = 0;
11496 bool work_in_progress = false;
11497 assert(is_primary());
11498
11499 if (!state_test(PG_STATE_RECOVERING) &&
11500 !state_test(PG_STATE_BACKFILLING)) {
11501 /* TODO: I think this case is broken and will make do_recovery()
11502 * unhappy since we're returning false */
11503 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11504 return false;
11505 }
11506
11507 const auto &missing = pg_log.get_missing();
11508
11509 unsigned int num_missing = missing.num_missing();
11510 uint64_t num_unfound = get_num_unfound();
11511
11512 if (num_missing == 0) {
11513 info.last_complete = info.last_update;
11514 }
11515
11516 if (num_missing == num_unfound) {
11517 // All of the missing objects we have are unfound.
11518 // Recover the replicas.
11519 started = recover_replicas(max, handle);
11520 }
11521 if (!started) {
11522 // We still have missing objects that we should grab from replicas.
11523 started += recover_primary(max, handle);
11524 }
11525 if (!started && num_unfound != get_num_unfound()) {
11526 // second chance to recovery replicas
11527 started = recover_replicas(max, handle);
11528 }
11529
11530 if (started)
11531 work_in_progress = true;
11532
11533 bool deferred_backfill = false;
11534 if (recovering.empty() &&
11535 state_test(PG_STATE_BACKFILLING) &&
11536 !backfill_targets.empty() && started < max &&
11537 missing.num_missing() == 0 &&
11538 waiting_on_backfill.empty()) {
11539 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11540 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11541 deferred_backfill = true;
11542 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11543 !is_degraded()) {
11544 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11545 deferred_backfill = true;
11546 } else if (!backfill_reserved) {
11547 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11548 if (!backfill_reserving) {
11549 dout(10) << "queueing RequestBackfill" << dendl;
11550 backfill_reserving = true;
11551 queue_peering_event(
11552 CephPeeringEvtRef(
11553 std::make_shared<CephPeeringEvt>(
11554 get_osdmap()->get_epoch(),
11555 get_osdmap()->get_epoch(),
11556 RequestBackfill())));
11557 }
11558 deferred_backfill = true;
11559 } else {
11560 started += recover_backfill(max - started, handle, &work_in_progress);
11561 }
11562 }
11563
11564 dout(10) << " started " << started << dendl;
11565 osd->logger->inc(l_osd_rop, started);
11566
11567 if (!recovering.empty() ||
11568 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11569 return work_in_progress;
11570
11571 assert(recovering.empty());
11572 assert(recovery_ops_active == 0);
11573
11574 dout(10) << __func__ << " needs_recovery: "
11575 << missing_loc.get_needs_recovery()
11576 << dendl;
11577 dout(10) << __func__ << " missing_loc: "
11578 << missing_loc.get_missing_locs()
11579 << dendl;
11580 int unfound = get_num_unfound();
11581 if (unfound) {
11582 dout(10) << " still have " << unfound << " unfound" << dendl;
11583 return work_in_progress;
11584 }
11585
11586 if (missing.num_missing() > 0) {
11587 // this shouldn't happen!
11588 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11589 << missing.num_missing() << ": " << missing.get_items();
11590 return work_in_progress;
11591 }
11592
11593 if (needs_recovery()) {
11594 // this shouldn't happen!
11595 // We already checked num_missing() so we must have missing replicas
11596 osd->clog->error() << info.pgid
11597 << " Unexpected Error: recovery ending with missing replicas";
11598 return work_in_progress;
11599 }
11600
11601 if (state_test(PG_STATE_RECOVERING)) {
11602 state_clear(PG_STATE_RECOVERING);
11603 state_clear(PG_STATE_FORCED_RECOVERY);
11604 if (needs_backfill()) {
11605 dout(10) << "recovery done, queuing backfill" << dendl;
11606 queue_peering_event(
11607 CephPeeringEvtRef(
11608 std::make_shared<CephPeeringEvt>(
11609 get_osdmap()->get_epoch(),
11610 get_osdmap()->get_epoch(),
11611 RequestBackfill())));
11612 } else {
11613 dout(10) << "recovery done, no backfill" << dendl;
11614 eio_errors_to_process = false;
11615 state_clear(PG_STATE_FORCED_BACKFILL);
11616 queue_peering_event(
11617 CephPeeringEvtRef(
11618 std::make_shared<CephPeeringEvt>(
11619 get_osdmap()->get_epoch(),
11620 get_osdmap()->get_epoch(),
11621 AllReplicasRecovered())));
11622 }
11623 } else { // backfilling
11624 state_clear(PG_STATE_BACKFILLING);
11625 state_clear(PG_STATE_FORCED_BACKFILL);
11626 state_clear(PG_STATE_FORCED_RECOVERY);
11627 dout(10) << "recovery done, backfill done" << dendl;
11628 eio_errors_to_process = false;
11629 queue_peering_event(
11630 CephPeeringEvtRef(
11631 std::make_shared<CephPeeringEvt>(
11632 get_osdmap()->get_epoch(),
11633 get_osdmap()->get_epoch(),
11634 Backfilled())));
11635 }
11636
11637 return false;
11638 }
11639
11640 /**
11641 * do one recovery op.
11642 * return true if done, false if nothing left to do.
11643 */
11644 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11645 {
11646 assert(is_primary());
11647
11648 const auto &missing = pg_log.get_missing();
11649
11650 dout(10) << "recover_primary recovering " << recovering.size()
11651 << " in pg" << dendl;
11652 dout(10) << "recover_primary " << missing << dendl;
11653 dout(25) << "recover_primary " << missing.get_items() << dendl;
11654
11655 // look at log!
11656 pg_log_entry_t *latest = 0;
11657 unsigned started = 0;
11658 int skipped = 0;
11659
11660 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11661 map<version_t, hobject_t>::const_iterator p =
11662 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11663 while (p != missing.get_rmissing().end()) {
11664 handle.reset_tp_timeout();
11665 hobject_t soid;
11666 version_t v = p->first;
11667
11668 if (pg_log.get_log().objects.count(p->second)) {
11669 latest = pg_log.get_log().objects.find(p->second)->second;
11670 assert(latest->is_update() || latest->is_delete());
11671 soid = latest->soid;
11672 } else {
11673 latest = 0;
11674 soid = p->second;
11675 }
11676 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11677 ++p;
11678
11679 hobject_t head = soid.get_head();
11680
11681 eversion_t need = item.need;
11682
11683 dout(10) << "recover_primary "
11684 << soid << " " << item.need
11685 << (missing.is_missing(soid) ? " (missing)":"")
11686 << (missing.is_missing(head) ? " (missing head)":"")
11687 << (recovering.count(soid) ? " (recovering)":"")
11688 << (recovering.count(head) ? " (recovering head)":"")
11689 << dendl;
11690
11691 if (latest) {
11692 switch (latest->op) {
11693 case pg_log_entry_t::CLONE:
11694 /*
11695 * Handling for this special case removed for now, until we
11696 * can correctly construct an accurate SnapSet from the old
11697 * one.
11698 */
11699 break;
11700
11701 case pg_log_entry_t::LOST_REVERT:
11702 {
11703 if (item.have == latest->reverting_to) {
11704 ObjectContextRef obc = get_object_context(soid, true);
11705
11706 if (obc->obs.oi.version == latest->version) {
11707 // I'm already reverting
11708 dout(10) << " already reverting " << soid << dendl;
11709 } else {
11710 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11711 obc->ondisk_write_lock();
11712 obc->obs.oi.version = latest->version;
11713
11714 ObjectStore::Transaction t;
11715 bufferlist b2;
11716 obc->obs.oi.encode(
11717 b2,
11718 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11719 assert(!pool.info.require_rollback());
11720 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11721
11722 recover_got(soid, latest->version);
11723 missing_loc.add_location(soid, pg_whoami);
11724
11725 ++active_pushes;
11726
11727 osd->store->queue_transaction(osr.get(), std::move(t),
11728 new C_OSD_AppliedRecoveredObject(this, obc),
11729 new C_OSD_CommittedPushedObject(
11730 this,
11731 get_osdmap()->get_epoch(),
11732 info.last_complete),
11733 new C_OSD_OndiskWriteUnlock(obc));
11734 continue;
11735 }
11736 } else {
11737 /*
11738 * Pull the old version of the object. Update missing_loc here to have the location
11739 * of the version we want.
11740 *
11741 * This doesn't use the usual missing_loc paths, but that's okay:
11742 * - if we have it locally, we hit the case above, and go from there.
11743 * - if we don't, we always pass through this case during recovery and set up the location
11744 * properly.
11745 * - this way we don't need to mangle the missing code to be general about needing an old
11746 * version...
11747 */
11748 eversion_t alternate_need = latest->reverting_to;
11749 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11750
11751 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11752 p != peer_missing.end();
11753 ++p)
11754 if (p->second.is_missing(soid, need) &&
11755 p->second.get_items().at(soid).have == alternate_need) {
11756 missing_loc.add_location(soid, p->first);
11757 }
11758 dout(10) << " will pull " << alternate_need << " or " << need
11759 << " from one of " << missing_loc.get_locations(soid)
11760 << dendl;
11761 }
11762 }
11763 break;
11764 }
11765 }
11766
11767 if (!recovering.count(soid)) {
11768 if (recovering.count(head)) {
11769 ++skipped;
11770 } else {
11771 int r = recover_missing(
11772 soid, need, get_recovery_op_priority(), h);
11773 switch (r) {
11774 case PULL_YES:
11775 ++started;
11776 break;
11777 case PULL_OTHER:
11778 ++started;
11779 case PULL_NONE:
11780 ++skipped;
11781 break;
11782 default:
11783 ceph_abort();
11784 }
11785 if (started >= max)
11786 break;
11787 }
11788 }
11789
11790 // only advance last_requested if we haven't skipped anything
11791 if (!skipped)
11792 pg_log.set_last_requested(v);
11793 }
11794
11795 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11796 return started;
11797 }
11798
11799 bool PrimaryLogPG::primary_error(
11800 const hobject_t& soid, eversion_t v)
11801 {
11802 pg_log.missing_add(soid, v, eversion_t());
11803 pg_log.set_last_requested(0);
11804 missing_loc.remove_location(soid, pg_whoami);
11805 bool uhoh = true;
11806 assert(!actingbackfill.empty());
11807 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11808 i != actingbackfill.end();
11809 ++i) {
11810 if (*i == get_primary()) continue;
11811 pg_shard_t peer = *i;
11812 if (!peer_missing[peer].is_missing(soid, v)) {
11813 missing_loc.add_location(soid, peer);
11814 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11815 << ", there should be a copy on shard " << peer << dendl;
11816 uhoh = false;
11817 }
11818 }
11819 if (uhoh)
11820 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11821 else
11822 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11823 << ", will try copies on " << missing_loc.get_locations(soid);
11824 return uhoh;
11825 }
11826
11827 int PrimaryLogPG::prep_object_replica_deletes(
11828 const hobject_t& soid, eversion_t v,
11829 PGBackend::RecoveryHandle *h)
11830 {
11831 assert(is_primary());
11832 dout(10) << __func__ << ": on " << soid << dendl;
11833
11834 start_recovery_op(soid);
11835 assert(!recovering.count(soid));
11836 recovering.insert(make_pair(soid, ObjectContextRef()));
11837
11838 pgbackend->recover_delete_object(soid, v, h);
11839 return 1;
11840 }
11841
11842 int PrimaryLogPG::prep_object_replica_pushes(
11843 const hobject_t& soid, eversion_t v,
11844 PGBackend::RecoveryHandle *h)
11845 {
11846 assert(is_primary());
11847 dout(10) << __func__ << ": on " << soid << dendl;
11848
11849 // NOTE: we know we will get a valid oloc off of disk here.
11850 ObjectContextRef obc = get_object_context(soid, false);
11851 if (!obc) {
11852 primary_error(soid, v);
11853 return 0;
11854 }
11855
11856 if (!obc->get_recovery_read()) {
11857 dout(20) << "recovery delayed on " << soid
11858 << "; could not get rw_manager lock" << dendl;
11859 return 0;
11860 } else {
11861 dout(20) << "recovery got recovery read lock on " << soid
11862 << dendl;
11863 }
11864
11865 start_recovery_op(soid);
11866 assert(!recovering.count(soid));
11867 recovering.insert(make_pair(soid, obc));
11868
11869 /* We need this in case there is an in progress write on the object. In fact,
11870 * the only possible write is an update to the xattr due to a lost_revert --
11871 * a client write would be blocked since the object is degraded.
11872 * In almost all cases, therefore, this lock should be uncontended.
11873 */
11874 obc->ondisk_read_lock();
11875 int r = pgbackend->recover_object(
11876 soid,
11877 v,
11878 ObjectContextRef(),
11879 obc, // has snapset context
11880 h);
11881 obc->ondisk_read_unlock();
11882 if (r < 0) {
11883 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11884 primary_failed(soid);
11885 primary_error(soid, v);
11886 return 0;
11887 }
11888 return 1;
11889 }
11890
11891 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11892 {
11893 dout(10) << __func__ << "(" << max << ")" << dendl;
11894 uint64_t started = 0;
11895
11896 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11897
11898 // this is FAR from an optimal recovery order. pretty lame, really.
11899 assert(!actingbackfill.empty());
11900 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11901 i != actingbackfill.end();
11902 ++i) {
11903 if (*i == get_primary()) continue;
11904 pg_shard_t peer = *i;
11905 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11906 assert(pm != peer_missing.end());
11907 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11908 assert(pi != peer_info.end());
11909 size_t m_sz = pm->second.num_missing();
11910
11911 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11912 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11913
11914 // oldest first!
11915 const pg_missing_t &m(pm->second);
11916 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11917 p != m.get_rmissing().end() && started < max;
11918 ++p) {
11919 handle.reset_tp_timeout();
11920 const hobject_t soid(p->second);
11921
11922 if (missing_loc.is_unfound(soid)) {
11923 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11924 continue;
11925 }
11926
11927 if (soid > pi->second.last_backfill) {
11928 if (!recovering.count(soid)) {
11929 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11930 derr << __func__ << ": object added to missing set for backfill, but "
11931 << "is not in recovering, error!" << dendl;
11932 ceph_abort();
11933 }
11934 continue;
11935 }
11936
11937 if (recovering.count(soid)) {
11938 dout(10) << __func__ << ": already recovering " << soid << dendl;
11939 continue;
11940 }
11941
11942 if (missing_loc.is_deleted(soid)) {
11943 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11944 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11945 started += prep_object_replica_deletes(soid, r->second.need, h);
11946 continue;
11947 }
11948
11949 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11950 dout(10) << __func__ << ": " << soid.get_head()
11951 << " still missing on primary" << dendl;
11952 continue;
11953 }
11954
11955 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11956 dout(10) << __func__ << ": " << soid.get_snapdir()
11957 << " still missing on primary" << dendl;
11958 continue;
11959 }
11960
11961 if (pg_log.get_missing().is_missing(soid)) {
11962 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11963 continue;
11964 }
11965
11966 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11967 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11968 started += prep_object_replica_pushes(soid, r->second.need,
11969 h);
11970 }
11971 }
11972
11973 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11974 return started;
11975 }
11976
11977 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11978 {
11979 hobject_t e = hobject_t::get_max();
11980 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11981 i != backfill_targets.end();
11982 ++i) {
11983 pg_shard_t peer = *i;
11984 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11985 peer_backfill_info.find(peer);
11986 assert(iter != peer_backfill_info.end());
11987 if (iter->second.begin < e)
11988 e = iter->second.begin;
11989 }
11990 return e;
11991 }
11992
11993 bool PrimaryLogPG::all_peer_done() const
11994 {
11995 // Primary hasn't got any more objects
11996 assert(backfill_info.empty());
11997
11998 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11999 i != backfill_targets.end();
12000 ++i) {
12001 pg_shard_t bt = *i;
12002 map<pg_shard_t, BackfillInterval>::const_iterator piter =
12003 peer_backfill_info.find(bt);
12004 assert(piter != peer_backfill_info.end());
12005 const BackfillInterval& pbi = piter->second;
12006 // See if peer has more to process
12007 if (!pbi.extends_to_end() || !pbi.empty())
12008 return false;
12009 }
12010 return true;
12011 }
12012
12013 /**
12014 * recover_backfill
12015 *
12016 * Invariants:
12017 *
12018 * backfilled: fully pushed to replica or present in replica's missing set (both
12019 * our copy and theirs).
12020 *
12021 * All objects on a backfill_target in
12022 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12023 * objects have been actually deleted and all logically-valid objects are replicated.
12024 * There may be PG objects in this interval yet to be backfilled.
12025 *
12026 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12027 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
12028 *
12029 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
12030 * backfill_info.begin) in PG are backfilled. No deleted objects in this
12031 * interval remain on the backfill target.
12032 *
12033 * For a backfill target, all objects <= peer_info[target].last_backfill
12034 * have been backfilled to target
12035 *
12036 * There *MAY* be missing/outdated objects between last_backfill_started and
12037 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12038 * io created objects since the last scan. For this reason, we call
12039 * update_range() again before continuing backfill.
12040 */
12041 uint64_t PrimaryLogPG::recover_backfill(
12042 uint64_t max,
12043 ThreadPool::TPHandle &handle, bool *work_started)
12044 {
12045 dout(10) << "recover_backfill (" << max << ")"
12046 << " bft=" << backfill_targets
12047 << " last_backfill_started " << last_backfill_started
12048 << (new_backfill ? " new_backfill":"")
12049 << dendl;
12050 assert(!backfill_targets.empty());
12051
12052 // Initialize from prior backfill state
12053 if (new_backfill) {
12054 // on_activate() was called prior to getting here
12055 assert(last_backfill_started == earliest_backfill());
12056 new_backfill = false;
12057
12058 // initialize BackfillIntervals
12059 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12060 i != backfill_targets.end();
12061 ++i) {
12062 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
12063 }
12064 backfill_info.reset(last_backfill_started);
12065
12066 backfills_in_flight.clear();
12067 pending_backfill_updates.clear();
12068 }
12069
12070 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12071 i != backfill_targets.end();
12072 ++i) {
12073 dout(10) << "peer osd." << *i
12074 << " info " << peer_info[*i]
12075 << " interval " << peer_backfill_info[*i].begin
12076 << "-" << peer_backfill_info[*i].end
12077 << " " << peer_backfill_info[*i].objects.size() << " objects"
12078 << dendl;
12079 }
12080
12081 // update our local interval to cope with recent changes
12082 backfill_info.begin = last_backfill_started;
12083 update_range(&backfill_info, handle);
12084
12085 unsigned ops = 0;
12086 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
12087 set<hobject_t> add_to_stat;
12088
12089 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12090 i != backfill_targets.end();
12091 ++i) {
12092 peer_backfill_info[*i].trim_to(
12093 std::max(peer_info[*i].last_backfill, last_backfill_started));
12094 }
12095 backfill_info.trim_to(last_backfill_started);
12096
12097 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12098 while (ops < max) {
12099 if (backfill_info.begin <= earliest_peer_backfill() &&
12100 !backfill_info.extends_to_end() && backfill_info.empty()) {
12101 hobject_t next = backfill_info.end;
12102 backfill_info.reset(next);
12103 backfill_info.end = hobject_t::get_max();
12104 update_range(&backfill_info, handle);
12105 backfill_info.trim();
12106 }
12107
12108 dout(20) << " my backfill interval " << backfill_info << dendl;
12109
12110 bool sent_scan = false;
12111 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12112 i != backfill_targets.end();
12113 ++i) {
12114 pg_shard_t bt = *i;
12115 BackfillInterval& pbi = peer_backfill_info[bt];
12116
12117 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
12118 if (pbi.begin <= backfill_info.begin &&
12119 !pbi.extends_to_end() && pbi.empty()) {
12120 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
12121 epoch_t e = get_osdmap()->get_epoch();
12122 MOSDPGScan *m = new MOSDPGScan(
12123 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
12124 spg_t(info.pgid.pgid, bt.shard),
12125 pbi.end, hobject_t());
12126 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12127 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
12128 waiting_on_backfill.insert(bt);
12129 sent_scan = true;
12130 }
12131 }
12132
12133 // Count simultaneous scans as a single op and let those complete
12134 if (sent_scan) {
12135 ops++;
12136 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12137 break;
12138 }
12139
12140 if (backfill_info.empty() && all_peer_done()) {
12141 dout(10) << " reached end for both local and all peers" << dendl;
12142 break;
12143 }
12144
12145 // Get object within set of peers to operate on and
12146 // the set of targets for which that object applies.
12147 hobject_t check = earliest_peer_backfill();
12148
12149 if (check < backfill_info.begin) {
12150
12151 set<pg_shard_t> check_targets;
12152 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12153 i != backfill_targets.end();
12154 ++i) {
12155 pg_shard_t bt = *i;
12156 BackfillInterval& pbi = peer_backfill_info[bt];
12157 if (pbi.begin == check)
12158 check_targets.insert(bt);
12159 }
12160 assert(!check_targets.empty());
12161
12162 dout(20) << " BACKFILL removing " << check
12163 << " from peers " << check_targets << dendl;
12164 for (set<pg_shard_t>::iterator i = check_targets.begin();
12165 i != check_targets.end();
12166 ++i) {
12167 pg_shard_t bt = *i;
12168 BackfillInterval& pbi = peer_backfill_info[bt];
12169 assert(pbi.begin == check);
12170
12171 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12172 pbi.pop_front();
12173 }
12174
12175 /* This requires a bit of explanation. We compare head against
12176 * last_backfill to determine whether to send an operation
12177 * to the replica. A single write operation can touch up to three
12178 * objects: head, the snapdir, and a new clone which sorts closer to
12179 * head than any existing clone. If last_backfill points at a clone,
12180 * the transaction won't be sent and all 3 must lie on the right side
12181 * of the line (i.e., we'll backfill them later). If last_backfill
12182 * points at snapdir, it sorts greater than head, so we send the
12183 * transaction which is correct because all three must lie to the left
12184 * of the line.
12185 *
12186 * If it points at head, we have a bit of an issue. If head actually
12187 * exists, no problem, because any transaction which touches snapdir
12188 * must end up creating it (and deleting head), so sending the
12189 * operation won't pose a problem -- we'll end up having to scan it,
12190 * but it'll end up being the right version so we won't bother to
12191 * rebackfill it. However, if head doesn't exist, any write on head
12192 * will remove snapdir. For a replicated pool, this isn't a problem,
12193 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12194 * It only poses a problem for EC pools, because we never just delete
12195 * an object, we rename it into a rollback object. That operation
12196 * will end up crashing the osd with ENOENT. Tolerating the failure
12197 * wouldn't work either, even if snapdir exists, we'd be creating a
12198 * rollback object past the last_backfill line which wouldn't get
12199 * cleaned up (no rollback objects past the last_backfill line is an
12200 * existing important invariant). Thus, let's avoid the whole issue
12201 * by just not updating last_backfill_started here if head doesn't
12202 * exist and snapdir does. We aren't using up a recovery count here,
12203 * so we're going to recover snapdir immediately anyway. We'll only
12204 * fail "backward" if we fail to get the rw lock and that just means
12205 * we'll re-process this section of the hash space again.
12206 *
12207 * I'm choosing this hack here because the really "correct" answer is
12208 * going to be to unify snapdir and head into a single object (a
12209 * snapdir is really just a confusing way to talk about head existing
12210 * as a whiteout), but doing that is going to be a somewhat larger
12211 * undertaking.
12212 *
12213 * @see http://tracker.ceph.com/issues/17668
12214 */
12215 if (!(check.is_head() &&
12216 backfill_info.begin.is_snapdir() &&
12217 check == backfill_info.begin.get_head()))
12218 last_backfill_started = check;
12219
12220 // Don't increment ops here because deletions
12221 // are cheap and not replied to unlike real recovery_ops,
12222 // and we can't increment ops without requeueing ourself
12223 // for recovery.
12224 } else {
12225 eversion_t& obj_v = backfill_info.objects.begin()->second;
12226
12227 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12228 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12229 i != backfill_targets.end();
12230 ++i) {
12231 pg_shard_t bt = *i;
12232 BackfillInterval& pbi = peer_backfill_info[bt];
12233 // Find all check peers that have the wrong version
12234 if (check == backfill_info.begin && check == pbi.begin) {
12235 if (pbi.objects.begin()->second != obj_v) {
12236 need_ver_targs.push_back(bt);
12237 } else {
12238 keep_ver_targs.push_back(bt);
12239 }
12240 } else {
12241 pg_info_t& pinfo = peer_info[bt];
12242
12243 // Only include peers that we've caught up to their backfill line
12244 // otherwise, they only appear to be missing this object
12245 // because their pbi.begin > backfill_info.begin.
12246 if (backfill_info.begin > pinfo.last_backfill)
12247 missing_targs.push_back(bt);
12248 else
12249 skip_targs.push_back(bt);
12250 }
12251 }
12252
12253 if (!keep_ver_targs.empty()) {
12254 // These peers have version obj_v
12255 dout(20) << " BACKFILL keeping " << check
12256 << " with ver " << obj_v
12257 << " on peers " << keep_ver_targs << dendl;
12258 //assert(!waiting_for_degraded_object.count(check));
12259 }
12260 if (!need_ver_targs.empty() || !missing_targs.empty()) {
12261 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12262 assert(obc);
12263 if (obc->get_recovery_read()) {
12264 if (!need_ver_targs.empty()) {
12265 dout(20) << " BACKFILL replacing " << check
12266 << " with ver " << obj_v
12267 << " to peers " << need_ver_targs << dendl;
12268 }
12269 if (!missing_targs.empty()) {
12270 dout(20) << " BACKFILL pushing " << backfill_info.begin
12271 << " with ver " << obj_v
12272 << " to peers " << missing_targs << dendl;
12273 }
12274 vector<pg_shard_t> all_push = need_ver_targs;
12275 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12276
12277 handle.reset_tp_timeout();
12278 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12279 if (r < 0) {
12280 *work_started = true;
12281 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12282 break;
12283 }
12284 ops++;
12285 } else {
12286 *work_started = true;
12287 dout(20) << "backfill blocking on " << backfill_info.begin
12288 << "; could not get rw_manager lock" << dendl;
12289 break;
12290 }
12291 }
12292 dout(20) << "need_ver_targs=" << need_ver_targs
12293 << " keep_ver_targs=" << keep_ver_targs << dendl;
12294 dout(20) << "backfill_targets=" << backfill_targets
12295 << " missing_targs=" << missing_targs
12296 << " skip_targs=" << skip_targs << dendl;
12297
12298 last_backfill_started = backfill_info.begin;
12299 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12300 backfill_info.pop_front();
12301 vector<pg_shard_t> check_targets = need_ver_targs;
12302 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12303 for (vector<pg_shard_t>::iterator i = check_targets.begin();
12304 i != check_targets.end();
12305 ++i) {
12306 pg_shard_t bt = *i;
12307 BackfillInterval& pbi = peer_backfill_info[bt];
12308 pbi.pop_front();
12309 }
12310 }
12311 }
12312
12313 hobject_t backfill_pos =
12314 std::min(backfill_info.begin, earliest_peer_backfill());
12315
12316 for (set<hobject_t>::iterator i = add_to_stat.begin();
12317 i != add_to_stat.end();
12318 ++i) {
12319 ObjectContextRef obc = get_object_context(*i, false);
12320 assert(obc);
12321 pg_stat_t stat;
12322 add_object_context_to_pg_stat(obc, &stat);
12323 pending_backfill_updates[*i] = stat;
12324 }
12325 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12326 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12327 for (unsigned i = 0; i < to_remove.size(); ++i) {
12328 handle.reset_tp_timeout();
12329 const hobject_t& oid = to_remove[i].get<0>();
12330 eversion_t v = to_remove[i].get<1>();
12331 pg_shard_t peer = to_remove[i].get<2>();
12332 MOSDPGBackfillRemove *m;
12333 auto it = reqs.find(peer);
12334 if (it != reqs.end()) {
12335 m = it->second;
12336 } else {
12337 m = reqs[peer] = new MOSDPGBackfillRemove(
12338 spg_t(info.pgid.pgid, peer.shard),
12339 get_osdmap()->get_epoch());
12340 }
12341 m->ls.push_back(make_pair(oid, v));
12342
12343 if (oid <= last_backfill_started)
12344 pending_backfill_updates[oid]; // add empty stat!
12345 }
12346 for (auto p : reqs) {
12347 osd->send_message_osd_cluster(p.first.osd, p.second,
12348 get_osdmap()->get_epoch());
12349 }
12350 } else {
12351 // for jewel targets
12352 for (unsigned i = 0; i < to_remove.size(); ++i) {
12353 handle.reset_tp_timeout();
12354
12355 // ordered before any subsequent updates
12356 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12357 to_remove[i].get<2>());
12358
12359 if (to_remove[i].get<0>() <= last_backfill_started)
12360 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12361 }
12362 }
12363
12364 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12365
12366 dout(5) << "backfill_pos is " << backfill_pos << dendl;
12367 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12368 i != backfills_in_flight.end();
12369 ++i) {
12370 dout(20) << *i << " is still in flight" << dendl;
12371 }
12372
12373 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12374 backfill_pos : *(backfills_in_flight.begin());
12375 hobject_t new_last_backfill = earliest_backfill();
12376 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12377 for (map<hobject_t, pg_stat_t>::iterator i =
12378 pending_backfill_updates.begin();
12379 i != pending_backfill_updates.end() &&
12380 i->first < next_backfill_to_complete;
12381 pending_backfill_updates.erase(i++)) {
12382 dout(20) << " pending_backfill_update " << i->first << dendl;
12383 assert(i->first > new_last_backfill);
12384 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12385 j != backfill_targets.end();
12386 ++j) {
12387 pg_shard_t bt = *j;
12388 pg_info_t& pinfo = peer_info[bt];
12389 //Add stats to all peers that were missing object
12390 if (i->first > pinfo.last_backfill)
12391 pinfo.stats.add(i->second);
12392 }
12393 new_last_backfill = i->first;
12394 }
12395 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12396
12397 assert(!pending_backfill_updates.empty() ||
12398 new_last_backfill == last_backfill_started);
12399 if (pending_backfill_updates.empty() &&
12400 backfill_pos.is_max()) {
12401 assert(backfills_in_flight.empty());
12402 new_last_backfill = backfill_pos;
12403 last_backfill_started = backfill_pos;
12404 }
12405 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12406
12407 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12408 // all the backfill targets. Otherwise, we will move last_backfill up on
12409 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12410 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12411 i != backfill_targets.end();
12412 ++i) {
12413 pg_shard_t bt = *i;
12414 pg_info_t& pinfo = peer_info[bt];
12415
12416 if (new_last_backfill > pinfo.last_backfill) {
12417 pinfo.set_last_backfill(new_last_backfill);
12418 epoch_t e = get_osdmap()->get_epoch();
12419 MOSDPGBackfill *m = NULL;
12420 if (pinfo.last_backfill.is_max()) {
12421 m = new MOSDPGBackfill(
12422 MOSDPGBackfill::OP_BACKFILL_FINISH,
12423 e,
12424 last_peering_reset,
12425 spg_t(info.pgid.pgid, bt.shard));
12426 // Use default priority here, must match sub_op priority
12427 /* pinfo.stats might be wrong if we did log-based recovery on the
12428 * backfilled portion in addition to continuing backfill.
12429 */
12430 pinfo.stats = info.stats;
12431 start_recovery_op(hobject_t::get_max());
12432 } else {
12433 m = new MOSDPGBackfill(
12434 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12435 e,
12436 last_peering_reset,
12437 spg_t(info.pgid.pgid, bt.shard));
12438 // Use default priority here, must match sub_op priority
12439 }
12440 m->last_backfill = pinfo.last_backfill;
12441 m->stats = pinfo.stats;
12442 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12443 dout(10) << " peer " << bt
12444 << " num_objects now " << pinfo.stats.stats.sum.num_objects
12445 << " / " << info.stats.stats.sum.num_objects << dendl;
12446 }
12447 }
12448
12449 if (ops)
12450 *work_started = true;
12451 return ops;
12452 }
12453
12454 int PrimaryLogPG::prep_backfill_object_push(
12455 hobject_t oid, eversion_t v,
12456 ObjectContextRef obc,
12457 vector<pg_shard_t> peers,
12458 PGBackend::RecoveryHandle *h)
12459 {
12460 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12461 assert(!peers.empty());
12462
12463 backfills_in_flight.insert(oid);
12464 for (unsigned int i = 0 ; i < peers.size(); ++i) {
12465 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12466 assert(bpm != peer_missing.end());
12467 bpm->second.add(oid, eversion_t(), eversion_t(), false);
12468 }
12469
12470 assert(!recovering.count(oid));
12471
12472 start_recovery_op(oid);
12473 recovering.insert(make_pair(oid, obc));
12474
12475 // We need to take the read_lock here in order to flush in-progress writes
12476 obc->ondisk_read_lock();
12477 int r = pgbackend->recover_object(
12478 oid,
12479 v,
12480 ObjectContextRef(),
12481 obc,
12482 h);
12483 obc->ondisk_read_unlock();
12484 if (r < 0) {
12485 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12486 primary_failed(oid);
12487 primary_error(oid, v);
12488 backfills_in_flight.erase(oid);
12489 missing_loc.add_missing(oid, v, eversion_t());
12490 }
12491 return r;
12492 }
12493
12494 void PrimaryLogPG::update_range(
12495 BackfillInterval *bi,
12496 ThreadPool::TPHandle &handle)
12497 {
12498 int local_min = cct->_conf->osd_backfill_scan_min;
12499 int local_max = cct->_conf->osd_backfill_scan_max;
12500
12501 if (bi->version < info.log_tail) {
12502 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12503 << dendl;
12504 osr->flush();
12505 if (last_update_applied >= info.log_tail) {
12506 bi->version = last_update_applied;
12507 } else {
12508 bi->version = info.last_update;
12509 }
12510 scan_range(local_min, local_max, bi, handle);
12511 }
12512
12513 if (bi->version >= projected_last_update) {
12514 dout(10) << __func__<< ": bi is current " << dendl;
12515 assert(bi->version == projected_last_update);
12516 } else if (bi->version >= info.log_tail) {
12517 if (pg_log.get_log().empty() && projected_log.empty()) {
12518 /* Because we don't move log_tail on split, the log might be
12519 * empty even if log_tail != last_update. However, the only
12520 * way to get here with an empty log is if log_tail is actually
12521 * eversion_t(), because otherwise the entry which changed
12522 * last_update since the last scan would have to be present.
12523 */
12524 assert(bi->version == eversion_t());
12525 return;
12526 }
12527
12528 dout(10) << __func__<< ": bi is old, (" << bi->version
12529 << ") can be updated with log to projected_last_update "
12530 << projected_last_update << dendl;
12531
12532 auto func = [&](const pg_log_entry_t &e) {
12533 dout(10) << __func__ << ": updating from version " << e.version
12534 << dendl;
12535 const hobject_t &soid = e.soid;
12536 if (soid >= bi->begin &&
12537 soid < bi->end) {
12538 if (e.is_update()) {
12539 dout(10) << __func__ << ": " << e.soid << " updated to version "
12540 << e.version << dendl;
12541 bi->objects.erase(e.soid);
12542 bi->objects.insert(
12543 make_pair(
12544 e.soid,
12545 e.version));
12546 } else if (e.is_delete()) {
12547 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12548 bi->objects.erase(e.soid);
12549 }
12550 }
12551 };
12552 dout(10) << "scanning pg log first" << dendl;
12553 pg_log.get_log().scan_log_after(bi->version, func);
12554 dout(10) << "scanning projected log" << dendl;
12555 projected_log.scan_log_after(bi->version, func);
12556 bi->version = projected_last_update;
12557 } else {
12558 assert(0 == "scan_range should have raised bi->version past log_tail");
12559 }
12560 }
12561
12562 void PrimaryLogPG::scan_range(
12563 int min, int max, BackfillInterval *bi,
12564 ThreadPool::TPHandle &handle)
12565 {
12566 assert(is_locked());
12567 dout(10) << "scan_range from " << bi->begin << dendl;
12568 bi->clear_objects();
12569
12570 vector<hobject_t> ls;
12571 ls.reserve(max);
12572 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12573 assert(r >= 0);
12574 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12575 dout(20) << ls << dendl;
12576
12577 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12578 handle.reset_tp_timeout();
12579 ObjectContextRef obc;
12580 if (is_primary())
12581 obc = object_contexts.lookup(*p);
12582 if (obc) {
12583 bi->objects[*p] = obc->obs.oi.version;
12584 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12585 } else {
12586 bufferlist bl;
12587 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12588
12589 /* If the object does not exist here, it must have been removed
12590 * between the collection_list_partial and here. This can happen
12591 * for the first item in the range, which is usually last_backfill.
12592 */
12593 if (r == -ENOENT)
12594 continue;
12595
12596 assert(r >= 0);
12597 object_info_t oi(bl);
12598 bi->objects[*p] = oi.version;
12599 dout(20) << " " << *p << " " << oi.version << dendl;
12600 }
12601 }
12602 }
12603
12604
12605 /** check_local
12606 *
12607 * verifies that stray objects have been deleted
12608 */
12609 void PrimaryLogPG::check_local()
12610 {
12611 dout(10) << __func__ << dendl;
12612
12613 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12614
12615 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12616 return;
12617
12618 // just scan the log.
12619 set<hobject_t> did;
12620 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12621 p != pg_log.get_log().log.rend();
12622 ++p) {
12623 if (did.count(p->soid))
12624 continue;
12625 did.insert(p->soid);
12626
12627 if (p->is_delete() && !is_missing_object(p->soid)) {
12628 dout(10) << " checking " << p->soid
12629 << " at " << p->version << dendl;
12630 struct stat st;
12631 int r = osd->store->stat(
12632 ch,
12633 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12634 &st);
12635 if (r != -ENOENT) {
12636 derr << __func__ << " " << p->soid << " exists, but should have been "
12637 << "deleted" << dendl;
12638 assert(0 == "erroneously present object");
12639 }
12640 } else {
12641 // ignore old(+missing) objects
12642 }
12643 }
12644 }
12645
12646
12647
12648 // ===========================
12649 // hit sets
12650
12651 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12652 {
12653 ostringstream ss;
12654 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12655 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12656 info.pgid.ps(), info.pgid.pool(),
12657 cct->_conf->osd_hit_set_namespace);
12658 dout(20) << __func__ << " " << hoid << dendl;
12659 return hoid;
12660 }
12661
12662 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12663 utime_t end,
12664 bool using_gmt)
12665 {
12666 ostringstream ss;
12667 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12668 if (using_gmt) {
12669 start.gmtime(ss) << "_";
12670 end.gmtime(ss);
12671 } else {
12672 start.localtime(ss) << "_";
12673 end.localtime(ss);
12674 }
12675 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12676 info.pgid.ps(), info.pgid.pool(),
12677 cct->_conf->osd_hit_set_namespace);
12678 dout(20) << __func__ << " " << hoid << dendl;
12679 return hoid;
12680 }
12681
12682 void PrimaryLogPG::hit_set_clear()
12683 {
12684 dout(20) << __func__ << dendl;
12685 hit_set.reset();
12686 hit_set_start_stamp = utime_t();
12687 }
12688
12689 void PrimaryLogPG::hit_set_setup()
12690 {
12691 if (!is_active() ||
12692 !is_primary()) {
12693 hit_set_clear();
12694 return;
12695 }
12696
12697 if (is_active() && is_primary() &&
12698 (!pool.info.hit_set_count ||
12699 !pool.info.hit_set_period ||
12700 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12701 hit_set_clear();
12702
12703 // only primary is allowed to remove all the hit set objects
12704 hit_set_remove_all();
12705 return;
12706 }
12707
12708 // FIXME: discard any previous data for now
12709 hit_set_create();
12710
12711 // include any writes we know about from the pg log. this doesn't
12712 // capture reads, but it is better than nothing!
12713 hit_set_apply_log();
12714 }
12715
12716 void PrimaryLogPG::hit_set_remove_all()
12717 {
12718 // If any archives are degraded we skip this
12719 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12720 p != info.hit_set.history.end();
12721 ++p) {
12722 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12723
12724 // Once we hit a degraded object just skip
12725 if (is_degraded_or_backfilling_object(aoid))
12726 return;
12727 if (write_blocked_by_scrub(aoid))
12728 return;
12729 }
12730
12731 if (!info.hit_set.history.empty()) {
12732 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12733 assert(p != info.hit_set.history.rend());
12734 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12735 assert(!is_degraded_or_backfilling_object(oid));
12736 ObjectContextRef obc = get_object_context(oid, false);
12737 assert(obc);
12738
12739 OpContextUPtr ctx = simple_opc_create(obc);
12740 ctx->at_version = get_next_version();
12741 ctx->updated_hset_history = info.hit_set;
12742 utime_t now = ceph_clock_now();
12743 ctx->mtime = now;
12744 hit_set_trim(ctx, 0);
12745 simple_opc_submit(std::move(ctx));
12746 }
12747
12748 info.hit_set = pg_hit_set_history_t();
12749 if (agent_state) {
12750 agent_state->discard_hit_sets();
12751 }
12752 }
12753
12754 void PrimaryLogPG::hit_set_create()
12755 {
12756 utime_t now = ceph_clock_now();
12757 // make a copy of the params to modify
12758 HitSet::Params params(pool.info.hit_set_params);
12759
12760 dout(20) << __func__ << " " << params << dendl;
12761 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12762 BloomHitSet::Params *p =
12763 static_cast<BloomHitSet::Params*>(params.impl.get());
12764
12765 // convert false positive rate so it holds up across the full period
12766 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12767 if (p->get_fpp() <= 0.0)
12768 p->set_fpp(.01); // fpp cannot be zero!
12769
12770 // if we don't have specified size, estimate target size based on the
12771 // previous bin!
12772 if (p->target_size == 0 && hit_set) {
12773 utime_t dur = now - hit_set_start_stamp;
12774 unsigned unique = hit_set->approx_unique_insert_count();
12775 dout(20) << __func__ << " previous set had approx " << unique
12776 << " unique items over " << dur << " seconds" << dendl;
12777 p->target_size = (double)unique * (double)pool.info.hit_set_period
12778 / (double)dur;
12779 }
12780 if (p->target_size <
12781 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12782 p->target_size = cct->_conf->osd_hit_set_min_size;
12783
12784 if (p->target_size
12785 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12786 p->target_size = cct->_conf->osd_hit_set_max_size;
12787
12788 p->seed = now.sec();
12789
12790 dout(10) << __func__ << " target_size " << p->target_size
12791 << " fpp " << p->get_fpp() << dendl;
12792 }
12793 hit_set.reset(new HitSet(params));
12794 hit_set_start_stamp = now;
12795 }
12796
12797 /**
12798 * apply log entries to set
12799 *
12800 * this would only happen after peering, to at least capture writes
12801 * during an interval that was potentially lost.
12802 */
12803 bool PrimaryLogPG::hit_set_apply_log()
12804 {
12805 if (!hit_set)
12806 return false;
12807
12808 eversion_t to = info.last_update;
12809 eversion_t from = info.hit_set.current_last_update;
12810 if (to <= from) {
12811 dout(20) << __func__ << " no update" << dendl;
12812 return false;
12813 }
12814
12815 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12816 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12817 while (p != pg_log.get_log().log.rend() && p->version > to)
12818 ++p;
12819 while (p != pg_log.get_log().log.rend() && p->version > from) {
12820 hit_set->insert(p->soid);
12821 ++p;
12822 }
12823
12824 return true;
12825 }
12826
12827 void PrimaryLogPG::hit_set_persist()
12828 {
12829 dout(10) << __func__ << dendl;
12830 bufferlist bl;
12831 unsigned max = pool.info.hit_set_count;
12832
12833 utime_t now = ceph_clock_now();
12834 hobject_t oid;
12835
12836 // If any archives are degraded we skip this persist request
12837 // account for the additional entry being added below
12838 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12839 p != info.hit_set.history.end();
12840 ++p) {
12841 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12842
12843 // Once we hit a degraded object just skip further trim
12844 if (is_degraded_or_backfilling_object(aoid))
12845 return;
12846 if (write_blocked_by_scrub(aoid))
12847 return;
12848 }
12849
12850 // If backfill is in progress and we could possibly overlap with the
12851 // hit_set_* objects, back off. Since these all have
12852 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12853 // look just at that. This is necessary because our transactions
12854 // may include a modify of the new hit_set *and* a delete of the
12855 // old one, and this may span the backfill boundary.
12856 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12857 p != backfill_targets.end();
12858 ++p) {
12859 assert(peer_info.count(*p));
12860 const pg_info_t& pi = peer_info[*p];
12861 if (pi.last_backfill == hobject_t() ||
12862 pi.last_backfill.get_hash() == info.pgid.ps()) {
12863 dout(10) << __func__ << " backfill target osd." << *p
12864 << " last_backfill has not progressed past pgid ps"
12865 << dendl;
12866 return;
12867 }
12868 }
12869
12870
12871 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12872 new_hset.begin = hit_set_start_stamp;
12873 new_hset.end = now;
12874 oid = get_hit_set_archive_object(
12875 new_hset.begin,
12876 new_hset.end,
12877 new_hset.using_gmt);
12878
12879 // If the current object is degraded we skip this persist request
12880 if (write_blocked_by_scrub(oid))
12881 return;
12882
12883 hit_set->seal();
12884 ::encode(*hit_set, bl);
12885 dout(20) << __func__ << " archive " << oid << dendl;
12886
12887 if (agent_state) {
12888 agent_state->add_hit_set(new_hset.begin, hit_set);
12889 uint32_t size = agent_state->hit_set_map.size();
12890 if (size >= pool.info.hit_set_count) {
12891 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12892 }
12893 hit_set_in_memory_trim(size);
12894 }
12895
12896 ObjectContextRef obc = get_object_context(oid, true);
12897 OpContextUPtr ctx = simple_opc_create(obc);
12898
12899 ctx->at_version = get_next_version();
12900 ctx->updated_hset_history = info.hit_set;
12901 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12902
12903 updated_hit_set_hist.current_last_update = info.last_update;
12904 new_hset.version = ctx->at_version;
12905
12906 updated_hit_set_hist.history.push_back(new_hset);
12907 hit_set_create();
12908
12909 // fabricate an object_info_t and SnapSet
12910 obc->obs.oi.version = ctx->at_version;
12911 obc->obs.oi.mtime = now;
12912 obc->obs.oi.size = bl.length();
12913 obc->obs.exists = true;
12914 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12915
12916 ctx->new_obs = obc->obs;
12917
12918 obc->ssc->snapset.head_exists = true;
12919 ctx->new_snapset = obc->ssc->snapset;
12920
12921 ctx->delta_stats.num_objects++;
12922 ctx->delta_stats.num_objects_hit_set_archive++;
12923 ctx->delta_stats.num_bytes += bl.length();
12924 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12925
12926 bufferlist bss;
12927 ::encode(ctx->new_snapset, bss);
12928 bufferlist boi(sizeof(ctx->new_obs.oi));
12929 ::encode(ctx->new_obs.oi, boi,
12930 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12931
12932 ctx->op_t->create(oid);
12933 if (bl.length()) {
12934 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12935 }
12936 map <string, bufferlist> attrs;
12937 attrs[OI_ATTR].claim(boi);
12938 attrs[SS_ATTR].claim(bss);
12939 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12940 ctx->log.push_back(
12941 pg_log_entry_t(
12942 pg_log_entry_t::MODIFY,
12943 oid,
12944 ctx->at_version,
12945 eversion_t(),
12946 0,
12947 osd_reqid_t(),
12948 ctx->mtime,
12949 0)
12950 );
12951
12952 hit_set_trim(ctx, max);
12953
12954 simple_opc_submit(std::move(ctx));
12955 }
12956
12957 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12958 {
12959 assert(ctx->updated_hset_history);
12960 pg_hit_set_history_t &updated_hit_set_hist =
12961 *(ctx->updated_hset_history);
12962 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12963 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12964 assert(p != updated_hit_set_hist.history.end());
12965 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12966
12967 assert(!is_degraded_or_backfilling_object(oid));
12968
12969 dout(20) << __func__ << " removing " << oid << dendl;
12970 ++ctx->at_version.version;
12971 ctx->log.push_back(
12972 pg_log_entry_t(pg_log_entry_t::DELETE,
12973 oid,
12974 ctx->at_version,
12975 p->version,
12976 0,
12977 osd_reqid_t(),
12978 ctx->mtime,
12979 0));
12980
12981 ctx->op_t->remove(oid);
12982 updated_hit_set_hist.history.pop_front();
12983
12984 ObjectContextRef obc = get_object_context(oid, false);
12985 assert(obc);
12986 --ctx->delta_stats.num_objects;
12987 --ctx->delta_stats.num_objects_hit_set_archive;
12988 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12989 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12990 }
12991 }
12992
12993 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12994 {
12995 while (agent_state->hit_set_map.size() > max_in_memory) {
12996 agent_state->remove_oldest_hit_set();
12997 }
12998 }
12999
13000
13001 // =======================================
13002 // cache agent
13003
13004 void PrimaryLogPG::agent_setup()
13005 {
13006 assert(is_locked());
13007 if (!is_active() ||
13008 !is_primary() ||
13009 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13010 pool.info.tier_of < 0 ||
13011 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13012 agent_clear();
13013 return;
13014 }
13015 if (!agent_state) {
13016 agent_state.reset(new TierAgentState);
13017
13018 // choose random starting position
13019 agent_state->position = hobject_t();
13020 agent_state->position.pool = info.pgid.pool();
13021 agent_state->position.set_hash(pool.info.get_random_pg_position(
13022 info.pgid.pgid,
13023 rand()));
13024 agent_state->start = agent_state->position;
13025
13026 dout(10) << __func__ << " allocated new state, position "
13027 << agent_state->position << dendl;
13028 } else {
13029 dout(10) << __func__ << " keeping existing state" << dendl;
13030 }
13031
13032 if (info.stats.stats_invalid) {
13033 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13034 }
13035
13036 agent_choose_mode();
13037 }
13038
13039 void PrimaryLogPG::agent_clear()
13040 {
13041 agent_stop();
13042 agent_state.reset(NULL);
13043 }
13044
13045 // Return false if no objects operated on since start of object hash space
13046 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13047 {
13048 lock();
13049 if (!agent_state) {
13050 dout(10) << __func__ << " no agent state, stopping" << dendl;
13051 unlock();
13052 return true;
13053 }
13054
13055 assert(!deleting);
13056
13057 if (agent_state->is_idle()) {
13058 dout(10) << __func__ << " idle, stopping" << dendl;
13059 unlock();
13060 return true;
13061 }
13062
13063 osd->logger->inc(l_osd_agent_wake);
13064
13065 dout(10) << __func__
13066 << " max " << start_max
13067 << ", flush " << agent_state->get_flush_mode_name()
13068 << ", evict " << agent_state->get_evict_mode_name()
13069 << ", pos " << agent_state->position
13070 << dendl;
13071 assert(is_primary());
13072 assert(is_active());
13073
13074 agent_load_hit_sets();
13075
13076 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13077 assert(base_pool);
13078
13079 int ls_min = 1;
13080 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13081
13082 // list some objects. this conveniently lists clones (oldest to
13083 // newest) before heads... the same order we want to flush in.
13084 //
13085 // NOTE: do not flush the Sequencer. we will assume that the
13086 // listing we get back is imprecise.
13087 vector<hobject_t> ls;
13088 hobject_t next;
13089 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13090 &ls, &next);
13091 assert(r >= 0);
13092 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13093 int started = 0;
13094 for (vector<hobject_t>::iterator p = ls.begin();
13095 p != ls.end();
13096 ++p) {
13097 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13098 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13099 osd->logger->inc(l_osd_agent_skip);
13100 continue;
13101 }
13102 if (is_degraded_or_backfilling_object(*p)) {
13103 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13104 osd->logger->inc(l_osd_agent_skip);
13105 continue;
13106 }
13107 if (is_missing_object(p->get_head())) {
13108 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13109 osd->logger->inc(l_osd_agent_skip);
13110 continue;
13111 }
13112 ObjectContextRef obc = get_object_context(*p, false, NULL);
13113 if (!obc) {
13114 // we didn't flush; we may miss something here.
13115 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13116 osd->logger->inc(l_osd_agent_skip);
13117 continue;
13118 }
13119 if (!obc->obs.exists) {
13120 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13121 osd->logger->inc(l_osd_agent_skip);
13122 continue;
13123 }
13124 if (range_intersects_scrub(obc->obs.oi.soid,
13125 obc->obs.oi.soid.get_head())) {
13126 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13127 osd->logger->inc(l_osd_agent_skip);
13128 continue;
13129 }
13130 if (obc->is_blocked()) {
13131 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13132 osd->logger->inc(l_osd_agent_skip);
13133 continue;
13134 }
13135 if (obc->is_request_pending()) {
13136 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13137 osd->logger->inc(l_osd_agent_skip);
13138 continue;
13139 }
13140
13141 // be careful flushing omap to an EC pool.
13142 if (!base_pool->supports_omap() &&
13143 obc->obs.oi.is_omap()) {
13144 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13145 osd->logger->inc(l_osd_agent_skip);
13146 continue;
13147 }
13148
13149 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13150 agent_maybe_evict(obc, false))
13151 ++started;
13152 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13153 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13154 ++started;
13155 --agent_flush_quota;
13156 }
13157 if (started >= start_max) {
13158 // If finishing early, set "next" to the next object
13159 if (++p != ls.end())
13160 next = *p;
13161 break;
13162 }
13163 }
13164
13165 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13166 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13167 agent_state->hist_age = 0;
13168 agent_state->temp_hist.decay();
13169 }
13170
13171 // Total objects operated on so far
13172 int total_started = agent_state->started + started;
13173 bool need_delay = false;
13174
13175 dout(20) << __func__ << " start pos " << agent_state->position
13176 << " next start pos " << next
13177 << " started " << total_started << dendl;
13178
13179 // See if we've made a full pass over the object hash space
13180 // This might check at most ls_max objects a second time to notice that
13181 // we've checked every objects at least once.
13182 if (agent_state->position < agent_state->start &&
13183 next >= agent_state->start) {
13184 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13185 if (total_started == 0)
13186 need_delay = true;
13187 else
13188 total_started = 0;
13189 agent_state->start = next;
13190 }
13191 agent_state->started = total_started;
13192
13193 // See if we are starting from beginning
13194 if (next.is_max())
13195 agent_state->position = hobject_t();
13196 else
13197 agent_state->position = next;
13198
13199 // Discard old in memory HitSets
13200 hit_set_in_memory_trim(pool.info.hit_set_count);
13201
13202 if (need_delay) {
13203 assert(agent_state->delaying == false);
13204 agent_delay();
13205 unlock();
13206 return false;
13207 }
13208 agent_choose_mode();
13209 unlock();
13210 return true;
13211 }
13212
13213 void PrimaryLogPG::agent_load_hit_sets()
13214 {
13215 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13216 return;
13217 }
13218
13219 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13220 dout(10) << __func__ << dendl;
13221 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13222 p != info.hit_set.history.end(); ++p) {
13223 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13224 dout(10) << __func__ << " loading " << p->begin << "-"
13225 << p->end << dendl;
13226 if (!pool.info.is_replicated()) {
13227 // FIXME: EC not supported here yet
13228 derr << __func__ << " on non-replicated pool" << dendl;
13229 break;
13230 }
13231
13232 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13233 if (is_unreadable_object(oid)) {
13234 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13235 break;
13236 }
13237
13238 ObjectContextRef obc = get_object_context(oid, false);
13239 if (!obc) {
13240 derr << __func__ << ": could not load hitset " << oid << dendl;
13241 break;
13242 }
13243
13244 bufferlist bl;
13245 {
13246 obc->ondisk_read_lock();
13247 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13248 assert(r >= 0);
13249 obc->ondisk_read_unlock();
13250 }
13251 HitSetRef hs(new HitSet);
13252 bufferlist::iterator pbl = bl.begin();
13253 ::decode(*hs, pbl);
13254 agent_state->add_hit_set(p->begin.sec(), hs);
13255 }
13256 }
13257 }
13258 }
13259
13260 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13261 {
13262 if (!obc->obs.oi.is_dirty()) {
13263 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13264 osd->logger->inc(l_osd_agent_skip);
13265 return false;
13266 }
13267 if (obc->obs.oi.is_cache_pinned()) {
13268 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13269 osd->logger->inc(l_osd_agent_skip);
13270 return false;
13271 }
13272
13273 utime_t now = ceph_clock_now();
13274 utime_t ob_local_mtime;
13275 if (obc->obs.oi.local_mtime != utime_t()) {
13276 ob_local_mtime = obc->obs.oi.local_mtime;
13277 } else {
13278 ob_local_mtime = obc->obs.oi.mtime;
13279 }
13280 bool evict_mode_full =
13281 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13282 if (!evict_mode_full &&
13283 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
13284 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13285 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13286 osd->logger->inc(l_osd_agent_skip);
13287 return false;
13288 }
13289
13290 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13291 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13292 osd->logger->inc(l_osd_agent_skip);
13293 return false;
13294 }
13295
13296 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13297
13298 // FIXME: flush anything dirty, regardless of what distribution of
13299 // ages we expect.
13300
13301 hobject_t oid = obc->obs.oi.soid;
13302 osd->agent_start_op(oid);
13303 // no need to capture a pg ref, can't outlive fop or ctx
13304 std::function<void()> on_flush = [this, oid]() {
13305 osd->agent_finish_op(oid);
13306 };
13307
13308 int result = start_flush(
13309 OpRequestRef(), obc, false, NULL,
13310 on_flush);
13311 if (result != -EINPROGRESS) {
13312 on_flush();
13313 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13314 << " with " << result << dendl;
13315 osd->logger->inc(l_osd_agent_skip);
13316 return false;
13317 }
13318
13319 osd->logger->inc(l_osd_agent_flush);
13320 return true;
13321 }
13322
13323 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13324 {
13325 const hobject_t& soid = obc->obs.oi.soid;
13326 if (!after_flush && obc->obs.oi.is_dirty()) {
13327 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13328 return false;
13329 }
13330 if (!obc->obs.oi.watchers.empty()) {
13331 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13332 return false;
13333 }
13334 if (obc->is_blocked()) {
13335 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13336 return false;
13337 }
13338 if (obc->obs.oi.is_cache_pinned()) {
13339 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13340 return false;
13341 }
13342
13343 if (soid.snap == CEPH_NOSNAP) {
13344 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13345 if (result < 0) {
13346 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13347 return false;
13348 }
13349 }
13350
13351 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13352 // is this object old than cache_min_evict_age?
13353 utime_t now = ceph_clock_now();
13354 utime_t ob_local_mtime;
13355 if (obc->obs.oi.local_mtime != utime_t()) {
13356 ob_local_mtime = obc->obs.oi.local_mtime;
13357 } else {
13358 ob_local_mtime = obc->obs.oi.mtime;
13359 }
13360 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13361 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13362 osd->logger->inc(l_osd_agent_skip);
13363 return false;
13364 }
13365 // is this object old and/or cold enough?
13366 int temp = 0;
13367 uint64_t temp_upper = 0, temp_lower = 0;
13368 if (hit_set)
13369 agent_estimate_temp(soid, &temp);
13370 agent_state->temp_hist.add(temp);
13371 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13372
13373 dout(20) << __func__
13374 << " temp " << temp
13375 << " pos " << temp_lower << "-" << temp_upper
13376 << ", evict_effort " << agent_state->evict_effort
13377 << dendl;
13378 dout(30) << "agent_state:\n";
13379 Formatter *f = Formatter::create("");
13380 f->open_object_section("agent_state");
13381 agent_state->dump(f);
13382 f->close_section();
13383 f->flush(*_dout);
13384 delete f;
13385 *_dout << dendl;
13386
13387 if (1000000 - temp_upper >= agent_state->evict_effort)
13388 return false;
13389 }
13390
13391 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13392 OpContextUPtr ctx = simple_opc_create(obc);
13393
13394 if (!ctx->lock_manager.get_lock_type(
13395 ObjectContext::RWState::RWWRITE,
13396 obc->obs.oi.soid,
13397 obc,
13398 OpRequestRef())) {
13399 close_op_ctx(ctx.release());
13400 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13401 return false;
13402 }
13403
13404 osd->agent_start_evict_op();
13405 ctx->register_on_finish(
13406 [this]() {
13407 osd->agent_finish_evict_op();
13408 });
13409
13410 ctx->at_version = get_next_version();
13411 assert(ctx->new_obs.exists);
13412 int r = _delete_oid(ctx.get(), true, false);
13413 if (obc->obs.oi.is_omap())
13414 ctx->delta_stats.num_objects_omap--;
13415 ctx->delta_stats.num_evict++;
13416 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13417 if (obc->obs.oi.is_dirty())
13418 --ctx->delta_stats.num_objects_dirty;
13419 assert(r == 0);
13420 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13421 simple_opc_submit(std::move(ctx));
13422 osd->logger->inc(l_osd_tier_evict);
13423 osd->logger->inc(l_osd_agent_evict);
13424 return true;
13425 }
13426
13427 void PrimaryLogPG::agent_stop()
13428 {
13429 dout(20) << __func__ << dendl;
13430 if (agent_state && !agent_state->is_idle()) {
13431 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13432 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13433 osd->agent_disable_pg(this, agent_state->evict_effort);
13434 }
13435 }
13436
13437 void PrimaryLogPG::agent_delay()
13438 {
13439 dout(20) << __func__ << dendl;
13440 if (agent_state && !agent_state->is_idle()) {
13441 assert(agent_state->delaying == false);
13442 agent_state->delaying = true;
13443 osd->agent_disable_pg(this, agent_state->evict_effort);
13444 }
13445 }
13446
13447 void PrimaryLogPG::agent_choose_mode_restart()
13448 {
13449 dout(20) << __func__ << dendl;
13450 lock();
13451 if (agent_state && agent_state->delaying) {
13452 agent_state->delaying = false;
13453 agent_choose_mode(true);
13454 }
13455 unlock();
13456 }
13457
13458 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13459 {
13460 bool requeued = false;
13461 // Let delay play out
13462 if (agent_state->delaying) {
13463 dout(20) << __func__ << this << " delaying, ignored" << dendl;
13464 return requeued;
13465 }
13466
13467 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13468 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13469 unsigned evict_effort = 0;
13470
13471 if (info.stats.stats_invalid) {
13472 // idle; stats can't be trusted until we scrub.
13473 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13474 goto skip_calc;
13475 }
13476
13477 {
13478 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13479 assert(divisor > 0);
13480
13481 // adjust (effective) user objects down based on the number
13482 // of HitSet objects, which should not count toward our total since
13483 // they cannot be flushed.
13484 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13485
13486 // also exclude omap objects if ec backing pool
13487 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13488 assert(base_pool);
13489 if (!base_pool->supports_omap())
13490 unflushable += info.stats.stats.sum.num_objects_omap;
13491
13492 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13493 if (num_user_objects > unflushable)
13494 num_user_objects -= unflushable;
13495 else
13496 num_user_objects = 0;
13497
13498 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13499 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13500 num_user_bytes -= unflushable_bytes;
13501 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13502 num_user_bytes += num_overhead_bytes;
13503
13504 // also reduce the num_dirty by num_objects_omap
13505 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13506 if (!base_pool->supports_omap()) {
13507 if (num_dirty > info.stats.stats.sum.num_objects_omap)
13508 num_dirty -= info.stats.stats.sum.num_objects_omap;
13509 else
13510 num_dirty = 0;
13511 }
13512
13513 dout(10) << __func__
13514 << " flush_mode: "
13515 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13516 << " evict_mode: "
13517 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13518 << " num_objects: " << info.stats.stats.sum.num_objects
13519 << " num_bytes: " << info.stats.stats.sum.num_bytes
13520 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13521 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13522 << " num_dirty: " << num_dirty
13523 << " num_user_objects: " << num_user_objects
13524 << " num_user_bytes: " << num_user_bytes
13525 << " num_overhead_bytes: " << num_overhead_bytes
13526 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13527 << " pool.info.target_max_objects: " << pool.info.target_max_objects
13528 << dendl;
13529
13530 // get dirty, full ratios
13531 uint64_t dirty_micro = 0;
13532 uint64_t full_micro = 0;
13533 if (pool.info.target_max_bytes && num_user_objects > 0) {
13534 uint64_t avg_size = num_user_bytes / num_user_objects;
13535 dirty_micro =
13536 num_dirty * avg_size * 1000000 /
13537 MAX(pool.info.target_max_bytes / divisor, 1);
13538 full_micro =
13539 num_user_objects * avg_size * 1000000 /
13540 MAX(pool.info.target_max_bytes / divisor, 1);
13541 }
13542 if (pool.info.target_max_objects > 0) {
13543 uint64_t dirty_objects_micro =
13544 num_dirty * 1000000 /
13545 MAX(pool.info.target_max_objects / divisor, 1);
13546 if (dirty_objects_micro > dirty_micro)
13547 dirty_micro = dirty_objects_micro;
13548 uint64_t full_objects_micro =
13549 num_user_objects * 1000000 /
13550 MAX(pool.info.target_max_objects / divisor, 1);
13551 if (full_objects_micro > full_micro)
13552 full_micro = full_objects_micro;
13553 }
13554 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13555 << " full " << ((float)full_micro / 1000000.0)
13556 << dendl;
13557
13558 // flush mode
13559 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13560 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13561 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13562 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13563 flush_target += flush_slop;
13564 flush_high_target += flush_slop;
13565 } else {
13566 flush_target -= MIN(flush_target, flush_slop);
13567 flush_high_target -= MIN(flush_high_target, flush_slop);
13568 }
13569
13570 if (dirty_micro > flush_high_target) {
13571 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13572 } else if (dirty_micro > flush_target) {
13573 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13574 }
13575
13576 // evict mode
13577 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13578 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13579 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13580 evict_target += evict_slop;
13581 else
13582 evict_target -= MIN(evict_target, evict_slop);
13583
13584 if (full_micro > 1000000) {
13585 // evict anything clean
13586 evict_mode = TierAgentState::EVICT_MODE_FULL;
13587 evict_effort = 1000000;
13588 } else if (full_micro > evict_target) {
13589 // set effort in [0..1] range based on where we are between
13590 evict_mode = TierAgentState::EVICT_MODE_SOME;
13591 uint64_t over = full_micro - evict_target;
13592 uint64_t span = 1000000 - evict_target;
13593 evict_effort = MAX(over * 1000000 / span,
13594 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13595
13596 // quantize effort to avoid too much reordering in the agent_queue.
13597 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13598 assert(inc > 0);
13599 uint64_t was = evict_effort;
13600 evict_effort -= evict_effort % inc;
13601 if (evict_effort < inc)
13602 evict_effort = inc;
13603 assert(evict_effort >= inc && evict_effort <= 1000000);
13604 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13605 }
13606 }
13607
13608 skip_calc:
13609 bool old_idle = agent_state->is_idle();
13610 if (flush_mode != agent_state->flush_mode) {
13611 dout(5) << __func__ << " flush_mode "
13612 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13613 << " -> "
13614 << TierAgentState::get_flush_mode_name(flush_mode)
13615 << dendl;
13616 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13617 osd->agent_inc_high_count();
13618 info.stats.stats.sum.num_flush_mode_high = 1;
13619 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13620 info.stats.stats.sum.num_flush_mode_low = 1;
13621 }
13622 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13623 osd->agent_dec_high_count();
13624 info.stats.stats.sum.num_flush_mode_high = 0;
13625 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13626 info.stats.stats.sum.num_flush_mode_low = 0;
13627 }
13628 agent_state->flush_mode = flush_mode;
13629 }
13630 if (evict_mode != agent_state->evict_mode) {
13631 dout(5) << __func__ << " evict_mode "
13632 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13633 << " -> "
13634 << TierAgentState::get_evict_mode_name(evict_mode)
13635 << dendl;
13636 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13637 is_active()) {
13638 if (op)
13639 requeue_op(op);
13640 requeue_ops(waiting_for_flush);
13641 requeue_ops(waiting_for_active);
13642 requeue_ops(waiting_for_scrub);
13643 requeue_ops(waiting_for_cache_not_full);
13644 objects_blocked_on_cache_full.clear();
13645 requeued = true;
13646 }
13647 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13648 info.stats.stats.sum.num_evict_mode_some = 1;
13649 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13650 info.stats.stats.sum.num_evict_mode_full = 1;
13651 }
13652 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13653 info.stats.stats.sum.num_evict_mode_some = 0;
13654 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13655 info.stats.stats.sum.num_evict_mode_full = 0;
13656 }
13657 agent_state->evict_mode = evict_mode;
13658 }
13659 uint64_t old_effort = agent_state->evict_effort;
13660 if (evict_effort != agent_state->evict_effort) {
13661 dout(5) << __func__ << " evict_effort "
13662 << ((float)agent_state->evict_effort / 1000000.0)
13663 << " -> "
13664 << ((float)evict_effort / 1000000.0)
13665 << dendl;
13666 agent_state->evict_effort = evict_effort;
13667 }
13668
13669 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13670 // (including flush). This is probably fine (they should be
13671 // correlated) but it is not precisely correct.
13672 if (agent_state->is_idle()) {
13673 if (!restart && !old_idle) {
13674 osd->agent_disable_pg(this, old_effort);
13675 }
13676 } else {
13677 if (restart || old_idle) {
13678 osd->agent_enable_pg(this, agent_state->evict_effort);
13679 } else if (old_effort != agent_state->evict_effort) {
13680 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13681 }
13682 }
13683 return requeued;
13684 }
13685
13686 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13687 {
13688 assert(hit_set);
13689 assert(temp);
13690 *temp = 0;
13691 if (hit_set->contains(oid))
13692 *temp = 1000000;
13693 unsigned i = 0;
13694 int last_n = pool.info.hit_set_search_last_n;
13695 for (map<time_t,HitSetRef>::reverse_iterator p =
13696 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13697 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13698 if (p->second->contains(oid)) {
13699 *temp += pool.info.get_grade(i);
13700 --last_n;
13701 }
13702 }
13703 }
13704
13705 // Dup op detection
13706
13707 bool PrimaryLogPG::already_complete(eversion_t v)
13708 {
13709 dout(20) << __func__ << ": " << v << dendl;
13710 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13711 !i.end();
13712 ++i) {
13713 dout(20) << __func__ << ": " << **i << dendl;
13714 // skip copy from temp object ops
13715 if ((*i)->v == eversion_t()) {
13716 dout(20) << __func__ << ": " << **i
13717 << " version is empty" << dendl;
13718 continue;
13719 }
13720 if ((*i)->v > v) {
13721 dout(20) << __func__ << ": " << **i
13722 << " (*i)->v past v" << dendl;
13723 break;
13724 }
13725 if (!(*i)->all_committed) {
13726 dout(20) << __func__ << ": " << **i
13727 << " not committed, returning false"
13728 << dendl;
13729 return false;
13730 }
13731 }
13732 dout(20) << __func__ << ": returning true" << dendl;
13733 return true;
13734 }
13735
13736 bool PrimaryLogPG::already_ack(eversion_t v)
13737 {
13738 dout(20) << __func__ << ": " << v << dendl;
13739 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13740 !i.end();
13741 ++i) {
13742 // skip copy from temp object ops
13743 if ((*i)->v == eversion_t()) {
13744 dout(20) << __func__ << ": " << **i
13745 << " version is empty" << dendl;
13746 continue;
13747 }
13748 if ((*i)->v > v) {
13749 dout(20) << __func__ << ": " << **i
13750 << " (*i)->v past v" << dendl;
13751 break;
13752 }
13753 if (!(*i)->all_applied) {
13754 dout(20) << __func__ << ": " << **i
13755 << " not applied, returning false"
13756 << dendl;
13757 return false;
13758 }
13759 }
13760 dout(20) << __func__ << ": returning true" << dendl;
13761 return true;
13762 }
13763
13764
13765 // ==========================================================================================
13766 // SCRUB
13767
13768
13769 bool PrimaryLogPG::_range_available_for_scrub(
13770 const hobject_t &begin, const hobject_t &end)
13771 {
13772 pair<hobject_t, ObjectContextRef> next;
13773 next.second = object_contexts.lookup(begin);
13774 next.first = begin;
13775 bool more = true;
13776 while (more && next.first < end) {
13777 if (next.second && next.second->is_blocked()) {
13778 next.second->requeue_scrub_on_unblock = true;
13779 dout(10) << __func__ << ": scrub delayed, "
13780 << next.first << " is blocked"
13781 << dendl;
13782 return false;
13783 }
13784 more = object_contexts.get_next(next.first, &next);
13785 }
13786 return true;
13787 }
13788
13789 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13790 const vector<snapid_t>::reverse_iterator &curclone) {
13791 return snapset && curclone != snapset.get().clones.rend();
13792 }
13793
13794 void PrimaryLogPG::log_missing(unsigned missing,
13795 const boost::optional<hobject_t> &head,
13796 LogChannelRef clog,
13797 const spg_t &pgid,
13798 const char *func,
13799 const char *mode,
13800 bool allow_incomplete_clones)
13801 {
13802 assert(head);
13803 if (allow_incomplete_clones) {
13804 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13805 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13806 } else {
13807 clog->info() << mode << " " << pgid << " " << head.get()
13808 << " " << missing << " missing clone(s)";
13809 }
13810 }
13811
13812 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13813 const boost::optional<SnapSet> &snapset,
13814 LogChannelRef clog,
13815 const spg_t &pgid,
13816 const char *mode,
13817 bool allow_incomplete_clones,
13818 boost::optional<snapid_t> target,
13819 vector<snapid_t>::reverse_iterator *curclone,
13820 inconsistent_snapset_wrapper &e)
13821 {
13822 assert(head);
13823 assert(snapset);
13824 unsigned missing = 0;
13825
13826 // NOTE: clones are in descending order, thus **curclone > target test here
13827 hobject_t next_clone(head.get());
13828 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13829 ++missing;
13830 // it is okay to be missing one or more clones in a cache tier.
13831 // skip higher-numbered clones in the list.
13832 if (!allow_incomplete_clones) {
13833 next_clone.snap = **curclone;
13834 clog->error() << mode << " " << pgid << " " << head.get()
13835 << " expected clone " << next_clone << " " << missing
13836 << " missing";
13837 ++scrubber.shallow_errors;
13838 e.set_clone_missing(next_clone.snap);
13839 }
13840 // Clones are descending
13841 ++(*curclone);
13842 }
13843 return missing;
13844 }
13845
13846 /*
13847 * Validate consistency of the object info and snap sets.
13848 *
13849 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13850 * the comparison of the objects is against multiple snapset.clones. There are
13851 * multiple clone lists and in between lists we expect head or snapdir.
13852 *
13853 * Example
13854 *
13855 * objects expected
13856 * ======= =======
13857 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13858 * obj2 head head/snapdir, head ok
13859 * [SnapSet clones 6 4 2 1]
13860 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13861 * obj2 snap 6 obj2 snap 6, match
13862 * obj2 snap 4 obj2 snap 4, match
13863 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13864 * [Snapset clones 3 1]
13865 * obj3 snap 3 obj3 snap 3 match
13866 * obj3 snap 1 obj3 snap 1 match
13867 * obj4 snapdir head/snapdir, snapdir ok
13868 * [Snapset clones 4]
13869 * EOL obj4 snap 4, (expected)
13870 */
13871 void PrimaryLogPG::scrub_snapshot_metadata(
13872 ScrubMap &scrubmap,
13873 const map<hobject_t,
13874 pair<boost::optional<uint32_t>,
13875 boost::optional<uint32_t>>> &missing_digest)
13876 {
13877 dout(10) << __func__ << dendl;
13878
13879 coll_t c(info.pgid);
13880 bool repair = state_test(PG_STATE_REPAIR);
13881 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13882 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13883 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13884
13885 /// snapsets to repair
13886 map<hobject_t,SnapSet> snapset_to_repair;
13887
13888 // traverse in reverse order.
13889 boost::optional<hobject_t> head;
13890 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13891 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13892 unsigned missing = 0;
13893 inconsistent_snapset_wrapper soid_error, head_error;
13894 unsigned soid_error_count = 0;
13895
13896 bufferlist last_data;
13897
13898 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13899 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13900 const hobject_t& soid = p->first;
13901 soid_error = inconsistent_snapset_wrapper{soid};
13902 object_stat_sum_t stat;
13903 boost::optional<object_info_t> oi;
13904
13905 if (!soid.is_snapdir())
13906 stat.num_objects++;
13907
13908 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13909 stat.num_objects_hit_set_archive++;
13910
13911 if (soid.is_snap()) {
13912 // it's a clone
13913 stat.num_object_clones++;
13914 }
13915
13916 // basic checks.
13917 if (p->second.attrs.count(OI_ATTR) == 0) {
13918 oi = boost::none;
13919 osd->clog->error() << mode << " " << info.pgid << " " << soid
13920 << " no '" << OI_ATTR << "' attr";
13921 ++scrubber.shallow_errors;
13922 soid_error.set_info_missing();
13923 } else {
13924 bufferlist bv;
13925 bv.push_back(p->second.attrs[OI_ATTR]);
13926 try {
13927 oi = object_info_t(); // Initialize optional<> before decode into it
13928 oi.get().decode(bv);
13929 } catch (buffer::error& e) {
13930 oi = boost::none;
13931 osd->clog->error() << mode << " " << info.pgid << " " << soid
13932 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13933 ++scrubber.shallow_errors;
13934 soid_error.set_info_corrupted();
13935 soid_error.set_info_missing(); // Not available too
13936 }
13937 }
13938
13939 if (oi) {
13940 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13941 osd->clog->error() << mode << " " << info.pgid << " " << soid
13942 << " on disk size (" << p->second.size
13943 << ") does not match object info size ("
13944 << oi->size << ") adjusted for ondisk to ("
13945 << pgbackend->be_get_ondisk_size(oi->size)
13946 << ")";
13947 soid_error.set_size_mismatch();
13948 ++scrubber.shallow_errors;
13949 }
13950
13951 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13952
13953 // A clone num_bytes will be added later when we have snapset
13954 if (!soid.is_snap()) {
13955 stat.num_bytes += oi->size;
13956 }
13957 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13958 stat.num_bytes_hit_set_archive += oi->size;
13959
13960 if (!soid.is_snapdir()) {
13961 if (oi->is_dirty())
13962 ++stat.num_objects_dirty;
13963 if (oi->is_whiteout())
13964 ++stat.num_whiteouts;
13965 if (oi->is_omap())
13966 ++stat.num_objects_omap;
13967 if (oi->is_cache_pinned())
13968 ++stat.num_objects_pinned;
13969 }
13970 } else {
13971 // pessimistic assumption that this object might contain a
13972 // legacy SnapSet
13973 stat.num_legacy_snapsets++;
13974 }
13975
13976 // Check for any problems while processing clones
13977 if (doing_clones(snapset, curclone)) {
13978 boost::optional<snapid_t> target;
13979 // Expecting an object with snap for current head
13980 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13981
13982 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13983 << soid << " while processing " << head.get() << dendl;
13984
13985 target = all_clones;
13986 } else {
13987 assert(soid.is_snap());
13988 target = soid.snap;
13989 }
13990
13991 // Log any clones we were expecting to be there up to target
13992 // This will set missing, but will be a no-op if snap.soid == *curclone.
13993 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13994 pool.info.allow_incomplete_clones(), target, &curclone,
13995 head_error);
13996 }
13997 bool expected;
13998 // Check doing_clones() again in case we ran process_clones_to()
13999 if (doing_clones(snapset, curclone)) {
14000 // A head/snapdir would have processed all clones above
14001 // or all greater than *curclone.
14002 assert(soid.is_snap() && *curclone <= soid.snap);
14003
14004 // After processing above clone snap should match the expected curclone
14005 expected = (*curclone == soid.snap);
14006 } else {
14007 // If we aren't doing clones any longer, then expecting head/snapdir
14008 expected = soid.has_snapset();
14009 }
14010 if (!expected) {
14011 // If we couldn't read the head's snapset, just ignore clones
14012 if (head && !snapset) {
14013 osd->clog->error() << mode << " " << info.pgid << " " << soid
14014 << " clone ignored due to missing snapset";
14015 } else {
14016 osd->clog->error() << mode << " " << info.pgid << " " << soid
14017 << " is an unexpected clone";
14018 }
14019 ++scrubber.shallow_errors;
14020 soid_error.set_headless();
14021 scrubber.store->add_snap_error(pool.id, soid_error);
14022 ++soid_error_count;
14023 if (head && soid.get_head() == head->get_head())
14024 head_error.set_clone(soid.snap);
14025 continue;
14026 }
14027
14028 // new snapset?
14029 if (soid.has_snapset()) {
14030
14031 if (missing) {
14032 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14033 pool.info.allow_incomplete_clones());
14034 }
14035
14036 // Save previous head error information
14037 if (head && (head_error.errors || soid_error_count))
14038 scrubber.store->add_snap_error(pool.id, head_error);
14039 // Set this as a new head object
14040 head = soid;
14041 missing = 0;
14042 head_error = soid_error;
14043 soid_error_count = 0;
14044
14045 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14046
14047 if (p->second.attrs.count(SS_ATTR) == 0) {
14048 osd->clog->error() << mode << " " << info.pgid << " " << soid
14049 << " no '" << SS_ATTR << "' attr";
14050 ++scrubber.shallow_errors;
14051 snapset = boost::none;
14052 head_error.set_snapset_missing();
14053 } else {
14054 bufferlist bl;
14055 bl.push_back(p->second.attrs[SS_ATTR]);
14056 bufferlist::iterator blp = bl.begin();
14057 try {
14058 snapset = SnapSet(); // Initialize optional<> before decoding into it
14059 ::decode(snapset.get(), blp);
14060 head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14061 } catch (buffer::error& e) {
14062 snapset = boost::none;
14063 osd->clog->error() << mode << " " << info.pgid << " " << soid
14064 << " can't decode '" << SS_ATTR << "' attr " << e.what();
14065 ++scrubber.shallow_errors;
14066 head_error.set_snapset_corrupted();
14067 }
14068 }
14069
14070 if (snapset) {
14071 // what will be next?
14072 curclone = snapset->clones.rbegin();
14073
14074 if (!snapset->clones.empty()) {
14075 dout(20) << " snapset " << snapset.get() << dendl;
14076 if (snapset->seq == 0) {
14077 osd->clog->error() << mode << " " << info.pgid << " " << soid
14078 << " snaps.seq not set";
14079 ++scrubber.shallow_errors;
14080 head_error.set_snapset_error();
14081 }
14082 }
14083
14084 if (soid.is_head() && !snapset->head_exists) {
14085 osd->clog->error() << mode << " " << info.pgid << " " << soid
14086 << " snapset.head_exists=false, but head exists";
14087 ++scrubber.shallow_errors;
14088 head_error.set_head_mismatch();
14089 // Fix head_exists locally so is_legacy() returns correctly
14090 snapset->head_exists = true;
14091 }
14092 if (soid.is_snapdir() && snapset->head_exists) {
14093 osd->clog->error() << mode << " " << info.pgid << " " << soid
14094 << " snapset.head_exists=true, but snapdir exists";
14095 ++scrubber.shallow_errors;
14096 head_error.set_head_mismatch();
14097 // For symmetry fix this too, but probably doesn't matter
14098 snapset->head_exists = false;
14099 }
14100
14101 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
14102 if (soid.is_snapdir()) {
14103 dout(10) << " will move snapset to head from " << soid << dendl;
14104 snapset_to_repair[soid.get_head()] = *snapset;
14105 } else if (snapset->is_legacy()) {
14106 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
14107 << dendl;
14108 snapset_to_repair[soid.get_head()] = *snapset;
14109 }
14110 } else {
14111 stat.num_legacy_snapsets++;
14112 }
14113 } else {
14114 // pessimistic assumption that this object might contain a
14115 // legacy SnapSet
14116 stat.num_legacy_snapsets++;
14117 }
14118 } else {
14119 assert(soid.is_snap());
14120 assert(head);
14121 assert(snapset);
14122 assert(soid.snap == *curclone);
14123
14124 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14125
14126 if (snapset->clone_size.count(soid.snap) == 0) {
14127 osd->clog->error() << mode << " " << info.pgid << " " << soid
14128 << " is missing in clone_size";
14129 ++scrubber.shallow_errors;
14130 soid_error.set_size_mismatch();
14131 } else {
14132 if (oi && oi->size != snapset->clone_size[soid.snap]) {
14133 osd->clog->error() << mode << " " << info.pgid << " " << soid
14134 << " size " << oi->size << " != clone_size "
14135 << snapset->clone_size[*curclone];
14136 ++scrubber.shallow_errors;
14137 soid_error.set_size_mismatch();
14138 }
14139
14140 if (snapset->clone_overlap.count(soid.snap) == 0) {
14141 osd->clog->error() << mode << " " << info.pgid << " " << soid
14142 << " is missing in clone_overlap";
14143 ++scrubber.shallow_errors;
14144 soid_error.set_size_mismatch();
14145 } else {
14146 // This checking is based on get_clone_bytes(). The first 2 asserts
14147 // can't happen because we know we have a clone_size and
14148 // a clone_overlap. Now we check that the interval_set won't
14149 // cause the last assert.
14150 uint64_t size = snapset->clone_size.find(soid.snap)->second;
14151 const interval_set<uint64_t> &overlap =
14152 snapset->clone_overlap.find(soid.snap)->second;
14153 bool bad_interval_set = false;
14154 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14155 i != overlap.end(); ++i) {
14156 if (size < i.get_len()) {
14157 bad_interval_set = true;
14158 break;
14159 }
14160 size -= i.get_len();
14161 }
14162
14163 if (bad_interval_set) {
14164 osd->clog->error() << mode << " " << info.pgid << " " << soid
14165 << " bad interval_set in clone_overlap";
14166 ++scrubber.shallow_errors;
14167 soid_error.set_size_mismatch();
14168 } else {
14169 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14170 }
14171 }
14172 }
14173
14174 // migrate legacy_snaps to snapset?
14175 auto p = snapset_to_repair.find(soid.get_head());
14176 if (p != snapset_to_repair.end()) {
14177 if (!oi || oi->legacy_snaps.empty()) {
14178 osd->clog->error() << mode << " " << info.pgid << " " << soid
14179 << " has no oi or legacy_snaps; cannot convert "
14180 << *snapset;
14181 ++scrubber.shallow_errors;
14182 } else {
14183 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
14184 << " to snapset " << p->second << dendl;
14185 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14186 }
14187 }
14188
14189 // what's next?
14190 ++curclone;
14191 if (soid_error.errors) {
14192 scrubber.store->add_snap_error(pool.id, soid_error);
14193 ++soid_error_count;
14194 }
14195 }
14196
14197 scrub_cstat.add(stat);
14198 }
14199
14200 if (doing_clones(snapset, curclone)) {
14201 dout(10) << __func__ << " " << mode << " " << info.pgid
14202 << " No more objects while processing " << head.get() << dendl;
14203
14204 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14205 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14206 head_error);
14207 }
14208 // There could be missing found by the test above or even
14209 // before dropping out of the loop for the last head.
14210 if (missing) {
14211 log_missing(missing, head, osd->clog, info.pgid, __func__,
14212 mode, pool.info.allow_incomplete_clones());
14213 }
14214 if (head && (head_error.errors || soid_error_count))
14215 scrubber.store->add_snap_error(pool.id, head_error);
14216
14217 for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
14218 if (p->first.is_snapdir())
14219 continue;
14220 dout(10) << __func__ << " recording digests for " << p->first << dendl;
14221 ObjectContextRef obc = get_object_context(p->first, false);
14222 if (!obc) {
14223 osd->clog->error() << info.pgid << " " << mode
14224 << " cannot get object context for object "
14225 << p->first;
14226 continue;
14227 } else if (obc->obs.oi.soid != p->first) {
14228 osd->clog->error() << info.pgid << " " << mode
14229 << " object " << p->first
14230 << " has a valid oi attr with a mismatched name, "
14231 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14232 continue;
14233 }
14234 OpContextUPtr ctx = simple_opc_create(obc);
14235 ctx->at_version = get_next_version();
14236 ctx->mtime = utime_t(); // do not update mtime
14237 if (p->second.first) {
14238 ctx->new_obs.oi.set_data_digest(*p->second.first);
14239 } else {
14240 ctx->new_obs.oi.clear_data_digest();
14241 }
14242 if (p->second.second) {
14243 ctx->new_obs.oi.set_omap_digest(*p->second.second);
14244 } else {
14245 ctx->new_obs.oi.clear_omap_digest();
14246 }
14247 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14248
14249 ctx->register_on_success(
14250 [this]() {
14251 dout(20) << "updating scrub digest" << dendl;
14252 if (--scrubber.num_digest_updates_pending == 0) {
14253 requeue_scrub();
14254 }
14255 });
14256
14257 simple_opc_submit(std::move(ctx));
14258 ++scrubber.num_digest_updates_pending;
14259 }
14260 for (auto& p : snapset_to_repair) {
14261 // cache pools may not have the clones, which means we won't know
14262 // what snaps they have. fake out the clone_snaps entries anyway (with
14263 // blank snap lists).
14264 p.second.head_exists = true;
14265 if (pool.info.allow_incomplete_clones()) {
14266 for (auto s : p.second.clones) {
14267 if (p.second.clone_snaps.count(s) == 0) {
14268 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14269 << s << dendl;
14270 p.second.clone_snaps[s];
14271 }
14272 }
14273 }
14274 if (p.second.clones.size() != p.second.clone_snaps.size() ||
14275 p.second.is_legacy()) {
14276 // this happens if we encounter other errors above, like a missing
14277 // or extra clone.
14278 dout(10) << __func__ << " not writing snapset to " << p.first
14279 << " snapset " << p.second << " clones " << p.second.clones
14280 << "; didn't convert fully" << dendl;
14281 scrub_cstat.sum.num_legacy_snapsets++;
14282 continue;
14283 }
14284 dout(10) << __func__ << " writing snapset to " << p.first
14285 << " " << p.second << dendl;
14286 ObjectContextRef obc = get_object_context(p.first, true);
14287 if (!obc) {
14288 osd->clog->error() << info.pgid << " " << mode
14289 << " cannot get object context for object "
14290 << p.first;
14291 continue;
14292 } else if (obc->obs.oi.soid != p.first) {
14293 osd->clog->error() << info.pgid << " " << mode
14294 << " object " << p.first
14295 << " has a valid oi attr with a mismatched name, "
14296 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14297 continue;
14298 }
14299 ObjectContextRef snapset_obc;
14300 if (!obc->obs.exists) {
14301 snapset_obc = get_object_context(p.first.get_snapdir(), false);
14302 if (!snapset_obc) {
14303 osd->clog->error() << info.pgid << " " << mode
14304 << " cannot get object context for "
14305 << p.first.get_snapdir();
14306 continue;
14307 }
14308 }
14309 OpContextUPtr ctx = simple_opc_create(obc);
14310 PGTransaction *t = ctx->op_t.get();
14311 ctx->snapset_obc = snapset_obc;
14312 ctx->at_version = get_next_version();
14313 ctx->mtime = utime_t(); // do not update mtime
14314 ctx->new_snapset = p.second;
14315 if (!ctx->new_obs.exists) {
14316 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
14317 ctx->new_obs.exists = true;
14318 ctx->new_snapset.head_exists = true;
14319 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14320 ++ctx->delta_stats.num_whiteouts;
14321 ++ctx->delta_stats.num_objects;
14322 t->create(p.first);
14323 if (p.first < scrubber.start) {
14324 dout(20) << __func__ << " kludging around update outside of scrub range"
14325 << dendl;
14326 } else {
14327 scrub_cstat.add(ctx->delta_stats);
14328 }
14329 }
14330 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
14331 assert(!ctx->new_snapset.is_legacy());
14332 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14333 ctx->register_on_success(
14334 [this]() {
14335 dout(20) << "updating snapset" << dendl;
14336 if (--scrubber.num_digest_updates_pending == 0) {
14337 requeue_scrub();
14338 }
14339 });
14340
14341 simple_opc_submit(std::move(ctx));
14342 ++scrubber.num_digest_updates_pending;
14343 }
14344
14345 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14346 }
14347
14348 void PrimaryLogPG::_scrub_clear_state()
14349 {
14350 scrub_cstat = object_stat_collection_t();
14351 }
14352
14353 void PrimaryLogPG::_scrub_finish()
14354 {
14355 bool repair = state_test(PG_STATE_REPAIR);
14356 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14357 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14358
14359 if (info.stats.stats_invalid) {
14360 info.stats.stats = scrub_cstat;
14361 info.stats.stats_invalid = false;
14362
14363 if (agent_state)
14364 agent_choose_mode();
14365 }
14366
14367 dout(10) << mode << " got "
14368 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14369 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14370 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14371 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14372 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14373 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14374 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14375 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14376 << dendl;
14377
14378 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14379 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14380 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14381 !info.stats.dirty_stats_invalid) ||
14382 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14383 !info.stats.omap_stats_invalid) ||
14384 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14385 !info.stats.pin_stats_invalid) ||
14386 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14387 !info.stats.hitset_stats_invalid) ||
14388 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14389 !info.stats.hitset_bytes_stats_invalid) ||
14390 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14391 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14392 osd->clog->error() << info.pgid << " " << mode
14393 << " stat mismatch, got "
14394 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14395 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14396 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14397 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14398 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14399 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14400 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14401 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14402 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14403 ++scrubber.shallow_errors;
14404
14405 if (repair) {
14406 ++scrubber.fixed;
14407 info.stats.stats = scrub_cstat;
14408 info.stats.dirty_stats_invalid = false;
14409 info.stats.omap_stats_invalid = false;
14410 info.stats.hitset_stats_invalid = false;
14411 info.stats.hitset_bytes_stats_invalid = false;
14412 publish_stats_to_osd();
14413 share_pg_info();
14414 }
14415 } else if (scrub_cstat.sum.num_legacy_snapsets !=
14416 info.stats.stats.sum.num_legacy_snapsets) {
14417 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14418 << " from " << info.stats.stats.sum.num_legacy_snapsets
14419 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14420 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14421 publish_stats_to_osd();
14422 share_pg_info();
14423 }
14424 // Clear object context cache to get repair information
14425 if (repair)
14426 object_contexts.clear();
14427 }
14428
14429 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14430 {
14431 return osd->check_osdmap_full(missing_on);
14432 }
14433
14434 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14435 {
14436 // Only supports replicated pools
14437 assert(!pool.info.require_rollback());
14438 assert(is_primary());
14439
14440 dout(10) << __func__ << " " << soid
14441 << " peers osd.{" << actingbackfill << "}" << dendl;
14442
14443 if (!is_clean()) {
14444 block_for_clean(soid, op);
14445 return -EAGAIN;
14446 }
14447
14448 assert(!pg_log.get_missing().is_missing(soid));
14449 bufferlist bv;
14450 object_info_t oi;
14451 eversion_t v;
14452 int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14453 if (r < 0) {
14454 // Leave v and try to repair without a version, getting attr failed
14455 dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14456 << soid << " error=" << r << dendl;
14457 } else try {
14458 bufferlist::iterator bliter = bv.begin();
14459 ::decode(oi, bliter);
14460 v = oi.version;
14461 } catch (...) {
14462 // Leave v as default constructed. This will fail when sent to older OSDs, but
14463 // not much worse than failing here.
14464 dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14465 }
14466
14467 missing_loc.add_missing(soid, v, eversion_t());
14468 if (primary_error(soid, v)) {
14469 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14470 // XXX: If we knew that there is no down osd which could include this
14471 // object, it would be nice if we could return EIO here.
14472 // If a "never fail" flag was available, that could be used
14473 // for rbd to NOT return EIO until object marked lost.
14474
14475 // Drop through to save this op in case an osd comes up with the object.
14476 }
14477
14478 // Restart the op after object becomes readable again
14479 waiting_for_unreadable_object[soid].push_back(op);
14480 op->mark_delayed("waiting for missing object");
14481
14482 if (!eio_errors_to_process) {
14483 eio_errors_to_process = true;
14484 assert(is_clean());
14485 queue_peering_event(
14486 CephPeeringEvtRef(
14487 std::make_shared<CephPeeringEvt>(
14488 get_osdmap()->get_epoch(),
14489 get_osdmap()->get_epoch(),
14490 DoRecovery())));
14491 } else {
14492 // A prior error must have already cleared clean state and queued recovery
14493 // or a map change has triggered re-peering.
14494 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14495 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14496 }
14497
14498 return -EAGAIN;
14499 }
14500
14501 /*---SnapTrimmer Logging---*/
14502 #undef dout_prefix
14503 #define dout_prefix *_dout << pg->gen_prefix()
14504
14505 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14506 {
14507 ldout(pg->cct, 20) << "enter " << state_name << dendl;
14508 }
14509
14510 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14511 {
14512 ldout(pg->cct, 20) << "exit " << state_name << dendl;
14513 }
14514
14515 /*---SnapTrimmer states---*/
14516 #undef dout_prefix
14517 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14518 << "SnapTrimmer state<" << get_state_name() << ">: ")
14519
14520 /* NotTrimming */
14521 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14522 : my_base(ctx),
14523 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14524 {
14525 context< SnapTrimmer >().log_enter(state_name);
14526 }
14527
14528 void PrimaryLogPG::NotTrimming::exit()
14529 {
14530 context< SnapTrimmer >().log_exit(state_name, enter_time);
14531 }
14532
14533 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14534 {
14535 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14536 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14537
14538 if (!(pg->is_primary() && pg->is_active())) {
14539 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14540 return discard_event();
14541 }
14542 if (!pg->is_clean() ||
14543 pg->snap_trimq.empty()) {
14544 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14545 return discard_event();
14546 }
14547 if (pg->scrubber.active) {
14548 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14549 return transit< WaitScrub >();
14550 } else {
14551 return transit< Trimming >();
14552 }
14553 }
14554
14555 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14556 {
14557 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14558 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14559
14560 pending = nullptr;
14561 if (!context< SnapTrimmer >().can_trim()) {
14562 post_event(KickTrim());
14563 return transit< NotTrimming >();
14564 }
14565
14566 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14567 ldout(pg->cct, 10) << "NotTrimming: trimming "
14568 << pg->snap_trimq.range_start()
14569 << dendl;
14570 return transit< AwaitAsyncWork >();
14571 }
14572
14573 /* AwaitAsyncWork */
14574 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14575 : my_base(ctx),
14576 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14577 {
14578 auto *pg = context< SnapTrimmer >().pg;
14579 context< SnapTrimmer >().log_enter(state_name);
14580 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14581 pg->state_set(PG_STATE_SNAPTRIM);
14582 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14583 pg->publish_stats_to_osd();
14584 }
14585
14586 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14587 {
14588 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14589 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14590 auto &in_flight = context<Trimming>().in_flight;
14591 assert(in_flight.empty());
14592
14593 assert(pg->is_primary() && pg->is_active());
14594 if (!context< SnapTrimmer >().can_trim()) {
14595 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14596 post_event(KickTrim());
14597 return transit< NotTrimming >();
14598 }
14599
14600 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14601
14602 vector<hobject_t> to_trim;
14603 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14604 to_trim.reserve(max);
14605 int r = pg->snap_mapper.get_next_objects_to_trim(
14606 snap_to_trim,
14607 max,
14608 &to_trim);
14609 if (r != 0 && r != -ENOENT) {
14610 lderr(pg->cct) << "get_next_objects_to_trim returned "
14611 << cpp_strerror(r) << dendl;
14612 assert(0 == "get_next_objects_to_trim returned an invalid code");
14613 } else if (r == -ENOENT) {
14614 // Done!
14615 ldout(pg->cct, 10) << "got ENOENT" << dendl;
14616
14617 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14618 << " to purged_snaps"
14619 << dendl;
14620 pg->info.purged_snaps.insert(snap_to_trim);
14621 pg->snap_trimq.erase(snap_to_trim);
14622 ldout(pg->cct, 10) << "purged_snaps now "
14623 << pg->info.purged_snaps << ", snap_trimq now "
14624 << pg->snap_trimq << dendl;
14625
14626 ObjectStore::Transaction t;
14627 pg->dirty_big_info = true;
14628 pg->write_if_dirty(t);
14629 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14630 assert(tr == 0);
14631
14632 pg->share_pg_info();
14633 post_event(KickTrim());
14634 return transit< NotTrimming >();
14635 }
14636 assert(!to_trim.empty());
14637
14638 for (auto &&object: to_trim) {
14639 // Get next
14640 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14641 OpContextUPtr ctx;
14642 int error = pg->trim_object(in_flight.empty(), object, &ctx);
14643 if (error) {
14644 if (error == -ENOLCK) {
14645 ldout(pg->cct, 10) << "could not get write lock on obj "
14646 << object << dendl;
14647 } else {
14648 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14649 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14650 }
14651 if (!in_flight.empty()) {
14652 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14653 return transit< WaitRepops >();
14654 }
14655 if (error == -ENOLCK) {
14656 ldout(pg->cct, 10) << "waiting for it to clear"
14657 << dendl;
14658 return transit< WaitRWLock >();
14659 } else {
14660 return transit< NotTrimming >();
14661 }
14662 }
14663
14664 in_flight.insert(object);
14665 ctx->register_on_success(
14666 [pg, object, &in_flight]() {
14667 assert(in_flight.find(object) != in_flight.end());
14668 in_flight.erase(object);
14669 if (in_flight.empty()) {
14670 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14671 pg->snap_trimmer_machine.process_event(Reset());
14672 } else {
14673 pg->snap_trimmer_machine.process_event(RepopsComplete());
14674 }
14675 }
14676 });
14677
14678 pg->simple_opc_submit(std::move(ctx));
14679 }
14680
14681 return transit< WaitRepops >();
14682 }
14683
14684 void PrimaryLogPG::setattr_maybe_cache(
14685 ObjectContextRef obc,
14686 OpContext *op,
14687 PGTransaction *t,
14688 const string &key,
14689 bufferlist &val)
14690 {
14691 t->setattr(obc->obs.oi.soid, key, val);
14692 }
14693
14694 void PrimaryLogPG::setattrs_maybe_cache(
14695 ObjectContextRef obc,
14696 OpContext *op,
14697 PGTransaction *t,
14698 map<string, bufferlist> &attrs)
14699 {
14700 t->setattrs(obc->obs.oi.soid, attrs);
14701 }
14702
14703 void PrimaryLogPG::rmattr_maybe_cache(
14704 ObjectContextRef obc,
14705 OpContext *op,
14706 PGTransaction *t,
14707 const string &key)
14708 {
14709 t->rmattr(obc->obs.oi.soid, key);
14710 }
14711
14712 int PrimaryLogPG::getattr_maybe_cache(
14713 ObjectContextRef obc,
14714 const string &key,
14715 bufferlist *val)
14716 {
14717 if (pool.info.require_rollback()) {
14718 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14719 if (i != obc->attr_cache.end()) {
14720 if (val)
14721 *val = i->second;
14722 return 0;
14723 } else {
14724 return -ENODATA;
14725 }
14726 }
14727 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14728 }
14729
14730 int PrimaryLogPG::getattrs_maybe_cache(
14731 ObjectContextRef obc,
14732 map<string, bufferlist> *out)
14733 {
14734 int r = 0;
14735 assert(out);
14736 if (pool.info.require_rollback()) {
14737 *out = obc->attr_cache;
14738 } else {
14739 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14740 }
14741 map<string, bufferlist> tmp;
14742 for (map<string, bufferlist>::iterator i = out->begin();
14743 i != out->end();
14744 ++i) {
14745 if (i->first.size() > 1 && i->first[0] == '_')
14746 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14747 }
14748 tmp.swap(*out);
14749 return r;
14750 }
14751
14752 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14753 return osd->check_failsafe_full(ss);
14754 }
14755
14756 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14757 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14758
14759 #ifdef PG_DEBUG_REFS
14760 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14761 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14762 #endif
14763
14764 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14765 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }