]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
31
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDPGTrim.h"
35 #include "messages/MOSDPGScan.h"
36 #include "messages/MOSDRepScrub.h"
37 #include "messages/MOSDPGBackfill.h"
38 #include "messages/MOSDPGBackfillRemove.h"
39 #include "messages/MOSDPGUpdateLogMissing.h"
40 #include "messages/MOSDPGUpdateLogMissingReply.h"
41 #include "messages/MCommandReply.h"
42 #include "messages/MOSDScrubReserve.h"
43 #include "mds/inode_backtrace.h" // Ugh
44 #include "common/EventTrace.h"
45
46 #include "common/config.h"
47 #include "include/compat.h"
48 #include "mon/MonClient.h"
49 #include "osdc/Objecter.h"
50 #include "json_spirit/json_spirit_value.h"
51 #include "json_spirit/json_spirit_reader.h"
52 #include "include/ceph_assert.h" // json_spirit clobbers it
53 #include "include/rados/rados_types.hpp"
54
55 #ifdef WITH_LTTNG
56 #include "tracing/osd.h"
57 #else
58 #define tracepoint(...)
59 #endif
60
61 #define dout_context cct
62 #define dout_subsys ceph_subsys_osd
63 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
64 #undef dout_prefix
65 #define dout_prefix _prefix(_dout, this)
66 template <typename T>
67 static ostream& _prefix(std::ostream *_dout, T *pg) {
68 return pg->gen_prefix(*_dout);
69 }
70
71
72 #include <sstream>
73 #include <utility>
74
75 #include <errno.h>
76
77 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
78
79 PGLSFilter::PGLSFilter() : cct(nullptr)
80 {
81 }
82
83 PGLSFilter::~PGLSFilter()
84 {
85 }
86
87 /**
88 * The CopyCallback class defines an interface for completions to the
89 * copy_start code. Users of the copy infrastructure must implement
90 * one and give an instance of the class to start_copy.
91 *
92 * The implementer is responsible for making sure that the CopyCallback
93 * can associate itself with the correct copy operation.
94 */
95 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
96 protected:
97 CopyCallback() {}
98 /**
99 * results.get<0>() is the return code: 0 for success; -ECANCELED if
100 * the operation was cancelled by the local OSD; -errno for other issues.
101 * results.get<1>() is a pointer to a CopyResults object, which you are
102 * responsible for deleting.
103 */
104 void finish(CopyCallbackResults results_) override = 0;
105
106 public:
107 /// Provide the final size of the copied object to the CopyCallback
108 ~CopyCallback() override {}
109 };
110
111 template <typename T>
112 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
113 PrimaryLogPGRef pg;
114 unique_ptr<GenContext<T>> c;
115 epoch_t e;
116 public:
117 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
118 : pg(pg), c(c), e(e) {}
119 void finish(T t) override {
120 pg->lock();
121 if (pg->pg_has_reset_since(e))
122 c.reset();
123 else
124 c.release()->complete(t);
125 pg->unlock();
126 }
127 bool sync_finish(T t) {
128 // we assume here all blessed/wrapped Contexts can complete synchronously.
129 c.release()->complete(t);
130 return true;
131 }
132 };
133
134 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
135 GenContext<ThreadPool::TPHandle&> *c) {
136 return new BlessedGenContext<ThreadPool::TPHandle&>(
137 this, c, get_osdmap_epoch());
138 }
139
140 template <typename T>
141 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
142 PrimaryLogPGRef pg;
143 unique_ptr<GenContext<T>> c;
144 epoch_t e;
145 public:
146 UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
147 : pg(pg), c(c), e(e) {}
148 void finish(T t) override {
149 if (pg->pg_has_reset_since(e))
150 c.reset();
151 else
152 c.release()->complete(t);
153 }
154 bool sync_finish(T t) {
155 // we assume here all blessed/wrapped Contexts can complete synchronously.
156 c.release()->complete(t);
157 return true;
158 }
159 };
160
161 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
162 GenContext<ThreadPool::TPHandle&> *c) {
163 return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
164 this, c, get_osdmap_epoch());
165 }
166
167 class PrimaryLogPG::BlessedContext : public Context {
168 PrimaryLogPGRef pg;
169 unique_ptr<Context> c;
170 epoch_t e;
171 public:
172 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
173 : pg(pg), c(c), e(e) {}
174 void finish(int r) override {
175 pg->lock();
176 if (pg->pg_has_reset_since(e))
177 c.reset();
178 else
179 c.release()->complete(r);
180 pg->unlock();
181 }
182 bool sync_finish(int r) {
183 // we assume here all blessed/wrapped Contexts can complete synchronously.
184 c.release()->complete(r);
185 return true;
186 }
187 };
188
189 Context *PrimaryLogPG::bless_context(Context *c) {
190 return new BlessedContext(this, c, get_osdmap_epoch());
191 }
192
193 class PrimaryLogPG::C_PG_ObjectContext : public Context {
194 PrimaryLogPGRef pg;
195 ObjectContext *obc;
196 public:
197 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
198 pg(p), obc(o) {}
199 void finish(int r) override {
200 pg->object_context_destructor_callback(obc);
201 }
202 };
203
204 struct OnReadComplete : public Context {
205 PrimaryLogPG *pg;
206 PrimaryLogPG::OpContext *opcontext;
207 OnReadComplete(
208 PrimaryLogPG *pg,
209 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
210 void finish(int r) override {
211 opcontext->finish_read(pg);
212 }
213 ~OnReadComplete() override {}
214 };
215
216 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
217 PrimaryLogPGRef pg;
218 ObjectContextRef obc;
219 public:
220 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
221 pg(p), obc(o) {}
222 bool sync_finish(int r) override {
223 pg->_applied_recovered_object(obc);
224 return true;
225 }
226 void finish(int r) override {
227 pg->lock();
228 pg->_applied_recovered_object(obc);
229 pg->unlock();
230 }
231 };
232
233 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
234 PrimaryLogPGRef pg;
235 epoch_t epoch;
236 eversion_t last_complete;
237 public:
238 C_OSD_CommittedPushedObject(
239 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
240 pg(p), epoch(epoch), last_complete(lc) {
241 }
242 void finish(int r) override {
243 pg->_committed_pushed_object(epoch, last_complete);
244 }
245 };
246
247 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
248 PrimaryLogPGRef pg;
249 public:
250 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
251 pg(p) {}
252 bool sync_finish(int r) override {
253 pg->_applied_recovered_object_replica();
254 return true;
255 }
256 void finish(int r) override {
257 pg->lock();
258 pg->_applied_recovered_object_replica();
259 pg->unlock();
260 }
261 };
262
263 // OpContext
264 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
265 {
266 inflightreads = 1;
267 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
268 pair<bufferlist*, Context*> > > in;
269 in.swap(pending_async_reads);
270 pg->pgbackend->objects_read_async(
271 obc->obs.oi.soid,
272 in,
273 new OnReadComplete(pg, this), pg->get_pool().fast_read);
274 }
275 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
276 {
277 ceph_assert(inflightreads > 0);
278 --inflightreads;
279 if (async_reads_complete()) {
280 ceph_assert(pg->in_progress_async_reads.size());
281 ceph_assert(pg->in_progress_async_reads.front().second == this);
282 pg->in_progress_async_reads.pop_front();
283
284 // Restart the op context now that all reads have been
285 // completed. Read failures will be handled by the op finisher
286 pg->execute_ctx(this);
287 }
288 }
289
290 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
291 public:
292 PrimaryLogPG::CopyResults *results = nullptr;
293 PrimaryLogPG::OpContext *ctx;
294 OSDOp &osd_op;
295
296 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
297 : ctx(ctx), osd_op(osd_op) {
298 }
299 ~CopyFromCallback() override {}
300
301 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
302 results = results_.get<1>();
303 int r = results_.get<0>();
304
305 // for finish_copyfrom
306 ctx->user_at_version = results->user_version;
307
308 if (r >= 0) {
309 ctx->pg->execute_ctx(ctx);
310 } else {
311 if (r != -ECANCELED) { // on cancel just toss it out; client resends
312 if (ctx->op)
313 ctx->pg->osd->reply_op_error(ctx->op, r);
314 } else if (results->should_requeue) {
315 if (ctx->op)
316 ctx->pg->requeue_op(ctx->op);
317 }
318 ctx->pg->close_op_ctx(ctx);
319 }
320 }
321
322 bool is_temp_obj_used() {
323 return results->started_temp_obj;
324 }
325 uint64_t get_data_size() {
326 return results->object_size;
327 }
328 };
329
330 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
331 CopyFromCallback *copy_from_callback;
332
333 explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
334 : copy_from_callback(copy_from_callback) {
335 }
336
337 int execute() override {
338 // instance will be destructed after this method completes
339 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
340 return 0;
341 }
342 };
343
344 // ======================
345 // PGBackend::Listener
346
347 void PrimaryLogPG::on_local_recover(
348 const hobject_t &hoid,
349 const ObjectRecoveryInfo &_recovery_info,
350 ObjectContextRef obc,
351 bool is_delete,
352 ObjectStore::Transaction *t
353 )
354 {
355 dout(10) << __func__ << ": " << hoid << dendl;
356
357 ObjectRecoveryInfo recovery_info(_recovery_info);
358 clear_object_snap_mapping(t, hoid);
359 if (!is_delete && recovery_info.soid.is_snap()) {
360 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
361 set<snapid_t> snaps;
362 dout(20) << " snapset " << recovery_info.ss << dendl;
363 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
364 if (p != recovery_info.ss.clone_snaps.end()) {
365 snaps.insert(p->second.begin(), p->second.end());
366 dout(20) << " snaps " << snaps << dendl;
367 snap_mapper.add_oid(
368 recovery_info.soid,
369 snaps,
370 &_t);
371 } else {
372 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
373 }
374 }
375 if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
376 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
377 ceph_assert(is_primary());
378 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
379 if (latest->op == pg_log_entry_t::LOST_REVERT &&
380 latest->reverting_to == recovery_info.version) {
381 dout(10) << " got old revert version " << recovery_info.version
382 << " for " << *latest << dendl;
383 recovery_info.version = latest->version;
384 // update the attr to the revert event version
385 recovery_info.oi.prior_version = recovery_info.oi.version;
386 recovery_info.oi.version = latest->version;
387 bufferlist bl;
388 encode(recovery_info.oi, bl,
389 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
390 ceph_assert(!pool.info.is_erasure());
391 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
392 if (obc)
393 obc->attr_cache[OI_ATTR] = bl;
394 }
395 }
396
397 // keep track of active pushes for scrub
398 ++active_pushes;
399
400 if (recovery_info.version > pg_log.get_can_rollback_to()) {
401 /* This can only happen during a repair, and even then, it would
402 * be one heck of a race. If we are repairing the object, the
403 * write in question must be fully committed, so it's not valid
404 * to roll it back anyway (and we'll be rolled forward shortly
405 * anyway) */
406 PGLogEntryHandler h{this, t};
407 pg_log.roll_forward_to(recovery_info.version, &h);
408 }
409 recover_got(recovery_info.soid, recovery_info.version);
410
411 if (is_primary()) {
412 if (!is_delete) {
413 obc->obs.exists = true;
414
415 bool got = obc->get_recovery_read();
416 ceph_assert(got);
417
418 ceph_assert(recovering.count(obc->obs.oi.soid));
419 recovering[obc->obs.oi.soid] = obc;
420 obc->obs.oi = recovery_info.oi; // may have been updated above
421 }
422
423 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
424
425 publish_stats_to_osd();
426 ceph_assert(missing_loc.needs_recovery(hoid));
427 if (!is_delete)
428 missing_loc.add_location(hoid, pg_whoami);
429 release_backoffs(hoid);
430 if (!is_unreadable_object(hoid)) {
431 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
432 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
433 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
434 requeue_ops(unreadable_object_entry->second);
435 waiting_for_unreadable_object.erase(unreadable_object_entry);
436 }
437 }
438 } else {
439 t->register_on_applied(
440 new C_OSD_AppliedRecoveredObjectReplica(this));
441
442 }
443
444 t->register_on_commit(
445 new C_OSD_CommittedPushedObject(
446 this,
447 get_osdmap_epoch(),
448 info.last_complete));
449
450 // update pg
451 dirty_info = true;
452 write_if_dirty(*t);
453 }
454
455 void PrimaryLogPG::on_global_recover(
456 const hobject_t &soid,
457 const object_stat_sum_t &stat_diff,
458 bool is_delete)
459 {
460 info.stats.stats.sum.add(stat_diff);
461 missing_loc.recovered(soid);
462 publish_stats_to_osd();
463 dout(10) << "pushed " << soid << " to all replicas" << dendl;
464 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
465 ceph_assert(i != recovering.end());
466
467 if (i->second && i->second->rwstate.recovery_read_marker) {
468 // recover missing won't have had an obc, but it gets filled in
469 // during on_local_recover
470 ceph_assert(i->second);
471 list<OpRequestRef> requeue_list;
472 i->second->drop_recovery_read(&requeue_list);
473 requeue_ops(requeue_list);
474 }
475
476 backfills_in_flight.erase(soid);
477
478 recovering.erase(i);
479 finish_recovery_op(soid);
480 release_backoffs(soid);
481 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
482 if (degraded_object_entry != waiting_for_degraded_object.end()) {
483 dout(20) << " kicking degraded waiters on " << soid << dendl;
484 requeue_ops(degraded_object_entry->second);
485 waiting_for_degraded_object.erase(degraded_object_entry);
486 }
487 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
488 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
489 dout(20) << " kicking unreadable waiters on " << soid << dendl;
490 requeue_ops(unreadable_object_entry->second);
491 waiting_for_unreadable_object.erase(unreadable_object_entry);
492 }
493 finish_degraded_object(soid);
494 }
495
496 void PrimaryLogPG::on_peer_recover(
497 pg_shard_t peer,
498 const hobject_t &soid,
499 const ObjectRecoveryInfo &recovery_info)
500 {
501 publish_stats_to_osd();
502 // done!
503 peer_missing[peer].got(soid, recovery_info.version);
504 missing_loc.add_location(soid, peer);
505 }
506
507 void PrimaryLogPG::begin_peer_recover(
508 pg_shard_t peer,
509 const hobject_t soid)
510 {
511 peer_missing[peer].revise_have(soid, eversion_t());
512 }
513
514 void PrimaryLogPG::schedule_recovery_work(
515 GenContext<ThreadPool::TPHandle&> *c)
516 {
517 osd->queue_recovery_context(this, c);
518 }
519
520 void PrimaryLogPG::send_message_osd_cluster(
521 int peer, Message *m, epoch_t from_epoch)
522 {
523 osd->send_message_osd_cluster(peer, m, from_epoch);
524 }
525
526 void PrimaryLogPG::send_message_osd_cluster(
527 Message *m, Connection *con)
528 {
529 osd->send_message_osd_cluster(m, con);
530 }
531
532 void PrimaryLogPG::send_message_osd_cluster(
533 Message *m, const ConnectionRef& con)
534 {
535 osd->send_message_osd_cluster(m, con);
536 }
537
538 void PrimaryLogPG::on_primary_error(
539 const hobject_t &oid,
540 eversion_t v)
541 {
542 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
543 primary_failed(oid);
544 primary_error(oid, v);
545 backfill_add_missing(oid, v);
546 }
547
548 void PrimaryLogPG::backfill_add_missing(
549 const hobject_t &oid,
550 eversion_t v)
551 {
552 dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
553 backfills_in_flight.erase(oid);
554 missing_loc.add_missing(oid, v, eversion_t());
555 }
556
557 bool PrimaryLogPG::should_send_op(
558 pg_shard_t peer,
559 const hobject_t &hoid) {
560 if (peer == get_primary())
561 return true;
562 ceph_assert(peer_info.count(peer));
563 bool should_send =
564 hoid.pool != (int64_t)info.pgid.pool() ||
565 hoid <= last_backfill_started ||
566 hoid <= peer_info[peer].last_backfill;
567 if (!should_send) {
568 ceph_assert(is_backfill_targets(peer));
569 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
570 << ", object " << hoid
571 << " beyond std::max(last_backfill_started "
572 << ", peer_info[peer].last_backfill "
573 << peer_info[peer].last_backfill << ")" << dendl;
574 return should_send;
575 }
576 if (async_recovery_targets.count(peer) && peer_missing[peer].is_missing(hoid)) {
577 should_send = false;
578 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
579 << ", object " << hoid
580 << " which is pending recovery in async_recovery_targets" << dendl;
581 }
582 return should_send;
583 }
584
585
586 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
587 int peer, epoch_t from_epoch)
588 {
589 return osd->get_con_osd_cluster(peer, from_epoch);
590 }
591
592 PerfCounters *PrimaryLogPG::get_logger()
593 {
594 return osd->logger;
595 }
596
597
598 // ====================
599 // missing objects
600
601 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
602 {
603 return pg_log.get_missing().get_items().count(soid);
604 }
605
606 void PrimaryLogPG::maybe_kick_recovery(
607 const hobject_t &soid)
608 {
609 eversion_t v;
610 bool work_started = false;
611 if (!missing_loc.needs_recovery(soid, &v))
612 return;
613
614 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
615 if (p != recovering.end()) {
616 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
617 } else if (missing_loc.is_unfound(soid)) {
618 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
619 } else {
620 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
621 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
622 if (is_missing_object(soid)) {
623 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
624 } else if (missing_loc.is_deleted(soid)) {
625 prep_object_replica_deletes(soid, v, h, &work_started);
626 } else {
627 prep_object_replica_pushes(soid, v, h, &work_started);
628 }
629 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
630 }
631 }
632
633 void PrimaryLogPG::wait_for_unreadable_object(
634 const hobject_t& soid, OpRequestRef op)
635 {
636 ceph_assert(is_unreadable_object(soid));
637 maybe_kick_recovery(soid);
638 waiting_for_unreadable_object[soid].push_back(op);
639 op->mark_delayed("waiting for missing object");
640 }
641
642 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
643 {
644 /* The conditions below may clear (on_local_recover, before we queue
645 * the transaction) before we actually requeue the degraded waiters
646 * in on_global_recover after the transaction completes.
647 */
648 if (waiting_for_degraded_object.count(soid))
649 return true;
650 if (pg_log.get_missing().get_items().count(soid))
651 return true;
652 ceph_assert(!acting_recovery_backfill.empty());
653 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
654 i != acting_recovery_backfill.end();
655 ++i) {
656 if (*i == get_primary()) continue;
657 pg_shard_t peer = *i;
658 auto peer_missing_entry = peer_missing.find(peer);
659 // If an object is missing on an async_recovery_target, return false.
660 // This will not block the op and the object is async recovered later.
661 if (peer_missing_entry != peer_missing.end() &&
662 peer_missing_entry->second.get_items().count(soid)) {
663 if (async_recovery_targets.count(peer))
664 continue;
665 else
666 return true;
667 }
668 // Object is degraded if after last_backfill AND
669 // we are backfilling it
670 if (is_backfill_targets(peer) &&
671 peer_info[peer].last_backfill <= soid &&
672 last_backfill_started >= soid &&
673 backfills_in_flight.count(soid))
674 return true;
675 }
676 return false;
677 }
678
679 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
680 {
681 for (auto &i: async_recovery_targets) {
682 auto peer_missing_entry = peer_missing.find(i);
683 if (peer_missing_entry != peer_missing.end() &&
684 peer_missing_entry->second.get_items().count(soid)) {
685 dout(30) << __func__ << " " << soid << dendl;
686 return true;
687 }
688 }
689 return false;
690 }
691
692 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
693 {
694 ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
695
696 maybe_kick_recovery(soid);
697 waiting_for_degraded_object[soid].push_back(op);
698 op->mark_delayed("waiting for degraded object");
699 }
700
701 void PrimaryLogPG::block_write_on_full_cache(
702 const hobject_t& _oid, OpRequestRef op)
703 {
704 const hobject_t oid = _oid.get_head();
705 dout(20) << __func__ << ": blocking object " << oid
706 << " on full cache" << dendl;
707 objects_blocked_on_cache_full.insert(oid);
708 waiting_for_cache_not_full.push_back(op);
709 op->mark_delayed("waiting for cache not full");
710 }
711
712 void PrimaryLogPG::block_for_clean(
713 const hobject_t& oid, OpRequestRef op)
714 {
715 dout(20) << __func__ << ": blocking object " << oid
716 << " on primary repair" << dendl;
717 waiting_for_clean_to_primary_repair.push_back(op);
718 op->mark_delayed("waiting for clean to repair");
719 }
720
721 void PrimaryLogPG::block_write_on_snap_rollback(
722 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
723 {
724 dout(20) << __func__ << ": blocking object " << oid.get_head()
725 << " on snap promotion " << obc->obs.oi.soid << dendl;
726 // otherwise, we'd have blocked in do_op
727 ceph_assert(oid.is_head());
728 ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
729 objects_blocked_on_snap_promotion[oid] = obc;
730 wait_for_blocked_object(obc->obs.oi.soid, op);
731 }
732
733 void PrimaryLogPG::block_write_on_degraded_snap(
734 const hobject_t& snap, OpRequestRef op)
735 {
736 dout(20) << __func__ << ": blocking object " << snap.get_head()
737 << " on degraded snap " << snap << dendl;
738 // otherwise, we'd have blocked in do_op
739 ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
740 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
741 wait_for_degraded_object(snap, op);
742 }
743
744 bool PrimaryLogPG::maybe_await_blocked_head(
745 const hobject_t &hoid,
746 OpRequestRef op)
747 {
748 ObjectContextRef obc;
749 obc = object_contexts.lookup(hoid.get_head());
750 if (obc) {
751 if (obc->is_blocked()) {
752 wait_for_blocked_object(obc->obs.oi.soid, op);
753 return true;
754 } else {
755 return false;
756 }
757 }
758 return false;
759 }
760
761 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
762 {
763 dout(10) << __func__ << " " << soid << " " << op << dendl;
764 waiting_for_blocked_object[soid].push_back(op);
765 op->mark_delayed("waiting for blocked object");
766 }
767
768 void PrimaryLogPG::maybe_force_recovery()
769 {
770 // no force if not in degraded/recovery/backfill states
771 if (!is_degraded() &&
772 !state_test(PG_STATE_RECOVERING |
773 PG_STATE_RECOVERY_WAIT |
774 PG_STATE_BACKFILLING |
775 PG_STATE_BACKFILL_WAIT |
776 PG_STATE_BACKFILL_TOOFULL))
777 return;
778
779 if (pg_log.get_log().approx_size() <
780 cct->_conf->osd_max_pg_log_entries *
781 cct->_conf->osd_force_recovery_pg_log_entries_factor)
782 return;
783
784 // find the oldest missing object
785 version_t min_version = pg_log.get_log().head.version;
786 hobject_t soid;
787 if (!pg_log.get_missing().get_rmissing().empty()) {
788 min_version = pg_log.get_missing().get_rmissing().begin()->first;
789 soid = pg_log.get_missing().get_rmissing().begin()->second;
790 }
791 ceph_assert(!acting_recovery_backfill.empty());
792 for (set<pg_shard_t>::iterator it = acting_recovery_backfill.begin();
793 it != acting_recovery_backfill.end();
794 ++it) {
795 if (*it == get_primary()) continue;
796 pg_shard_t peer = *it;
797 auto it_missing = peer_missing.find(peer);
798 if (it_missing != peer_missing.end() &&
799 !it_missing->second.get_rmissing().empty()) {
800 const auto& min_obj = peer_missing[peer].get_rmissing().begin();
801 dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
802 << " oid " << min_obj->second << dendl;
803 if (min_version > min_obj->first) {
804 min_version = min_obj->first;
805 soid = min_obj->second;
806 }
807 }
808 }
809
810 // recover it
811 if (soid != hobject_t())
812 maybe_kick_recovery(soid);
813 }
814
815 class PGLSPlainFilter : public PGLSFilter {
816 string val;
817 public:
818 int init(bufferlist::const_iterator &params) override
819 {
820 try {
821 decode(xattr, params);
822 decode(val, params);
823 } catch (buffer::error &e) {
824 return -EINVAL;
825 }
826
827 return 0;
828 }
829 ~PGLSPlainFilter() override {}
830 bool filter(const hobject_t &obj, bufferlist& xattr_data,
831 bufferlist& outdata) override;
832 };
833
834 class PGLSParentFilter : public PGLSFilter {
835 inodeno_t parent_ino;
836 public:
837 CephContext* cct;
838 explicit PGLSParentFilter(CephContext* cct) : cct(cct) {
839 xattr = "_parent";
840 }
841 int init(bufferlist::const_iterator &params) override
842 {
843 try {
844 decode(parent_ino, params);
845 } catch (buffer::error &e) {
846 return -EINVAL;
847 }
848 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
849
850 return 0;
851 }
852 ~PGLSParentFilter() override {}
853 bool filter(const hobject_t &obj, bufferlist& xattr_data,
854 bufferlist& outdata) override;
855 };
856
857 bool PGLSParentFilter::filter(const hobject_t &obj,
858 bufferlist& xattr_data, bufferlist& outdata)
859 {
860 auto iter = xattr_data.cbegin();
861 inode_backtrace_t bt;
862
863 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
864
865 decode(bt, iter);
866
867 vector<inode_backpointer_t>::iterator vi;
868 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
869 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
870 if (vi->dirino == parent_ino) {
871 encode(*vi, outdata);
872 return true;
873 }
874 }
875
876 return false;
877 }
878
879 bool PGLSPlainFilter::filter(const hobject_t &obj,
880 bufferlist& xattr_data, bufferlist& outdata)
881 {
882 if (val.size() != xattr_data.length())
883 return false;
884
885 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
886 return false;
887
888 return true;
889 }
890
891 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
892 {
893 bufferlist bl;
894
895 // If filter has expressed an interest in an xattr, load it.
896 if (!filter->get_xattr().empty()) {
897 int ret = pgbackend->objects_get_attr(
898 sobj,
899 filter->get_xattr(),
900 &bl);
901 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
902 if (ret < 0) {
903 if (ret != -ENODATA || filter->reject_empty_xattr()) {
904 return false;
905 }
906 }
907 }
908
909 return filter->filter(sobj, bl, outdata);
910 }
911
912 int PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter, PGLSFilter **pfilter)
913 {
914 string type;
915 PGLSFilter *filter;
916
917 try {
918 decode(type, iter);
919 }
920 catch (buffer::error& e) {
921 return -EINVAL;
922 }
923
924 if (type.compare("parent") == 0) {
925 filter = new PGLSParentFilter(cct);
926 } else if (type.compare("plain") == 0) {
927 filter = new PGLSPlainFilter();
928 } else {
929 std::size_t dot = type.find(".");
930 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
931 return -EINVAL;
932 }
933
934 const std::string class_name = type.substr(0, dot);
935 const std::string filter_name = type.substr(dot + 1);
936 ClassHandler::ClassData *cls = NULL;
937 int r = osd->class_handler->open_class(class_name, &cls);
938 if (r != 0) {
939 derr << "Error opening class '" << class_name << "': "
940 << cpp_strerror(r) << dendl;
941 if (r != -EPERM) // propogate permission error
942 r = -EINVAL;
943 return r;
944 } else {
945 ceph_assert(cls);
946 }
947
948 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
949 if (class_filter == NULL) {
950 derr << "Error finding filter '" << filter_name << "' in class "
951 << class_name << dendl;
952 return -EINVAL;
953 }
954 filter = class_filter->fn();
955 if (!filter) {
956 // Object classes are obliged to return us something, but let's
957 // give an error rather than asserting out.
958 derr << "Buggy class " << class_name << " failed to construct "
959 "filter " << filter_name << dendl;
960 return -EINVAL;
961 }
962 }
963
964 ceph_assert(filter);
965 int r = filter->init(iter);
966 if (r < 0) {
967 derr << "Error initializing filter " << type << ": "
968 << cpp_strerror(r) << dendl;
969 delete filter;
970 return -EINVAL;
971 } else {
972 // Successfully constructed and initialized, return it.
973 *pfilter = filter;
974 return 0;
975 }
976 }
977
978
979 // ==========================================================
980
981 int PrimaryLogPG::do_command(
982 cmdmap_t cmdmap,
983 ostream& ss,
984 bufferlist& idata,
985 bufferlist& odata,
986 ConnectionRef con,
987 ceph_tid_t tid)
988 {
989 string prefix;
990 string format;
991
992 cmd_getval(cct, cmdmap, "format", format);
993 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
994
995 string command;
996 cmd_getval(cct, cmdmap, "cmd", command);
997 if (command == "query") {
998 f->open_object_section("pg");
999 f->dump_string("state", pg_state_string(get_state()));
1000 f->dump_stream("snap_trimq") << snap_trimq;
1001 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
1002 f->dump_unsigned("epoch", get_osdmap_epoch());
1003 f->open_array_section("up");
1004 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
1005 f->dump_unsigned("osd", *p);
1006 f->close_section();
1007 f->open_array_section("acting");
1008 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
1009 f->dump_unsigned("osd", *p);
1010 f->close_section();
1011 if (!backfill_targets.empty()) {
1012 f->open_array_section("backfill_targets");
1013 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
1014 p != backfill_targets.end();
1015 ++p)
1016 f->dump_stream("shard") << *p;
1017 f->close_section();
1018 }
1019 if (!async_recovery_targets.empty()) {
1020 f->open_array_section("async_recovery_targets");
1021 for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
1022 p != async_recovery_targets.end();
1023 ++p)
1024 f->dump_stream("shard") << *p;
1025 f->close_section();
1026 }
1027 if (!acting_recovery_backfill.empty()) {
1028 f->open_array_section("acting_recovery_backfill");
1029 for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
1030 p != acting_recovery_backfill.end();
1031 ++p)
1032 f->dump_stream("shard") << *p;
1033 f->close_section();
1034 }
1035 f->open_object_section("info");
1036 _update_calc_stats();
1037 info.dump(f.get());
1038 f->close_section();
1039
1040 f->open_array_section("peer_info");
1041 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1042 p != peer_info.end();
1043 ++p) {
1044 f->open_object_section("info");
1045 f->dump_stream("peer") << p->first;
1046 p->second.dump(f.get());
1047 f->close_section();
1048 }
1049 f->close_section();
1050
1051 f->open_array_section("recovery_state");
1052 handle_query_state(f.get());
1053 f->close_section();
1054
1055 f->open_object_section("agent_state");
1056 if (agent_state)
1057 agent_state->dump(f.get());
1058 f->close_section();
1059
1060 f->close_section();
1061 f->flush(odata);
1062 return 0;
1063 }
1064 else if (command == "mark_unfound_lost") {
1065 string mulcmd;
1066 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1067 int mode = -1;
1068 if (mulcmd == "revert") {
1069 if (pool.info.is_erasure()) {
1070 ss << "mode must be 'delete' for ec pool";
1071 return -EINVAL;
1072 }
1073 mode = pg_log_entry_t::LOST_REVERT;
1074 } else if (mulcmd == "delete") {
1075 mode = pg_log_entry_t::LOST_DELETE;
1076 } else {
1077 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1078 return -EINVAL;
1079 }
1080 ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1081 mode == pg_log_entry_t::LOST_DELETE);
1082
1083 if (!is_primary()) {
1084 ss << "not primary";
1085 return -EROFS;
1086 }
1087
1088 uint64_t unfound = missing_loc.num_unfound();
1089 if (!unfound) {
1090 ss << "pg has no unfound objects";
1091 return 0; // make command idempotent
1092 }
1093
1094 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1095 ss << "pg has " << unfound
1096 << " unfound objects but we haven't probed all sources, not marking lost";
1097 return -EINVAL;
1098 }
1099
1100 mark_all_unfound_lost(mode, con, tid);
1101 return -EAGAIN;
1102 }
1103 else if (command == "list_unfound") {
1104 hobject_t offset;
1105 string offset_json;
1106 bool show_offset = false;
1107 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1108 json_spirit::Value v;
1109 try {
1110 if (!json_spirit::read(offset_json, v))
1111 throw std::runtime_error("bad json");
1112 offset.decode(v);
1113 } catch (std::runtime_error& e) {
1114 ss << "error parsing offset: " << e.what();
1115 return -EINVAL;
1116 }
1117 show_offset = true;
1118 }
1119 f->open_object_section("missing");
1120 if (show_offset) {
1121 f->open_object_section("offset");
1122 offset.dump(f.get());
1123 f->close_section();
1124 }
1125 auto &needs_recovery_map = missing_loc.get_needs_recovery();
1126 f->dump_int("num_missing", needs_recovery_map.size());
1127 f->dump_int("num_unfound", get_num_unfound());
1128 map<hobject_t, pg_missing_item>::const_iterator p =
1129 needs_recovery_map.upper_bound(offset);
1130 {
1131 f->open_array_section("objects");
1132 int32_t num = 0;
1133 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1134 if (missing_loc.is_unfound(p->first)) {
1135 f->open_object_section("object");
1136 {
1137 f->open_object_section("oid");
1138 p->first.dump(f.get());
1139 f->close_section();
1140 }
1141 p->second.dump(f.get()); // have, need keys
1142 {
1143 f->open_array_section("locations");
1144 for (set<pg_shard_t>::iterator r =
1145 missing_loc.get_locations(p->first).begin();
1146 r != missing_loc.get_locations(p->first).end();
1147 ++r)
1148 f->dump_stream("shard") << *r;
1149 f->close_section();
1150 }
1151 f->close_section();
1152 num++;
1153 }
1154 }
1155 f->close_section();
1156 }
1157 f->dump_bool("more", p != needs_recovery_map.end());
1158 f->close_section();
1159 f->flush(odata);
1160 return 0;
1161 }
1162
1163 ss << "unknown pg command " << prefix;
1164 return -EINVAL;
1165 }
1166
1167 // ==========================================================
1168
1169 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1170 {
1171 // NOTE: this is non-const because we modify the OSDOp.outdata in
1172 // place
1173 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1174 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1175 dout(10) << "do_pg_op " << *m << dendl;
1176
1177 op->mark_started();
1178
1179 int result = 0;
1180 string cname, mname;
1181 PGLSFilter *filter = NULL;
1182 bufferlist filter_out;
1183
1184 snapid_t snapid = m->get_snapid();
1185
1186 vector<OSDOp> ops = m->ops;
1187
1188 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1189 OSDOp& osd_op = *p;
1190 auto bp = p->indata.cbegin();
1191 switch (p->op.op) {
1192 case CEPH_OSD_OP_PGNLS_FILTER:
1193 try {
1194 decode(cname, bp);
1195 decode(mname, bp);
1196 }
1197 catch (const buffer::error& e) {
1198 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1199 result = -EINVAL;
1200 break;
1201 }
1202 if (filter) {
1203 delete filter;
1204 filter = NULL;
1205 }
1206 result = get_pgls_filter(bp, &filter);
1207 if (result < 0)
1208 break;
1209
1210 ceph_assert(filter);
1211
1212 // fall through
1213
1214 case CEPH_OSD_OP_PGNLS:
1215 if (snapid != CEPH_NOSNAP) {
1216 result = -EINVAL;
1217 break;
1218 }
1219 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1220 dout(10) << " pgnls pg=" << m->get_pg()
1221 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1222 << " != " << info.pgid << dendl;
1223 result = 0; // hmm?
1224 } else {
1225 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1226 p->op.pgls.count);
1227
1228 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1229 << dendl;
1230 // read into a buffer
1231 vector<hobject_t> sentries;
1232 pg_nls_response_t response;
1233 try {
1234 decode(response.handle, bp);
1235 }
1236 catch (const buffer::error& e) {
1237 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1238 result = -EINVAL;
1239 break;
1240 }
1241
1242 hobject_t next;
1243 hobject_t lower_bound = response.handle;
1244 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1245 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1246 dout(10) << " pgnls lower_bound " << lower_bound
1247 << " pg_end " << pg_end << dendl;
1248 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1249 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1250 // this should only happen with a buggy client.
1251 dout(10) << "outside of PG bounds " << pg_start << " .. "
1252 << pg_end << dendl;
1253 result = -EINVAL;
1254 break;
1255 }
1256
1257 hobject_t current = lower_bound;
1258 int r = pgbackend->objects_list_partial(
1259 current,
1260 list_size,
1261 list_size,
1262 &sentries,
1263 &next);
1264 if (r != 0) {
1265 result = -EINVAL;
1266 break;
1267 }
1268
1269 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1270 pg_log.get_missing().get_items().lower_bound(current);
1271 vector<hobject_t>::iterator ls_iter = sentries.begin();
1272 hobject_t _max = hobject_t::get_max();
1273 while (1) {
1274 const hobject_t &mcand =
1275 missing_iter == pg_log.get_missing().get_items().end() ?
1276 _max :
1277 missing_iter->first;
1278 const hobject_t &lcand =
1279 ls_iter == sentries.end() ?
1280 _max :
1281 *ls_iter;
1282
1283 hobject_t candidate;
1284 if (mcand == lcand) {
1285 candidate = mcand;
1286 if (!mcand.is_max()) {
1287 ++ls_iter;
1288 ++missing_iter;
1289 }
1290 } else if (mcand < lcand) {
1291 candidate = mcand;
1292 ceph_assert(!mcand.is_max());
1293 ++missing_iter;
1294 } else {
1295 candidate = lcand;
1296 ceph_assert(!lcand.is_max());
1297 ++ls_iter;
1298 }
1299
1300 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1301 << " vs lower bound 0x" << lower_bound.get_hash()
1302 << std::dec << dendl;
1303
1304 if (candidate >= next) {
1305 break;
1306 }
1307
1308 if (response.entries.size() == list_size) {
1309 next = candidate;
1310 break;
1311 }
1312
1313 if (candidate.snap != CEPH_NOSNAP)
1314 continue;
1315
1316 // skip internal namespace
1317 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1318 continue;
1319
1320 if (missing_loc.is_deleted(candidate))
1321 continue;
1322
1323 // skip wrong namespace
1324 if (m->get_hobj().nspace != librados::all_nspaces &&
1325 candidate.get_namespace() != m->get_hobj().nspace)
1326 continue;
1327
1328 if (filter && !pgls_filter(filter, candidate, filter_out))
1329 continue;
1330
1331 dout(20) << "pgnls item 0x" << std::hex
1332 << candidate.get_hash()
1333 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1334 << std::dec << " "
1335 << candidate.oid.name << dendl;
1336
1337 librados::ListObjectImpl item;
1338 item.nspace = candidate.get_namespace();
1339 item.oid = candidate.oid.name;
1340 item.locator = candidate.get_key();
1341 response.entries.push_back(item);
1342 }
1343
1344 if (next.is_max() &&
1345 missing_iter == pg_log.get_missing().get_items().end() &&
1346 ls_iter == sentries.end()) {
1347 result = 1;
1348
1349 // Set response.handle to the start of the next PG according
1350 // to the object sort order.
1351 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1352 } else {
1353 response.handle = next;
1354 }
1355 dout(10) << "pgnls handle=" << response.handle << dendl;
1356 encode(response, osd_op.outdata);
1357 if (filter)
1358 encode(filter_out, osd_op.outdata);
1359 dout(10) << " pgnls result=" << result << " outdata.length()="
1360 << osd_op.outdata.length() << dendl;
1361 }
1362 break;
1363
1364 case CEPH_OSD_OP_PGLS_FILTER:
1365 try {
1366 decode(cname, bp);
1367 decode(mname, bp);
1368 }
1369 catch (const buffer::error& e) {
1370 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1371 result = -EINVAL;
1372 break;
1373 }
1374 if (filter) {
1375 delete filter;
1376 filter = NULL;
1377 }
1378 result = get_pgls_filter(bp, &filter);
1379 if (result < 0)
1380 break;
1381
1382 ceph_assert(filter);
1383
1384 // fall through
1385
1386 case CEPH_OSD_OP_PGLS:
1387 if (snapid != CEPH_NOSNAP) {
1388 result = -EINVAL;
1389 break;
1390 }
1391 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1392 dout(10) << " pgls pg=" << m->get_pg()
1393 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1394 << " != " << info.pgid << dendl;
1395 result = 0; // hmm?
1396 } else {
1397 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1398 p->op.pgls.count);
1399
1400 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1401 // read into a buffer
1402 vector<hobject_t> sentries;
1403 pg_ls_response_t response;
1404 try {
1405 decode(response.handle, bp);
1406 }
1407 catch (const buffer::error& e) {
1408 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1409 result = -EINVAL;
1410 break;
1411 }
1412
1413 hobject_t next;
1414 hobject_t current = response.handle;
1415 int r = pgbackend->objects_list_partial(
1416 current,
1417 list_size,
1418 list_size,
1419 &sentries,
1420 &next);
1421 if (r != 0) {
1422 result = -EINVAL;
1423 break;
1424 }
1425
1426 ceph_assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1427
1428 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1429 pg_log.get_missing().get_items().lower_bound(current);
1430 vector<hobject_t>::iterator ls_iter = sentries.begin();
1431 hobject_t _max = hobject_t::get_max();
1432 while (1) {
1433 const hobject_t &mcand =
1434 missing_iter == pg_log.get_missing().get_items().end() ?
1435 _max :
1436 missing_iter->first;
1437 const hobject_t &lcand =
1438 ls_iter == sentries.end() ?
1439 _max :
1440 *ls_iter;
1441
1442 hobject_t candidate;
1443 if (mcand == lcand) {
1444 candidate = mcand;
1445 if (!mcand.is_max()) {
1446 ++ls_iter;
1447 ++missing_iter;
1448 }
1449 } else if (mcand < lcand) {
1450 candidate = mcand;
1451 ceph_assert(!mcand.is_max());
1452 ++missing_iter;
1453 } else {
1454 candidate = lcand;
1455 ceph_assert(!lcand.is_max());
1456 ++ls_iter;
1457 }
1458
1459 if (candidate >= next) {
1460 break;
1461 }
1462
1463 if (response.entries.size() == list_size) {
1464 next = candidate;
1465 break;
1466 }
1467
1468 if (candidate.snap != CEPH_NOSNAP)
1469 continue;
1470
1471 // skip wrong namespace
1472 if (candidate.get_namespace() != m->get_hobj().nspace)
1473 continue;
1474
1475 if (missing_loc.is_deleted(candidate))
1476 continue;
1477
1478 if (filter && !pgls_filter(filter, candidate, filter_out))
1479 continue;
1480
1481 response.entries.push_back(make_pair(candidate.oid,
1482 candidate.get_key()));
1483 }
1484 if (next.is_max() &&
1485 missing_iter == pg_log.get_missing().get_items().end() &&
1486 ls_iter == sentries.end()) {
1487 result = 1;
1488 }
1489 response.handle = next;
1490 encode(response, osd_op.outdata);
1491 if (filter)
1492 encode(filter_out, osd_op.outdata);
1493 dout(10) << " pgls result=" << result << " outdata.length()="
1494 << osd_op.outdata.length() << dendl;
1495 }
1496 break;
1497
1498 case CEPH_OSD_OP_PG_HITSET_LS:
1499 {
1500 list< pair<utime_t,utime_t> > ls;
1501 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1502 p != info.hit_set.history.end();
1503 ++p)
1504 ls.push_back(make_pair(p->begin, p->end));
1505 if (hit_set)
1506 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1507 encode(ls, osd_op.outdata);
1508 }
1509 break;
1510
1511 case CEPH_OSD_OP_PG_HITSET_GET:
1512 {
1513 utime_t stamp(osd_op.op.hit_set_get.stamp);
1514 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1515 // read the current in-memory HitSet, not the version we've
1516 // checkpointed.
1517 if (!hit_set) {
1518 result= -ENOENT;
1519 break;
1520 }
1521 encode(*hit_set, osd_op.outdata);
1522 result = osd_op.outdata.length();
1523 } else {
1524 // read an archived HitSet.
1525 hobject_t oid;
1526 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1527 p != info.hit_set.history.end();
1528 ++p) {
1529 if (stamp >= p->begin && stamp <= p->end) {
1530 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1531 break;
1532 }
1533 }
1534 if (oid == hobject_t()) {
1535 result = -ENOENT;
1536 break;
1537 }
1538 if (!pool.info.is_replicated()) {
1539 // FIXME: EC not supported yet
1540 result = -EOPNOTSUPP;
1541 break;
1542 }
1543 if (is_unreadable_object(oid)) {
1544 wait_for_unreadable_object(oid, op);
1545 delete filter;
1546 return;
1547 }
1548 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1549 }
1550 }
1551 break;
1552
1553 case CEPH_OSD_OP_SCRUBLS:
1554 result = do_scrub_ls(m, &osd_op);
1555 break;
1556
1557 default:
1558 result = -EINVAL;
1559 break;
1560 }
1561
1562 if (result < 0)
1563 break;
1564 }
1565
1566 // reply
1567 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1568 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1569 false);
1570 reply->claim_op_out_data(ops);
1571 reply->set_result(result);
1572 reply->set_reply_versions(info.last_update, info.last_user_version);
1573 osd->send_message_osd_client(reply, m->get_connection());
1574 delete filter;
1575 }
1576
1577 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1578 {
1579 if (m->get_pg() != info.pgid.pgid) {
1580 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1581 return -EINVAL; // hmm?
1582 }
1583 auto bp = osd_op->indata.cbegin();
1584 scrub_ls_arg_t arg;
1585 try {
1586 arg.decode(bp);
1587 } catch (buffer::error&) {
1588 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1589 return -EINVAL;
1590 }
1591 int r = 0;
1592 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1593 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1594 r = -EAGAIN;
1595 } else if (!scrubber.store) {
1596 r = -ENOENT;
1597 } else if (arg.get_snapsets) {
1598 result.vals = scrubber.store->get_snap_errors(osd->store,
1599 get_pgid().pool(),
1600 arg.start_after,
1601 arg.max_return);
1602 } else {
1603 result.vals = scrubber.store->get_object_errors(osd->store,
1604 get_pgid().pool(),
1605 arg.start_after,
1606 arg.max_return);
1607 }
1608 encode(result, osd_op->outdata);
1609 return r;
1610 }
1611
1612 void PrimaryLogPG::calc_trim_to()
1613 {
1614 size_t target = cct->_conf->osd_min_pg_log_entries;
1615 if (is_degraded() ||
1616 state_test(PG_STATE_RECOVERING |
1617 PG_STATE_RECOVERY_WAIT |
1618 PG_STATE_BACKFILLING |
1619 PG_STATE_BACKFILL_WAIT |
1620 PG_STATE_BACKFILL_TOOFULL)) {
1621 target = cct->_conf->osd_max_pg_log_entries;
1622 }
1623
1624 eversion_t limit = std::min(
1625 min_last_complete_ondisk,
1626 pg_log.get_can_rollback_to());
1627 if (limit != eversion_t() &&
1628 limit != pg_trim_to &&
1629 pg_log.get_log().approx_size() > target) {
1630 size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
1631 cct->_conf->osd_pg_log_trim_max);
1632 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1633 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1634 return;
1635 }
1636 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1637 eversion_t new_trim_to;
1638 for (size_t i = 0; i < num_to_trim; ++i) {
1639 new_trim_to = it->version;
1640 ++it;
1641 if (new_trim_to > limit) {
1642 new_trim_to = limit;
1643 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1644 break;
1645 }
1646 }
1647 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1648 pg_trim_to = new_trim_to;
1649 assert(pg_trim_to <= pg_log.get_head());
1650 assert(pg_trim_to <= min_last_complete_ondisk);
1651 }
1652 }
1653
1654 void PrimaryLogPG::calc_trim_to_aggressive()
1655 {
1656 size_t target = cct->_conf->osd_min_pg_log_entries;
1657 if (is_degraded() ||
1658 state_test(PG_STATE_RECOVERING |
1659 PG_STATE_RECOVERY_WAIT |
1660 PG_STATE_BACKFILLING |
1661 PG_STATE_BACKFILL_WAIT |
1662 PG_STATE_BACKFILL_TOOFULL)) {
1663 target = cct->_conf->osd_max_pg_log_entries;
1664 }
1665 // limit pg log trimming up to the can_rollback_to value
1666 eversion_t limit = std::min(
1667 pg_log.get_head(),
1668 pg_log.get_can_rollback_to());
1669 dout(10) << __func__ << " limit = " << limit << dendl;
1670
1671 if (limit != eversion_t() &&
1672 limit != pg_trim_to &&
1673 pg_log.get_log().approx_size() > target) {
1674 dout(10) << __func__ << " approx pg log length = "
1675 << pg_log.get_log().approx_size() << dendl;
1676 uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
1677 cct->_conf->osd_pg_log_trim_max);
1678 dout(10) << __func__ << " num_to_trim = " << num_to_trim << dendl;
1679 if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1680 cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1681 return;
1682 }
1683 auto it = pg_log.get_log().log.begin(); // oldest log entry
1684 auto rit = pg_log.get_log().log.rbegin();
1685 eversion_t by_n_to_keep; // start from tail
1686 eversion_t by_n_to_trim = eversion_t::max(); // start from head
1687 for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
1688 i++;
1689 if (i > target && by_n_to_keep == eversion_t()) {
1690 by_n_to_keep = rit->version;
1691 }
1692 if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
1693 by_n_to_trim = it->version;
1694 }
1695 if (by_n_to_keep != eversion_t() &&
1696 by_n_to_trim != eversion_t::max()) {
1697 break;
1698 }
1699 }
1700
1701 if (by_n_to_keep == eversion_t()) {
1702 return;
1703 }
1704
1705 pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
1706 dout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
1707 ceph_assert(pg_trim_to <= pg_log.get_head());
1708 }
1709 }
1710
1711 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1712 const PGPool &_pool,
1713 const map<string,string>& ec_profile, spg_t p) :
1714 PG(o, curmap, _pool, p),
1715 pgbackend(
1716 PGBackend::build_pg_backend(
1717 _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1718 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1719 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1720 new_backfill(false),
1721 temp_seq(0),
1722 snap_trimmer_machine(this)
1723 {
1724 missing_loc.set_backend_predicates(
1725 pgbackend->get_is_readable_predicate(),
1726 pgbackend->get_is_recoverable_predicate());
1727 snap_trimmer_machine.initiate();
1728 }
1729
1730 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1731 {
1732 src_oloc = oloc;
1733 if (oloc.key.empty())
1734 src_oloc.key = oid.name;
1735 }
1736
1737 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1738 {
1739 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1740 SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())};
1741 if (!session)
1742 return; // drop it.
1743 hobject_t begin = info.pgid.pgid.get_hobj_start();
1744 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1745 if (begin < m->begin) {
1746 begin = m->begin;
1747 }
1748 if (end > m->end) {
1749 end = m->end;
1750 }
1751 dout(10) << __func__ << " backoff ack id " << m->id
1752 << " [" << begin << "," << end << ")" << dendl;
1753 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1754 }
1755
1756 void PrimaryLogPG::do_request(
1757 OpRequestRef& op,
1758 ThreadPool::TPHandle &handle)
1759 {
1760 if (op->osd_trace) {
1761 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1762 op->pg_trace.event("do request");
1763 }
1764 // make sure we have a new enough map
1765 auto p = waiting_for_map.find(op->get_source());
1766 if (p != waiting_for_map.end()) {
1767 // preserve ordering
1768 dout(20) << __func__ << " waiting_for_map "
1769 << p->first << " not empty, queueing" << dendl;
1770 p->second.push_back(op);
1771 op->mark_delayed("waiting_for_map not empty");
1772 return;
1773 }
1774 if (!have_same_or_newer_map(op->min_epoch)) {
1775 dout(20) << __func__ << " min " << op->min_epoch
1776 << ", queue on waiting_for_map " << op->get_source() << dendl;
1777 waiting_for_map[op->get_source()].push_back(op);
1778 op->mark_delayed("op must wait for map");
1779 osd->request_osdmap_update(op->min_epoch);
1780 return;
1781 }
1782
1783 if (can_discard_request(op)) {
1784 return;
1785 }
1786
1787 // pg-wide backoffs
1788 const Message *m = op->get_req();
1789 int msg_type = m->get_type();
1790 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1791 SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())};
1792 if (!session)
1793 return; // drop it.
1794
1795 if (msg_type == CEPH_MSG_OSD_OP) {
1796 if (session->check_backoff(cct, info.pgid,
1797 info.pgid.pgid.get_hobj_start(), m)) {
1798 return;
1799 }
1800
1801 bool backoff =
1802 is_down() ||
1803 is_incomplete() ||
1804 (!is_active() && is_peered());
1805 if (g_conf()->osd_backoff_on_peering && !backoff) {
1806 if (is_peering()) {
1807 backoff = true;
1808 }
1809 }
1810 if (backoff) {
1811 add_pg_backoff(session);
1812 return;
1813 }
1814 }
1815 // pg backoff acks at pg-level
1816 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1817 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1818 if (ba->begin != ba->end) {
1819 handle_backoff(op);
1820 return;
1821 }
1822 }
1823 }
1824
1825 if (!is_peered()) {
1826 // Delay unless PGBackend says it's ok
1827 if (pgbackend->can_handle_while_inactive(op)) {
1828 bool handled = pgbackend->handle_message(op);
1829 ceph_assert(handled);
1830 return;
1831 } else {
1832 waiting_for_peered.push_back(op);
1833 op->mark_delayed("waiting for peered");
1834 return;
1835 }
1836 }
1837
1838 if (flushes_in_progress > 0) {
1839 dout(20) << flushes_in_progress
1840 << " flushes_in_progress pending "
1841 << "waiting for flush on " << op << dendl;
1842 waiting_for_flush.push_back(op);
1843 op->mark_delayed("waiting for flush");
1844 return;
1845 }
1846
1847 ceph_assert(is_peered() && flushes_in_progress == 0);
1848 if (pgbackend->handle_message(op))
1849 return;
1850
1851 switch (msg_type) {
1852 case CEPH_MSG_OSD_OP:
1853 case CEPH_MSG_OSD_BACKOFF:
1854 if (!is_active()) {
1855 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1856 waiting_for_active.push_back(op);
1857 op->mark_delayed("waiting for active");
1858 return;
1859 }
1860 switch (msg_type) {
1861 case CEPH_MSG_OSD_OP:
1862 // verify client features
1863 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1864 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1865 osd->reply_op_error(op, -EOPNOTSUPP);
1866 return;
1867 }
1868 do_op(op);
1869 break;
1870 case CEPH_MSG_OSD_BACKOFF:
1871 // object-level backoff acks handled in osdop context
1872 handle_backoff(op);
1873 break;
1874 }
1875 break;
1876
1877 case MSG_OSD_PG_SCAN:
1878 do_scan(op, handle);
1879 break;
1880
1881 case MSG_OSD_PG_BACKFILL:
1882 do_backfill(op);
1883 break;
1884
1885 case MSG_OSD_PG_BACKFILL_REMOVE:
1886 do_backfill_remove(op);
1887 break;
1888
1889 case MSG_OSD_SCRUB_RESERVE:
1890 {
1891 const MOSDScrubReserve *m =
1892 static_cast<const MOSDScrubReserve*>(op->get_req());
1893 switch (m->type) {
1894 case MOSDScrubReserve::REQUEST:
1895 handle_scrub_reserve_request(op);
1896 break;
1897 case MOSDScrubReserve::GRANT:
1898 handle_scrub_reserve_grant(op, m->from);
1899 break;
1900 case MOSDScrubReserve::REJECT:
1901 handle_scrub_reserve_reject(op, m->from);
1902 break;
1903 case MOSDScrubReserve::RELEASE:
1904 handle_scrub_reserve_release(op);
1905 break;
1906 }
1907 }
1908 break;
1909
1910 case MSG_OSD_REP_SCRUB:
1911 replica_scrub(op, handle);
1912 break;
1913
1914 case MSG_OSD_REP_SCRUBMAP:
1915 do_replica_scrub_map(op);
1916 break;
1917
1918 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1919 do_update_log_missing(op);
1920 break;
1921
1922 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1923 do_update_log_missing_reply(op);
1924 break;
1925
1926 default:
1927 ceph_abort_msg("bad message type in do_request");
1928 }
1929 }
1930
1931 hobject_t PrimaryLogPG::earliest_backfill() const
1932 {
1933 hobject_t e = hobject_t::get_max();
1934 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1935 i != backfill_targets.end();
1936 ++i) {
1937 pg_shard_t bt = *i;
1938 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1939 ceph_assert(iter != peer_info.end());
1940 if (iter->second.last_backfill < e)
1941 e = iter->second.last_backfill;
1942 }
1943 return e;
1944 }
1945
1946 /** do_op - do an op
1947 * pg lock will be held (if multithreaded)
1948 * osd_lock NOT held.
1949 */
1950 void PrimaryLogPG::do_op(OpRequestRef& op)
1951 {
1952 FUNCTRACE(cct);
1953 // NOTE: take a non-const pointer here; we must be careful not to
1954 // change anything that will break other reads on m (operator<<).
1955 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1956 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1957 if (m->finish_decode()) {
1958 op->reset_desc(); // for TrackedOp
1959 m->clear_payload();
1960 }
1961
1962 dout(20) << __func__ << ": op " << *m << dendl;
1963
1964 hobject_t head = m->get_hobj();
1965 head.snap = CEPH_NOSNAP;
1966
1967 if (!info.pgid.pgid.contains(
1968 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1969 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1970 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1971 << std::hex << head.get_hash() << std::dec << dendl;
1972 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1973 << " op " << *m;
1974 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1975 return;
1976 }
1977
1978 bool can_backoff =
1979 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1980 SessionRef session;
1981 if (can_backoff) {
1982 session = static_cast<Session*>(m->get_connection()->get_priv().get());
1983 if (!session.get()) {
1984 dout(10) << __func__ << " no session" << dendl;
1985 return;
1986 }
1987
1988 if (session->check_backoff(cct, info.pgid, head, m)) {
1989 return;
1990 }
1991 }
1992
1993 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1994 // not implemented.
1995 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1996 osd->reply_op_error(op, -EINVAL);
1997 return;
1998 }
1999
2000 if (op->rmw_flags == 0) {
2001 int r = osd->osd->init_op_flags(op);
2002 if (r) {
2003 osd->reply_op_error(op, r);
2004 return;
2005 }
2006 }
2007
2008 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2009 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2010 op->may_read() &&
2011 !(op->may_write() || op->may_cache())) {
2012 // balanced reads; any replica will do
2013 if (!(is_primary() || is_replica())) {
2014 osd->handle_misdirected_op(this, op);
2015 return;
2016 }
2017 } else {
2018 // normal case; must be primary
2019 if (!is_primary()) {
2020 osd->handle_misdirected_op(this, op);
2021 return;
2022 }
2023 }
2024
2025 if (!op_has_sufficient_caps(op)) {
2026 osd->reply_op_error(op, -EPERM);
2027 return;
2028 }
2029
2030 if (op->includes_pg_op()) {
2031 return do_pg_op(op);
2032 }
2033
2034 // object name too long?
2035 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
2036 dout(4) << "do_op name is longer than "
2037 << cct->_conf->osd_max_object_name_len
2038 << " bytes" << dendl;
2039 osd->reply_op_error(op, -ENAMETOOLONG);
2040 return;
2041 }
2042 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
2043 dout(4) << "do_op locator is longer than "
2044 << cct->_conf->osd_max_object_name_len
2045 << " bytes" << dendl;
2046 osd->reply_op_error(op, -ENAMETOOLONG);
2047 return;
2048 }
2049 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
2050 dout(4) << "do_op namespace is longer than "
2051 << cct->_conf->osd_max_object_namespace_len
2052 << " bytes" << dendl;
2053 osd->reply_op_error(op, -ENAMETOOLONG);
2054 return;
2055 }
2056
2057 if (int r = osd->store->validate_hobject_key(head)) {
2058 dout(4) << "do_op object " << head << " invalid for backing store: "
2059 << r << dendl;
2060 osd->reply_op_error(op, r);
2061 return;
2062 }
2063
2064 // blacklisted?
2065 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
2066 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
2067 osd->reply_op_error(op, -EBLACKLISTED);
2068 return;
2069 }
2070
2071 // order this op as a write?
2072 bool write_ordered = op->rwordered();
2073
2074 // discard due to cluster full transition? (we discard any op that
2075 // originates before the cluster or pool is marked full; the client
2076 // will resend after the full flag is removed or if they expect the
2077 // op to succeed despite being full). The except is FULL_FORCE and
2078 // FULL_TRY ops, which there is no reason to discard because they
2079 // bypass all full checks anyway. If this op isn't write or
2080 // read-ordered, we skip.
2081 // FIXME: we exclude mds writes for now.
2082 if (write_ordered && !(m->get_source().is_mds() ||
2083 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2084 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2085 info.history.last_epoch_marked_full > m->get_map_epoch()) {
2086 dout(10) << __func__ << " discarding op sent before full " << m << " "
2087 << *m << dendl;
2088 return;
2089 }
2090 // mds should have stopped writing before this point.
2091 // We can't allow OSD to become non-startable even if mds
2092 // could be writing as part of file removals.
2093 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
2094 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2095 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
2096 return;
2097 }
2098 int64_t poolid = get_pgid().pool();
2099 if (op->may_write()) {
2100
2101 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2102 if (!pi) {
2103 return;
2104 }
2105
2106 // invalid?
2107 if (m->get_snapid() != CEPH_NOSNAP) {
2108 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2109 osd->reply_op_error(op, -EINVAL);
2110 return;
2111 }
2112
2113 // too big?
2114 if (cct->_conf->osd_max_write_size &&
2115 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2116 // journal can't hold commit!
2117 derr << "do_op msg data len " << m->get_data_len()
2118 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2119 << " on " << *m << dendl;
2120 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2121 return;
2122 }
2123 }
2124
2125 dout(10) << "do_op " << *m
2126 << (op->may_write() ? " may_write" : "")
2127 << (op->may_read() ? " may_read" : "")
2128 << (op->may_cache() ? " may_cache" : "")
2129 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2130 << " flags " << ceph_osd_flag_string(m->get_flags())
2131 << dendl;
2132
2133 // missing object?
2134 if (is_unreadable_object(head)) {
2135 if (!is_primary()) {
2136 osd->reply_op_error(op, -EAGAIN);
2137 return;
2138 }
2139 if (can_backoff &&
2140 (g_conf()->osd_backoff_on_degraded ||
2141 (g_conf()->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2142 add_backoff(session, head, head);
2143 maybe_kick_recovery(head);
2144 } else {
2145 wait_for_unreadable_object(head, op);
2146 }
2147 return;
2148 }
2149
2150 if (write_ordered) {
2151 // degraded object?
2152 if (is_degraded_or_backfilling_object(head)) {
2153 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2154 add_backoff(session, head, head);
2155 maybe_kick_recovery(head);
2156 } else {
2157 wait_for_degraded_object(head, op);
2158 }
2159 return;
2160 }
2161
2162 if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
2163 dout(20) << __func__ << ": waiting for scrub" << dendl;
2164 waiting_for_scrub.push_back(op);
2165 op->mark_delayed("waiting for scrub");
2166 return;
2167 }
2168
2169 // blocked on snap?
2170 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2171 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2172 hobject_t to_wait_on(head);
2173 to_wait_on.snap = blocked_iter->second;
2174 wait_for_degraded_object(to_wait_on, op);
2175 return;
2176 }
2177 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2178 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2179 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2180 return;
2181 }
2182 if (objects_blocked_on_cache_full.count(head)) {
2183 block_write_on_full_cache(head, op);
2184 return;
2185 }
2186 }
2187
2188 // dup/resent?
2189 if (op->may_write() || op->may_cache()) {
2190 // warning: we will get back *a* request for this reqid, but not
2191 // necessarily the most recent. this happens with flush and
2192 // promote ops, but we can't possible have both in our log where
2193 // the original request is still not stable on disk, so for our
2194 // purposes here it doesn't matter which one we get.
2195 eversion_t version;
2196 version_t user_version;
2197 int return_code = 0;
2198 bool got = check_in_progress_op(
2199 m->get_reqid(), &version, &user_version, &return_code);
2200 if (got) {
2201 dout(3) << __func__ << " dup " << m->get_reqid()
2202 << " version " << version << dendl;
2203 if (already_complete(version)) {
2204 osd->reply_op_error(op, return_code, version, user_version);
2205 } else {
2206 dout(10) << " waiting for " << version << " to commit" << dendl;
2207 // always queue ondisk waiters, so that we can requeue if needed
2208 waiting_for_ondisk[version].emplace_back(op, user_version, return_code);
2209 op->mark_delayed("waiting for ondisk");
2210 }
2211 return;
2212 }
2213 }
2214
2215 ObjectContextRef obc;
2216 bool can_create = op->may_write();
2217 hobject_t missing_oid;
2218
2219 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2220 hobject_t _oid_head;
2221 if (m->get_snapid() == CEPH_SNAPDIR) {
2222 _oid_head = m->get_hobj().get_head();
2223 }
2224 const hobject_t& oid =
2225 m->get_snapid() == CEPH_SNAPDIR ? _oid_head : m->get_hobj();
2226
2227 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2228 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2229 OSDOp& osd_op = *p;
2230
2231 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2232 if (m->get_snapid() != CEPH_SNAPDIR) {
2233 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2234 osd->reply_op_error(op, -EINVAL);
2235 return;
2236 }
2237 } else {
2238 if (m->get_snapid() == CEPH_SNAPDIR) {
2239 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2240 osd->reply_op_error(op, -EINVAL);
2241 return;
2242 }
2243 }
2244 }
2245
2246 // io blocked on obc?
2247 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2248 maybe_await_blocked_head(oid, op)) {
2249 return;
2250 }
2251
2252 int r = find_object_context(
2253 oid, &obc, can_create,
2254 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2255 &missing_oid);
2256
2257 // LIST_SNAPS needs the ssc too
2258 if (obc &&
2259 m->get_snapid() == CEPH_SNAPDIR &&
2260 !obc->ssc) {
2261 obc->ssc = get_snapset_context(oid, true);
2262 }
2263
2264 if (r == -EAGAIN) {
2265 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2266 // we have to wait for the object.
2267 if (is_primary()) {
2268 // missing the specific snap we need; requeue and wait.
2269 ceph_assert(!op->may_write()); // only happens on a read/cache
2270 wait_for_unreadable_object(missing_oid, op);
2271 return;
2272 }
2273 } else if (r == 0) {
2274 if (is_unreadable_object(obc->obs.oi.soid)) {
2275 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2276 << " is unreadable, waiting" << dendl;
2277 wait_for_unreadable_object(obc->obs.oi.soid, op);
2278 return;
2279 }
2280
2281 // degraded object? (the check above was for head; this could be a clone)
2282 if (write_ordered &&
2283 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2284 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2285 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2286 << " is degraded, waiting" << dendl;
2287 wait_for_degraded_object(obc->obs.oi.soid, op);
2288 return;
2289 }
2290 }
2291
2292 bool in_hit_set = false;
2293 if (hit_set) {
2294 if (obc.get()) {
2295 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2296 in_hit_set = true;
2297 } else {
2298 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2299 in_hit_set = true;
2300 }
2301 if (!op->hitset_inserted) {
2302 hit_set->insert(oid);
2303 op->hitset_inserted = true;
2304 if (hit_set->is_full() ||
2305 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2306 hit_set_persist();
2307 }
2308 }
2309 }
2310
2311 if (agent_state) {
2312 if (agent_choose_mode(false, op))
2313 return;
2314 }
2315
2316 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2317 if (maybe_handle_manifest(op,
2318 write_ordered,
2319 obc))
2320 return;
2321 }
2322
2323 if (maybe_handle_cache(op,
2324 write_ordered,
2325 obc,
2326 r,
2327 missing_oid,
2328 false,
2329 in_hit_set))
2330 return;
2331
2332 if (r && (r != -ENOENT || !obc)) {
2333 // copy the reqids for copy get on ENOENT
2334 if (r == -ENOENT &&
2335 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2336 fill_in_copy_get_noent(op, oid, m->ops[0]);
2337 return;
2338 }
2339 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2340 if (op->may_write() &&
2341 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2342 record_write_error(op, oid, nullptr, r);
2343 } else {
2344 osd->reply_op_error(op, r);
2345 }
2346 return;
2347 }
2348
2349 // make sure locator is consistent
2350 object_locator_t oloc(obc->obs.oi.soid);
2351 if (m->get_object_locator() != oloc) {
2352 dout(10) << " provided locator " << m->get_object_locator()
2353 << " != object's " << obc->obs.oi.soid << dendl;
2354 osd->clog->warn() << "bad locator " << m->get_object_locator()
2355 << " on object " << oloc
2356 << " op " << *m;
2357 }
2358
2359 // io blocked on obc?
2360 if (obc->is_blocked() &&
2361 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2362 wait_for_blocked_object(obc->obs.oi.soid, op);
2363 return;
2364 }
2365
2366 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2367
2368 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2369
2370 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2371 dout(20) << __func__ << ": skipping rw locks" << dendl;
2372 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2373 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2374
2375 // verify there is in fact a flush in progress
2376 // FIXME: we could make this a stronger test.
2377 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2378 if (p == flush_ops.end()) {
2379 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2380 reply_ctx(ctx, -EINVAL);
2381 return;
2382 }
2383 } else if (!get_rw_locks(write_ordered, ctx)) {
2384 dout(20) << __func__ << " waiting for rw locks " << dendl;
2385 op->mark_delayed("waiting for rw locks");
2386 close_op_ctx(ctx);
2387 return;
2388 }
2389 dout(20) << __func__ << " obc " << *obc << dendl;
2390
2391 if (r) {
2392 dout(20) << __func__ << " returned an error: " << r << dendl;
2393 close_op_ctx(ctx);
2394 if (op->may_write() &&
2395 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2396 record_write_error(op, oid, nullptr, r);
2397 } else {
2398 osd->reply_op_error(op, r);
2399 }
2400 return;
2401 }
2402
2403 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2404 ctx->ignore_cache = true;
2405 }
2406
2407 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2408 // This object is lost. Reading from it returns an error.
2409 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2410 << " is lost" << dendl;
2411 reply_ctx(ctx, -ENFILE);
2412 return;
2413 }
2414 if (!op->may_write() &&
2415 !op->may_cache() &&
2416 (!obc->obs.exists ||
2417 ((m->get_snapid() != CEPH_SNAPDIR) &&
2418 obc->obs.oi.is_whiteout()))) {
2419 // copy the reqids for copy get on ENOENT
2420 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2421 fill_in_copy_get_noent(op, oid, m->ops[0]);
2422 close_op_ctx(ctx);
2423 return;
2424 }
2425 reply_ctx(ctx, -ENOENT);
2426 return;
2427 }
2428
2429 op->mark_started();
2430
2431 execute_ctx(ctx);
2432 utime_t prepare_latency = ceph_clock_now();
2433 prepare_latency -= op->get_dequeued_time();
2434 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2435 if (op->may_read() && op->may_write()) {
2436 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2437 } else if (op->may_read()) {
2438 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2439 } else if (op->may_write() || op->may_cache()) {
2440 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2441 }
2442
2443 // force recovery of the oldest missing object if too many logs
2444 maybe_force_recovery();
2445 }
2446
2447 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2448 OpRequestRef op,
2449 bool write_ordered,
2450 ObjectContextRef obc)
2451 {
2452 ceph_assert(obc);
2453 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2454 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2455 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2456 return cache_result_t::NOOP;
2457 }
2458
2459 // if it is write-ordered and blocked, stop now
2460 if (obc->is_blocked() && write_ordered) {
2461 // we're already doing something with this object
2462 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2463 return cache_result_t::NOOP;
2464 }
2465
2466 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2467 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2468 OSDOp& osd_op = *p;
2469 ceph_osd_op& op = osd_op.op;
2470 if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2471 op.op == CEPH_OSD_OP_SET_CHUNK ||
2472 op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2473 op.op == CEPH_OSD_OP_UNSET_MANIFEST) {
2474 return cache_result_t::NOOP;
2475 }
2476 }
2477
2478 switch (obc->obs.oi.manifest.type) {
2479 case object_manifest_t::TYPE_REDIRECT:
2480 if (op->may_write() || write_ordered) {
2481 do_proxy_write(op, obc);
2482 } else {
2483 // promoted object
2484 if (obc->obs.oi.size != 0) {
2485 return cache_result_t::NOOP;
2486 }
2487 do_proxy_read(op, obc);
2488 }
2489 return cache_result_t::HANDLED_PROXY;
2490 case object_manifest_t::TYPE_CHUNKED:
2491 {
2492 if (can_proxy_chunked_read(op, obc)) {
2493 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2494 if (p != flush_ops.end()) {
2495 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2496 return cache_result_t::HANDLED_PROXY;
2497 }
2498 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2499 return cache_result_t::HANDLED_PROXY;
2500 }
2501
2502 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2503 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2504 hobject_t head = m->get_hobj();
2505
2506 if (is_degraded_or_backfilling_object(head)) {
2507 dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2508 wait_for_degraded_object(head, op);
2509 return cache_result_t::BLOCKED_RECOVERY;
2510 }
2511
2512 if (write_blocked_by_scrub(head)) {
2513 dout(20) << __func__ << ": waiting for scrub" << dendl;
2514 waiting_for_scrub.push_back(op);
2515 op->mark_delayed("waiting for scrub");
2516 return cache_result_t::BLOCKED_RECOVERY;
2517 }
2518
2519 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2520 if (p.second.is_missing()) {
2521 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2522 const object_locator_t oloc = m->get_object_locator();
2523 promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2524 return cache_result_t::BLOCKED_PROMOTE;
2525 }
2526 }
2527
2528 bool all_dirty = true;
2529 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2530 if (!p.second.is_dirty()) {
2531 all_dirty = false;
2532 }
2533 }
2534 if (all_dirty) {
2535 start_flush(OpRequestRef(), obc, true, NULL, boost::none);
2536 }
2537 return cache_result_t::NOOP;
2538 }
2539 default:
2540 ceph_abort_msg("unrecognized manifest type");
2541 }
2542
2543 return cache_result_t::NOOP;
2544 }
2545
2546 struct C_ManifestFlush : public Context {
2547 PrimaryLogPGRef pg;
2548 hobject_t oid;
2549 epoch_t lpr;
2550 ceph_tid_t tid;
2551 utime_t start;
2552 uint64_t offset;
2553 uint64_t last_offset;
2554 C_ManifestFlush(PrimaryLogPG *p, hobject_t o, epoch_t e)
2555 : pg(p), oid(o), lpr(e),
2556 tid(0), start(ceph_clock_now())
2557 {}
2558 void finish(int r) override {
2559 if (r == -ECANCELED)
2560 return;
2561 pg->lock();
2562 pg->handle_manifest_flush(oid, tid, r, offset, last_offset, lpr);
2563 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
2564 pg->unlock();
2565 }
2566 };
2567
2568 void PrimaryLogPG::handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2569 uint64_t offset, uint64_t last_offset,
2570 epoch_t lpr)
2571 {
2572 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2573 if (p == flush_ops.end()) {
2574 dout(10) << __func__ << " no flush_op found" << dendl;
2575 return;
2576 }
2577 if (p->second->rval < 0) {
2578 return;
2579 }
2580 p->second->io_results[offset] = r;
2581 for (auto &ior: p->second->io_results) {
2582 if (ior.second < 0) {
2583 finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2584 p->second->rval = r;
2585 return;
2586 }
2587 }
2588 if (p->second->chunks == p->second->io_results.size()) {
2589 if (lpr == get_last_peering_reset()) {
2590 ceph_assert(p->second->obc);
2591 finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2592 }
2593 }
2594 }
2595
2596 int PrimaryLogPG::start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
2597 boost::optional<std::function<void()>> &&on_flush)
2598 {
2599 auto p = obc->obs.oi.manifest.chunk_map.begin();
2600 FlushOpRef manifest_fop(std::make_shared<FlushOp>());
2601 manifest_fop->op = op;
2602 manifest_fop->obc = obc;
2603 manifest_fop->flushed_version = obc->obs.oi.user_version;
2604 manifest_fop->blocking = blocking;
2605 manifest_fop->on_flush = std::move(on_flush);
2606 int r = do_manifest_flush(op, obc, manifest_fop, p->first, blocking);
2607 if (r < 0) {
2608 return r;
2609 }
2610
2611 flush_ops[obc->obs.oi.soid] = manifest_fop;
2612 return -EINPROGRESS;
2613 }
2614
2615 int PrimaryLogPG::do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
2616 uint64_t start_offset, bool block)
2617 {
2618 struct object_manifest_t &manifest = obc->obs.oi.manifest;
2619 hobject_t soid = obc->obs.oi.soid;
2620 ceph_tid_t tid;
2621 SnapContext snapc;
2622 uint64_t max_copy_size = 0, last_offset = 0;
2623
2624 map<uint64_t, chunk_info_t>::iterator iter = manifest.chunk_map.find(start_offset);
2625 ceph_assert(iter != manifest.chunk_map.end());
2626 for (;iter != manifest.chunk_map.end(); ++iter) {
2627 if (iter->second.is_dirty()) {
2628 last_offset = iter->first;
2629 max_copy_size += iter->second.length;
2630 }
2631 if (get_copy_chunk_size() < max_copy_size) {
2632 break;
2633 }
2634 }
2635
2636 iter = manifest.chunk_map.find(start_offset);
2637 for (;iter != manifest.chunk_map.end(); ++iter) {
2638 if (!iter->second.is_dirty()) {
2639 continue;
2640 }
2641 uint64_t tgt_length = iter->second.length;
2642 uint64_t tgt_offset= iter->second.offset;
2643 hobject_t tgt_soid = iter->second.oid;
2644 object_locator_t oloc(tgt_soid);
2645 ObjectOperation obj_op;
2646 bufferlist chunk_data;
2647 int r = pgbackend->objects_read_sync(
2648 soid, iter->first, tgt_length, 0, &chunk_data);
2649 if (r < 0) {
2650 dout(0) << __func__ << " read fail " << " offset: " << tgt_offset
2651 << " len: " << tgt_length << " r: " << r << dendl;
2652 return r;
2653 }
2654 if (!chunk_data.length()) {
2655 return -ENODATA;
2656 }
2657
2658 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
2659 CEPH_OSD_FLAG_RWORDERED;
2660 tgt_length = chunk_data.length();
2661 pg_pool_t::fingerprint_t fp_algo_t = pool.info.get_fingerprint_type();
2662 if (iter->second.has_reference() &&
2663 fp_algo_t != pg_pool_t::TYPE_FINGERPRINT_NONE) {
2664 switch (fp_algo_t) {
2665 case pg_pool_t::TYPE_FINGERPRINT_SHA1:
2666 {
2667 sha1_digest_t sha1r = chunk_data.sha1();
2668 object_t fp_oid = sha1r.to_str();
2669 bufferlist in;
2670 if (fp_oid != tgt_soid.oid) {
2671 // decrement old chunk's reference count
2672 ObjectOperation dec_op;
2673 cls_chunk_refcount_put_op put_call;
2674 ::encode(put_call, in);
2675 dec_op.call("refcount", "chunk_put", in);
2676 // we don't care dec_op's completion. scrub for dedup will fix this.
2677 tid = osd->objecter->mutate(
2678 tgt_soid.oid, oloc, dec_op, snapc,
2679 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2680 flags, NULL);
2681 in.clear();
2682 }
2683 tgt_soid.oid = fp_oid;
2684 iter->second.oid = tgt_soid;
2685 // add data op
2686 ceph_osd_op osd_op;
2687 osd_op.extent.offset = 0;
2688 osd_op.extent.length = chunk_data.length();
2689 encode(osd_op, in);
2690 encode(soid, in);
2691 in.append(chunk_data);
2692 obj_op.call("cas", "cas_write_or_get", in);
2693 break;
2694 }
2695 default:
2696 assert(0 == "unrecognized fingerprint type");
2697 break;
2698 }
2699 } else {
2700 obj_op.add_data(CEPH_OSD_OP_WRITE, tgt_offset, tgt_length, chunk_data);
2701 }
2702
2703 C_ManifestFlush *fin = new C_ManifestFlush(this, soid, get_last_peering_reset());
2704 fin->offset = iter->first;
2705 fin->last_offset = last_offset;
2706 manifest_fop->chunks++;
2707
2708 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
2709 tid = osd->objecter->mutate(
2710 tgt_soid.oid, oloc, obj_op, snapc,
2711 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2712 flags, new C_OnFinisher(fin, osd->objecter_finishers[n]));
2713 fin->tid = tid;
2714 manifest_fop->io_tids[iter->first] = tid;
2715
2716 dout(20) << __func__ << " offset: " << tgt_offset << " len: " << tgt_length
2717 << " oid: " << tgt_soid.oid << " ori oid: " << soid.oid.name
2718 << " tid: " << tid << dendl;
2719 if (last_offset < iter->first) {
2720 break;
2721 }
2722 }
2723
2724 return 0;
2725 }
2726
2727 void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2728 ObjectContextRef obc, uint64_t last_offset)
2729 {
2730 dout(10) << __func__ << " " << oid << " tid " << tid
2731 << " " << cpp_strerror(r) << " last_offset: " << last_offset << dendl;
2732 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2733 if (p == flush_ops.end()) {
2734 dout(10) << __func__ << " no flush_op found" << dendl;
2735 return;
2736 }
2737 map<uint64_t, chunk_info_t>::iterator iter =
2738 obc->obs.oi.manifest.chunk_map.find(last_offset);
2739 ceph_assert(iter != obc->obs.oi.manifest.chunk_map.end());
2740 for (;iter != obc->obs.oi.manifest.chunk_map.end(); ++iter) {
2741 if (iter->second.is_dirty() && last_offset < iter->first) {
2742 do_manifest_flush(p->second->op, obc, p->second, iter->first, p->second->blocking);
2743 return;
2744 }
2745 }
2746 finish_flush(oid, tid, r);
2747 }
2748
2749 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2750 MOSDOpReply *orig_reply, int r)
2751 {
2752 dout(20) << __func__ << " r=" << r << dendl;
2753 ceph_assert(op->may_write());
2754 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2755 mempool::osd_pglog::list<pg_log_entry_t> entries;
2756 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2757 get_next_version(), eversion_t(), 0,
2758 reqid, utime_t(), r));
2759
2760 struct OnComplete {
2761 PrimaryLogPG *pg;
2762 OpRequestRef op;
2763 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2764 int r;
2765 OnComplete(
2766 PrimaryLogPG *pg,
2767 OpRequestRef op,
2768 MOSDOpReply *orig_reply,
2769 int r)
2770 : pg(pg), op(op),
2771 orig_reply(orig_reply, false /* take over ref */), r(r)
2772 {}
2773 void operator()() {
2774 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2775 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2776 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2777 MOSDOpReply *reply = orig_reply.detach();
2778 if (reply == nullptr) {
2779 reply = new MOSDOpReply(m, r, pg->get_osdmap_epoch(),
2780 flags, true);
2781 }
2782 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2783 pg->osd->send_message_osd_client(reply, m->get_connection());
2784 }
2785 };
2786
2787 ObcLockManager lock_manager;
2788 submit_log_entries(
2789 entries,
2790 std::move(lock_manager),
2791 boost::optional<std::function<void(void)> >(
2792 OnComplete(this, op, orig_reply, r)),
2793 op,
2794 r);
2795 }
2796
2797 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2798 OpRequestRef op,
2799 bool write_ordered,
2800 ObjectContextRef obc,
2801 int r, hobject_t missing_oid,
2802 bool must_promote,
2803 bool in_hit_set,
2804 ObjectContextRef *promote_obc)
2805 {
2806 // return quickly if caching is not enabled
2807 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2808 return cache_result_t::NOOP;
2809
2810 if (op &&
2811 op->get_req() &&
2812 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2813 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2814 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2815 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2816 return cache_result_t::NOOP;
2817 }
2818
2819 must_promote = must_promote || op->need_promote();
2820
2821 if (obc)
2822 dout(25) << __func__ << " " << obc->obs.oi << " "
2823 << (obc->obs.exists ? "exists" : "DNE")
2824 << " missing_oid " << missing_oid
2825 << " must_promote " << (int)must_promote
2826 << " in_hit_set " << (int)in_hit_set
2827 << dendl;
2828 else
2829 dout(25) << __func__ << " (no obc)"
2830 << " missing_oid " << missing_oid
2831 << " must_promote " << (int)must_promote
2832 << " in_hit_set " << (int)in_hit_set
2833 << dendl;
2834
2835 // if it is write-ordered and blocked, stop now
2836 if (obc.get() && obc->is_blocked() && write_ordered) {
2837 // we're already doing something with this object
2838 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2839 return cache_result_t::NOOP;
2840 }
2841
2842 if (r == -ENOENT && missing_oid == hobject_t()) {
2843 // we know this object is logically absent (e.g., an undefined clone)
2844 return cache_result_t::NOOP;
2845 }
2846
2847 if (obc.get() && obc->obs.exists) {
2848 osd->logger->inc(l_osd_op_cache_hit);
2849 return cache_result_t::NOOP;
2850 }
2851 if (!is_primary()) {
2852 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2853 osd->reply_op_error(op, -EAGAIN);
2854 return cache_result_t::REPLIED_WITH_EAGAIN;
2855 }
2856
2857 if (missing_oid == hobject_t() && obc.get()) {
2858 missing_oid = obc->obs.oi.soid;
2859 }
2860
2861 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2862 const object_locator_t oloc = m->get_object_locator();
2863
2864 if (op->need_skip_handle_cache()) {
2865 return cache_result_t::NOOP;
2866 }
2867
2868 OpRequestRef promote_op;
2869
2870 switch (pool.info.cache_mode) {
2871 case pg_pool_t::CACHEMODE_WRITEBACK:
2872 if (agent_state &&
2873 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2874 if (!op->may_write() && !op->may_cache() &&
2875 !write_ordered && !must_promote) {
2876 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2877 do_proxy_read(op);
2878 return cache_result_t::HANDLED_PROXY;
2879 }
2880 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2881 block_write_on_full_cache(missing_oid, op);
2882 return cache_result_t::BLOCKED_FULL;
2883 }
2884
2885 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2886 promote_object(obc, missing_oid, oloc, op, promote_obc);
2887 return cache_result_t::BLOCKED_PROMOTE;
2888 }
2889
2890 if (op->may_write() || op->may_cache()) {
2891 do_proxy_write(op);
2892
2893 // Promote too?
2894 if (!op->need_skip_promote() &&
2895 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2896 pool.info.min_write_recency_for_promote,
2897 OpRequestRef(),
2898 promote_obc)) {
2899 return cache_result_t::BLOCKED_PROMOTE;
2900 }
2901 return cache_result_t::HANDLED_PROXY;
2902 } else {
2903 do_proxy_read(op);
2904
2905 // Avoid duplicate promotion
2906 if (obc.get() && obc->is_blocked()) {
2907 if (promote_obc)
2908 *promote_obc = obc;
2909 return cache_result_t::BLOCKED_PROMOTE;
2910 }
2911
2912 // Promote too?
2913 if (!op->need_skip_promote()) {
2914 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2915 pool.info.min_read_recency_for_promote,
2916 promote_op, promote_obc);
2917 }
2918
2919 return cache_result_t::HANDLED_PROXY;
2920 }
2921 ceph_abort_msg("unreachable");
2922 return cache_result_t::NOOP;
2923
2924 case pg_pool_t::CACHEMODE_FORWARD:
2925 // FIXME: this mode allows requests to be reordered.
2926 do_cache_redirect(op);
2927 return cache_result_t::HANDLED_REDIRECT;
2928
2929 case pg_pool_t::CACHEMODE_READONLY:
2930 // TODO: clean this case up
2931 if (!obc.get() && r == -ENOENT) {
2932 // we don't have the object and op's a read
2933 promote_object(obc, missing_oid, oloc, op, promote_obc);
2934 return cache_result_t::BLOCKED_PROMOTE;
2935 }
2936 if (!r) { // it must be a write
2937 do_cache_redirect(op);
2938 return cache_result_t::HANDLED_REDIRECT;
2939 }
2940 // crap, there was a failure of some kind
2941 return cache_result_t::NOOP;
2942
2943 case pg_pool_t::CACHEMODE_READFORWARD:
2944 // Do writeback to the cache tier for writes
2945 if (op->may_write() || write_ordered || must_promote) {
2946 if (agent_state &&
2947 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2948 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2949 block_write_on_full_cache(missing_oid, op);
2950 return cache_result_t::BLOCKED_FULL;
2951 }
2952 promote_object(obc, missing_oid, oloc, op, promote_obc);
2953 return cache_result_t::BLOCKED_PROMOTE;
2954 }
2955
2956 // If it is a read, we can read, we need to forward it
2957 do_cache_redirect(op);
2958 return cache_result_t::HANDLED_REDIRECT;
2959
2960 case pg_pool_t::CACHEMODE_PROXY:
2961 if (!must_promote) {
2962 if (op->may_write() || op->may_cache() || write_ordered) {
2963 do_proxy_write(op);
2964 return cache_result_t::HANDLED_PROXY;
2965 } else {
2966 do_proxy_read(op);
2967 return cache_result_t::HANDLED_PROXY;
2968 }
2969 }
2970 // ugh, we're forced to promote.
2971 if (agent_state &&
2972 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2973 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2974 block_write_on_full_cache(missing_oid, op);
2975 return cache_result_t::BLOCKED_FULL;
2976 }
2977 promote_object(obc, missing_oid, oloc, op, promote_obc);
2978 return cache_result_t::BLOCKED_PROMOTE;
2979
2980 case pg_pool_t::CACHEMODE_READPROXY:
2981 // Do writeback to the cache tier for writes
2982 if (op->may_write() || write_ordered || must_promote) {
2983 if (agent_state &&
2984 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2985 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2986 block_write_on_full_cache(missing_oid, op);
2987 return cache_result_t::BLOCKED_FULL;
2988 }
2989 promote_object(obc, missing_oid, oloc, op, promote_obc);
2990 return cache_result_t::BLOCKED_PROMOTE;
2991 }
2992
2993 // If it is a read, we can read, we need to proxy it
2994 do_proxy_read(op);
2995 return cache_result_t::HANDLED_PROXY;
2996
2997 default:
2998 ceph_abort_msg("unrecognized cache_mode");
2999 }
3000 return cache_result_t::NOOP;
3001 }
3002
3003 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
3004 const hobject_t& missing_oid,
3005 const object_locator_t& oloc,
3006 bool in_hit_set,
3007 uint32_t recency,
3008 OpRequestRef promote_op,
3009 ObjectContextRef *promote_obc)
3010 {
3011 dout(20) << __func__ << " missing_oid " << missing_oid
3012 << " in_hit_set " << in_hit_set << dendl;
3013
3014 switch (recency) {
3015 case 0:
3016 break;
3017 case 1:
3018 // Check if in the current hit set
3019 if (in_hit_set) {
3020 break;
3021 } else {
3022 // not promoting
3023 return false;
3024 }
3025 break;
3026 default:
3027 {
3028 unsigned count = (int)in_hit_set;
3029 if (count) {
3030 // Check if in other hit sets
3031 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
3032 for (map<time_t,HitSetRef>::reverse_iterator itor =
3033 agent_state->hit_set_map.rbegin();
3034 itor != agent_state->hit_set_map.rend();
3035 ++itor) {
3036 if (!itor->second->contains(oid)) {
3037 break;
3038 }
3039 ++count;
3040 if (count >= recency) {
3041 break;
3042 }
3043 }
3044 }
3045 if (count >= recency) {
3046 break;
3047 }
3048 return false; // not promoting
3049 }
3050 break;
3051 }
3052
3053 if (osd->promote_throttle()) {
3054 dout(10) << __func__ << " promote throttled" << dendl;
3055 return false;
3056 }
3057 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
3058 return true;
3059 }
3060
3061 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
3062 {
3063 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3064 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
3065 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
3066 flags, false);
3067 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
3068 reply->set_redirect(redir);
3069 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
3070 << op << dendl;
3071 m->get_connection()->send_message(reply);
3072 return;
3073 }
3074
3075 struct C_ProxyRead : public Context {
3076 PrimaryLogPGRef pg;
3077 hobject_t oid;
3078 epoch_t last_peering_reset;
3079 ceph_tid_t tid;
3080 PrimaryLogPG::ProxyReadOpRef prdop;
3081 utime_t start;
3082 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3083 const PrimaryLogPG::ProxyReadOpRef& prd)
3084 : pg(p), oid(o), last_peering_reset(lpr),
3085 tid(0), prdop(prd), start(ceph_clock_now())
3086 {}
3087 void finish(int r) override {
3088 if (prdop->canceled)
3089 return;
3090 pg->lock();
3091 if (prdop->canceled) {
3092 pg->unlock();
3093 return;
3094 }
3095 if (last_peering_reset == pg->get_last_peering_reset()) {
3096 pg->finish_proxy_read(oid, tid, r);
3097 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3098 }
3099 pg->unlock();
3100 }
3101 };
3102
3103 struct C_ProxyChunkRead : public Context {
3104 PrimaryLogPGRef pg;
3105 hobject_t oid;
3106 epoch_t last_peering_reset;
3107 ceph_tid_t tid;
3108 PrimaryLogPG::ProxyReadOpRef prdop;
3109 utime_t start;
3110 ObjectOperation *obj_op;
3111 int op_index = 0;
3112 uint64_t req_offset = 0;
3113 ObjectContextRef obc;
3114 uint64_t req_total_len = 0;
3115 C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3116 const PrimaryLogPG::ProxyReadOpRef& prd)
3117 : pg(p), oid(o), last_peering_reset(lpr),
3118 tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
3119 {}
3120 void finish(int r) override {
3121 if (prdop->canceled)
3122 return;
3123 pg->lock();
3124 if (prdop->canceled) {
3125 pg->unlock();
3126 return;
3127 }
3128 if (last_peering_reset == pg->get_last_peering_reset()) {
3129 if (r >= 0) {
3130 if (!prdop->ops[op_index].outdata.length()) {
3131 ceph_assert(req_total_len);
3132 bufferlist list;
3133 bufferptr bptr(req_total_len);
3134 list.push_back(std::move(bptr));
3135 prdop->ops[op_index].outdata.append(list);
3136 }
3137 ceph_assert(obj_op);
3138 uint64_t copy_offset;
3139 if (req_offset >= prdop->ops[op_index].op.extent.offset) {
3140 copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
3141 } else {
3142 copy_offset = 0;
3143 }
3144 prdop->ops[op_index].outdata.copy_in(copy_offset, obj_op->ops[0].outdata.length(),
3145 obj_op->ops[0].outdata.c_str());
3146 }
3147
3148 pg->finish_proxy_read(oid, tid, r);
3149 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3150 if (obj_op) {
3151 delete obj_op;
3152 }
3153 }
3154 pg->unlock();
3155 }
3156 };
3157
3158 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3159 {
3160 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3161 // stash the result in the request's OSDOp vector
3162 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3163 object_locator_t oloc;
3164 hobject_t soid;
3165 /* extensible tier */
3166 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3167 switch (obc->obs.oi.manifest.type) {
3168 case object_manifest_t::TYPE_REDIRECT:
3169 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3170 soid = obc->obs.oi.manifest.redirect_target;
3171 break;
3172 default:
3173 ceph_abort_msg("unrecognized manifest type");
3174 }
3175 } else {
3176 /* proxy */
3177 soid = m->get_hobj();
3178 oloc = object_locator_t(m->get_object_locator());
3179 oloc.pool = pool.info.tier_of;
3180 }
3181 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3182
3183 // pass through some original flags that make sense.
3184 // - leave out redirection and balancing flags since we are
3185 // already proxying through the primary
3186 // - leave off read/write/exec flags that are derived from the op
3187 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3188 CEPH_OSD_FLAG_ORDERSNAP |
3189 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3190 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3191
3192 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3193
3194 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3195
3196 ObjectOperation obj_op;
3197 obj_op.dup(prdop->ops);
3198
3199 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3200 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3201 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3202 ceph_osd_op op = obj_op.ops[i].op;
3203 switch (op.op) {
3204 case CEPH_OSD_OP_READ:
3205 case CEPH_OSD_OP_SYNC_READ:
3206 case CEPH_OSD_OP_SPARSE_READ:
3207 case CEPH_OSD_OP_CHECKSUM:
3208 case CEPH_OSD_OP_CMPEXT:
3209 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3210 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3211 }
3212 }
3213 }
3214
3215 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3216 prdop);
3217 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3218 ceph_tid_t tid = osd->objecter->read(
3219 soid.oid, oloc, obj_op,
3220 m->get_snapid(), NULL,
3221 flags, new C_OnFinisher(fin, osd->objecter_finishers[n]),
3222 &prdop->user_version,
3223 &prdop->data_offset,
3224 m->get_features());
3225 fin->tid = tid;
3226 prdop->objecter_tid = tid;
3227 proxyread_ops[tid] = prdop;
3228 in_progress_proxy_ops[soid].push_back(op);
3229 }
3230
3231 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3232 {
3233 dout(10) << __func__ << " " << oid << " tid " << tid
3234 << " " << cpp_strerror(r) << dendl;
3235
3236 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3237 if (p == proxyread_ops.end()) {
3238 dout(10) << __func__ << " no proxyread_op found" << dendl;
3239 return;
3240 }
3241 ProxyReadOpRef prdop = p->second;
3242 if (tid != prdop->objecter_tid) {
3243 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3244 << " tid " << prdop->objecter_tid << dendl;
3245 return;
3246 }
3247 if (oid != prdop->soid) {
3248 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3249 << " soid " << prdop->soid << dendl;
3250 return;
3251 }
3252 proxyread_ops.erase(tid);
3253
3254 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3255 if (q == in_progress_proxy_ops.end()) {
3256 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3257 return;
3258 }
3259 ceph_assert(q->second.size());
3260 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3261 q->second.end(),
3262 prdop->op);
3263 ceph_assert(it != q->second.end());
3264 OpRequestRef op = *it;
3265 q->second.erase(it);
3266 if (q->second.size() == 0) {
3267 in_progress_proxy_ops.erase(oid);
3268 } else if (std::find(q->second.begin(),
3269 q->second.end(),
3270 prdop->op) != q->second.end()) {
3271 /* multiple read case */
3272 dout(20) << __func__ << " " << oid << " is not completed " << dendl;
3273 return;
3274 }
3275
3276 osd->logger->inc(l_osd_tier_proxy_read);
3277
3278 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3279 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3280 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3281 ctx->user_at_version = prdop->user_version;
3282 ctx->data_off = prdop->data_offset;
3283 ctx->ignore_log_op_stats = true;
3284 complete_read_ctx(r, ctx);
3285 }
3286
3287 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3288 {
3289 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3290 if (p == in_progress_proxy_ops.end())
3291 return;
3292
3293 list<OpRequestRef>& ls = p->second;
3294 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3295 requeue_ops(ls);
3296 in_progress_proxy_ops.erase(p);
3297 }
3298
3299 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3300 vector<ceph_tid_t> *tids)
3301 {
3302 dout(10) << __func__ << " " << prdop->soid << dendl;
3303 prdop->canceled = true;
3304
3305 // cancel objecter op, if we can
3306 if (prdop->objecter_tid) {
3307 tids->push_back(prdop->objecter_tid);
3308 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3309 prdop->ops[i].outdata.clear();
3310 }
3311 proxyread_ops.erase(prdop->objecter_tid);
3312 prdop->objecter_tid = 0;
3313 }
3314 }
3315
3316 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3317 {
3318 dout(10) << __func__ << dendl;
3319
3320 // cancel proxy reads
3321 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3322 while (p != proxyread_ops.end()) {
3323 cancel_proxy_read((p++)->second, tids);
3324 }
3325
3326 // cancel proxy writes
3327 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3328 while (q != proxywrite_ops.end()) {
3329 cancel_proxy_write((q++)->second, tids);
3330 }
3331
3332 if (requeue) {
3333 map<hobject_t, list<OpRequestRef>>::iterator p =
3334 in_progress_proxy_ops.begin();
3335 while (p != in_progress_proxy_ops.end()) {
3336 list<OpRequestRef>& ls = p->second;
3337 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3338 << " requests" << dendl;
3339 requeue_ops(ls);
3340 in_progress_proxy_ops.erase(p++);
3341 }
3342 } else {
3343 in_progress_proxy_ops.clear();
3344 }
3345 }
3346
3347 struct C_ProxyWrite_Commit : public Context {
3348 PrimaryLogPGRef pg;
3349 hobject_t oid;
3350 epoch_t last_peering_reset;
3351 ceph_tid_t tid;
3352 PrimaryLogPG::ProxyWriteOpRef pwop;
3353 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3354 const PrimaryLogPG::ProxyWriteOpRef& pw)
3355 : pg(p), oid(o), last_peering_reset(lpr),
3356 tid(0), pwop(pw)
3357 {}
3358 void finish(int r) override {
3359 if (pwop->canceled)
3360 return;
3361 pg->lock();
3362 if (pwop->canceled) {
3363 pg->unlock();
3364 return;
3365 }
3366 if (last_peering_reset == pg->get_last_peering_reset()) {
3367 pg->finish_proxy_write(oid, tid, r);
3368 }
3369 pg->unlock();
3370 }
3371 };
3372
3373 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3374 {
3375 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3376 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3377 object_locator_t oloc;
3378 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3379 hobject_t soid;
3380 /* extensible tier */
3381 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3382 switch (obc->obs.oi.manifest.type) {
3383 case object_manifest_t::TYPE_REDIRECT:
3384 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3385 soid = obc->obs.oi.manifest.redirect_target;
3386 break;
3387 default:
3388 ceph_abort_msg("unrecognized manifest type");
3389 }
3390 } else {
3391 /* proxy */
3392 soid = m->get_hobj();
3393 oloc = object_locator_t(m->get_object_locator());
3394 oloc.pool = pool.info.tier_of;
3395 }
3396
3397 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3398 if (!(op->may_write() || op->may_cache())) {
3399 flags |= CEPH_OSD_FLAG_RWORDERED;
3400 }
3401 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3402
3403 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3404 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3405 pwop->mtime = m->get_mtime();
3406
3407 ObjectOperation obj_op;
3408 obj_op.dup(pwop->ops);
3409
3410 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3411 this, soid, get_last_peering_reset(), pwop);
3412 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3413 ceph_tid_t tid = osd->objecter->mutate(
3414 soid.oid, oloc, obj_op, snapc,
3415 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3416 flags, new C_OnFinisher(fin, osd->objecter_finishers[n]),
3417 &pwop->user_version, pwop->reqid);
3418 fin->tid = tid;
3419 pwop->objecter_tid = tid;
3420 proxywrite_ops[tid] = pwop;
3421 in_progress_proxy_ops[soid].push_back(op);
3422 }
3423
3424 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3425 ObjectContextRef obc, bool write_ordered)
3426 {
3427 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3428 OSDOp *osd_op = NULL;
3429 for (unsigned int i = 0; i < m->ops.size(); i++) {
3430 osd_op = &m->ops[i];
3431 uint64_t cursor = osd_op->op.extent.offset;
3432 uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3433 uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3434 object_manifest_t *manifest = &obc->obs.oi.manifest;
3435 map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3436
3437 while (cursor < op_length) {
3438 chunk_index = 0;
3439 chunk_length = 0;
3440 /* find the right chunk position for cursor */
3441 for (auto &p : manifest->chunk_map) {
3442 if (p.first <= cursor && p.first + p.second.length > cursor) {
3443 chunk_length = p.second.length;
3444 chunk_index = p.first;
3445 break;
3446 }
3447 }
3448 /* no index */
3449 if (!chunk_index && !chunk_length) {
3450 if (cursor == osd_op->op.extent.offset) {
3451 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3452 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3453 ctx->data_off = osd_op->op.extent.offset;
3454 ctx->ignore_log_op_stats = true;
3455 complete_read_ctx(0, ctx);
3456 }
3457 break;
3458 }
3459 uint64_t next_length = chunk_length;
3460 /* the size to read -> | op length | */
3461 /* | a chunk | */
3462 if (cursor + next_length > op_length) {
3463 next_length = op_length - cursor;
3464 }
3465 /* the size to read -> | op length | */
3466 /* | a chunk | */
3467 if (cursor + next_length > chunk_index + chunk_length) {
3468 next_length = chunk_index + chunk_length - cursor;
3469 }
3470
3471 chunk_read[cursor] = {{chunk_index, next_length}};
3472 cursor += next_length;
3473 }
3474
3475 req_len = cursor - osd_op->op.extent.offset;
3476 for (auto &p : chunk_read) {
3477 auto chunks = p.second.begin();
3478 dout(20) << __func__ << " chunk_index: " << chunks->first
3479 << " next_length: " << chunks->second << " cursor: "
3480 << p.first << dendl;
3481 do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3482 }
3483 }
3484 }
3485
3486 struct RefCountCallback : public Context {
3487 public:
3488 PrimaryLogPG *pg;
3489 PrimaryLogPG::OpContext *ctx;
3490 OSDOp& osd_op;
3491 epoch_t last_peering_reset;
3492
3493 RefCountCallback(PrimaryLogPG *pg, PrimaryLogPG::OpContext *ctx,
3494 OSDOp &osd_op, epoch_t lpr)
3495 : pg(pg), ctx(ctx), osd_op(osd_op), last_peering_reset(lpr)
3496 {}
3497 void finish(int r) override {
3498 pg->lock();
3499 if (last_peering_reset == pg->get_last_peering_reset()) {
3500 if (r >= 0) {
3501 osd_op.rval = 0;
3502 pg->execute_ctx(ctx);
3503 } else {
3504 if (ctx->op) {
3505 pg->osd->reply_op_error(ctx->op, r);
3506 }
3507 pg->close_op_ctx(ctx);
3508 }
3509 }
3510 pg->unlock();
3511 }
3512 };
3513
3514 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3515 OSDOp& osd_op;
3516
3517 explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3518 }
3519
3520 int execute() override {
3521 return osd_op.rval;
3522 }
3523 };
3524
3525 void PrimaryLogPG::refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
3526 SnapContext snapc, bool get, Context *cb, uint64_t offset)
3527 {
3528 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3529 CEPH_OSD_FLAG_RWORDERED;
3530
3531 dout(10) << __func__ << " Start refcount for " << soid << dendl;
3532
3533 ObjectOperation obj_op;
3534 bufferlist in;
3535 if (get) {
3536 cls_chunk_refcount_get_op call;
3537 call.source = obc->obs.oi.soid;
3538 ::encode(call, in);
3539 obj_op.call("cas", "chunk_get", in);
3540 } else {
3541 cls_chunk_refcount_put_op call;
3542 call.source = obc->obs.oi.soid;
3543 ::encode(call, in);
3544 obj_op.call("cas", "chunk_put", in);
3545 }
3546
3547 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3548 Context *c;
3549 if (cb) {
3550 c = new C_OnFinisher(cb, osd->objecter_finishers[n]);
3551 } else {
3552 c = NULL;
3553 }
3554
3555 osd->objecter->mutate(
3556 soid.oid, oloc, obj_op, snapc,
3557 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
3558 flags, c);
3559 }
3560
3561 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3562 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3563 uint64_t req_total_len, bool write_ordered)
3564 {
3565 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3566 object_manifest_t *manifest = &obc->obs.oi.manifest;
3567 if (!manifest->chunk_map.count(chunk_index)) {
3568 return;
3569 }
3570 uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3571 hobject_t soid = manifest->chunk_map[chunk_index].oid;
3572 hobject_t ori_soid = m->get_hobj();
3573 object_locator_t oloc(soid);
3574 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3575 if (write_ordered) {
3576 flags |= CEPH_OSD_FLAG_RWORDERED;
3577 }
3578
3579 if (!chunk_length || soid == hobject_t()) {
3580 return;
3581 }
3582
3583 /* same as do_proxy_read() */
3584 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3585 CEPH_OSD_FLAG_ORDERSNAP |
3586 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3587 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3588
3589 dout(10) << __func__ << " Start do chunk proxy read for " << *m
3590 << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3591 << " req_length: " << req_length << dendl;
3592
3593 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3594
3595 ObjectOperation *pobj_op = new ObjectOperation;
3596 OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3597
3598 if (chunk_index <= req_offset) {
3599 osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3600 } else {
3601 ceph_abort_msg("chunk_index > req_offset");
3602 }
3603 osd_op.op.extent.length = req_length;
3604
3605 ObjectOperation obj_op;
3606 obj_op.dup(pobj_op->ops);
3607
3608 C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3609 prdop);
3610 fin->obj_op = pobj_op;
3611 fin->op_index = op_index;
3612 fin->req_offset = req_offset;
3613 fin->obc = obc;
3614 fin->req_total_len = req_total_len;
3615
3616 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3617 ceph_tid_t tid = osd->objecter->read(
3618 soid.oid, oloc, obj_op,
3619 m->get_snapid(), NULL,
3620 flags, new C_OnFinisher(fin, osd->objecter_finishers[n]),
3621 &prdop->user_version,
3622 &prdop->data_offset,
3623 m->get_features());
3624 fin->tid = tid;
3625 prdop->objecter_tid = tid;
3626 proxyread_ops[tid] = prdop;
3627 in_progress_proxy_ops[ori_soid].push_back(op);
3628 }
3629
3630 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3631 {
3632 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3633 OSDOp *osd_op = NULL;
3634 bool ret = true;
3635 for (unsigned int i = 0; i < m->ops.size(); i++) {
3636 osd_op = &m->ops[i];
3637 ceph_osd_op op = osd_op->op;
3638 switch (op.op) {
3639 case CEPH_OSD_OP_READ:
3640 case CEPH_OSD_OP_SYNC_READ: {
3641 uint64_t cursor = osd_op->op.extent.offset;
3642 uint64_t remain = osd_op->op.extent.length;
3643
3644 /* requested chunks exist in chunk_map ? */
3645 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3646 if (p.first <= cursor && p.first + p.second.length > cursor) {
3647 if (!p.second.is_missing()) {
3648 return false;
3649 }
3650 if (p.second.length >= remain) {
3651 remain = 0;
3652 break;
3653 } else {
3654 remain = remain - p.second.length;
3655 }
3656 cursor += p.second.length;
3657 }
3658 }
3659
3660 if (remain) {
3661 dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3662 return false;
3663 }
3664 continue;
3665 }
3666 default:
3667 return false;
3668 }
3669 }
3670 return ret;
3671 }
3672
3673 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3674 {
3675 dout(10) << __func__ << " " << oid << " tid " << tid
3676 << " " << cpp_strerror(r) << dendl;
3677
3678 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3679 if (p == proxywrite_ops.end()) {
3680 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3681 return;
3682 }
3683 ProxyWriteOpRef pwop = p->second;
3684 ceph_assert(tid == pwop->objecter_tid);
3685 ceph_assert(oid == pwop->soid);
3686
3687 proxywrite_ops.erase(tid);
3688
3689 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3690 if (q == in_progress_proxy_ops.end()) {
3691 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3692 delete pwop->ctx;
3693 pwop->ctx = NULL;
3694 return;
3695 }
3696 list<OpRequestRef>& in_progress_op = q->second;
3697 ceph_assert(in_progress_op.size());
3698 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3699 in_progress_op.end(),
3700 pwop->op);
3701 ceph_assert(it != in_progress_op.end());
3702 in_progress_op.erase(it);
3703 if (in_progress_op.size() == 0) {
3704 in_progress_proxy_ops.erase(oid);
3705 } else if (std::find(in_progress_op.begin(),
3706 in_progress_op.end(),
3707 pwop->op) != in_progress_op.end()) {
3708 if (pwop->ctx)
3709 delete pwop->ctx;
3710 pwop->ctx = NULL;
3711 dout(20) << __func__ << " " << oid << " tid " << tid
3712 << " in_progress_op size: "
3713 << in_progress_op.size() << dendl;
3714 return;
3715 }
3716
3717 osd->logger->inc(l_osd_tier_proxy_write);
3718
3719 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3720 ceph_assert(m != NULL);
3721
3722 if (!pwop->sent_reply) {
3723 // send commit.
3724 MOSDOpReply *reply = pwop->ctx->reply;
3725 if (reply)
3726 pwop->ctx->reply = NULL;
3727 else {
3728 reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0, true);
3729 reply->set_reply_versions(eversion_t(), pwop->user_version);
3730 }
3731 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3732 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3733 osd->send_message_osd_client(reply, m->get_connection());
3734 pwop->sent_reply = true;
3735 pwop->ctx->op->mark_commit_sent();
3736 }
3737
3738 delete pwop->ctx;
3739 pwop->ctx = NULL;
3740 }
3741
3742 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3743 vector<ceph_tid_t> *tids)
3744 {
3745 dout(10) << __func__ << " " << pwop->soid << dendl;
3746 pwop->canceled = true;
3747
3748 // cancel objecter op, if we can
3749 if (pwop->objecter_tid) {
3750 tids->push_back(pwop->objecter_tid);
3751 delete pwop->ctx;
3752 pwop->ctx = NULL;
3753 proxywrite_ops.erase(pwop->objecter_tid);
3754 pwop->objecter_tid = 0;
3755 }
3756 }
3757
3758 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3759 ObjectContextRef obc;
3760 PrimaryLogPG *pg;
3761 utime_t start;
3762 public:
3763 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3764 : obc(obc_),
3765 pg(pg_),
3766 start(ceph_clock_now()) {}
3767
3768 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3769 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3770 int r = results.get<0>();
3771 pg->finish_promote(r, results_data, obc);
3772 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3773 }
3774 };
3775
3776 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3777 ObjectContextRef obc;
3778 PrimaryLogPG *pg;
3779 utime_t start;
3780 PrimaryLogPG::OpContext *ctx;
3781 PrimaryLogPG::CopyCallbackResults promote_results;
3782 public:
3783 PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3784 : obc(obc_),
3785 pg(pg_),
3786 start(ceph_clock_now()), ctx(ctx) {}
3787
3788 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3789 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3790 int r = results.get<0>();
3791 if (ctx) {
3792 promote_results = results;
3793 pg->execute_ctx(ctx);
3794 } else {
3795 pg->finish_promote_manifest(r, results_data, obc);
3796 }
3797 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3798 }
3799 friend struct PromoteFinisher;
3800 };
3801
3802 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3803 PromoteManifestCallback *promote_callback;
3804
3805 explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3806 : promote_callback(promote_callback) {
3807 }
3808
3809 int execute() override {
3810 if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3811 promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3812 promote_callback->promote_results.get<1>(),
3813 promote_callback->obc);
3814 } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3815 promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3816 promote_callback->promote_results.get<1>(),
3817 promote_callback->obc);
3818 } else {
3819 ceph_abort_msg("unrecognized manifest type");
3820 }
3821 return 0;
3822 }
3823 };
3824
3825 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3826 const hobject_t& missing_oid,
3827 const object_locator_t& oloc,
3828 OpRequestRef op,
3829 ObjectContextRef *promote_obc)
3830 {
3831 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3832 ceph_assert(hoid != hobject_t());
3833 if (write_blocked_by_scrub(hoid)) {
3834 dout(10) << __func__ << " " << hoid
3835 << " blocked by scrub" << dendl;
3836 if (op) {
3837 waiting_for_scrub.push_back(op);
3838 op->mark_delayed("waiting for scrub");
3839 dout(10) << __func__ << " " << hoid
3840 << " placing op in waiting_for_scrub" << dendl;
3841 } else {
3842 dout(10) << __func__ << " " << hoid
3843 << " no op, dropping on the floor" << dendl;
3844 }
3845 return;
3846 }
3847 if (!obc) { // we need to create an ObjectContext
3848 ceph_assert(missing_oid != hobject_t());
3849 obc = get_object_context(missing_oid, true);
3850 }
3851 if (promote_obc)
3852 *promote_obc = obc;
3853
3854 /*
3855 * Before promote complete, if there are proxy-reads for the object,
3856 * for this case we don't use DONTNEED.
3857 */
3858 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3859 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3860 if (q == in_progress_proxy_ops.end()) {
3861 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3862 }
3863
3864 CopyCallback *cb;
3865 object_locator_t my_oloc;
3866 hobject_t src_hoid;
3867 if (!obc->obs.oi.has_manifest()) {
3868 my_oloc = oloc;
3869 my_oloc.pool = pool.info.tier_of;
3870 src_hoid = obc->obs.oi.soid;
3871 cb = new PromoteCallback(obc, this);
3872 } else {
3873 if (obc->obs.oi.manifest.is_chunked()) {
3874 src_hoid = obc->obs.oi.soid;
3875 cb = new PromoteManifestCallback(obc, this);
3876 } else if (obc->obs.oi.manifest.is_redirect()) {
3877 object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3878 my_oloc = src_oloc;
3879 src_hoid = obc->obs.oi.manifest.redirect_target;
3880 cb = new PromoteCallback(obc, this);
3881 } else {
3882 ceph_abort_msg("unrecognized manifest type");
3883 }
3884 }
3885
3886 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3887 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3888 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3889 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3890 start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
3891 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3892 src_fadvise_flags, 0);
3893
3894 ceph_assert(obc->is_blocked());
3895
3896 if (op)
3897 wait_for_blocked_object(obc->obs.oi.soid, op);
3898 info.stats.stats.sum.num_promote++;
3899 }
3900
3901 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3902 {
3903 FUNCTRACE(cct);
3904 dout(10) << __func__ << " " << ctx << dendl;
3905 ctx->reset_obs(ctx->obc);
3906 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3907 OpRequestRef op = ctx->op;
3908 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3909 ObjectContextRef obc = ctx->obc;
3910 const hobject_t& soid = obc->obs.oi.soid;
3911
3912 // this method must be idempotent since we may call it several times
3913 // before we finally apply the resulting transaction.
3914 ctx->op_t.reset(new PGTransaction);
3915
3916 if (op->may_write() || op->may_cache()) {
3917 // snap
3918 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3919 pool.info.is_pool_snaps_mode()) {
3920 // use pool's snapc
3921 ctx->snapc = pool.snapc;
3922 } else {
3923 // client specified snapc
3924 ctx->snapc.seq = m->get_snap_seq();
3925 ctx->snapc.snaps = m->get_snaps();
3926 filter_snapc(ctx->snapc.snaps);
3927 }
3928 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3929 ctx->snapc.seq < obc->ssc->snapset.seq) {
3930 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3931 << " < snapset seq " << obc->ssc->snapset.seq
3932 << " on " << obc->obs.oi.soid << dendl;
3933 reply_ctx(ctx, -EOLDSNAPC);
3934 return;
3935 }
3936
3937 // version
3938 ctx->at_version = get_next_version();
3939 ctx->mtime = m->get_mtime();
3940
3941 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3942 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3943 << " snapc " << ctx->snapc
3944 << " snapset " << obc->ssc->snapset
3945 << dendl;
3946 } else {
3947 dout(10) << __func__ << " " << soid << " " << *ctx->ops
3948 << " ov " << obc->obs.oi.version
3949 << dendl;
3950 }
3951
3952 if (!ctx->user_at_version)
3953 ctx->user_at_version = obc->obs.oi.user_version;
3954 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3955
3956 {
3957 #ifdef WITH_LTTNG
3958 osd_reqid_t reqid = ctx->op->get_reqid();
3959 #endif
3960 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3961 reqid.name._num, reqid.tid, reqid.inc);
3962 }
3963
3964 int result = prepare_transaction(ctx);
3965
3966 {
3967 #ifdef WITH_LTTNG
3968 osd_reqid_t reqid = ctx->op->get_reqid();
3969 #endif
3970 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3971 reqid.name._num, reqid.tid, reqid.inc);
3972 }
3973
3974 bool pending_async_reads = !ctx->pending_async_reads.empty();
3975 if (result == -EINPROGRESS || pending_async_reads) {
3976 // come back later.
3977 if (pending_async_reads) {
3978 ceph_assert(pool.info.is_erasure());
3979 in_progress_async_reads.push_back(make_pair(op, ctx));
3980 ctx->start_async_reads(this);
3981 }
3982 return;
3983 }
3984
3985 if (result == -EAGAIN) {
3986 // clean up after the ctx
3987 close_op_ctx(ctx);
3988 return;
3989 }
3990
3991 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3992 // prepare the reply
3993 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0,
3994 successful_write);
3995
3996 // Write operations aren't allowed to return a data payload because
3997 // we can't do so reliably. If the client has to resend the request
3998 // and it has already been applied, we will return 0 with no
3999 // payload. Non-deterministic behavior is no good. However, it is
4000 // possible to construct an operation that does a read, does a guard
4001 // check (e.g., CMPXATTR), and then a write. Then we either succeed
4002 // with the write, or return a CMPXATTR and the read value.
4003 if (successful_write) {
4004 // write. normalize the result code.
4005 dout(20) << " zeroing write result code " << result << dendl;
4006 result = 0;
4007 }
4008 ctx->reply->set_result(result);
4009
4010 // read or error?
4011 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4012 // finish side-effects
4013 if (result >= 0)
4014 do_osd_op_effects(ctx, m->get_connection());
4015
4016 complete_read_ctx(result, ctx);
4017 return;
4018 }
4019
4020 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4021
4022 ceph_assert(op->may_write() || op->may_cache());
4023
4024 // trim log?
4025 if (hard_limit_pglog())
4026 calc_trim_to_aggressive();
4027 else
4028 calc_trim_to();
4029
4030 // verify that we are doing this in order?
4031 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4032 !pool.info.is_tier() && !pool.info.has_tiers()) {
4033 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4034 ceph_tid_t t = m->get_tid();
4035 client_t n = m->get_source().num();
4036 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4037 if (p == cm.end()) {
4038 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4039 cm[n] = t;
4040 } else {
4041 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4042 if (p->second > t) {
4043 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
4044 ceph_abort_msg("out of order op");
4045 }
4046 p->second = t;
4047 }
4048 }
4049
4050 if (ctx->update_log_only) {
4051 if (result >= 0)
4052 do_osd_op_effects(ctx, m->get_connection());
4053
4054 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4055 // save just what we need from ctx
4056 MOSDOpReply *reply = ctx->reply;
4057 ctx->reply = nullptr;
4058 reply->claim_op_out_data(*ctx->ops);
4059 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
4060 close_op_ctx(ctx);
4061
4062 if (result == -ENOENT) {
4063 reply->set_enoent_reply_versions(info.last_update,
4064 info.last_user_version);
4065 }
4066 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4067 // append to pg log for dup detection - don't save buffers for now
4068 record_write_error(op, soid, reply, result);
4069 return;
4070 }
4071
4072 // no need to capture PG ref, repop cancel will handle that
4073 // Can capture the ctx by pointer, it's owned by the repop
4074 ctx->register_on_commit(
4075 [m, ctx, this](){
4076 if (ctx->op)
4077 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4078
4079 if (m && !ctx->sent_reply) {
4080 MOSDOpReply *reply = ctx->reply;
4081 if (reply)
4082 ctx->reply = nullptr;
4083 else {
4084 reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, true);
4085 reply->set_reply_versions(ctx->at_version,
4086 ctx->user_at_version);
4087 }
4088 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4089 dout(10) << " sending reply on " << *m << " " << reply << dendl;
4090 osd->send_message_osd_client(reply, m->get_connection());
4091 ctx->sent_reply = true;
4092 ctx->op->mark_commit_sent();
4093 }
4094 });
4095 ctx->register_on_success(
4096 [ctx, this]() {
4097 do_osd_op_effects(
4098 ctx,
4099 ctx->op ? ctx->op->get_req()->get_connection() :
4100 ConnectionRef());
4101 });
4102 ctx->register_on_finish(
4103 [ctx]() {
4104 delete ctx;
4105 });
4106
4107 // issue replica writes
4108 ceph_tid_t rep_tid = osd->get_tid();
4109
4110 RepGather *repop = new_repop(ctx, obc, rep_tid);
4111
4112 issue_repop(repop, ctx);
4113 eval_repop(repop);
4114 repop->put();
4115 }
4116
4117 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4118 release_object_locks(ctx->lock_manager);
4119
4120 ctx->op_t.reset();
4121
4122 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4123 ctx->on_finish.erase(p++)) {
4124 (*p)();
4125 }
4126 delete ctx;
4127 }
4128
4129 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4130 {
4131 if (ctx->op)
4132 osd->reply_op_error(ctx->op, r);
4133 close_op_ctx(ctx);
4134 }
4135
4136 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
4137 {
4138 if (ctx->op)
4139 osd->reply_op_error(ctx->op, r, v, uv);
4140 close_op_ctx(ctx);
4141 }
4142
4143 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4144 const uint64_t inb,
4145 const uint64_t outb)
4146 {
4147 const MOSDOp* const m = static_cast<const MOSDOp*>(op.get_req());
4148 const utime_t now = ceph_clock_now();
4149
4150 const utime_t latency = now - m->get_recv_stamp();
4151 const utime_t process_latency = now - op.get_dequeued_time();
4152
4153 osd->logger->inc(l_osd_op);
4154
4155 osd->logger->inc(l_osd_op_outb, outb);
4156 osd->logger->inc(l_osd_op_inb, inb);
4157 osd->logger->tinc(l_osd_op_lat, latency);
4158 osd->logger->tinc(l_osd_op_process_lat, process_latency);
4159
4160 if (op.may_read() && op.may_write()) {
4161 osd->logger->inc(l_osd_op_rw);
4162 osd->logger->inc(l_osd_op_rw_inb, inb);
4163 osd->logger->inc(l_osd_op_rw_outb, outb);
4164 osd->logger->tinc(l_osd_op_rw_lat, latency);
4165 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4166 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4167 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4168 } else if (op.may_read()) {
4169 osd->logger->inc(l_osd_op_r);
4170 osd->logger->inc(l_osd_op_r_outb, outb);
4171 osd->logger->tinc(l_osd_op_r_lat, latency);
4172 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4173 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4174 } else if (op.may_write() || op.may_cache()) {
4175 osd->logger->inc(l_osd_op_w);
4176 osd->logger->inc(l_osd_op_w_inb, inb);
4177 osd->logger->tinc(l_osd_op_w_lat, latency);
4178 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4179 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4180 } else {
4181 ceph_abort();
4182 }
4183
4184 dout(15) << "log_op_stats " << *m
4185 << " inb " << inb
4186 << " outb " << outb
4187 << " lat " << latency << dendl;
4188
4189 if (m_dynamic_perf_stats.is_enabled()) {
4190 m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4191 }
4192 }
4193
4194 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4195 const std::list<OSDPerfMetricQuery> &queries)
4196 {
4197 m_dynamic_perf_stats.set_queries(queries);
4198 }
4199
4200 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4201 {
4202 std::swap(m_dynamic_perf_stats, *stats);
4203 }
4204
4205 void PrimaryLogPG::do_scan(
4206 OpRequestRef op,
4207 ThreadPool::TPHandle &handle)
4208 {
4209 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
4210 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4211 dout(10) << "do_scan " << *m << dendl;
4212
4213 op->mark_started();
4214
4215 switch (m->op) {
4216 case MOSDPGScan::OP_SCAN_GET_DIGEST:
4217 {
4218 auto dpp = get_dpp();
4219 if (osd->check_backfill_full(dpp)) {
4220 dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4221 queue_peering_event(
4222 PGPeeringEventRef(
4223 std::make_shared<PGPeeringEvent>(
4224 get_osdmap_epoch(),
4225 get_osdmap_epoch(),
4226 BackfillTooFull())));
4227 return;
4228 }
4229
4230 BackfillInterval bi;
4231 bi.begin = m->begin;
4232 // No need to flush, there won't be any in progress writes occuring
4233 // past m->begin
4234 scan_range(
4235 cct->_conf->osd_backfill_scan_min,
4236 cct->_conf->osd_backfill_scan_max,
4237 &bi,
4238 handle);
4239 MOSDPGScan *reply = new MOSDPGScan(
4240 MOSDPGScan::OP_SCAN_DIGEST,
4241 pg_whoami,
4242 get_osdmap_epoch(), m->query_epoch,
4243 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4244 encode(bi.objects, reply->get_data());
4245 osd->send_message_osd_cluster(reply, m->get_connection());
4246 }
4247 break;
4248
4249 case MOSDPGScan::OP_SCAN_DIGEST:
4250 {
4251 pg_shard_t from = m->from;
4252
4253 // Check that from is in backfill_targets vector
4254 ceph_assert(is_backfill_targets(from));
4255
4256 BackfillInterval& bi = peer_backfill_info[from];
4257 bi.begin = m->begin;
4258 bi.end = m->end;
4259 auto p = m->get_data().cbegin();
4260
4261 // take care to preserve ordering!
4262 bi.clear_objects();
4263 ::decode_noclear(bi.objects, p);
4264
4265 if (waiting_on_backfill.erase(from)) {
4266 if (waiting_on_backfill.empty()) {
4267 ceph_assert(peer_backfill_info.size() == backfill_targets.size());
4268 finish_recovery_op(hobject_t::get_max());
4269 }
4270 } else {
4271 // we canceled backfill for a while due to a too full, and this
4272 // is an extra response from a non-too-full peer
4273 dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4274 }
4275 }
4276 break;
4277 }
4278 }
4279
4280 void PrimaryLogPG::do_backfill(OpRequestRef op)
4281 {
4282 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
4283 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4284 dout(10) << "do_backfill " << *m << dendl;
4285
4286 op->mark_started();
4287
4288 switch (m->op) {
4289 case MOSDPGBackfill::OP_BACKFILL_FINISH:
4290 {
4291 ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4292
4293 MOSDPGBackfill *reply = new MOSDPGBackfill(
4294 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4295 get_osdmap_epoch(),
4296 m->query_epoch,
4297 spg_t(info.pgid.pgid, get_primary().shard));
4298 reply->set_priority(get_recovery_op_priority());
4299 osd->send_message_osd_cluster(reply, m->get_connection());
4300 queue_peering_event(
4301 PGPeeringEventRef(
4302 std::make_shared<PGPeeringEvent>(
4303 get_osdmap_epoch(),
4304 get_osdmap_epoch(),
4305 RecoveryDone())));
4306 }
4307 // fall-thru
4308
4309 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4310 {
4311 ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4312
4313 info.set_last_backfill(m->last_backfill);
4314 // During backfill submit_push_data() tracks num_bytes which is needed in case
4315 // backfill stops and starts again. We want to know how many bytes this
4316 // pg is consuming on the disk in order to compute amount of new data
4317 // reserved to hold backfill if it won't fit.
4318 if (m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS) {
4319 dout(0) << __func__ << " primary " << m->stats.stats.sum.num_bytes << " local " << info.stats.stats.sum.num_bytes << dendl;
4320 int64_t bytes = info.stats.stats.sum.num_bytes;
4321 info.stats = m->stats;
4322 info.stats.stats.sum.num_bytes = bytes;
4323 } else {
4324 dout(0) << __func__ << " final " << m->stats.stats.sum.num_bytes << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
4325 info.stats = m->stats;
4326 }
4327
4328 ObjectStore::Transaction t;
4329 dirty_info = true;
4330 write_if_dirty(t);
4331 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4332 ceph_assert(tr == 0);
4333 }
4334 break;
4335
4336 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4337 {
4338 ceph_assert(is_primary());
4339 ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4340 finish_recovery_op(hobject_t::get_max());
4341 }
4342 break;
4343 }
4344 }
4345
4346 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4347 {
4348 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4349 op->get_req());
4350 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4351 dout(7) << __func__ << " " << m->ls << dendl;
4352
4353 op->mark_started();
4354
4355 ObjectStore::Transaction t;
4356 for (auto& p : m->ls) {
4357 if (is_remote_backfilling()) {
4358 struct stat st;
4359 int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4360 pg_whoami.shard) , &st);
4361 if (r == 0) {
4362 sub_local_num_bytes(st.st_size);
4363 int64_t usersize;
4364 if (pool.info.is_erasure()) {
4365 bufferlist bv;
4366 int r = osd->store->getattr(
4367 ch,
4368 ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4369 OI_ATTR,
4370 bv);
4371 if (r >= 0) {
4372 object_info_t oi(bv);
4373 usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4374 } else {
4375 dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4376 << " can't get object info" << dendl;
4377 usersize = 0;
4378 }
4379 } else {
4380 usersize = st.st_size;
4381 }
4382 sub_num_bytes(usersize);
4383 dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4384 << " sub actual data by " << st.st_size
4385 << " sub num_bytes by " << usersize
4386 << dendl;
4387 }
4388 }
4389 remove_snap_mapped_object(t, p.first);
4390 }
4391 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4392 ceph_assert(r == 0);
4393 }
4394
4395 int PrimaryLogPG::trim_object(
4396 bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
4397 {
4398 *ctxp = NULL;
4399
4400 // load clone info
4401 bufferlist bl;
4402 ObjectContextRef obc = get_object_context(coid, false, NULL);
4403 if (!obc || !obc->ssc || !obc->ssc->exists) {
4404 osd->clog->error() << __func__ << ": Can not trim " << coid
4405 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4406 return -ENOENT;
4407 }
4408
4409 hobject_t head_oid = coid.get_head();
4410 ObjectContextRef head_obc = get_object_context(head_oid, false);
4411 if (!head_obc) {
4412 osd->clog->error() << __func__ << ": Can not trim " << coid
4413 << " repair needed, no snapset obc for " << head_oid;
4414 return -ENOENT;
4415 }
4416
4417 SnapSet& snapset = obc->ssc->snapset;
4418
4419 object_info_t &coi = obc->obs.oi;
4420 auto citer = snapset.clone_snaps.find(coid.snap);
4421 if (citer == snapset.clone_snaps.end()) {
4422 osd->clog->error() << "No clone_snaps in snapset " << snapset
4423 << " for object " << coid << "\n";
4424 return -ENOENT;
4425 }
4426 set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4427 if (old_snaps.empty()) {
4428 osd->clog->error() << "No object info snaps for object " << coid;
4429 return -ENOENT;
4430 }
4431
4432 dout(10) << coid << " old_snaps " << old_snaps
4433 << " old snapset " << snapset << dendl;
4434 if (snapset.seq == 0) {
4435 osd->clog->error() << "No snapset.seq for object " << coid;
4436 return -ENOENT;
4437 }
4438
4439 set<snapid_t> new_snaps;
4440 for (set<snapid_t>::iterator i = old_snaps.begin();
4441 i != old_snaps.end();
4442 ++i) {
4443 if (!pool.info.is_removed_snap(*i))
4444 new_snaps.insert(*i);
4445 }
4446
4447 vector<snapid_t>::iterator p = snapset.clones.end();
4448
4449 if (new_snaps.empty()) {
4450 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4451 if (p == snapset.clones.end()) {
4452 osd->clog->error() << "Snap " << coid.snap << " not in clones";
4453 return -ENOENT;
4454 }
4455 }
4456
4457 OpContextUPtr ctx = simple_opc_create(obc);
4458 ctx->head_obc = head_obc;
4459
4460 if (!ctx->lock_manager.get_snaptrimmer_write(
4461 coid,
4462 obc,
4463 first)) {
4464 close_op_ctx(ctx.release());
4465 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4466 return -ENOLCK;
4467 }
4468
4469 if (!ctx->lock_manager.get_snaptrimmer_write(
4470 head_oid,
4471 head_obc,
4472 first)) {
4473 close_op_ctx(ctx.release());
4474 dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4475 return -ENOLCK;
4476 }
4477
4478 ctx->at_version = get_next_version();
4479
4480 PGTransaction *t = ctx->op_t.get();
4481
4482 if (new_snaps.empty()) {
4483 // remove clone
4484 dout(10) << coid << " snaps " << old_snaps << " -> "
4485 << new_snaps << " ... deleting" << dendl;
4486
4487 // ...from snapset
4488 ceph_assert(p != snapset.clones.end());
4489
4490 snapid_t last = coid.snap;
4491 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4492
4493 if (p != snapset.clones.begin()) {
4494 // not the oldest... merge overlap into next older clone
4495 vector<snapid_t>::iterator n = p - 1;
4496 hobject_t prev_coid = coid;
4497 prev_coid.snap = *n;
4498 bool adjust_prev_bytes = is_present_clone(prev_coid);
4499
4500 if (adjust_prev_bytes)
4501 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4502
4503 snapset.clone_overlap[*n].intersection_of(
4504 snapset.clone_overlap[*p]);
4505
4506 if (adjust_prev_bytes)
4507 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4508 }
4509 ctx->delta_stats.num_objects--;
4510 if (coi.is_dirty())
4511 ctx->delta_stats.num_objects_dirty--;
4512 if (coi.is_omap())
4513 ctx->delta_stats.num_objects_omap--;
4514 if (coi.is_whiteout()) {
4515 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4516 ctx->delta_stats.num_whiteouts--;
4517 }
4518 ctx->delta_stats.num_object_clones--;
4519 if (coi.is_cache_pinned())
4520 ctx->delta_stats.num_objects_pinned--;
4521 if (coi.has_manifest())
4522 ctx->delta_stats.num_objects_manifest--;
4523 obc->obs.exists = false;
4524
4525 snapset.clones.erase(p);
4526 snapset.clone_overlap.erase(last);
4527 snapset.clone_size.erase(last);
4528 snapset.clone_snaps.erase(last);
4529
4530 ctx->log.push_back(
4531 pg_log_entry_t(
4532 pg_log_entry_t::DELETE,
4533 coid,
4534 ctx->at_version,
4535 ctx->obs->oi.version,
4536 0,
4537 osd_reqid_t(),
4538 ctx->mtime,
4539 0)
4540 );
4541 t->remove(coid);
4542 t->update_snaps(
4543 coid,
4544 old_snaps,
4545 new_snaps);
4546
4547 coi = object_info_t(coid);
4548
4549 ctx->at_version.version++;
4550 } else {
4551 // save adjusted snaps for this object
4552 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4553 snapset.clone_snaps[coid.snap] =
4554 vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4555 // we still do a 'modify' event on this object just to trigger a
4556 // snapmapper.update ... :(
4557
4558 coi.prior_version = coi.version;
4559 coi.version = ctx->at_version;
4560 bl.clear();
4561 encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4562 t->setattr(coid, OI_ATTR, bl);
4563
4564 ctx->log.push_back(
4565 pg_log_entry_t(
4566 pg_log_entry_t::MODIFY,
4567 coid,
4568 coi.version,
4569 coi.prior_version,
4570 0,
4571 osd_reqid_t(),
4572 ctx->mtime,
4573 0)
4574 );
4575 ctx->at_version.version++;
4576
4577 t->update_snaps(
4578 coid,
4579 old_snaps,
4580 new_snaps);
4581 }
4582
4583 // save head snapset
4584 dout(10) << coid << " new snapset " << snapset << " on "
4585 << head_obc->obs.oi << dendl;
4586 if (snapset.clones.empty() &&
4587 (head_obc->obs.oi.is_whiteout() &&
4588 !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4589 !head_obc->obs.oi.is_cache_pinned())) {
4590 // NOTE: this arguably constitutes minor interference with the
4591 // tiering agent if this is a cache tier since a snap trim event
4592 // is effectively evicting a whiteout we might otherwise want to
4593 // keep around.
4594 dout(10) << coid << " removing " << head_oid << dendl;
4595 ctx->log.push_back(
4596 pg_log_entry_t(
4597 pg_log_entry_t::DELETE,
4598 head_oid,
4599 ctx->at_version,
4600 head_obc->obs.oi.version,
4601 0,
4602 osd_reqid_t(),
4603 ctx->mtime,
4604 0)
4605 );
4606 derr << "removing snap head" << dendl;
4607 object_info_t& oi = head_obc->obs.oi;
4608 ctx->delta_stats.num_objects--;
4609 if (oi.is_dirty()) {
4610 ctx->delta_stats.num_objects_dirty--;
4611 }
4612 if (oi.is_omap())
4613 ctx->delta_stats.num_objects_omap--;
4614 if (oi.is_whiteout()) {
4615 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4616 ctx->delta_stats.num_whiteouts--;
4617 }
4618 if (oi.is_cache_pinned()) {
4619 ctx->delta_stats.num_objects_pinned--;
4620 }
4621 if (coi.has_manifest())
4622 ctx->delta_stats.num_objects_manifest--;
4623 head_obc->obs.exists = false;
4624 head_obc->obs.oi = object_info_t(head_oid);
4625 t->remove(head_oid);
4626 } else {
4627 dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4628 snapset.filter(pool.info);
4629 dout(10) << coid << " writing updated snapset on " << head_oid
4630 << ", snapset is " << snapset << dendl;
4631 ctx->log.push_back(
4632 pg_log_entry_t(
4633 pg_log_entry_t::MODIFY,
4634 head_oid,
4635 ctx->at_version,
4636 head_obc->obs.oi.version,
4637 0,
4638 osd_reqid_t(),
4639 ctx->mtime,
4640 0)
4641 );
4642
4643 head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4644 head_obc->obs.oi.version = ctx->at_version;
4645
4646 map <string, bufferlist> attrs;
4647 bl.clear();
4648 encode(snapset, bl);
4649 attrs[SS_ATTR].claim(bl);
4650
4651 bl.clear();
4652 encode(head_obc->obs.oi, bl,
4653 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4654 attrs[OI_ATTR].claim(bl);
4655 t->setattrs(head_oid, attrs);
4656 }
4657
4658 *ctxp = std::move(ctx);
4659 return 0;
4660 }
4661
4662 void PrimaryLogPG::kick_snap_trim()
4663 {
4664 ceph_assert(is_active());
4665 ceph_assert(is_primary());
4666 if (is_clean() &&
4667 !state_test(PG_STATE_PREMERGE) &&
4668 !snap_trimq.empty()) {
4669 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4670 dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4671 } else {
4672 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4673 snap_trimmer_machine.process_event(KickTrim());
4674 }
4675 }
4676 }
4677
4678 void PrimaryLogPG::snap_trimmer_scrub_complete()
4679 {
4680 if (is_primary() && is_active() && is_clean()) {
4681 ceph_assert(!snap_trimq.empty());
4682 snap_trimmer_machine.process_event(ScrubComplete());
4683 }
4684 }
4685
4686 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4687 {
4688 if (deleting || pg_has_reset_since(queued)) {
4689 return;
4690 }
4691
4692 ceph_assert(is_primary());
4693
4694 dout(10) << "snap_trimmer posting" << dendl;
4695 snap_trimmer_machine.process_event(DoSnapWork());
4696 dout(10) << "snap_trimmer complete" << dendl;
4697 return;
4698 }
4699
4700 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4701 {
4702 __u64 v2;
4703
4704 string v2s(xattr.c_str(), xattr.length());
4705 if (v2s.length())
4706 v2 = strtoull(v2s.c_str(), NULL, 10);
4707 else
4708 v2 = 0;
4709
4710 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4711
4712 switch (op) {
4713 case CEPH_OSD_CMPXATTR_OP_EQ:
4714 return (v1 == v2);
4715 case CEPH_OSD_CMPXATTR_OP_NE:
4716 return (v1 != v2);
4717 case CEPH_OSD_CMPXATTR_OP_GT:
4718 return (v1 > v2);
4719 case CEPH_OSD_CMPXATTR_OP_GTE:
4720 return (v1 >= v2);
4721 case CEPH_OSD_CMPXATTR_OP_LT:
4722 return (v1 < v2);
4723 case CEPH_OSD_CMPXATTR_OP_LTE:
4724 return (v1 <= v2);
4725 default:
4726 return -EINVAL;
4727 }
4728 }
4729
4730 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4731 {
4732 string v2s(xattr.c_str(), xattr.length());
4733
4734 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4735
4736 switch (op) {
4737 case CEPH_OSD_CMPXATTR_OP_EQ:
4738 return (v1s.compare(v2s) == 0);
4739 case CEPH_OSD_CMPXATTR_OP_NE:
4740 return (v1s.compare(v2s) != 0);
4741 case CEPH_OSD_CMPXATTR_OP_GT:
4742 return (v1s.compare(v2s) > 0);
4743 case CEPH_OSD_CMPXATTR_OP_GTE:
4744 return (v1s.compare(v2s) >= 0);
4745 case CEPH_OSD_CMPXATTR_OP_LT:
4746 return (v1s.compare(v2s) < 0);
4747 case CEPH_OSD_CMPXATTR_OP_LTE:
4748 return (v1s.compare(v2s) <= 0);
4749 default:
4750 return -EINVAL;
4751 }
4752 }
4753
4754 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4755 {
4756 ceph_osd_op& op = osd_op.op;
4757 vector<OSDOp> write_ops(1);
4758 OSDOp& write_op = write_ops[0];
4759 uint64_t write_length = op.writesame.length;
4760 int result = 0;
4761
4762 if (!write_length)
4763 return 0;
4764
4765 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4766 return -EINVAL;
4767
4768 if (op.writesame.data_length != osd_op.indata.length()) {
4769 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4770 return -EINVAL;
4771 }
4772
4773 while (write_length) {
4774 write_op.indata.append(osd_op.indata);
4775 write_length -= op.writesame.data_length;
4776 }
4777
4778 write_op.op.op = CEPH_OSD_OP_WRITE;
4779 write_op.op.extent.offset = op.writesame.offset;
4780 write_op.op.extent.length = op.writesame.length;
4781 result = do_osd_ops(ctx, write_ops);
4782 if (result < 0)
4783 derr << "do_writesame do_osd_ops failed " << result << dendl;
4784
4785 return result;
4786 }
4787
4788 // ========================================================================
4789 // low level osd ops
4790
4791 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4792 {
4793 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4794 bufferlist header, vals;
4795 int r = _get_tmap(ctx, &header, &vals);
4796 if (r < 0) {
4797 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4798 r = 0;
4799 return r;
4800 }
4801
4802 vector<OSDOp> ops(3);
4803
4804 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4805 ops[0].op.extent.offset = 0;
4806 ops[0].op.extent.length = 0;
4807
4808 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4809 ops[1].indata.claim(header);
4810
4811 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4812 ops[2].indata.claim(vals);
4813
4814 return do_osd_ops(ctx, ops);
4815 }
4816
4817 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4818 OSDOp& osd_op, bufferlist& bl)
4819 {
4820 // decode
4821 bufferlist header;
4822 map<string, bufferlist> m;
4823 if (bl.length()) {
4824 auto p = bl.cbegin();
4825 decode(header, p);
4826 decode(m, p);
4827 ceph_assert(p.end());
4828 }
4829
4830 // do the update(s)
4831 while (!bp.end()) {
4832 __u8 op;
4833 string key;
4834 decode(op, bp);
4835
4836 switch (op) {
4837 case CEPH_OSD_TMAP_SET: // insert key
4838 {
4839 decode(key, bp);
4840 bufferlist data;
4841 decode(data, bp);
4842 m[key] = data;
4843 }
4844 break;
4845 case CEPH_OSD_TMAP_RM: // remove key
4846 decode(key, bp);
4847 if (!m.count(key)) {
4848 return -ENOENT;
4849 }
4850 m.erase(key);
4851 break;
4852 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4853 decode(key, bp);
4854 m.erase(key);
4855 break;
4856 case CEPH_OSD_TMAP_HDR: // update header
4857 {
4858 decode(header, bp);
4859 }
4860 break;
4861 default:
4862 return -EINVAL;
4863 }
4864 }
4865
4866 // reencode
4867 bufferlist obl;
4868 encode(header, obl);
4869 encode(m, obl);
4870
4871 // write it out
4872 vector<OSDOp> nops(1);
4873 OSDOp& newop = nops[0];
4874 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4875 newop.op.extent.offset = 0;
4876 newop.op.extent.length = obl.length();
4877 newop.indata = obl;
4878 do_osd_ops(ctx, nops);
4879 osd_op.outdata.claim(newop.outdata);
4880 return 0;
4881 }
4882
4883 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
4884 {
4885 bufferlist::const_iterator orig_bp = bp;
4886 int result = 0;
4887 if (bp.end()) {
4888 dout(10) << "tmapup is a no-op" << dendl;
4889 } else {
4890 // read the whole object
4891 vector<OSDOp> nops(1);
4892 OSDOp& newop = nops[0];
4893 newop.op.op = CEPH_OSD_OP_READ;
4894 newop.op.extent.offset = 0;
4895 newop.op.extent.length = 0;
4896 result = do_osd_ops(ctx, nops);
4897
4898 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4899
4900 dout(30) << " starting is \n";
4901 newop.outdata.hexdump(*_dout);
4902 *_dout << dendl;
4903
4904 auto ip = newop.outdata.cbegin();
4905 bufferlist obl;
4906
4907 dout(30) << "the update command is: \n";
4908 osd_op.indata.hexdump(*_dout);
4909 *_dout << dendl;
4910
4911 // header
4912 bufferlist header;
4913 __u32 nkeys = 0;
4914 if (newop.outdata.length()) {
4915 decode(header, ip);
4916 decode(nkeys, ip);
4917 }
4918 dout(10) << "tmapup header " << header.length() << dendl;
4919
4920 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4921 ++bp;
4922 decode(header, bp);
4923 dout(10) << "tmapup new header " << header.length() << dendl;
4924 }
4925
4926 encode(header, obl);
4927
4928 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4929
4930 // update keys
4931 bufferlist newkeydata;
4932 string nextkey, last_in_key;
4933 bufferlist nextval;
4934 bool have_next = false;
4935 if (!ip.end()) {
4936 have_next = true;
4937 decode(nextkey, ip);
4938 decode(nextval, ip);
4939 }
4940 while (!bp.end() && !result) {
4941 __u8 op;
4942 string key;
4943 try {
4944 decode(op, bp);
4945 decode(key, bp);
4946 }
4947 catch (buffer::error& e) {
4948 return -EINVAL;
4949 }
4950 if (key < last_in_key) {
4951 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4952 << "', falling back to an inefficient (unsorted) update" << dendl;
4953 bp = orig_bp;
4954 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4955 }
4956 last_in_key = key;
4957
4958 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4959
4960 // skip existing intervening keys
4961 bool key_exists = false;
4962 while (have_next && !key_exists) {
4963 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4964 if (nextkey > key)
4965 break;
4966 if (nextkey < key) {
4967 // copy untouched.
4968 encode(nextkey, newkeydata);
4969 encode(nextval, newkeydata);
4970 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4971 } else {
4972 // don't copy; discard old value. and stop.
4973 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4974 key_exists = true;
4975 nkeys--;
4976 }
4977 if (!ip.end()) {
4978 decode(nextkey, ip);
4979 decode(nextval, ip);
4980 } else {
4981 have_next = false;
4982 }
4983 }
4984
4985 if (op == CEPH_OSD_TMAP_SET) {
4986 bufferlist val;
4987 try {
4988 decode(val, bp);
4989 }
4990 catch (buffer::error& e) {
4991 return -EINVAL;
4992 }
4993 encode(key, newkeydata);
4994 encode(val, newkeydata);
4995 dout(20) << " set " << key << " " << val.length() << dendl;
4996 nkeys++;
4997 } else if (op == CEPH_OSD_TMAP_CREATE) {
4998 if (key_exists) {
4999 return -EEXIST;
5000 }
5001 bufferlist val;
5002 try {
5003 decode(val, bp);
5004 }
5005 catch (buffer::error& e) {
5006 return -EINVAL;
5007 }
5008 encode(key, newkeydata);
5009 encode(val, newkeydata);
5010 dout(20) << " create " << key << " " << val.length() << dendl;
5011 nkeys++;
5012 } else if (op == CEPH_OSD_TMAP_RM) {
5013 // do nothing.
5014 if (!key_exists) {
5015 return -ENOENT;
5016 }
5017 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5018 // do nothing
5019 } else {
5020 dout(10) << " invalid tmap op " << (int)op << dendl;
5021 return -EINVAL;
5022 }
5023 }
5024
5025 // copy remaining
5026 if (have_next) {
5027 encode(nextkey, newkeydata);
5028 encode(nextval, newkeydata);
5029 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5030 }
5031 if (!ip.end()) {
5032 bufferlist rest;
5033 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5034 dout(20) << " keep trailing " << rest.length()
5035 << " at " << newkeydata.length() << dendl;
5036 newkeydata.claim_append(rest);
5037 }
5038
5039 // encode final key count + key data
5040 dout(20) << "tmapup final nkeys " << nkeys << dendl;
5041 encode(nkeys, obl);
5042 obl.claim_append(newkeydata);
5043
5044 if (0) {
5045 dout(30) << " final is \n";
5046 obl.hexdump(*_dout);
5047 *_dout << dendl;
5048
5049 // sanity check
5050 auto tp = obl.cbegin();
5051 bufferlist h;
5052 decode(h, tp);
5053 map<string,bufferlist> d;
5054 decode(d, tp);
5055 ceph_assert(tp.end());
5056 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5057 }
5058
5059 // write it out
5060 if (!result) {
5061 dout(20) << "tmapput write " << obl.length() << dendl;
5062 newop.op.op = CEPH_OSD_OP_WRITEFULL;
5063 newop.op.extent.offset = 0;
5064 newop.op.extent.length = obl.length();
5065 newop.indata = obl;
5066 do_osd_ops(ctx, nops);
5067 osd_op.outdata.claim(newop.outdata);
5068 }
5069 }
5070 return result;
5071 }
5072
5073 static int check_offset_and_length(uint64_t offset, uint64_t length,
5074 uint64_t max, DoutPrefixProvider *dpp)
5075 {
5076 if (offset >= max ||
5077 length > max ||
5078 offset + length > max) {
5079 ldpp_dout(dpp, 10) << __func__ << " "
5080 << "osd_max_object_size: " << max
5081 << "; Hard limit of object size is 4GB." << dendl;
5082 return -EFBIG;
5083 }
5084
5085 return 0;
5086 }
5087
5088 struct FillInVerifyExtent : public Context {
5089 ceph_le64 *r;
5090 int32_t *rval;
5091 bufferlist *outdatap;
5092 boost::optional<uint32_t> maybe_crc;
5093 uint64_t size;
5094 OSDService *osd;
5095 hobject_t soid;
5096 __le32 flags;
5097 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5098 boost::optional<uint32_t> mc, uint64_t size,
5099 OSDService *osd, hobject_t soid, __le32 flags) :
5100 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5101 size(size), osd(osd), soid(soid), flags(flags) {}
5102 void finish(int len) override {
5103 *r = len;
5104 if (len < 0) {
5105 *rval = len;
5106 return;
5107 }
5108 *rval = 0;
5109
5110 // whole object? can we verify the checksum?
5111 if (maybe_crc && *r == size) {
5112 uint32_t crc = outdatap->crc32c(-1);
5113 if (maybe_crc != crc) {
5114 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5115 << " != expected 0x" << *maybe_crc
5116 << std::dec << " on " << soid;
5117 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5118 *rval = -EIO;
5119 *r = 0;
5120 }
5121 }
5122 }
5123 }
5124 };
5125
5126 struct ToSparseReadResult : public Context {
5127 int* result;
5128 bufferlist* data_bl;
5129 uint64_t data_offset;
5130 ceph_le64* len;
5131 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5132 ceph_le64* len)
5133 : result(result), data_bl(bl), data_offset(offset),len(len) {}
5134 void finish(int r) override {
5135 if (r < 0) {
5136 *result = r;
5137 return;
5138 }
5139 *result = 0;
5140 *len = r;
5141 bufferlist outdata;
5142 map<uint64_t, uint64_t> extents = {{data_offset, r}};
5143 encode(extents, outdata);
5144 ::encode_destructively(*data_bl, outdata);
5145 data_bl->swap(outdata);
5146 }
5147 };
5148
5149 template<typename V>
5150 static string list_keys(const map<string, V>& m) {
5151 string s;
5152 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5153 if (!s.empty()) {
5154 s.push_back(',');
5155 }
5156 s.append(itr->first);
5157 }
5158 return s;
5159 }
5160
5161 template<typename T>
5162 static string list_entries(const T& m) {
5163 string s;
5164 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5165 if (!s.empty()) {
5166 s.push_back(',');
5167 }
5168 s.append(*itr);
5169 }
5170 return s;
5171 }
5172
5173 void PrimaryLogPG::maybe_create_new_object(
5174 OpContext *ctx,
5175 bool ignore_transaction)
5176 {
5177 ObjectState& obs = ctx->new_obs;
5178 if (!obs.exists) {
5179 ctx->delta_stats.num_objects++;
5180 obs.exists = true;
5181 ceph_assert(!obs.oi.is_whiteout());
5182 obs.oi.new_object();
5183 if (!ignore_transaction)
5184 ctx->op_t->create(obs.oi.soid);
5185 } else if (obs.oi.is_whiteout()) {
5186 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5187 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5188 --ctx->delta_stats.num_whiteouts;
5189 }
5190 }
5191
5192 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5193 OSDOp& osd_op;
5194
5195 explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5196 }
5197
5198 int execute() override {
5199 return osd_op.rval;
5200 }
5201 };
5202
5203 struct C_ChecksumRead : public Context {
5204 PrimaryLogPG *primary_log_pg;
5205 OSDOp &osd_op;
5206 Checksummer::CSumType csum_type;
5207 bufferlist init_value_bl;
5208 ceph_le64 read_length;
5209 bufferlist read_bl;
5210 Context *fill_extent_ctx;
5211
5212 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5213 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5214 boost::optional<uint32_t> maybe_crc, uint64_t size,
5215 OSDService *osd, hobject_t soid, __le32 flags)
5216 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5217 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5218 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5219 &read_bl, maybe_crc, size,
5220 osd, soid, flags)) {
5221 }
5222 ~C_ChecksumRead() override {
5223 delete fill_extent_ctx;
5224 }
5225
5226 void finish(int r) override {
5227 fill_extent_ctx->complete(r);
5228 fill_extent_ctx = nullptr;
5229
5230 if (osd_op.rval >= 0) {
5231 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5232 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5233 &init_value_bl_it, read_bl);
5234 }
5235 }
5236 };
5237
5238 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5239 bufferlist::const_iterator *bl_it)
5240 {
5241 dout(20) << __func__ << dendl;
5242
5243 auto& op = osd_op.op;
5244 if (op.checksum.chunk_size > 0) {
5245 if (op.checksum.length == 0) {
5246 dout(10) << __func__ << ": length required when chunk size provided"
5247 << dendl;
5248 return -EINVAL;
5249 }
5250 if (op.checksum.length % op.checksum.chunk_size != 0) {
5251 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5252 return -EINVAL;
5253 }
5254 }
5255
5256 auto& oi = ctx->new_obs.oi;
5257 if (op.checksum.offset == 0 && op.checksum.length == 0) {
5258 // zeroed offset+length implies checksum whole object
5259 op.checksum.length = oi.size;
5260 } else if (op.checksum.offset >= oi.size) {
5261 // read size was trimmed to zero, do nothing
5262 // see PrimaryLogPG::do_read
5263 return 0;
5264 } else if (op.extent.offset + op.extent.length > oi.size) {
5265 op.extent.length = oi.size - op.extent.offset;
5266 if (op.checksum.chunk_size > 0 &&
5267 op.checksum.length % op.checksum.chunk_size != 0) {
5268 dout(10) << __func__ << ": length (trimmed to 0x"
5269 << std::hex << op.checksum.length
5270 << ") not aligned to chunk size 0x"
5271 << op.checksum.chunk_size << std::dec
5272 << dendl;
5273 return -EINVAL;
5274 }
5275 }
5276
5277 Checksummer::CSumType csum_type;
5278 switch (op.checksum.type) {
5279 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5280 csum_type = Checksummer::CSUM_XXHASH32;
5281 break;
5282 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5283 csum_type = Checksummer::CSUM_XXHASH64;
5284 break;
5285 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5286 csum_type = Checksummer::CSUM_CRC32C;
5287 break;
5288 default:
5289 dout(10) << __func__ << ": unknown crc type ("
5290 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5291 return -EINVAL;
5292 }
5293
5294 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5295 if (bl_it->get_remaining() < csum_init_value_size) {
5296 dout(10) << __func__ << ": init value not provided" << dendl;
5297 return -EINVAL;
5298 }
5299
5300 bufferlist init_value_bl;
5301 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5302 csum_init_value_size);
5303 bl_it->advance(csum_init_value_size);
5304
5305 if (pool.info.is_erasure() && op.checksum.length > 0) {
5306 // If there is a data digest and it is possible we are reading
5307 // entire object, pass the digest.
5308 boost::optional<uint32_t> maybe_crc;
5309 if (oi.is_data_digest() && op.checksum.offset == 0 &&
5310 op.checksum.length >= oi.size) {
5311 maybe_crc = oi.data_digest;
5312 }
5313
5314 // async read
5315 auto& soid = oi.soid;
5316 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5317 std::move(init_value_bl), maybe_crc,
5318 oi.size, osd, soid, op.flags);
5319
5320 ctx->pending_async_reads.push_back({
5321 {op.checksum.offset, op.checksum.length, op.flags},
5322 {&checksum_ctx->read_bl, checksum_ctx}});
5323
5324 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5325 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5326 new ReadFinisher(osd_op));
5327 return -EINPROGRESS;
5328 }
5329
5330 // sync read
5331 std::vector<OSDOp> read_ops(1);
5332 auto& read_op = read_ops[0];
5333 if (op.checksum.length > 0) {
5334 read_op.op.op = CEPH_OSD_OP_READ;
5335 read_op.op.flags = op.flags;
5336 read_op.op.extent.offset = op.checksum.offset;
5337 read_op.op.extent.length = op.checksum.length;
5338 read_op.op.extent.truncate_size = 0;
5339 read_op.op.extent.truncate_seq = 0;
5340
5341 int r = do_osd_ops(ctx, read_ops);
5342 if (r < 0) {
5343 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5344 return r;
5345 }
5346 }
5347
5348 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5349 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5350 read_op.outdata);
5351 }
5352
5353 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5354 Checksummer::CSumType csum_type,
5355 bufferlist::const_iterator *init_value_bl_it,
5356 const bufferlist &read_bl) {
5357 dout(20) << __func__ << dendl;
5358
5359 auto& op = osd_op.op;
5360
5361 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5362 derr << __func__ << ": bytes read " << read_bl.length() << " != "
5363 << op.checksum.length << dendl;
5364 return -EINVAL;
5365 }
5366
5367 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5368 op.checksum.chunk_size : read_bl.length());
5369 uint32_t csum_count = (csum_chunk_size > 0 ?
5370 read_bl.length() / csum_chunk_size : 0);
5371
5372 bufferlist csum;
5373 bufferptr csum_data;
5374 if (csum_count > 0) {
5375 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5376 csum_data = buffer::create(csum_value_size * csum_count);
5377 csum_data.zero();
5378 csum.append(csum_data);
5379
5380 switch (csum_type) {
5381 case Checksummer::CSUM_XXHASH32:
5382 {
5383 Checksummer::xxhash32::init_value_t init_value;
5384 decode(init_value, *init_value_bl_it);
5385 Checksummer::calculate<Checksummer::xxhash32>(
5386 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5387 &csum_data);
5388 }
5389 break;
5390 case Checksummer::CSUM_XXHASH64:
5391 {
5392 Checksummer::xxhash64::init_value_t init_value;
5393 decode(init_value, *init_value_bl_it);
5394 Checksummer::calculate<Checksummer::xxhash64>(
5395 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5396 &csum_data);
5397 }
5398 break;
5399 case Checksummer::CSUM_CRC32C:
5400 {
5401 Checksummer::crc32c::init_value_t init_value;
5402 decode(init_value, *init_value_bl_it);
5403 Checksummer::calculate<Checksummer::crc32c>(
5404 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5405 &csum_data);
5406 }
5407 break;
5408 default:
5409 break;
5410 }
5411 }
5412
5413 encode(csum_count, osd_op.outdata);
5414 osd_op.outdata.claim_append(csum);
5415 return 0;
5416 }
5417
5418 struct C_ExtentCmpRead : public Context {
5419 PrimaryLogPG *primary_log_pg;
5420 OSDOp &osd_op;
5421 ceph_le64 read_length{};
5422 bufferlist read_bl;
5423 Context *fill_extent_ctx;
5424
5425 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5426 boost::optional<uint32_t> maybe_crc, uint64_t size,
5427 OSDService *osd, hobject_t soid, __le32 flags)
5428 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5429 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5430 &read_bl, maybe_crc, size,
5431 osd, soid, flags)) {
5432 }
5433 ~C_ExtentCmpRead() override {
5434 delete fill_extent_ctx;
5435 }
5436
5437 void finish(int r) override {
5438 if (r == -ENOENT) {
5439 osd_op.rval = 0;
5440 read_bl.clear();
5441 delete fill_extent_ctx;
5442 } else {
5443 fill_extent_ctx->complete(r);
5444 }
5445 fill_extent_ctx = nullptr;
5446
5447 if (osd_op.rval >= 0) {
5448 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5449 }
5450 }
5451 };
5452
5453 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5454 {
5455 dout(20) << __func__ << dendl;
5456 ceph_osd_op& op = osd_op.op;
5457
5458 auto& oi = ctx->new_obs.oi;
5459 uint64_t size = oi.size;
5460 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5461 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5462 size = op.extent.truncate_size;
5463 }
5464
5465 if (op.extent.offset >= size) {
5466 op.extent.length = 0;
5467 } else if (op.extent.offset + op.extent.length > size) {
5468 op.extent.length = size - op.extent.offset;
5469 }
5470
5471 if (op.extent.length == 0) {
5472 dout(20) << __func__ << " zero length extent" << dendl;
5473 return finish_extent_cmp(osd_op, bufferlist{});
5474 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5475 dout(20) << __func__ << " object DNE" << dendl;
5476 return finish_extent_cmp(osd_op, {});
5477 } else if (pool.info.is_erasure()) {
5478 // If there is a data digest and it is possible we are reading
5479 // entire object, pass the digest.
5480 boost::optional<uint32_t> maybe_crc;
5481 if (oi.is_data_digest() && op.checksum.offset == 0 &&
5482 op.checksum.length >= oi.size) {
5483 maybe_crc = oi.data_digest;
5484 }
5485
5486 // async read
5487 auto& soid = oi.soid;
5488 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5489 osd, soid, op.flags);
5490 ctx->pending_async_reads.push_back({
5491 {op.extent.offset, op.extent.length, op.flags},
5492 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5493
5494 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5495
5496 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5497 new ReadFinisher(osd_op));
5498 return -EINPROGRESS;
5499 }
5500
5501 // sync read
5502 vector<OSDOp> read_ops(1);
5503 OSDOp& read_op = read_ops[0];
5504
5505 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5506 read_op.op.extent.offset = op.extent.offset;
5507 read_op.op.extent.length = op.extent.length;
5508 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5509 read_op.op.extent.truncate_size = op.extent.truncate_size;
5510
5511 int result = do_osd_ops(ctx, read_ops);
5512 if (result < 0) {
5513 derr << __func__ << " failed " << result << dendl;
5514 return result;
5515 }
5516 return finish_extent_cmp(osd_op, read_op.outdata);
5517 }
5518
5519 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5520 {
5521 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5522 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5523 if (osd_op.indata[idx] != read_byte) {
5524 return (-MAX_ERRNO - idx);
5525 }
5526 }
5527
5528 return 0;
5529 }
5530
5531 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5532 dout(20) << __func__ << dendl;
5533 auto& op = osd_op.op;
5534 auto& oi = ctx->new_obs.oi;
5535 auto& soid = oi.soid;
5536 __u32 seq = oi.truncate_seq;
5537 uint64_t size = oi.size;
5538 bool trimmed_read = false;
5539
5540 dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5541 dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5542 dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5543 dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5544
5545 // are we beyond truncate_size?
5546 if ( (seq < op.extent.truncate_seq) &&
5547 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5548 (size > op.extent.truncate_size) )
5549 size = op.extent.truncate_size;
5550
5551 if (op.extent.length == 0) //length is zero mean read the whole object
5552 op.extent.length = size;
5553
5554 if (op.extent.offset >= size) {
5555 op.extent.length = 0;
5556 trimmed_read = true;
5557 } else if (op.extent.offset + op.extent.length > size) {
5558 op.extent.length = size - op.extent.offset;
5559 trimmed_read = true;
5560 }
5561
5562 dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5563
5564 // read into a buffer
5565 int result = 0;
5566 if (trimmed_read && op.extent.length == 0) {
5567 // read size was trimmed to zero and it is expected to do nothing
5568 // a read operation of 0 bytes does *not* do nothing, this is why
5569 // the trimmed_read boolean is needed
5570 } else if (pool.info.is_erasure()) {
5571 // The initialisation below is required to silence a false positive
5572 // -Wmaybe-uninitialized warning
5573 boost::optional<uint32_t> maybe_crc = boost::make_optional(false, uint32_t());
5574 // If there is a data digest and it is possible we are reading
5575 // entire object, pass the digest. FillInVerifyExtent will
5576 // will check the oi.size again.
5577 if (oi.is_data_digest() && op.extent.offset == 0 &&
5578 op.extent.length >= oi.size)
5579 maybe_crc = oi.data_digest;
5580 ctx->pending_async_reads.push_back(
5581 make_pair(
5582 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5583 make_pair(&osd_op.outdata,
5584 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5585 &osd_op.outdata, maybe_crc, oi.size,
5586 osd, soid, op.flags))));
5587 dout(10) << " async_read noted for " << soid << dendl;
5588
5589 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5590 new ReadFinisher(osd_op));
5591 } else {
5592 int r = pgbackend->objects_read_sync(
5593 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5594 // whole object? can we verify the checksum?
5595 if (r >= 0 && op.extent.offset == 0 &&
5596 (uint64_t)r == oi.size && oi.is_data_digest()) {
5597 uint32_t crc = osd_op.outdata.crc32c(-1);
5598 if (oi.data_digest != crc) {
5599 osd->clog->error() << info.pgid << std::hex
5600 << " full-object read crc 0x" << crc
5601 << " != expected 0x" << oi.data_digest
5602 << std::dec << " on " << soid;
5603 r = -EIO; // try repair later
5604 }
5605 }
5606 if (r == -EIO) {
5607 r = rep_repair_primary_object(soid, ctx);
5608 }
5609 if (r >= 0)
5610 op.extent.length = r;
5611 else if (r == -EAGAIN) {
5612 result = -EAGAIN;
5613 } else {
5614 result = r;
5615 op.extent.length = 0;
5616 }
5617 dout(10) << " read got " << r << " / " << op.extent.length
5618 << " bytes from obj " << soid << dendl;
5619 }
5620 if (result >= 0) {
5621 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5622 ctx->delta_stats.num_rd++;
5623 }
5624 return result;
5625 }
5626
5627 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5628 dout(20) << __func__ << dendl;
5629 auto& op = osd_op.op;
5630 auto& oi = ctx->new_obs.oi;
5631 auto& soid = oi.soid;
5632
5633 if (op.extent.truncate_seq) {
5634 dout(0) << "sparse_read does not support truncation sequence " << dendl;
5635 return -EINVAL;
5636 }
5637
5638 ++ctx->num_read;
5639 if (pool.info.is_erasure()) {
5640 // translate sparse read to a normal one if not supported
5641 uint64_t offset = op.extent.offset;
5642 uint64_t length = op.extent.length;
5643 if (offset > oi.size) {
5644 length = 0;
5645 } else if (offset + length > oi.size) {
5646 length = oi.size - offset;
5647 }
5648
5649 if (length > 0) {
5650 ctx->pending_async_reads.push_back(
5651 make_pair(
5652 boost::make_tuple(offset, length, op.flags),
5653 make_pair(
5654 &osd_op.outdata,
5655 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5656 &op.extent.length))));
5657 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5658
5659 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5660 new ReadFinisher(osd_op));
5661 } else {
5662 dout(10) << " sparse read ended up empty for " << soid << dendl;
5663 map<uint64_t, uint64_t> extents;
5664 encode(extents, osd_op.outdata);
5665 }
5666 } else {
5667 // read into a buffer
5668 map<uint64_t, uint64_t> m;
5669 uint32_t total_read = 0;
5670 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5671 info.pgid.shard),
5672 op.extent.offset, op.extent.length, m);
5673 if (r < 0) {
5674 return r;
5675 }
5676
5677 map<uint64_t, uint64_t>::iterator miter;
5678 bufferlist data_bl;
5679 uint64_t last = op.extent.offset;
5680 for (miter = m.begin(); miter != m.end(); ++miter) {
5681 // verify hole?
5682 if (cct->_conf->osd_verify_sparse_read_holes &&
5683 last < miter->first) {
5684 bufferlist t;
5685 uint64_t len = miter->first - last;
5686 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5687 if (r < 0) {
5688 osd->clog->error() << coll << " " << soid
5689 << " sparse-read failed to read: "
5690 << r;
5691 } else if (!t.is_zero()) {
5692 osd->clog->error() << coll << " " << soid
5693 << " sparse-read found data in hole "
5694 << last << "~" << len;
5695 }
5696 }
5697
5698 bufferlist tmpbl;
5699 r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
5700 op.flags, &tmpbl);
5701 if (r == -EIO) {
5702 r = rep_repair_primary_object(soid, ctx);
5703 }
5704 if (r < 0) {
5705 return r;
5706 }
5707
5708 // this is usually happen when we get extent that exceeds the actual file
5709 // size
5710 if (r < (int)miter->second)
5711 miter->second = r;
5712 total_read += r;
5713 dout(10) << "sparse-read " << miter->first << "@" << miter->second
5714 << dendl;
5715 data_bl.claim_append(tmpbl);
5716 last = miter->first + r;
5717 }
5718
5719 // verify trailing hole?
5720 if (cct->_conf->osd_verify_sparse_read_holes) {
5721 uint64_t end = std::min<uint64_t>(op.extent.offset + op.extent.length,
5722 oi.size);
5723 if (last < end) {
5724 bufferlist t;
5725 uint64_t len = end - last;
5726 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5727 if (r < 0) {
5728 osd->clog->error() << coll << " " << soid
5729 << " sparse-read failed to read: " << r;
5730 } else if (!t.is_zero()) {
5731 osd->clog->error() << coll << " " << soid
5732 << " sparse-read found data in hole "
5733 << last << "~" << len;
5734 }
5735 }
5736 }
5737
5738 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5739 // Maybe at first, there is no much whole objects. With continued use, more
5740 // and more whole object exist. So from this point, for spare-read add
5741 // checksum make sense.
5742 if (total_read == oi.size && oi.is_data_digest()) {
5743 uint32_t crc = data_bl.crc32c(-1);
5744 if (oi.data_digest != crc) {
5745 osd->clog->error() << info.pgid << std::hex
5746 << " full-object read crc 0x" << crc
5747 << " != expected 0x" << oi.data_digest
5748 << std::dec << " on " << soid;
5749 r = rep_repair_primary_object(soid, ctx);
5750 if (r < 0) {
5751 return r;
5752 }
5753 }
5754 }
5755
5756 op.extent.length = total_read;
5757
5758 encode(m, osd_op.outdata); // re-encode since it might be modified
5759 ::encode_destructively(data_bl, osd_op.outdata);
5760
5761 dout(10) << " sparse_read got " << total_read << " bytes from object "
5762 << soid << dendl;
5763 }
5764
5765 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5766 ctx->delta_stats.num_rd++;
5767 return 0;
5768 }
5769
5770 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5771 {
5772 int result = 0;
5773 SnapSetContext *ssc = ctx->obc->ssc;
5774 ObjectState& obs = ctx->new_obs;
5775 object_info_t& oi = obs.oi;
5776 const hobject_t& soid = oi.soid;
5777 const bool skip_data_digest = osd->store->has_builtin_csum() &&
5778 osd->osd_skip_data_digest;
5779
5780 PGTransaction* t = ctx->op_t.get();
5781
5782 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5783
5784 ctx->current_osd_subop_num = 0;
5785 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5786 OSDOp& osd_op = *p;
5787 ceph_osd_op& op = osd_op.op;
5788
5789 OpFinisher* op_finisher = nullptr;
5790 {
5791 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5792 if (op_finisher_it != ctx->op_finishers.end()) {
5793 op_finisher = op_finisher_it->second.get();
5794 }
5795 }
5796
5797 // TODO: check endianness (__le32 vs uint32_t, etc.)
5798 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5799 // but the code in this function seems to treat them as native-endian. What should the
5800 // tracepoints do?
5801 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5802
5803 dout(10) << "do_osd_op " << osd_op << dendl;
5804
5805 auto bp = osd_op.indata.cbegin();
5806
5807 // user-visible modifcation?
5808 switch (op.op) {
5809 // non user-visible modifications
5810 case CEPH_OSD_OP_WATCH:
5811 case CEPH_OSD_OP_CACHE_EVICT:
5812 case CEPH_OSD_OP_CACHE_FLUSH:
5813 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5814 case CEPH_OSD_OP_UNDIRTY:
5815 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5816 case CEPH_OSD_OP_CACHE_PIN:
5817 case CEPH_OSD_OP_CACHE_UNPIN:
5818 case CEPH_OSD_OP_SET_REDIRECT:
5819 case CEPH_OSD_OP_TIER_PROMOTE:
5820 break;
5821 default:
5822 if (op.op & CEPH_OSD_OP_MODE_WR)
5823 ctx->user_modify = true;
5824 }
5825
5826 // munge -1 truncate to 0 truncate
5827 if (ceph_osd_op_uses_extent(op.op) &&
5828 op.extent.truncate_seq == 1 &&
5829 op.extent.truncate_size == (-1ULL)) {
5830 op.extent.truncate_size = 0;
5831 op.extent.truncate_seq = 0;
5832 }
5833
5834 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5835 if (op.op == CEPH_OSD_OP_ZERO &&
5836 obs.exists &&
5837 op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5838 op.extent.length >= 1 &&
5839 op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
5840 op.extent.offset + op.extent.length >= oi.size) {
5841 if (op.extent.offset >= oi.size) {
5842 // no-op
5843 goto fail;
5844 }
5845 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5846 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5847 op.op = CEPH_OSD_OP_TRUNCATE;
5848 }
5849
5850 switch (op.op) {
5851
5852 // --- READS ---
5853
5854 case CEPH_OSD_OP_CMPEXT:
5855 ++ctx->num_read;
5856 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5857 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5858 op.extent.length, op.extent.truncate_size,
5859 op.extent.truncate_seq);
5860
5861 if (op_finisher == nullptr) {
5862 result = do_extent_cmp(ctx, osd_op);
5863 } else {
5864 result = op_finisher->execute();
5865 }
5866 break;
5867
5868 case CEPH_OSD_OP_SYNC_READ:
5869 if (pool.info.is_erasure()) {
5870 result = -EOPNOTSUPP;
5871 break;
5872 }
5873 // fall through
5874 case CEPH_OSD_OP_READ:
5875 ++ctx->num_read;
5876 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5877 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5878 op.extent.length, op.extent.truncate_size,
5879 op.extent.truncate_seq);
5880 if (op_finisher == nullptr) {
5881 if (!ctx->data_off) {
5882 ctx->data_off = op.extent.offset;
5883 }
5884 result = do_read(ctx, osd_op);
5885 } else {
5886 result = op_finisher->execute();
5887 }
5888 break;
5889
5890 case CEPH_OSD_OP_CHECKSUM:
5891 ++ctx->num_read;
5892 {
5893 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5894 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5895 op.checksum.offset, op.checksum.length,
5896 op.checksum.chunk_size);
5897
5898 if (op_finisher == nullptr) {
5899 result = do_checksum(ctx, osd_op, &bp);
5900 } else {
5901 result = op_finisher->execute();
5902 }
5903 }
5904 break;
5905
5906 /* map extents */
5907 case CEPH_OSD_OP_MAPEXT:
5908 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5909 if (pool.info.is_erasure()) {
5910 result = -EOPNOTSUPP;
5911 break;
5912 }
5913 ++ctx->num_read;
5914 {
5915 // read into a buffer
5916 bufferlist bl;
5917 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5918 info.pgid.shard),
5919 op.extent.offset, op.extent.length, bl);
5920 osd_op.outdata.claim(bl);
5921 if (r < 0)
5922 result = r;
5923 else
5924 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
5925 ctx->delta_stats.num_rd++;
5926 dout(10) << " map_extents done on object " << soid << dendl;
5927 }
5928 break;
5929
5930 /* map extents */
5931 case CEPH_OSD_OP_SPARSE_READ:
5932 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5933 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5934 op.extent.length, op.extent.truncate_size,
5935 op.extent.truncate_seq);
5936 if (op_finisher == nullptr) {
5937 result = do_sparse_read(ctx, osd_op);
5938 } else {
5939 result = op_finisher->execute();
5940 }
5941 break;
5942
5943 case CEPH_OSD_OP_CALL:
5944 {
5945 string cname, mname;
5946 bufferlist indata;
5947 try {
5948 bp.copy(op.cls.class_len, cname);
5949 bp.copy(op.cls.method_len, mname);
5950 bp.copy(op.cls.indata_len, indata);
5951 } catch (buffer::error& e) {
5952 dout(10) << "call unable to decode class + method + indata" << dendl;
5953 dout(30) << "in dump: ";
5954 osd_op.indata.hexdump(*_dout);
5955 *_dout << dendl;
5956 result = -EINVAL;
5957 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5958 break;
5959 }
5960 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5961
5962 ClassHandler::ClassData *cls;
5963 result = osd->class_handler->open_class(cname, &cls);
5964 ceph_assert(result == 0); // init_op_flags() already verified this works.
5965
5966 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5967 if (!method) {
5968 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5969 result = -EOPNOTSUPP;
5970 break;
5971 }
5972
5973 int flags = method->get_flags();
5974 if (flags & CLS_METHOD_WR)
5975 ctx->user_modify = true;
5976
5977 bufferlist outdata;
5978 dout(10) << "call method " << cname << "." << mname << dendl;
5979 int prev_rd = ctx->num_read;
5980 int prev_wr = ctx->num_write;
5981 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5982
5983 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5984 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5985 result = -EIO;
5986 break;
5987 }
5988 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5989 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5990 result = -EIO;
5991 break;
5992 }
5993
5994 dout(10) << "method called response length=" << outdata.length() << dendl;
5995 op.extent.length = outdata.length();
5996 osd_op.outdata.claim_append(outdata);
5997 dout(30) << "out dump: ";
5998 osd_op.outdata.hexdump(*_dout);
5999 *_dout << dendl;
6000 }
6001 break;
6002
6003 case CEPH_OSD_OP_STAT:
6004 // note: stat does not require RD
6005 {
6006 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6007
6008 if (obs.exists && !oi.is_whiteout()) {
6009 encode(oi.size, osd_op.outdata);
6010 encode(oi.mtime, osd_op.outdata);
6011 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6012 } else {
6013 result = -ENOENT;
6014 dout(10) << "stat oi object does not exist" << dendl;
6015 }
6016
6017 ctx->delta_stats.num_rd++;
6018 }
6019 break;
6020
6021 case CEPH_OSD_OP_ISDIRTY:
6022 ++ctx->num_read;
6023 {
6024 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6025 bool is_dirty = obs.oi.is_dirty();
6026 encode(is_dirty, osd_op.outdata);
6027 ctx->delta_stats.num_rd++;
6028 result = 0;
6029 }
6030 break;
6031
6032 case CEPH_OSD_OP_UNDIRTY:
6033 ++ctx->num_write;
6034 {
6035 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6036 if (oi.is_dirty()) {
6037 ctx->undirty = true; // see make_writeable()
6038 ctx->modify = true;
6039 ctx->delta_stats.num_wr++;
6040 }
6041 result = 0;
6042 }
6043 break;
6044
6045 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6046 ++ctx->num_write;
6047 {
6048 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
6049 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
6050 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6051 result = -EINVAL;
6052 break;
6053 }
6054 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6055 result = -EINVAL;
6056 break;
6057 }
6058 if (!obs.exists) {
6059 result = 0;
6060 break;
6061 }
6062 if (oi.is_cache_pinned()) {
6063 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6064 result = -EPERM;
6065 break;
6066 }
6067 if (oi.is_dirty()) {
6068 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
6069 if (result == -EINPROGRESS)
6070 result = -EAGAIN;
6071 } else {
6072 result = 0;
6073 }
6074 }
6075 break;
6076
6077 case CEPH_OSD_OP_CACHE_FLUSH:
6078 ++ctx->num_write;
6079 {
6080 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
6081 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
6082 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6083 result = -EINVAL;
6084 break;
6085 }
6086 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6087 result = -EINVAL;
6088 break;
6089 }
6090 if (!obs.exists) {
6091 result = 0;
6092 break;
6093 }
6094 if (oi.is_cache_pinned()) {
6095 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6096 result = -EPERM;
6097 break;
6098 }
6099 hobject_t missing;
6100 if (oi.is_dirty()) {
6101 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
6102 if (result == -EINPROGRESS)
6103 result = -EAGAIN;
6104 } else {
6105 result = 0;
6106 }
6107 // Check special return value which has set missing_return
6108 if (result == -ENOENT) {
6109 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
6110 ceph_assert(!missing.is_min());
6111 wait_for_unreadable_object(missing, ctx->op);
6112 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6113 result = -EAGAIN;
6114 }
6115 }
6116 break;
6117
6118 case CEPH_OSD_OP_CACHE_EVICT:
6119 ++ctx->num_write;
6120 {
6121 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6122 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6123 result = -EINVAL;
6124 break;
6125 }
6126 if (!obs.exists) {
6127 result = 0;
6128 break;
6129 }
6130 if (oi.is_cache_pinned()) {
6131 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6132 result = -EPERM;
6133 break;
6134 }
6135 if (oi.is_dirty()) {
6136 result = -EBUSY;
6137 break;
6138 }
6139 if (!oi.watchers.empty()) {
6140 result = -EBUSY;
6141 break;
6142 }
6143 if (soid.snap == CEPH_NOSNAP) {
6144 result = _verify_no_head_clones(soid, ssc->snapset);
6145 if (result < 0)
6146 break;
6147 }
6148 result = _delete_oid(ctx, true, false);
6149 if (result >= 0) {
6150 // mark that this is a cache eviction to avoid triggering normal
6151 // make_writeable() clone creation in finish_ctx()
6152 ctx->cache_evict = true;
6153 }
6154 osd->logger->inc(l_osd_tier_evict);
6155 }
6156 break;
6157
6158 case CEPH_OSD_OP_GETXATTR:
6159 ++ctx->num_read;
6160 {
6161 string aname;
6162 bp.copy(op.xattr.name_len, aname);
6163 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6164 string name = "_" + aname;
6165 int r = getattr_maybe_cache(
6166 ctx->obc,
6167 name,
6168 &(osd_op.outdata));
6169 if (r >= 0) {
6170 op.xattr.value_len = osd_op.outdata.length();
6171 result = 0;
6172 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6173 } else
6174 result = r;
6175
6176 ctx->delta_stats.num_rd++;
6177 }
6178 break;
6179
6180 case CEPH_OSD_OP_GETXATTRS:
6181 ++ctx->num_read;
6182 {
6183 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6184 map<string, bufferlist> out;
6185 result = getattrs_maybe_cache(
6186 ctx->obc,
6187 &out);
6188
6189 bufferlist bl;
6190 encode(out, bl);
6191 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6192 ctx->delta_stats.num_rd++;
6193 osd_op.outdata.claim_append(bl);
6194 }
6195 break;
6196
6197 case CEPH_OSD_OP_CMPXATTR:
6198 ++ctx->num_read;
6199 {
6200 string aname;
6201 bp.copy(op.xattr.name_len, aname);
6202 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6203 string name = "_" + aname;
6204 name[op.xattr.name_len + 1] = 0;
6205
6206 bufferlist xattr;
6207 result = getattr_maybe_cache(
6208 ctx->obc,
6209 name,
6210 &xattr);
6211 if (result < 0 && result != -EEXIST && result != -ENODATA)
6212 break;
6213
6214 ctx->delta_stats.num_rd++;
6215 ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6216
6217 switch (op.xattr.cmp_mode) {
6218 case CEPH_OSD_CMPXATTR_MODE_STRING:
6219 {
6220 string val;
6221 bp.copy(op.xattr.value_len, val);
6222 val[op.xattr.value_len] = 0;
6223 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6224 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6225 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6226 }
6227 break;
6228
6229 case CEPH_OSD_CMPXATTR_MODE_U64:
6230 {
6231 uint64_t u64val;
6232 try {
6233 decode(u64val, bp);
6234 }
6235 catch (buffer::error& e) {
6236 result = -EINVAL;
6237 goto fail;
6238 }
6239 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6240 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6241 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6242 }
6243 break;
6244
6245 default:
6246 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6247 result = -EINVAL;
6248 }
6249
6250 if (!result) {
6251 dout(10) << "comparison returned false" << dendl;
6252 result = -ECANCELED;
6253 break;
6254 }
6255 if (result < 0) {
6256 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6257 break;
6258 }
6259
6260 dout(10) << "comparison returned true" << dendl;
6261 }
6262 break;
6263
6264 case CEPH_OSD_OP_ASSERT_VER:
6265 ++ctx->num_read;
6266 {
6267 uint64_t ver = op.assert_ver.ver;
6268 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6269 if (!ver)
6270 result = -EINVAL;
6271 else if (ver < oi.user_version)
6272 result = -ERANGE;
6273 else if (ver > oi.user_version)
6274 result = -EOVERFLOW;
6275 }
6276 break;
6277
6278 case CEPH_OSD_OP_LIST_WATCHERS:
6279 ++ctx->num_read;
6280 {
6281 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6282 obj_list_watch_response_t resp;
6283
6284 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6285 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6286 ++oi_iter) {
6287 dout(20) << "key cookie=" << oi_iter->first.first
6288 << " entity=" << oi_iter->first.second << " "
6289 << oi_iter->second << dendl;
6290 ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6291 ceph_assert(oi_iter->first.second.is_client());
6292
6293 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6294 oi_iter->second.timeout_seconds, oi_iter->second.addr);
6295 resp.entries.push_back(wi);
6296 }
6297
6298 resp.encode(osd_op.outdata, ctx->get_features());
6299 result = 0;
6300
6301 ctx->delta_stats.num_rd++;
6302 break;
6303 }
6304
6305 case CEPH_OSD_OP_LIST_SNAPS:
6306 ++ctx->num_read;
6307 {
6308 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6309 obj_list_snap_response_t resp;
6310
6311 if (!ssc) {
6312 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6313 }
6314 ceph_assert(ssc);
6315 dout(20) << " snapset " << ssc->snapset << dendl;
6316
6317 int clonecount = ssc->snapset.clones.size();
6318 clonecount++; // for head
6319 resp.clones.reserve(clonecount);
6320 for (auto clone_iter = ssc->snapset.clones.begin();
6321 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6322 clone_info ci;
6323 ci.cloneid = *clone_iter;
6324
6325 hobject_t clone_oid = soid;
6326 clone_oid.snap = *clone_iter;
6327
6328 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6329 if (p == ssc->snapset.clone_snaps.end()) {
6330 osd->clog->error() << "osd." << osd->whoami
6331 << ": inconsistent clone_snaps found for oid "
6332 << soid << " clone " << *clone_iter
6333 << " snapset " << ssc->snapset;
6334 result = -EINVAL;
6335 break;
6336 }
6337 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6338 ci.snaps.push_back(*q);
6339 }
6340
6341 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6342
6343 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6344 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6345 if (coi == ssc->snapset.clone_overlap.end()) {
6346 osd->clog->error() << "osd." << osd->whoami
6347 << ": inconsistent clone_overlap found for oid "
6348 << soid << " clone " << *clone_iter;
6349 result = -EINVAL;
6350 break;
6351 }
6352 const interval_set<uint64_t> &o = coi->second;
6353 ci.overlap.reserve(o.num_intervals());
6354 for (interval_set<uint64_t>::const_iterator r = o.begin();
6355 r != o.end(); ++r) {
6356 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6357 r.get_len()));
6358 }
6359
6360 map<snapid_t, uint64_t>::const_iterator si;
6361 si = ssc->snapset.clone_size.find(ci.cloneid);
6362 if (si == ssc->snapset.clone_size.end()) {
6363 osd->clog->error() << "osd." << osd->whoami
6364 << ": inconsistent clone_size found for oid "
6365 << soid << " clone " << *clone_iter;
6366 result = -EINVAL;
6367 break;
6368 }
6369 ci.size = si->second;
6370
6371 resp.clones.push_back(ci);
6372 }
6373 if (result < 0) {
6374 break;
6375 }
6376 if (!ctx->obc->obs.oi.is_whiteout()) {
6377 ceph_assert(obs.exists);
6378 clone_info ci;
6379 ci.cloneid = CEPH_NOSNAP;
6380
6381 //Size for HEAD is oi.size
6382 ci.size = oi.size;
6383
6384 resp.clones.push_back(ci);
6385 }
6386 resp.seq = ssc->snapset.seq;
6387
6388 resp.encode(osd_op.outdata);
6389 result = 0;
6390
6391 ctx->delta_stats.num_rd++;
6392 break;
6393 }
6394
6395 case CEPH_OSD_OP_NOTIFY:
6396 ++ctx->num_read;
6397 {
6398 uint32_t timeout;
6399 bufferlist bl;
6400
6401 try {
6402 uint32_t ver; // obsolete
6403 decode(ver, bp);
6404 decode(timeout, bp);
6405 decode(bl, bp);
6406 } catch (const buffer::error &e) {
6407 timeout = 0;
6408 }
6409 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6410 if (!timeout)
6411 timeout = cct->_conf->osd_default_notify_timeout;
6412
6413 notify_info_t n;
6414 n.timeout = timeout;
6415 n.notify_id = osd->get_next_id(get_osdmap_epoch());
6416 n.cookie = op.watch.cookie;
6417 n.bl = bl;
6418 ctx->notifies.push_back(n);
6419
6420 // return our unique notify id to the client
6421 encode(n.notify_id, osd_op.outdata);
6422 }
6423 break;
6424
6425 case CEPH_OSD_OP_NOTIFY_ACK:
6426 ++ctx->num_read;
6427 {
6428 try {
6429 uint64_t notify_id = 0;
6430 uint64_t watch_cookie = 0;
6431 decode(notify_id, bp);
6432 decode(watch_cookie, bp);
6433 bufferlist reply_bl;
6434 if (!bp.end()) {
6435 decode(reply_bl, bp);
6436 }
6437 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6438 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6439 ctx->notify_acks.push_back(ack);
6440 } catch (const buffer::error &e) {
6441 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6442 OpContext::NotifyAck ack(
6443 // op.watch.cookie is actually the notify_id for historical reasons
6444 op.watch.cookie
6445 );
6446 ctx->notify_acks.push_back(ack);
6447 }
6448 }
6449 break;
6450
6451 case CEPH_OSD_OP_SETALLOCHINT:
6452 ++ctx->num_write;
6453 {
6454 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6455 maybe_create_new_object(ctx);
6456 oi.expected_object_size = op.alloc_hint.expected_object_size;
6457 oi.expected_write_size = op.alloc_hint.expected_write_size;
6458 oi.alloc_hint_flags = op.alloc_hint.flags;
6459 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6460 op.alloc_hint.expected_write_size,
6461 op.alloc_hint.flags);
6462 result = 0;
6463 }
6464 break;
6465
6466
6467 // --- WRITES ---
6468
6469 // -- object data --
6470
6471 case CEPH_OSD_OP_WRITE:
6472 ++ctx->num_write;
6473 { // write
6474 __u32 seq = oi.truncate_seq;
6475 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6476 if (op.extent.length != osd_op.indata.length()) {
6477 result = -EINVAL;
6478 break;
6479 }
6480
6481 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6482 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6483
6484 if (pool.info.requires_aligned_append() &&
6485 (op.extent.offset % pool.info.required_alignment() != 0)) {
6486 result = -EOPNOTSUPP;
6487 break;
6488 }
6489
6490 if (!obs.exists) {
6491 if (pool.info.requires_aligned_append() && op.extent.offset) {
6492 result = -EOPNOTSUPP;
6493 break;
6494 }
6495 } else if (op.extent.offset != oi.size &&
6496 pool.info.requires_aligned_append()) {
6497 result = -EOPNOTSUPP;
6498 break;
6499 }
6500
6501 if (seq && (seq > op.extent.truncate_seq) &&
6502 (op.extent.offset + op.extent.length > oi.size)) {
6503 // old write, arrived after trimtrunc
6504 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6505 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6506 << ", adjusting write length to " << op.extent.length << dendl;
6507 bufferlist t;
6508 t.substr_of(osd_op.indata, 0, op.extent.length);
6509 osd_op.indata.swap(t);
6510 }
6511 if (op.extent.truncate_seq > seq) {
6512 // write arrives before trimtrunc
6513 if (obs.exists && !oi.is_whiteout()) {
6514 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6515 << ", truncating to " << op.extent.truncate_size << dendl;
6516 t->truncate(soid, op.extent.truncate_size);
6517 oi.truncate_seq = op.extent.truncate_seq;
6518 oi.truncate_size = op.extent.truncate_size;
6519 if (oi.size > op.extent.truncate_size) {
6520 interval_set<uint64_t> trim;
6521 trim.insert(op.extent.truncate_size,
6522 oi.size - op.extent.truncate_size);
6523 ctx->modified_ranges.union_of(trim);
6524 }
6525 if (op.extent.truncate_size != oi.size) {
6526 truncate_update_size_and_usage(ctx->delta_stats,
6527 oi,
6528 op.extent.truncate_size);
6529 }
6530 } else {
6531 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6532 << ", but object is new" << dendl;
6533 oi.truncate_seq = op.extent.truncate_seq;
6534 oi.truncate_size = op.extent.truncate_size;
6535 }
6536 }
6537 result = check_offset_and_length(
6538 op.extent.offset, op.extent.length,
6539 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6540 if (result < 0)
6541 break;
6542
6543 maybe_create_new_object(ctx);
6544
6545 if (op.extent.length == 0) {
6546 if (op.extent.offset > oi.size) {
6547 t->truncate(
6548 soid, op.extent.offset);
6549 } else {
6550 t->nop(soid);
6551 }
6552 } else {
6553 t->write(
6554 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6555 }
6556
6557 if (op.extent.offset == 0 && op.extent.length >= oi.size
6558 && !skip_data_digest) {
6559 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6560 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6561 if (skip_data_digest) {
6562 obs.oi.clear_data_digest();
6563 } else {
6564 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6565 }
6566 } else {
6567 obs.oi.clear_data_digest();
6568 }
6569 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6570 op.extent.offset, op.extent.length);
6571
6572 }
6573 break;
6574
6575 case CEPH_OSD_OP_WRITEFULL:
6576 ++ctx->num_write;
6577 { // write full object
6578 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6579
6580 if (op.extent.length != osd_op.indata.length()) {
6581 result = -EINVAL;
6582 break;
6583 }
6584 result = check_offset_and_length(
6585 0, op.extent.length,
6586 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6587 if (result < 0)
6588 break;
6589
6590 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6591 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6592
6593 maybe_create_new_object(ctx);
6594 if (pool.info.is_erasure()) {
6595 t->truncate(soid, 0);
6596 } else if (obs.exists && op.extent.length < oi.size) {
6597 t->truncate(soid, op.extent.length);
6598 }
6599 if (op.extent.length) {
6600 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6601 }
6602 if (!skip_data_digest) {
6603 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6604 } else {
6605 obs.oi.clear_data_digest();
6606 }
6607
6608 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6609 0, op.extent.length, true);
6610 }
6611 break;
6612
6613 case CEPH_OSD_OP_WRITESAME:
6614 ++ctx->num_write;
6615 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6616 result = do_writesame(ctx, osd_op);
6617 break;
6618
6619 case CEPH_OSD_OP_ROLLBACK :
6620 ++ctx->num_write;
6621 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6622 result = _rollback_to(ctx, op);
6623 break;
6624
6625 case CEPH_OSD_OP_ZERO:
6626 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6627 if (pool.info.requires_aligned_append()) {
6628 result = -EOPNOTSUPP;
6629 break;
6630 }
6631 ++ctx->num_write;
6632 { // zero
6633 result = check_offset_and_length(
6634 op.extent.offset, op.extent.length,
6635 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6636 if (result < 0)
6637 break;
6638
6639 ceph_assert(op.extent.length);
6640 if (obs.exists && !oi.is_whiteout()) {
6641 t->zero(soid, op.extent.offset, op.extent.length);
6642 interval_set<uint64_t> ch;
6643 ch.insert(op.extent.offset, op.extent.length);
6644 ctx->modified_ranges.union_of(ch);
6645 ctx->delta_stats.num_wr++;
6646 oi.clear_data_digest();
6647 } else {
6648 // no-op
6649 }
6650 }
6651 break;
6652 case CEPH_OSD_OP_CREATE:
6653 ++ctx->num_write;
6654 {
6655 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6656 int flags = le32_to_cpu(op.flags);
6657 if (obs.exists && !oi.is_whiteout() &&
6658 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
6659 result = -EEXIST; /* this is an exclusive create */
6660 } else {
6661 if (osd_op.indata.length()) {
6662 auto p = osd_op.indata.cbegin();
6663 string category;
6664 try {
6665 decode(category, p);
6666 }
6667 catch (buffer::error& e) {
6668 result = -EINVAL;
6669 goto fail;
6670 }
6671 // category is no longer implemented.
6672 }
6673 if (result >= 0) {
6674 maybe_create_new_object(ctx);
6675 t->nop(soid);
6676 }
6677 }
6678 }
6679 break;
6680
6681 case CEPH_OSD_OP_TRIMTRUNC:
6682 op.extent.offset = op.extent.truncate_size;
6683 // falling through
6684
6685 case CEPH_OSD_OP_TRUNCATE:
6686 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6687 if (pool.info.requires_aligned_append()) {
6688 result = -EOPNOTSUPP;
6689 break;
6690 }
6691 ++ctx->num_write;
6692 {
6693 // truncate
6694 if (!obs.exists || oi.is_whiteout()) {
6695 dout(10) << " object dne, truncate is a no-op" << dendl;
6696 break;
6697 }
6698
6699 result = check_offset_and_length(
6700 op.extent.offset, op.extent.length,
6701 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6702 if (result < 0)
6703 break;
6704
6705 if (op.extent.truncate_seq) {
6706 ceph_assert(op.extent.offset == op.extent.truncate_size);
6707 if (op.extent.truncate_seq <= oi.truncate_seq) {
6708 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6709 << ", no-op" << dendl;
6710 break; // old
6711 }
6712 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6713 << ", truncating" << dendl;
6714 oi.truncate_seq = op.extent.truncate_seq;
6715 oi.truncate_size = op.extent.truncate_size;
6716 }
6717
6718 maybe_create_new_object(ctx);
6719 t->truncate(soid, op.extent.offset);
6720 if (oi.size > op.extent.offset) {
6721 interval_set<uint64_t> trim;
6722 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6723 ctx->modified_ranges.union_of(trim);
6724 }
6725 if (op.extent.offset != oi.size) {
6726 truncate_update_size_and_usage(ctx->delta_stats,
6727 oi,
6728 op.extent.offset);
6729 }
6730 ctx->delta_stats.num_wr++;
6731 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6732
6733 oi.clear_data_digest();
6734 }
6735 break;
6736
6737 case CEPH_OSD_OP_DELETE:
6738 ++ctx->num_write;
6739 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6740 {
6741 if (oi.has_manifest()) {
6742 if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && oi.manifest.is_redirect()) {
6743 ctx->register_on_commit(
6744 [oi, ctx, this](){
6745 object_locator_t target_oloc(oi.manifest.redirect_target);
6746 refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
6747 SnapContext(), false, NULL, 0);
6748 });
6749 } else if (oi.manifest.is_chunked()) {
6750 ctx->register_on_commit(
6751 [oi, ctx, this](){
6752 for (auto p : oi.manifest.chunk_map) {
6753 if (p.second.has_reference()) {
6754 object_locator_t target_oloc(p.second.oid);
6755 refcount_manifest(ctx->obc, target_oloc, p.second.oid,
6756 SnapContext(), false, NULL, p.first);
6757 }
6758 }
6759 });
6760 }
6761 }
6762 result = _delete_oid(ctx, false, ctx->ignore_cache);
6763 }
6764 break;
6765
6766 case CEPH_OSD_OP_WATCH:
6767 ++ctx->num_write;
6768 {
6769 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6770 op.watch.cookie, op.watch.op);
6771 if (!obs.exists) {
6772 result = -ENOENT;
6773 break;
6774 }
6775 uint64_t cookie = op.watch.cookie;
6776 entity_name_t entity = ctx->reqid.name;
6777 ObjectContextRef obc = ctx->obc;
6778
6779 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6780 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6781 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6782 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6783 dout(10) << "watch: peer_addr="
6784 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6785
6786 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6787 if (op.watch.timeout != 0) {
6788 timeout = op.watch.timeout;
6789 }
6790
6791 watch_info_t w(cookie, timeout,
6792 ctx->op->get_req()->get_connection()->get_peer_addr());
6793 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6794 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6795 if (oi.watchers.count(make_pair(cookie, entity))) {
6796 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6797 } else {
6798 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6799 oi.watchers[make_pair(cookie, entity)] = w;
6800 t->nop(soid); // make sure update the object_info on disk!
6801 }
6802 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6803 ctx->watch_connects.push_back(make_pair(w, will_ping));
6804 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6805 if (!oi.watchers.count(make_pair(cookie, entity))) {
6806 result = -ENOTCONN;
6807 break;
6808 }
6809 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6810 ctx->watch_connects.push_back(make_pair(w, true));
6811 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6812 /* Note: WATCH with PING doesn't cause may_write() to return true,
6813 * so if there is nothing else in the transaction, this is going
6814 * to run do_osd_op_effects, but not write out a log entry */
6815 if (!oi.watchers.count(make_pair(cookie, entity))) {
6816 result = -ENOTCONN;
6817 break;
6818 }
6819 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6820 obc->watchers.find(make_pair(cookie, entity));
6821 if (p == obc->watchers.end() ||
6822 !p->second->is_connected()) {
6823 // client needs to reconnect
6824 result = -ETIMEDOUT;
6825 break;
6826 }
6827 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6828 p->second->got_ping(ceph_clock_now());
6829 result = 0;
6830 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6831 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6832 oi.watchers.find(make_pair(cookie, entity));
6833 if (oi_iter != oi.watchers.end()) {
6834 dout(10) << " removed watch " << oi_iter->second << " by "
6835 << entity << dendl;
6836 oi.watchers.erase(oi_iter);
6837 t->nop(soid); // update oi on disk
6838 ctx->watch_disconnects.push_back(
6839 watch_disconnect_t(cookie, entity, false));
6840 } else {
6841 dout(10) << " can't remove: no watch by " << entity << dendl;
6842 }
6843 }
6844 }
6845 break;
6846
6847 case CEPH_OSD_OP_CACHE_PIN:
6848 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6849 if ((!pool.info.is_tier() ||
6850 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6851 result = -EINVAL;
6852 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6853 break;
6854 }
6855 ++ctx->num_write;
6856 {
6857 if (!obs.exists || oi.is_whiteout()) {
6858 result = -ENOENT;
6859 break;
6860 }
6861
6862 if (!oi.is_cache_pinned()) {
6863 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6864 ctx->modify = true;
6865 ctx->delta_stats.num_objects_pinned++;
6866 ctx->delta_stats.num_wr++;
6867 }
6868 result = 0;
6869 }
6870 break;
6871
6872 case CEPH_OSD_OP_CACHE_UNPIN:
6873 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6874 if ((!pool.info.is_tier() ||
6875 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6876 result = -EINVAL;
6877 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6878 break;
6879 }
6880 ++ctx->num_write;
6881 {
6882 if (!obs.exists || oi.is_whiteout()) {
6883 result = -ENOENT;
6884 break;
6885 }
6886
6887 if (oi.is_cache_pinned()) {
6888 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6889 ctx->modify = true;
6890 ctx->delta_stats.num_objects_pinned--;
6891 ctx->delta_stats.num_wr++;
6892 }
6893 result = 0;
6894 }
6895 break;
6896
6897 case CEPH_OSD_OP_SET_REDIRECT:
6898 ++ctx->num_write;
6899 {
6900 if (pool.info.is_tier()) {
6901 result = -EINVAL;
6902 break;
6903 }
6904 if (!obs.exists) {
6905 result = -ENOENT;
6906 break;
6907 }
6908 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6909 result = -EOPNOTSUPP;
6910 break;
6911 }
6912
6913 object_t target_name;
6914 object_locator_t target_oloc;
6915 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6916 version_t target_version = op.copy_from.src_version;
6917 try {
6918 decode(target_name, bp);
6919 decode(target_oloc, bp);
6920 }
6921 catch (buffer::error& e) {
6922 result = -EINVAL;
6923 goto fail;
6924 }
6925 pg_t raw_pg;
6926 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6927 hobject_t target(target_name, target_oloc.key, target_snapid,
6928 raw_pg.ps(), raw_pg.pool(),
6929 target_oloc.nspace);
6930 if (target == soid) {
6931 dout(20) << " set-redirect self is invalid" << dendl;
6932 result = -EINVAL;
6933 break;
6934 }
6935
6936 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6937 bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6938 if (has_reference) {
6939 result = -EINVAL;
6940 dout(5) << " the object is already a manifest " << dendl;
6941 break;
6942 }
6943 if (op_finisher == nullptr && need_reference) {
6944 // start
6945 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6946 new SetManifestFinisher(osd_op));
6947 RefCountCallback *fin = new RefCountCallback(
6948 this, ctx, osd_op, get_last_peering_reset());
6949 refcount_manifest(ctx->obc, target_oloc, target, SnapContext(),
6950 true, fin, 0);
6951 result = -EINPROGRESS;
6952 } else {
6953 // finish
6954 if (op_finisher) {
6955 result = op_finisher->execute();
6956 ceph_assert(result == 0);
6957 }
6958
6959 if (!oi.has_manifest() && !oi.manifest.is_redirect())
6960 ctx->delta_stats.num_objects_manifest++;
6961
6962 oi.set_flag(object_info_t::FLAG_MANIFEST);
6963 oi.manifest.redirect_target = target;
6964 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6965 t->truncate(soid, 0);
6966 if (oi.is_omap() && pool.info.supports_omap()) {
6967 t->omap_clear(soid);
6968 obs.oi.clear_omap_digest();
6969 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6970 }
6971 ctx->delta_stats.num_bytes -= oi.size;
6972 oi.size = 0;
6973 oi.new_object();
6974 oi.user_version = target_version;
6975 ctx->user_at_version = target_version;
6976 /* rm_attrs */
6977 map<string,bufferlist> rmattrs;
6978 result = getattrs_maybe_cache(ctx->obc, &rmattrs);
6979 if (result < 0) {
6980 return result;
6981 }
6982 map<string, bufferlist>::iterator iter;
6983 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6984 const string& name = iter->first;
6985 t->rmattr(soid, name);
6986 }
6987 if (!has_reference && need_reference) {
6988 oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6989 }
6990 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6991 if (op_finisher) {
6992 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6993 }
6994 }
6995 }
6996
6997 break;
6998
6999 case CEPH_OSD_OP_SET_CHUNK:
7000 ++ctx->num_write;
7001 {
7002 if (pool.info.is_tier()) {
7003 result = -EINVAL;
7004 break;
7005 }
7006 if (!obs.exists) {
7007 result = -ENOENT;
7008 break;
7009 }
7010 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7011 result = -EOPNOTSUPP;
7012 break;
7013 }
7014
7015 object_locator_t tgt_oloc;
7016 uint64_t src_offset, src_length, tgt_offset;
7017 object_t tgt_name;
7018 try {
7019 decode(src_offset, bp);
7020 decode(src_length, bp);
7021 decode(tgt_oloc, bp);
7022 decode(tgt_name, bp);
7023 decode(tgt_offset, bp);
7024 }
7025 catch (buffer::error& e) {
7026 result = -EINVAL;
7027 goto fail;
7028 }
7029
7030 if (!src_length) {
7031 result = -EINVAL;
7032 goto fail;
7033 }
7034
7035 for (auto &p : oi.manifest.chunk_map) {
7036 if ((p.first <= src_offset && p.first + p.second.length > src_offset) ||
7037 (p.first > src_offset && p.first <= src_offset + src_length)) {
7038 dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7039 << " chunk_info: " << p << dendl;
7040 result = -EOPNOTSUPP;
7041 goto fail;
7042 }
7043 }
7044
7045 if (!oi.manifest.is_chunked()) {
7046 oi.manifest.clear();
7047 }
7048
7049 pg_t raw_pg;
7050 chunk_info_t chunk_info;
7051 get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7052 hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7053 raw_pg.ps(), raw_pg.pool(),
7054 tgt_oloc.nspace);
7055 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
7056 bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
7057 (oi.manifest.chunk_map[src_offset].flags & chunk_info_t::FLAG_HAS_REFERENCE);
7058 if (has_reference) {
7059 result = -EINVAL;
7060 dout(5) << " the object is already a manifest " << dendl;
7061 break;
7062 }
7063 if (op_finisher == nullptr && need_reference) {
7064 // start
7065 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7066 new SetManifestFinisher(osd_op));
7067 RefCountCallback *fin = new RefCountCallback(
7068 this, ctx, osd_op, get_last_peering_reset());
7069 refcount_manifest(ctx->obc, tgt_oloc, target, SnapContext(),
7070 true, fin, src_offset);
7071 result = -EINPROGRESS;
7072 } else {
7073 if (op_finisher) {
7074 result = op_finisher->execute();
7075 ceph_assert(result == 0);
7076 }
7077
7078 chunk_info_t chunk_info;
7079 chunk_info.set_flag(chunk_info_t::FLAG_MISSING);
7080 chunk_info.oid = target;
7081 chunk_info.offset = tgt_offset;
7082 chunk_info.length= src_length;
7083 oi.manifest.chunk_map[src_offset] = chunk_info;
7084 if (!oi.has_manifest() && !oi.manifest.is_chunked())
7085 ctx->delta_stats.num_objects_manifest++;
7086 oi.set_flag(object_info_t::FLAG_MANIFEST);
7087 oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7088 if (!has_reference && need_reference) {
7089 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7090 }
7091 if (need_reference && pool.info.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE) {
7092 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT);
7093 }
7094 ctx->modify = true;
7095
7096 dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7097 << " chunk_info: " << chunk_info << dendl;
7098 if (op_finisher) {
7099 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7100 }
7101 }
7102 }
7103
7104 break;
7105
7106 case CEPH_OSD_OP_TIER_PROMOTE:
7107 ++ctx->num_write;
7108 {
7109 if (pool.info.is_tier()) {
7110 result = -EINVAL;
7111 break;
7112 }
7113 if (!obs.exists) {
7114 result = -ENOENT;
7115 break;
7116 }
7117 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7118 result = -EOPNOTSUPP;
7119 break;
7120 }
7121 if (!obs.oi.has_manifest()) {
7122 result = 0;
7123 break;
7124 }
7125
7126 if (op_finisher == nullptr) {
7127 PromoteManifestCallback *cb;
7128 object_locator_t my_oloc;
7129 hobject_t src_hoid;
7130
7131 if (obs.oi.manifest.is_chunked()) {
7132 src_hoid = obs.oi.soid;
7133 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7134 } else if (obs.oi.manifest.is_redirect()) {
7135 object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7136 my_oloc = src_oloc;
7137 src_hoid = obs.oi.manifest.redirect_target;
7138 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7139 } else {
7140 ceph_abort_msg("unrecognized manifest type");
7141 }
7142 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7143 new PromoteFinisher(cb));
7144 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7145 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7146 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7147 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7148 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7149 start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7150 obs.oi.soid.snap == CEPH_NOSNAP,
7151 src_fadvise_flags, 0);
7152
7153 dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7154 result = -EINPROGRESS;
7155 } else {
7156 result = op_finisher->execute();
7157 ceph_assert(result == 0);
7158 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7159 }
7160 }
7161
7162 break;
7163
7164 case CEPH_OSD_OP_UNSET_MANIFEST:
7165 ++ctx->num_write;
7166 {
7167 if (pool.info.is_tier()) {
7168 result = -EINVAL;
7169 break;
7170 }
7171 if (!obs.exists) {
7172 result = -ENOENT;
7173 break;
7174 }
7175 if (!oi.has_manifest()) {
7176 result = -EOPNOTSUPP;
7177 break;
7178 }
7179 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7180 result = -EOPNOTSUPP;
7181 break;
7182 }
7183
7184 if (oi.manifest.is_redirect()) {
7185 if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
7186 ctx->register_on_commit(
7187 [oi, ctx, this](){
7188 object_locator_t target_oloc(oi.manifest.redirect_target);
7189 refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
7190 SnapContext(), false, NULL, 0);
7191 });
7192 }
7193 } else if (oi.manifest.is_chunked()) {
7194 ctx->register_on_commit(
7195 [oi, ctx, this](){
7196 for (auto p : oi.manifest.chunk_map) {
7197 if (p.second.flags & chunk_info_t::FLAG_HAS_REFERENCE) {
7198 object_locator_t target_oloc(p.second.oid);
7199 refcount_manifest(ctx->obc, target_oloc, p.second.oid,
7200 SnapContext(), false, NULL, p.first);
7201 }
7202 }
7203 });
7204 } else {
7205 ceph_abort_msg("unrecognized manifest type");
7206 }
7207
7208 oi.clear_flag(object_info_t::FLAG_MANIFEST);
7209 oi.manifest = object_manifest_t();
7210 ctx->delta_stats.num_objects_manifest--;
7211 ctx->delta_stats.num_wr++;
7212 ctx->modify = true;
7213 }
7214
7215 break;
7216
7217 // -- object attrs --
7218
7219 case CEPH_OSD_OP_SETXATTR:
7220 ++ctx->num_write;
7221 {
7222 if (cct->_conf->osd_max_attr_size > 0 &&
7223 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7224 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7225 result = -EFBIG;
7226 break;
7227 }
7228 unsigned max_name_len =
7229 std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7230 cct->_conf->osd_max_attr_name_len);
7231 if (op.xattr.name_len > max_name_len) {
7232 result = -ENAMETOOLONG;
7233 break;
7234 }
7235 maybe_create_new_object(ctx);
7236 string aname;
7237 bp.copy(op.xattr.name_len, aname);
7238 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7239 string name = "_" + aname;
7240 bufferlist bl;
7241 bp.copy(op.xattr.value_len, bl);
7242 t->setattr(soid, name, bl);
7243 ctx->delta_stats.num_wr++;
7244 }
7245 break;
7246
7247 case CEPH_OSD_OP_RMXATTR:
7248 ++ctx->num_write;
7249 {
7250 string aname;
7251 bp.copy(op.xattr.name_len, aname);
7252 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7253 if (!obs.exists || oi.is_whiteout()) {
7254 result = -ENOENT;
7255 break;
7256 }
7257 string name = "_" + aname;
7258 t->rmattr(soid, name);
7259 ctx->delta_stats.num_wr++;
7260 }
7261 break;
7262
7263
7264 // -- fancy writers --
7265 case CEPH_OSD_OP_APPEND:
7266 {
7267 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7268 // just do it inline; this works because we are happy to execute
7269 // fancy op on replicas as well.
7270 vector<OSDOp> nops(1);
7271 OSDOp& newop = nops[0];
7272 newop.op.op = CEPH_OSD_OP_WRITE;
7273 newop.op.extent.offset = oi.size;
7274 newop.op.extent.length = op.extent.length;
7275 newop.op.extent.truncate_seq = oi.truncate_seq;
7276 newop.indata = osd_op.indata;
7277 result = do_osd_ops(ctx, nops);
7278 osd_op.outdata.claim(newop.outdata);
7279 }
7280 break;
7281
7282 case CEPH_OSD_OP_STARTSYNC:
7283 t->nop(soid);
7284 break;
7285
7286 // -- trivial map --
7287 case CEPH_OSD_OP_TMAPGET:
7288 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7289 if (pool.info.is_erasure()) {
7290 result = -EOPNOTSUPP;
7291 break;
7292 }
7293 {
7294 vector<OSDOp> nops(1);
7295 OSDOp& newop = nops[0];
7296 newop.op.op = CEPH_OSD_OP_SYNC_READ;
7297 newop.op.extent.offset = 0;
7298 newop.op.extent.length = 0;
7299 do_osd_ops(ctx, nops);
7300 osd_op.outdata.claim(newop.outdata);
7301 }
7302 break;
7303
7304 case CEPH_OSD_OP_TMAPPUT:
7305 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7306 if (pool.info.is_erasure()) {
7307 result = -EOPNOTSUPP;
7308 break;
7309 }
7310 {
7311 //_dout_lock.Lock();
7312 //osd_op.data.hexdump(*_dout);
7313 //_dout_lock.Unlock();
7314
7315 // verify sort order
7316 bool unsorted = false;
7317 if (true) {
7318 bufferlist header;
7319 decode(header, bp);
7320 uint32_t n;
7321 decode(n, bp);
7322 string last_key;
7323 while (n--) {
7324 string key;
7325 decode(key, bp);
7326 dout(10) << "tmapput key " << key << dendl;
7327 bufferlist val;
7328 decode(val, bp);
7329 if (key < last_key) {
7330 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7331 unsorted = true;
7332 break;
7333 }
7334 last_key = key;
7335 }
7336 }
7337
7338 // write it
7339 vector<OSDOp> nops(1);
7340 OSDOp& newop = nops[0];
7341 newop.op.op = CEPH_OSD_OP_WRITEFULL;
7342 newop.op.extent.offset = 0;
7343 newop.op.extent.length = osd_op.indata.length();
7344 newop.indata = osd_op.indata;
7345
7346 if (unsorted) {
7347 bp = osd_op.indata.begin();
7348 bufferlist header;
7349 map<string, bufferlist> m;
7350 decode(header, bp);
7351 decode(m, bp);
7352 ceph_assert(bp.end());
7353 bufferlist newbl;
7354 encode(header, newbl);
7355 encode(m, newbl);
7356 newop.indata = newbl;
7357 }
7358 result = do_osd_ops(ctx, nops);
7359 ceph_assert(result == 0);
7360 }
7361 break;
7362
7363 case CEPH_OSD_OP_TMAPUP:
7364 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7365 if (pool.info.is_erasure()) {
7366 result = -EOPNOTSUPP;
7367 break;
7368 }
7369 ++ctx->num_write;
7370 result = do_tmapup(ctx, bp, osd_op);
7371 break;
7372
7373 case CEPH_OSD_OP_TMAP2OMAP:
7374 ++ctx->num_write;
7375 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7376 result = do_tmap2omap(ctx, op.tmap2omap.flags);
7377 break;
7378
7379 // OMAP Read ops
7380 case CEPH_OSD_OP_OMAPGETKEYS:
7381 ++ctx->num_read;
7382 {
7383 string start_after;
7384 uint64_t max_return;
7385 try {
7386 decode(start_after, bp);
7387 decode(max_return, bp);
7388 }
7389 catch (buffer::error& e) {
7390 result = -EINVAL;
7391 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7392 goto fail;
7393 }
7394 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7395 max_return = cct->_conf->osd_max_omap_entries_per_request;
7396 }
7397 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7398
7399 bufferlist bl;
7400 uint32_t num = 0;
7401 bool truncated = false;
7402 if (oi.is_omap()) {
7403 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7404 ch, ghobject_t(soid)
7405 );
7406 ceph_assert(iter);
7407 iter->upper_bound(start_after);
7408 for (num = 0; iter->valid(); ++num, iter->next()) {
7409 if (num >= max_return ||
7410 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7411 truncated = true;
7412 break;
7413 }
7414 encode(iter->key(), bl);
7415 }
7416 } // else return empty out_set
7417 encode(num, osd_op.outdata);
7418 osd_op.outdata.claim_append(bl);
7419 encode(truncated, osd_op.outdata);
7420 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7421 ctx->delta_stats.num_rd++;
7422 }
7423 break;
7424
7425 case CEPH_OSD_OP_OMAPGETVALS:
7426 ++ctx->num_read;
7427 {
7428 string start_after;
7429 uint64_t max_return;
7430 string filter_prefix;
7431 try {
7432 decode(start_after, bp);
7433 decode(max_return, bp);
7434 decode(filter_prefix, bp);
7435 }
7436 catch (buffer::error& e) {
7437 result = -EINVAL;
7438 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7439 goto fail;
7440 }
7441 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7442 max_return = cct->_conf->osd_max_omap_entries_per_request;
7443 }
7444 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7445
7446 uint32_t num = 0;
7447 bool truncated = false;
7448 bufferlist bl;
7449 if (oi.is_omap()) {
7450 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7451 ch, ghobject_t(soid)
7452 );
7453 if (!iter) {
7454 result = -ENOENT;
7455 goto fail;
7456 }
7457 iter->upper_bound(start_after);
7458 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7459 for (num = 0;
7460 iter->valid() &&
7461 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7462 ++num, iter->next()) {
7463 dout(20) << "Found key " << iter->key() << dendl;
7464 if (num >= max_return ||
7465 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7466 truncated = true;
7467 break;
7468 }
7469 encode(iter->key(), bl);
7470 encode(iter->value(), bl);
7471 }
7472 } // else return empty out_set
7473 encode(num, osd_op.outdata);
7474 osd_op.outdata.claim_append(bl);
7475 encode(truncated, osd_op.outdata);
7476 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7477 ctx->delta_stats.num_rd++;
7478 }
7479 break;
7480
7481 case CEPH_OSD_OP_OMAPGETHEADER:
7482 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7483 if (!oi.is_omap()) {
7484 // return empty header
7485 break;
7486 }
7487 ++ctx->num_read;
7488 {
7489 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7490 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7491 ctx->delta_stats.num_rd++;
7492 }
7493 break;
7494
7495 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7496 ++ctx->num_read;
7497 {
7498 set<string> keys_to_get;
7499 try {
7500 decode(keys_to_get, bp);
7501 }
7502 catch (buffer::error& e) {
7503 result = -EINVAL;
7504 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7505 goto fail;
7506 }
7507 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7508 map<string, bufferlist> out;
7509 if (oi.is_omap()) {
7510 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7511 } // else return empty omap entries
7512 encode(out, osd_op.outdata);
7513 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7514 ctx->delta_stats.num_rd++;
7515 }
7516 break;
7517
7518 case CEPH_OSD_OP_OMAP_CMP:
7519 ++ctx->num_read;
7520 {
7521 if (!obs.exists || oi.is_whiteout()) {
7522 result = -ENOENT;
7523 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7524 break;
7525 }
7526 map<string, pair<bufferlist, int> > assertions;
7527 try {
7528 decode(assertions, bp);
7529 }
7530 catch (buffer::error& e) {
7531 result = -EINVAL;
7532 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7533 goto fail;
7534 }
7535 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7536
7537 map<string, bufferlist> out;
7538
7539 if (oi.is_omap()) {
7540 set<string> to_get;
7541 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7542 i != assertions.end();
7543 ++i)
7544 to_get.insert(i->first);
7545 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7546 to_get, &out);
7547 if (r < 0) {
7548 result = r;
7549 break;
7550 }
7551 } // else leave out empty
7552
7553 //Should set num_rd_kb based on encode length of map
7554 ctx->delta_stats.num_rd++;
7555
7556 int r = 0;
7557 bufferlist empty;
7558 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7559 i != assertions.end();
7560 ++i) {
7561 auto out_entry = out.find(i->first);
7562 bufferlist &bl = (out_entry != out.end()) ?
7563 out_entry->second : empty;
7564 switch (i->second.second) {
7565 case CEPH_OSD_CMPXATTR_OP_EQ:
7566 if (!(bl == i->second.first)) {
7567 r = -ECANCELED;
7568 }
7569 break;
7570 case CEPH_OSD_CMPXATTR_OP_LT:
7571 if (!(bl < i->second.first)) {
7572 r = -ECANCELED;
7573 }
7574 break;
7575 case CEPH_OSD_CMPXATTR_OP_GT:
7576 if (!(bl > i->second.first)) {
7577 r = -ECANCELED;
7578 }
7579 break;
7580 default:
7581 r = -EINVAL;
7582 break;
7583 }
7584 if (r < 0)
7585 break;
7586 }
7587 if (r < 0) {
7588 result = r;
7589 }
7590 }
7591 break;
7592
7593 // OMAP Write ops
7594 case CEPH_OSD_OP_OMAPSETVALS:
7595 if (!pool.info.supports_omap()) {
7596 result = -EOPNOTSUPP;
7597 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7598 break;
7599 }
7600 ++ctx->num_write;
7601 {
7602 maybe_create_new_object(ctx);
7603 bufferlist to_set_bl;
7604 try {
7605 decode_str_str_map_to_bl(bp, &to_set_bl);
7606 }
7607 catch (buffer::error& e) {
7608 result = -EINVAL;
7609 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7610 goto fail;
7611 }
7612 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7613 if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7614 dout(20) << "setting vals: " << dendl;
7615 map<string,bufferlist> to_set;
7616 bufferlist::const_iterator pt = to_set_bl.begin();
7617 decode(to_set, pt);
7618 for (map<string, bufferlist>::iterator i = to_set.begin();
7619 i != to_set.end();
7620 ++i) {
7621 dout(20) << "\t" << i->first << dendl;
7622 }
7623 }
7624 t->omap_setkeys(soid, to_set_bl);
7625 ctx->delta_stats.num_wr++;
7626 ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7627 }
7628 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7629 obs.oi.clear_omap_digest();
7630 break;
7631
7632 case CEPH_OSD_OP_OMAPSETHEADER:
7633 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7634 if (!pool.info.supports_omap()) {
7635 result = -EOPNOTSUPP;
7636 break;
7637 }
7638 ++ctx->num_write;
7639 {
7640 maybe_create_new_object(ctx);
7641 t->omap_setheader(soid, osd_op.indata);
7642 ctx->delta_stats.num_wr++;
7643 }
7644 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7645 obs.oi.clear_omap_digest();
7646 break;
7647
7648 case CEPH_OSD_OP_OMAPCLEAR:
7649 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7650 if (!pool.info.supports_omap()) {
7651 result = -EOPNOTSUPP;
7652 break;
7653 }
7654 ++ctx->num_write;
7655 {
7656 if (!obs.exists || oi.is_whiteout()) {
7657 result = -ENOENT;
7658 break;
7659 }
7660 if (oi.is_omap()) {
7661 t->omap_clear(soid);
7662 ctx->delta_stats.num_wr++;
7663 obs.oi.clear_omap_digest();
7664 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7665 }
7666 }
7667 break;
7668
7669 case CEPH_OSD_OP_OMAPRMKEYS:
7670 if (!pool.info.supports_omap()) {
7671 result = -EOPNOTSUPP;
7672 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7673 break;
7674 }
7675 ++ctx->num_write;
7676 {
7677 if (!obs.exists || oi.is_whiteout()) {
7678 result = -ENOENT;
7679 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7680 break;
7681 }
7682 bufferlist to_rm_bl;
7683 try {
7684 decode_str_set_to_bl(bp, &to_rm_bl);
7685 }
7686 catch (buffer::error& e) {
7687 result = -EINVAL;
7688 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7689 goto fail;
7690 }
7691 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7692 t->omap_rmkeys(soid, to_rm_bl);
7693 ctx->delta_stats.num_wr++;
7694 }
7695 obs.oi.clear_omap_digest();
7696 break;
7697
7698 case CEPH_OSD_OP_COPY_GET:
7699 ++ctx->num_read;
7700 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7701 soid.snap.val);
7702 if (op_finisher == nullptr) {
7703 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7704 } else {
7705 result = op_finisher->execute();
7706 }
7707 break;
7708
7709 case CEPH_OSD_OP_COPY_FROM:
7710 ++ctx->num_write;
7711 {
7712 object_t src_name;
7713 object_locator_t src_oloc;
7714 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7715 version_t src_version = op.copy_from.src_version;
7716 try {
7717 decode(src_name, bp);
7718 decode(src_oloc, bp);
7719 }
7720 catch (buffer::error& e) {
7721 result = -EINVAL;
7722 tracepoint(osd,
7723 do_osd_op_pre_copy_from,
7724 soid.oid.name.c_str(),
7725 soid.snap.val,
7726 "???",
7727 0,
7728 "???",
7729 "???",
7730 0,
7731 src_snapid,
7732 src_version);
7733 goto fail;
7734 }
7735 tracepoint(osd,
7736 do_osd_op_pre_copy_from,
7737 soid.oid.name.c_str(),
7738 soid.snap.val,
7739 src_name.name.c_str(),
7740 src_oloc.pool,
7741 src_oloc.key.c_str(),
7742 src_oloc.nspace.c_str(),
7743 src_oloc.hash,
7744 src_snapid,
7745 src_version);
7746 if (op_finisher == nullptr) {
7747 // start
7748 pg_t raw_pg;
7749 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7750 hobject_t src(src_name, src_oloc.key, src_snapid,
7751 raw_pg.ps(), raw_pg.pool(),
7752 src_oloc.nspace);
7753 if (src == soid) {
7754 dout(20) << " copy from self is invalid" << dendl;
7755 result = -EINVAL;
7756 break;
7757 }
7758 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
7759 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7760 new CopyFromFinisher(cb));
7761 start_copy(cb, ctx->obc, src, src_oloc, src_version,
7762 op.copy_from.flags,
7763 false,
7764 op.copy_from.src_fadvise_flags,
7765 op.flags);
7766 result = -EINPROGRESS;
7767 } else {
7768 // finish
7769 result = op_finisher->execute();
7770 ceph_assert(result == 0);
7771
7772 // COPY_FROM cannot be executed multiple times -- it must restart
7773 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7774 }
7775 }
7776 break;
7777
7778 default:
7779 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7780 dout(1) << "unrecognized osd op " << op.op
7781 << " " << ceph_osd_op_name(op.op)
7782 << dendl;
7783 result = -EOPNOTSUPP;
7784 }
7785
7786 fail:
7787 osd_op.rval = result;
7788 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
7789 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7790 result != -EAGAIN && result != -EINPROGRESS)
7791 result = 0;
7792
7793 if (result < 0)
7794 break;
7795 }
7796 return result;
7797 }
7798
7799 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7800 {
7801 if (ctx->new_obs.oi.size == 0) {
7802 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7803 return -ENODATA;
7804 }
7805 vector<OSDOp> nops(1);
7806 OSDOp &newop = nops[0];
7807 newop.op.op = CEPH_OSD_OP_TMAPGET;
7808 do_osd_ops(ctx, nops);
7809 try {
7810 bufferlist::const_iterator i = newop.outdata.begin();
7811 decode(*header, i);
7812 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7813 } catch (...) {
7814 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7815 << dendl;
7816 return -EINVAL;
7817 }
7818 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7819 << dendl;
7820 return 0;
7821 }
7822
7823 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7824 const SnapSet& ss)
7825 {
7826 // verify that all clones have been evicted
7827 dout(20) << __func__ << " verifying clones are absent "
7828 << ss << dendl;
7829 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7830 p != ss.clones.end();
7831 ++p) {
7832 hobject_t clone_oid = soid;
7833 clone_oid.snap = *p;
7834 if (is_missing_object(clone_oid))
7835 return -EBUSY;
7836 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
7837 if (clone_obc && clone_obc->obs.exists) {
7838 dout(10) << __func__ << " cannot evict head before clone "
7839 << clone_oid << dendl;
7840 return -EBUSY;
7841 }
7842 if (copy_ops.count(clone_oid)) {
7843 dout(10) << __func__ << " cannot evict head, pending promote on clone "
7844 << clone_oid << dendl;
7845 return -EBUSY;
7846 }
7847 }
7848 return 0;
7849 }
7850
7851 inline int PrimaryLogPG::_delete_oid(
7852 OpContext *ctx,
7853 bool no_whiteout, // no whiteouts, no matter what.
7854 bool try_no_whiteout) // try not to whiteout
7855 {
7856 SnapSet& snapset = ctx->new_snapset;
7857 ObjectState& obs = ctx->new_obs;
7858 object_info_t& oi = obs.oi;
7859 const hobject_t& soid = oi.soid;
7860 PGTransaction* t = ctx->op_t.get();
7861
7862 // cache: cache: set whiteout on delete?
7863 bool whiteout = false;
7864 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
7865 && !no_whiteout
7866 && !try_no_whiteout) {
7867 whiteout = true;
7868 }
7869
7870 // in luminous or later, we can't delete the head if there are
7871 // clones. we trust the caller passing no_whiteout has already
7872 // verified they don't exist.
7873 if (!snapset.clones.empty() ||
7874 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
7875 if (no_whiteout) {
7876 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
7877 << dendl;
7878 } else {
7879 dout(20) << __func__ << " has or will have clones; will whiteout"
7880 << dendl;
7881 whiteout = true;
7882 }
7883 }
7884 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
7885 << " no_whiteout=" << (int)no_whiteout
7886 << " try_no_whiteout=" << (int)try_no_whiteout
7887 << dendl;
7888 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
7889 return -ENOENT;
7890
7891 t->remove(soid);
7892
7893 if (oi.size > 0) {
7894 interval_set<uint64_t> ch;
7895 ch.insert(0, oi.size);
7896 ctx->modified_ranges.union_of(ch);
7897 }
7898
7899 ctx->delta_stats.num_wr++;
7900 if (soid.is_snap()) {
7901 ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7902 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
7903 } else {
7904 ctx->delta_stats.num_bytes -= oi.size;
7905 }
7906 oi.size = 0;
7907 oi.new_object();
7908
7909 // disconnect all watchers
7910 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
7911 oi.watchers.begin();
7912 p != oi.watchers.end();
7913 ++p) {
7914 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
7915 ctx->watch_disconnects.push_back(
7916 watch_disconnect_t(p->first.first, p->first.second, true));
7917 }
7918 oi.watchers.clear();
7919
7920 if (whiteout) {
7921 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
7922 oi.set_flag(object_info_t::FLAG_WHITEOUT);
7923 ctx->delta_stats.num_whiteouts++;
7924 t->create(soid);
7925 osd->logger->inc(l_osd_tier_whiteout);
7926 return 0;
7927 }
7928
7929 // delete the head
7930 ctx->delta_stats.num_objects--;
7931 if (soid.is_snap())
7932 ctx->delta_stats.num_object_clones--;
7933 if (oi.is_whiteout()) {
7934 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
7935 ctx->delta_stats.num_whiteouts--;
7936 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
7937 }
7938 if (oi.is_cache_pinned()) {
7939 ctx->delta_stats.num_objects_pinned--;
7940 }
7941 if (oi.has_manifest()) {
7942 ctx->delta_stats.num_objects_manifest--;
7943 }
7944 obs.exists = false;
7945 return 0;
7946 }
7947
7948 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7949 {
7950 SnapSet& snapset = ctx->new_snapset;
7951 ObjectState& obs = ctx->new_obs;
7952 object_info_t& oi = obs.oi;
7953 const hobject_t& soid = oi.soid;
7954 PGTransaction* t = ctx->op_t.get();
7955 snapid_t snapid = (uint64_t)op.snap.snapid;
7956 hobject_t missing_oid;
7957
7958 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7959
7960 ObjectContextRef rollback_to;
7961
7962 int ret = find_object_context(
7963 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7964 soid.get_namespace()),
7965 &rollback_to, false, false, &missing_oid);
7966 if (ret == -EAGAIN) {
7967 /* clone must be missing */
7968 ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
7969 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7970 << missing_oid << " (requested snapid: ) " << snapid << dendl;
7971 block_write_on_degraded_snap(missing_oid, ctx->op);
7972 return ret;
7973 }
7974 {
7975 ObjectContextRef promote_obc;
7976 cache_result_t tier_mode_result;
7977 if (obs.exists && obs.oi.has_manifest()) {
7978 tier_mode_result =
7979 maybe_handle_manifest_detail(
7980 ctx->op,
7981 true,
7982 rollback_to);
7983 } else {
7984 tier_mode_result =
7985 maybe_handle_cache_detail(
7986 ctx->op,
7987 true,
7988 rollback_to,
7989 ret,
7990 missing_oid,
7991 true,
7992 false,
7993 &promote_obc);
7994 }
7995 switch (tier_mode_result) {
7996 case cache_result_t::NOOP:
7997 break;
7998 case cache_result_t::BLOCKED_PROMOTE:
7999 ceph_assert(promote_obc);
8000 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8001 return -EAGAIN;
8002 case cache_result_t::BLOCKED_FULL:
8003 block_write_on_full_cache(soid, ctx->op);
8004 return -EAGAIN;
8005 case cache_result_t::REPLIED_WITH_EAGAIN:
8006 ceph_abort_msg("this can't happen, no rollback on replica");
8007 default:
8008 ceph_abort_msg("must promote was set, other values are not valid");
8009 return -EAGAIN;
8010 }
8011 }
8012
8013 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8014 // there's no snapshot here, or there's no object.
8015 // if there's no snapshot, we delete the object; otherwise, do nothing.
8016 dout(20) << "_rollback_to deleting head on " << soid.oid
8017 << " because got ENOENT|whiteout on find_object_context" << dendl;
8018 if (ctx->obc->obs.oi.watchers.size()) {
8019 // Cannot delete an object with watchers
8020 ret = -EBUSY;
8021 } else {
8022 _delete_oid(ctx, false, false);
8023 ret = 0;
8024 }
8025 } else if (ret) {
8026 // ummm....huh? It *can't* return anything else at time of writing.
8027 ceph_abort_msg("unexpected error code in _rollback_to");
8028 } else { //we got our context, let's use it to do the rollback!
8029 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8030 if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8031 is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8032 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8033 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8034 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8035 ret = -EAGAIN;
8036 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8037 // rolling back to the head; we just need to clone it.
8038 ctx->modify = true;
8039 } else {
8040 /* 1) Delete current head
8041 * 2) Clone correct snapshot into head
8042 * 3) Calculate clone_overlaps by following overlaps
8043 * forward from rollback snapshot */
8044 dout(10) << "_rollback_to deleting " << soid.oid
8045 << " and rolling back to old snap" << dendl;
8046
8047 if (obs.exists) {
8048 t->remove(soid);
8049 }
8050 t->clone(soid, rollback_to_sobject);
8051 t->add_obc(rollback_to);
8052
8053 map<snapid_t, interval_set<uint64_t> >::iterator iter =
8054 snapset.clone_overlap.lower_bound(snapid);
8055 ceph_assert(iter != snapset.clone_overlap.end());
8056 interval_set<uint64_t> overlaps = iter->second;
8057 for ( ;
8058 iter != snapset.clone_overlap.end();
8059 ++iter)
8060 overlaps.intersection_of(iter->second);
8061
8062 if (obs.oi.size > 0) {
8063 interval_set<uint64_t> modified;
8064 modified.insert(0, obs.oi.size);
8065 overlaps.intersection_of(modified);
8066 modified.subtract(overlaps);
8067 ctx->modified_ranges.union_of(modified);
8068 }
8069
8070 // Adjust the cached objectcontext
8071 maybe_create_new_object(ctx, true);
8072 ctx->delta_stats.num_bytes -= obs.oi.size;
8073 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8074 obs.oi.size = rollback_to->obs.oi.size;
8075 if (rollback_to->obs.oi.is_data_digest())
8076 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8077 else
8078 obs.oi.clear_data_digest();
8079 if (rollback_to->obs.oi.is_omap_digest())
8080 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8081 else
8082 obs.oi.clear_omap_digest();
8083
8084 if (rollback_to->obs.oi.is_omap()) {
8085 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8086 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8087 } else {
8088 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8089 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8090 }
8091 }
8092 }
8093 return ret;
8094 }
8095
8096 void PrimaryLogPG::_make_clone(
8097 OpContext *ctx,
8098 PGTransaction* t,
8099 ObjectContextRef obc,
8100 const hobject_t& head, const hobject_t& coid,
8101 object_info_t *poi)
8102 {
8103 bufferlist bv;
8104 encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8105
8106 t->clone(coid, head);
8107 setattr_maybe_cache(obc, t, OI_ATTR, bv);
8108 rmattr_maybe_cache(obc, t, SS_ATTR);
8109 }
8110
8111 void PrimaryLogPG::make_writeable(OpContext *ctx)
8112 {
8113 const hobject_t& soid = ctx->obs->oi.soid;
8114 SnapContext& snapc = ctx->snapc;
8115
8116 // clone?
8117 ceph_assert(soid.snap == CEPH_NOSNAP);
8118 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8119 << " snapc=" << snapc << dendl;
8120
8121 bool was_dirty = ctx->obc->obs.oi.is_dirty();
8122 if (ctx->new_obs.exists) {
8123 // we will mark the object dirty
8124 if (ctx->undirty && was_dirty) {
8125 dout(20) << " clearing DIRTY flag" << dendl;
8126 ceph_assert(ctx->new_obs.oi.is_dirty());
8127 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8128 --ctx->delta_stats.num_objects_dirty;
8129 osd->logger->inc(l_osd_tier_clean);
8130 } else if (!was_dirty && !ctx->undirty) {
8131 dout(20) << " setting DIRTY flag" << dendl;
8132 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8133 ++ctx->delta_stats.num_objects_dirty;
8134 osd->logger->inc(l_osd_tier_dirty);
8135 }
8136 } else {
8137 if (was_dirty) {
8138 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8139 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8140 --ctx->delta_stats.num_objects_dirty;
8141 }
8142 }
8143
8144 if ((ctx->new_obs.exists &&
8145 ctx->new_obs.oi.is_omap()) &&
8146 (!ctx->obc->obs.exists ||
8147 !ctx->obc->obs.oi.is_omap())) {
8148 ++ctx->delta_stats.num_objects_omap;
8149 }
8150 if ((!ctx->new_obs.exists ||
8151 !ctx->new_obs.oi.is_omap()) &&
8152 (ctx->obc->obs.exists &&
8153 ctx->obc->obs.oi.is_omap())) {
8154 --ctx->delta_stats.num_objects_omap;
8155 }
8156
8157 if (ctx->new_snapset.seq > snapc.seq) {
8158 dout(10) << " op snapset is old" << dendl;
8159 }
8160
8161 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8162 snapc.snaps.size() && // there are snaps
8163 !ctx->cache_evict &&
8164 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
8165 // clone
8166 hobject_t coid = soid;
8167 coid.snap = snapc.seq;
8168
8169 unsigned l;
8170 for (l = 1;
8171 l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8172 l++) ;
8173
8174 vector<snapid_t> snaps(l);
8175 for (unsigned i=0; i<l; i++)
8176 snaps[i] = snapc.snaps[i];
8177
8178 // prepare clone
8179 object_info_t static_snap_oi(coid);
8180 object_info_t *snap_oi;
8181 if (is_primary()) {
8182 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8183 ctx->clone_obc->destructor_callback =
8184 new C_PG_ObjectContext(this, ctx->clone_obc.get());
8185 ctx->clone_obc->obs.oi = static_snap_oi;
8186 ctx->clone_obc->obs.exists = true;
8187 ctx->clone_obc->ssc = ctx->obc->ssc;
8188 ctx->clone_obc->ssc->ref++;
8189 if (pool.info.is_erasure())
8190 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8191 snap_oi = &ctx->clone_obc->obs.oi;
8192 bool got = ctx->lock_manager.get_write_greedy(
8193 coid,
8194 ctx->clone_obc,
8195 ctx->op);
8196 ceph_assert(got);
8197 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8198 } else {
8199 snap_oi = &static_snap_oi;
8200 }
8201 snap_oi->version = ctx->at_version;
8202 snap_oi->prior_version = ctx->obs->oi.version;
8203 snap_oi->copy_user_bits(ctx->obs->oi);
8204
8205 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8206
8207 ctx->delta_stats.num_objects++;
8208 if (snap_oi->is_dirty()) {
8209 ctx->delta_stats.num_objects_dirty++;
8210 osd->logger->inc(l_osd_tier_dirty);
8211 }
8212 if (snap_oi->is_omap())
8213 ctx->delta_stats.num_objects_omap++;
8214 if (snap_oi->is_cache_pinned())
8215 ctx->delta_stats.num_objects_pinned++;
8216 if (snap_oi->has_manifest())
8217 ctx->delta_stats.num_objects_manifest++;
8218 ctx->delta_stats.num_object_clones++;
8219 ctx->new_snapset.clones.push_back(coid.snap);
8220 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8221 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8222
8223 // clone_overlap should contain an entry for each clone
8224 // (an empty interval_set if there is no overlap)
8225 ctx->new_snapset.clone_overlap[coid.snap];
8226 if (ctx->obs->oi.size)
8227 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8228
8229 // log clone
8230 dout(10) << " cloning v " << ctx->obs->oi.version
8231 << " to " << coid << " v " << ctx->at_version
8232 << " snaps=" << snaps
8233 << " snapset=" << ctx->new_snapset << dendl;
8234 ctx->log.push_back(pg_log_entry_t(
8235 pg_log_entry_t::CLONE, coid, ctx->at_version,
8236 ctx->obs->oi.version,
8237 ctx->obs->oi.user_version,
8238 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8239 encode(snaps, ctx->log.back().snaps);
8240
8241 ctx->at_version.version++;
8242 }
8243
8244 // update most recent clone_overlap and usage stats
8245 if (ctx->new_snapset.clones.size() > 0) {
8246 // the clone_overlap is difference of range between head and clones.
8247 // we need to check whether the most recent clone exists, if it's
8248 // been evicted, it's not included in the stats, but the clone_overlap
8249 // is still exist in the snapset, so we should update the
8250 // clone_overlap to make it sense.
8251 hobject_t last_clone_oid = soid;
8252 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8253 interval_set<uint64_t> &newest_overlap =
8254 ctx->new_snapset.clone_overlap.rbegin()->second;
8255 ctx->modified_ranges.intersection_of(newest_overlap);
8256 if (is_present_clone(last_clone_oid)) {
8257 // modified_ranges is still in use by the clone
8258 ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8259 }
8260 newest_overlap.subtract(ctx->modified_ranges);
8261 }
8262
8263 if (snapc.seq > ctx->new_snapset.seq) {
8264 // update snapset with latest snap context
8265 ctx->new_snapset.seq = snapc.seq;
8266 ctx->new_snapset.snaps = snapc.snaps;
8267 }
8268 dout(20) << "make_writeable " << soid
8269 << " done, snapset=" << ctx->new_snapset << dendl;
8270 }
8271
8272
8273 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8274 interval_set<uint64_t>& modified, uint64_t offset,
8275 uint64_t length, bool write_full)
8276 {
8277 interval_set<uint64_t> ch;
8278 if (write_full) {
8279 if (oi.size)
8280 ch.insert(0, oi.size);
8281 } else if (length)
8282 ch.insert(offset, length);
8283 modified.union_of(ch);
8284 if (write_full ||
8285 (offset + length > oi.size && length)) {
8286 uint64_t new_size = offset + length;
8287 delta_stats.num_bytes -= oi.size;
8288 delta_stats.num_bytes += new_size;
8289 oi.size = new_size;
8290 }
8291
8292 if (oi.has_manifest() && oi.manifest.is_chunked()) {
8293 for (auto &p : oi.manifest.chunk_map) {
8294 if ((p.first <= offset && p.first + p.second.length > offset) ||
8295 (p.first > offset && p.first <= offset + length)) {
8296 p.second.clear_flag(chunk_info_t::FLAG_MISSING);
8297 p.second.set_flag(chunk_info_t::FLAG_DIRTY);
8298 }
8299 }
8300 }
8301 delta_stats.num_wr++;
8302 delta_stats.num_wr_kb += shift_round_up(length, 10);
8303 }
8304
8305 void PrimaryLogPG::truncate_update_size_and_usage(
8306 object_stat_sum_t& delta_stats,
8307 object_info_t& oi,
8308 uint64_t truncate_size)
8309 {
8310 if (oi.size != truncate_size) {
8311 delta_stats.num_bytes -= oi.size;
8312 delta_stats.num_bytes += truncate_size;
8313 oi.size = truncate_size;
8314 }
8315 }
8316
8317 void PrimaryLogPG::complete_disconnect_watches(
8318 ObjectContextRef obc,
8319 const list<watch_disconnect_t> &to_disconnect)
8320 {
8321 for (list<watch_disconnect_t>::const_iterator i =
8322 to_disconnect.begin();
8323 i != to_disconnect.end();
8324 ++i) {
8325 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8326 auto watchers_entry = obc->watchers.find(watcher);
8327 if (watchers_entry != obc->watchers.end()) {
8328 WatchRef watch = watchers_entry->second;
8329 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8330 obc->watchers.erase(watcher);
8331 watch->remove(i->send_disconnect);
8332 } else {
8333 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8334 << watcher << dendl;
8335 }
8336 }
8337 }
8338
8339 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8340 {
8341 entity_name_t entity = ctx->reqid.name;
8342 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8343
8344 // disconnects first
8345 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8346
8347 ceph_assert(conn);
8348
8349 auto session = conn->get_priv();
8350 if (!session)
8351 return;
8352
8353 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8354 i != ctx->watch_connects.end();
8355 ++i) {
8356 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8357 dout(15) << "do_osd_op_effects applying watch connect on session "
8358 << session.get() << " watcher " << watcher << dendl;
8359 WatchRef watch;
8360 if (ctx->obc->watchers.count(watcher)) {
8361 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8362 << dendl;
8363 watch = ctx->obc->watchers[watcher];
8364 } else {
8365 dout(15) << "do_osd_op_effects new watcher " << watcher
8366 << dendl;
8367 watch = Watch::makeWatchRef(
8368 this, osd, ctx->obc, i->first.timeout_seconds,
8369 i->first.cookie, entity, conn->get_peer_addr());
8370 ctx->obc->watchers.insert(
8371 make_pair(
8372 watcher,
8373 watch));
8374 }
8375 watch->connect(conn, i->second);
8376 }
8377
8378 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8379 p != ctx->notifies.end();
8380 ++p) {
8381 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8382 ConnectionRef conn(ctx->op->get_req()->get_connection());
8383 NotifyRef notif(
8384 Notify::makeNotifyRef(
8385 conn,
8386 ctx->reqid.name.num(),
8387 p->bl,
8388 p->timeout,
8389 p->cookie,
8390 p->notify_id,
8391 ctx->obc->obs.oi.user_version,
8392 osd));
8393 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8394 ctx->obc->watchers.begin();
8395 i != ctx->obc->watchers.end();
8396 ++i) {
8397 dout(10) << "starting notify on watch " << i->first << dendl;
8398 i->second->start_notify(notif);
8399 }
8400 notif->init();
8401 }
8402
8403 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8404 p != ctx->notify_acks.end();
8405 ++p) {
8406 if (p->watch_cookie)
8407 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
8408 else
8409 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8410 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8411 ctx->obc->watchers.begin();
8412 i != ctx->obc->watchers.end();
8413 ++i) {
8414 if (i->first.second != entity) continue;
8415 if (p->watch_cookie &&
8416 p->watch_cookie.get() != i->first.first) continue;
8417 dout(10) << "acking notify on watch " << i->first << dendl;
8418 i->second->notify_ack(p->notify_id, p->reply_bl);
8419 }
8420 }
8421 }
8422
8423 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8424 {
8425 ostringstream ss;
8426 ss << "temp_" << info.pgid << "_" << get_role()
8427 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8428 hobject_t hoid = target.make_temp_hobject(ss.str());
8429 dout(20) << __func__ << " " << hoid << dendl;
8430 return hoid;
8431 }
8432
8433 hobject_t PrimaryLogPG::get_temp_recovery_object(
8434 const hobject_t& target,
8435 eversion_t version)
8436 {
8437 ostringstream ss;
8438 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
8439 << "_" << version
8440 << "_" << info.history.same_interval_since
8441 << "_" << target.snap;
8442 // pgid + version + interval + snapid is unique, and short
8443 hobject_t hoid = target.make_temp_hobject(ss.str());
8444 dout(20) << __func__ << " " << hoid << dendl;
8445 return hoid;
8446 }
8447
8448 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8449 {
8450 ceph_assert(!ctx->ops->empty());
8451
8452 // valid snap context?
8453 if (!ctx->snapc.is_valid()) {
8454 dout(10) << " invalid snapc " << ctx->snapc << dendl;
8455 return -EINVAL;
8456 }
8457
8458 // prepare the actual mutation
8459 int result = do_osd_ops(ctx, *ctx->ops);
8460 if (result < 0) {
8461 if (ctx->op->may_write() &&
8462 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
8463 // need to save the error code in the pg log, to detect dup ops,
8464 // but do nothing else
8465 ctx->update_log_only = true;
8466 }
8467 return result;
8468 }
8469
8470 // read-op? write-op noop? done?
8471 if (ctx->op_t->empty() && !ctx->modify) {
8472 if (ctx->pending_async_reads.empty())
8473 unstable_stats.add(ctx->delta_stats);
8474 if (ctx->op->may_write() &&
8475 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
8476 ctx->update_log_only = true;
8477 }
8478 return result;
8479 }
8480
8481 // check for full
8482 if ((ctx->delta_stats.num_bytes > 0 ||
8483 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
8484 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
8485 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
8486 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
8487 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
8488 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8489 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8490 << dendl;
8491 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8492 // they tried, they failed.
8493 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8494 return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8495 } else {
8496 // drop request
8497 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8498 return -EAGAIN;
8499 }
8500 }
8501
8502 const hobject_t& soid = ctx->obs->oi.soid;
8503 // clone, if necessary
8504 if (soid.snap == CEPH_NOSNAP)
8505 make_writeable(ctx);
8506
8507 finish_ctx(ctx,
8508 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8509 pg_log_entry_t::DELETE);
8510
8511 return result;
8512 }
8513
8514 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type)
8515 {
8516 const hobject_t& soid = ctx->obs->oi.soid;
8517 dout(20) << __func__ << " " << soid << " " << ctx
8518 << " op " << pg_log_entry_t::get_op_name(log_op_type)
8519 << dendl;
8520 utime_t now = ceph_clock_now();
8521
8522 // finish and log the op.
8523 if (ctx->user_modify) {
8524 // update the user_version for any modify ops, except for the watch op
8525 ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8526 /* In order for new clients and old clients to interoperate properly
8527 * when exchanging versions, we need to lower bound the user_version
8528 * (which our new clients pay proper attention to)
8529 * by the at_version (which is all the old clients can ever see). */
8530 if (ctx->at_version.version > ctx->user_at_version)
8531 ctx->user_at_version = ctx->at_version.version;
8532 ctx->new_obs.oi.user_version = ctx->user_at_version;
8533 }
8534 ctx->bytes_written = ctx->op_t->get_bytes_written();
8535
8536 if (ctx->new_obs.exists) {
8537 ctx->new_obs.oi.version = ctx->at_version;
8538 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8539 ctx->new_obs.oi.last_reqid = ctx->reqid;
8540 if (ctx->mtime != utime_t()) {
8541 ctx->new_obs.oi.mtime = ctx->mtime;
8542 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8543 ctx->new_obs.oi.local_mtime = now;
8544 } else {
8545 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8546 }
8547
8548 // object_info_t
8549 map <string, bufferlist> attrs;
8550 bufferlist bv(sizeof(ctx->new_obs.oi));
8551 encode(ctx->new_obs.oi, bv,
8552 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8553 attrs[OI_ATTR].claim(bv);
8554
8555 // snapset
8556 if (soid.snap == CEPH_NOSNAP) {
8557 dout(10) << " final snapset " << ctx->new_snapset
8558 << " in " << soid << dendl;
8559 bufferlist bss;
8560 encode(ctx->new_snapset, bss);
8561 attrs[SS_ATTR].claim(bss);
8562 } else {
8563 dout(10) << " no snapset (this is a clone)" << dendl;
8564 }
8565 ctx->op_t->setattrs(soid, attrs);
8566 } else {
8567 // reset cached oi
8568 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8569 }
8570
8571 // append to log
8572 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
8573 ctx->obs->oi.version,
8574 ctx->user_at_version, ctx->reqid,
8575 ctx->mtime, 0));
8576 if (soid.snap < CEPH_NOSNAP) {
8577 switch (log_op_type) {
8578 case pg_log_entry_t::MODIFY:
8579 case pg_log_entry_t::PROMOTE:
8580 case pg_log_entry_t::CLEAN:
8581 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8582 << dendl;
8583 encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8584 break;
8585 default:
8586 break;
8587 }
8588 }
8589
8590 if (!ctx->extra_reqids.empty()) {
8591 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
8592 << ctx->extra_reqid_return_codes << dendl;
8593 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
8594 ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
8595 }
8596
8597 // apply new object state.
8598 ctx->obc->obs = ctx->new_obs;
8599
8600 if (soid.is_head() && !ctx->obc->obs.exists) {
8601 ctx->obc->ssc->exists = false;
8602 ctx->obc->ssc->snapset = SnapSet();
8603 } else {
8604 ctx->obc->ssc->exists = true;
8605 ctx->obc->ssc->snapset = ctx->new_snapset;
8606 }
8607 }
8608
8609 void PrimaryLogPG::apply_stats(
8610 const hobject_t &soid,
8611 const object_stat_sum_t &delta_stats) {
8612
8613 info.stats.stats.add(delta_stats);
8614 info.stats.stats.floor(0);
8615
8616 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
8617 i != backfill_targets.end();
8618 ++i) {
8619 pg_shard_t bt = *i;
8620 pg_info_t& pinfo = peer_info[bt];
8621 if (soid <= pinfo.last_backfill)
8622 pinfo.stats.stats.add(delta_stats);
8623 else if (soid <= last_backfill_started)
8624 pending_backfill_updates[soid].stats.add(delta_stats);
8625 }
8626
8627 if (is_primary() && scrubber.active) {
8628 if (soid < scrubber.start) {
8629 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
8630 << "," << scrubber.end << ")" << dendl;
8631 scrub_cstat.add(delta_stats);
8632 } else {
8633 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
8634 << "," << scrubber.end << ")" << dendl;
8635 }
8636 }
8637 }
8638
8639 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8640 {
8641 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
8642 ceph_assert(ctx->async_reads_complete());
8643
8644 for (vector<OSDOp>::iterator p = ctx->ops->begin();
8645 p != ctx->ops->end() && result >= 0; ++p) {
8646 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8647 result = p->rval;
8648 break;
8649 }
8650 ctx->bytes_read += p->outdata.length();
8651 }
8652 ctx->reply->claim_op_out_data(*ctx->ops);
8653 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
8654
8655 MOSDOpReply *reply = ctx->reply;
8656 ctx->reply = nullptr;
8657
8658 if (result >= 0) {
8659 if (!ctx->ignore_log_op_stats) {
8660 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8661
8662 publish_stats_to_osd();
8663 }
8664
8665 // on read, return the current object version
8666 if (ctx->obs) {
8667 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8668 } else {
8669 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8670 }
8671 } else if (result == -ENOENT) {
8672 // on ENOENT, set a floor for what the next user version will be.
8673 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8674 }
8675
8676 reply->set_result(result);
8677 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8678 osd->send_message_osd_client(reply, m->get_connection());
8679 close_op_ctx(ctx);
8680 }
8681
8682 // ========================================================================
8683 // copyfrom
8684
8685 struct C_Copyfrom : public Context {
8686 PrimaryLogPGRef pg;
8687 hobject_t oid;
8688 epoch_t last_peering_reset;
8689 ceph_tid_t tid;
8690 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
8691 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8692 const PrimaryLogPG::CopyOpRef& c)
8693 : pg(p), oid(o), last_peering_reset(lpr),
8694 tid(0), cop(c)
8695 {}
8696 void finish(int r) override {
8697 if (r == -ECANCELED)
8698 return;
8699 pg->lock();
8700 if (last_peering_reset == pg->get_last_peering_reset()) {
8701 pg->process_copy_chunk(oid, tid, r);
8702 cop.reset();
8703 }
8704 pg->unlock();
8705 }
8706 };
8707
8708 struct C_CopyFrom_AsyncReadCb : public Context {
8709 OSDOp *osd_op;
8710 object_copy_data_t reply_obj;
8711 uint64_t features;
8712 size_t len;
8713 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8714 osd_op(osd_op), features(features), len(0) {}
8715 void finish(int r) override {
8716 osd_op->rval = r;
8717 if (r < 0) {
8718 return;
8719 }
8720
8721 ceph_assert(len > 0);
8722 ceph_assert(len <= reply_obj.data.length());
8723 bufferlist bl;
8724 bl.substr_of(reply_obj.data, 0, len);
8725 reply_obj.data.swap(bl);
8726 encode(reply_obj, osd_op->outdata, features);
8727 }
8728 };
8729
8730 struct C_CopyChunk : public Context {
8731 PrimaryLogPGRef pg;
8732 hobject_t oid;
8733 epoch_t last_peering_reset;
8734 ceph_tid_t tid;
8735 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
8736 uint64_t offset = 0;
8737 C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8738 const PrimaryLogPG::CopyOpRef& c)
8739 : pg(p), oid(o), last_peering_reset(lpr),
8740 tid(0), cop(c)
8741 {}
8742 void finish(int r) override {
8743 if (r == -ECANCELED)
8744 return;
8745 pg->lock();
8746 if (last_peering_reset == pg->get_last_peering_reset()) {
8747 pg->process_copy_chunk_manifest(oid, tid, r, offset);
8748 cop.reset();
8749 }
8750 pg->unlock();
8751 }
8752 };
8753
8754 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
8755 OSDOp& osd_op, ObjectContextRef &obc)
8756 {
8757 object_info_t& oi = obc->obs.oi;
8758 hobject_t& soid = oi.soid;
8759 int result = 0;
8760 object_copy_cursor_t cursor;
8761 uint64_t out_max;
8762 try {
8763 decode(cursor, bp);
8764 decode(out_max, bp);
8765 }
8766 catch (buffer::error& e) {
8767 result = -EINVAL;
8768 return result;
8769 }
8770
8771 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8772 uint64_t features = op->get_features();
8773
8774 bool async_read_started = false;
8775 object_copy_data_t _reply_obj;
8776 C_CopyFrom_AsyncReadCb *cb = nullptr;
8777 if (pool.info.is_erasure()) {
8778 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8779 }
8780 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8781 // size, mtime
8782 reply_obj.size = oi.size;
8783 reply_obj.mtime = oi.mtime;
8784 ceph_assert(obc->ssc);
8785 if (soid.snap < CEPH_NOSNAP) {
8786 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8787 ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8788 reply_obj.snaps = p->second;
8789 } else {
8790 reply_obj.snap_seq = obc->ssc->snapset.seq;
8791 }
8792 if (oi.is_data_digest()) {
8793 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8794 reply_obj.data_digest = oi.data_digest;
8795 }
8796 if (oi.is_omap_digest()) {
8797 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8798 reply_obj.omap_digest = oi.omap_digest;
8799 }
8800 reply_obj.truncate_seq = oi.truncate_seq;
8801 reply_obj.truncate_size = oi.truncate_size;
8802
8803 // attrs
8804 map<string,bufferlist>& out_attrs = reply_obj.attrs;
8805 if (!cursor.attr_complete) {
8806 result = getattrs_maybe_cache(
8807 ctx->obc,
8808 &out_attrs);
8809 if (result < 0) {
8810 if (cb) {
8811 delete cb;
8812 }
8813 return result;
8814 }
8815 cursor.attr_complete = true;
8816 dout(20) << " got attrs" << dendl;
8817 }
8818
8819 int64_t left = out_max - osd_op.outdata.length();
8820
8821 // data
8822 bufferlist& bl = reply_obj.data;
8823 if (left > 0 && !cursor.data_complete) {
8824 if (cursor.data_offset < oi.size) {
8825 uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
8826 if (cb) {
8827 async_read_started = true;
8828 ctx->pending_async_reads.push_back(
8829 make_pair(
8830 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
8831 make_pair(&bl, cb)));
8832 cb->len = max_read;
8833
8834 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8835 new ReadFinisher(osd_op));
8836 result = -EINPROGRESS;
8837
8838 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
8839 } else {
8840 result = pgbackend->objects_read_sync(
8841 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
8842 if (result < 0)
8843 return result;
8844 }
8845 left -= max_read;
8846 cursor.data_offset += max_read;
8847 }
8848 if (cursor.data_offset == oi.size) {
8849 cursor.data_complete = true;
8850 dout(20) << " got data" << dendl;
8851 }
8852 ceph_assert(cursor.data_offset <= oi.size);
8853 }
8854
8855 // omap
8856 uint32_t omap_keys = 0;
8857 if (!pool.info.supports_omap() || !oi.is_omap()) {
8858 cursor.omap_complete = true;
8859 } else {
8860 if (left > 0 && !cursor.omap_complete) {
8861 ceph_assert(cursor.data_complete);
8862 if (cursor.omap_offset.empty()) {
8863 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
8864 &reply_obj.omap_header);
8865 }
8866 bufferlist omap_data;
8867 ObjectMap::ObjectMapIterator iter =
8868 osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
8869 ceph_assert(iter);
8870 iter->upper_bound(cursor.omap_offset);
8871 for (; iter->valid(); iter->next()) {
8872 ++omap_keys;
8873 encode(iter->key(), omap_data);
8874 encode(iter->value(), omap_data);
8875 left -= iter->key().length() + 4 + iter->value().length() + 4;
8876 if (left <= 0)
8877 break;
8878 }
8879 if (omap_keys) {
8880 encode(omap_keys, reply_obj.omap_data);
8881 reply_obj.omap_data.claim_append(omap_data);
8882 }
8883 if (iter->valid()) {
8884 cursor.omap_offset = iter->key();
8885 } else {
8886 cursor.omap_complete = true;
8887 dout(20) << " got omap" << dendl;
8888 }
8889 }
8890 }
8891
8892 if (cursor.is_complete()) {
8893 // include reqids only in the final step. this is a bit fragile
8894 // but it works...
8895 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
8896 &reply_obj.reqids,
8897 &reply_obj.reqid_return_codes);
8898 dout(20) << " got reqids" << dendl;
8899 }
8900
8901 dout(20) << " cursor.is_complete=" << cursor.is_complete()
8902 << " " << out_attrs.size() << " attrs"
8903 << " " << bl.length() << " bytes"
8904 << " " << reply_obj.omap_header.length() << " omap header bytes"
8905 << " " << reply_obj.omap_data.length() << " omap data bytes in "
8906 << omap_keys << " keys"
8907 << " " << reply_obj.reqids.size() << " reqids"
8908 << dendl;
8909 reply_obj.cursor = cursor;
8910 if (!async_read_started) {
8911 encode(reply_obj, osd_op.outdata, features);
8912 }
8913 if (cb && !async_read_started) {
8914 delete cb;
8915 }
8916
8917 if (result > 0) {
8918 result = 0;
8919 }
8920 return result;
8921 }
8922
8923 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8924 OSDOp& osd_op)
8925 {
8926 // NOTE: we take non-const ref here for claim_op_out_data below; we must
8927 // be careful not to modify anything else that will upset a racing
8928 // operator<<
8929 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
8930 uint64_t features = m->get_features();
8931 object_copy_data_t reply_obj;
8932
8933 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
8934 &reply_obj.reqid_return_codes);
8935 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8936 encode(reply_obj, osd_op.outdata, features);
8937 osd_op.rval = -ENOENT;
8938 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
8939 reply->claim_op_out_data(m->ops);
8940 reply->set_result(-ENOENT);
8941 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8942 osd->send_message_osd_client(reply, m->get_connection());
8943 }
8944
8945 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8946 hobject_t src, object_locator_t oloc,
8947 version_t version, unsigned flags,
8948 bool mirror_snapset,
8949 unsigned src_obj_fadvise_flags,
8950 unsigned dest_obj_fadvise_flags)
8951 {
8952 const hobject_t& dest = obc->obs.oi.soid;
8953 dout(10) << __func__ << " " << dest
8954 << " from " << src << " " << oloc << " v" << version
8955 << " flags " << flags
8956 << (mirror_snapset ? " mirror_snapset" : "")
8957 << dendl;
8958
8959 ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
8960
8961 // cancel a previous in-progress copy?
8962 if (copy_ops.count(dest)) {
8963 // FIXME: if the src etc match, we could avoid restarting from the
8964 // beginning.
8965 CopyOpRef cop = copy_ops[dest];
8966 vector<ceph_tid_t> tids;
8967 cancel_copy(cop, false, &tids);
8968 osd->objecter->op_cancel(tids, -ECANCELED);
8969 }
8970
8971 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8972 mirror_snapset, src_obj_fadvise_flags,
8973 dest_obj_fadvise_flags));
8974 copy_ops[dest] = cop;
8975 obc->start_block();
8976
8977 if (!obc->obs.oi.has_manifest()) {
8978 _copy_some(obc, cop);
8979 } else {
8980 if (obc->obs.oi.manifest.is_redirect()) {
8981 _copy_some(obc, cop);
8982 } else if (obc->obs.oi.manifest.is_chunked()) {
8983 auto p = obc->obs.oi.manifest.chunk_map.begin();
8984 _copy_some_manifest(obc, cop, p->first);
8985 } else {
8986 ceph_abort_msg("unrecognized manifest type");
8987 }
8988 }
8989 }
8990
8991 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8992 {
8993 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
8994
8995 unsigned flags = 0;
8996 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8997 flags |= CEPH_OSD_FLAG_FLUSH;
8998 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8999 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9000 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9001 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9002 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9003 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9004 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9005 flags |= CEPH_OSD_FLAG_RWORDERED;
9006
9007 C_GatherBuilder gather(cct);
9008
9009 if (cop->cursor.is_initial() && cop->mirror_snapset) {
9010 // list snaps too.
9011 ceph_assert(cop->src.snap == CEPH_NOSNAP);
9012 ObjectOperation op;
9013 op.list_snaps(&cop->results.snapset, NULL);
9014 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9015 CEPH_SNAPDIR, NULL,
9016 flags, gather.new_sub(), NULL);
9017 cop->objecter_tid2 = tid;
9018 }
9019
9020 ObjectOperation op;
9021 if (cop->results.user_version) {
9022 op.assert_version(cop->results.user_version);
9023 } else {
9024 // we should learn the version after the first chunk, if we didn't know
9025 // it already!
9026 ceph_assert(cop->cursor.is_initial());
9027 }
9028 op.copy_get(&cop->cursor, get_copy_chunk_size(),
9029 &cop->results.object_size, &cop->results.mtime,
9030 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9031 &cop->results.snaps, &cop->results.snap_seq,
9032 &cop->results.flags,
9033 &cop->results.source_data_digest,
9034 &cop->results.source_omap_digest,
9035 &cop->results.reqids,
9036 &cop->results.reqid_return_codes,
9037 &cop->results.truncate_seq,
9038 &cop->results.truncate_size,
9039 &cop->rval);
9040 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9041
9042 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9043 get_last_peering_reset(), cop);
9044 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
9045 gather.set_finisher(new C_OnFinisher(fin,
9046 osd->objecter_finishers[n]));
9047
9048 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9049 cop->src.snap, NULL,
9050 flags,
9051 gather.new_sub(),
9052 // discover the object version if we don't know it yet
9053 cop->results.user_version ? NULL : &cop->results.user_version);
9054 fin->tid = tid;
9055 cop->objecter_tid = tid;
9056 gather.activate();
9057 }
9058
9059 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9060 {
9061 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9062
9063 unsigned flags = 0;
9064 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9065 flags |= CEPH_OSD_FLAG_FLUSH;
9066 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9067 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9068 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9069 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9070 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9071 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9072 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9073 flags |= CEPH_OSD_FLAG_RWORDERED;
9074
9075 int num_chunks = 0;
9076 uint64_t last_offset = 0, chunks_size = 0;
9077 object_manifest_t *manifest = &obc->obs.oi.manifest;
9078 map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9079 for (;iter != manifest->chunk_map.end(); ++iter) {
9080 num_chunks++;
9081 chunks_size += iter->second.length;
9082 last_offset = iter->first;
9083 if (get_copy_chunk_size() < chunks_size) {
9084 break;
9085 }
9086 }
9087
9088 cop->num_chunk = num_chunks;
9089 cop->start_offset = start_offset;
9090 cop->last_offset = last_offset;
9091 dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9092 << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9093 << " last_offset: " << last_offset << dendl;
9094
9095 iter = manifest->chunk_map.find(start_offset);
9096 for (;iter != manifest->chunk_map.end(); ++iter) {
9097 uint64_t obj_offset = iter->first;
9098 uint64_t length = manifest->chunk_map[iter->first].length;
9099 hobject_t soid = manifest->chunk_map[iter->first].oid;
9100 object_locator_t oloc(soid);
9101 CopyCallback * cb = NULL;
9102 CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9103 cop->results.user_version, cop->flags, cop->mirror_snapset,
9104 cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9105 sub_cop->cursor.data_offset = obj_offset;
9106 cop->chunk_cops[obj_offset] = sub_cop;
9107
9108 int s = sub_cop->chunk_ops.size();
9109 sub_cop->chunk_ops.resize(s+1);
9110 sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
9111 sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9112 sub_cop->chunk_ops[s].op.extent.length = length;
9113
9114 ObjectOperation op;
9115 op.dup(sub_cop->chunk_ops);
9116
9117 dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9118 << manifest->chunk_map[iter->first].offset
9119 << " length: " << length << " pool id: " << oloc.pool << dendl;
9120
9121 if (cop->results.user_version) {
9122 op.assert_version(cop->results.user_version);
9123 } else {
9124 // we should learn the version after the first chunk, if we didn't know
9125 // it already!
9126 ceph_assert(cop->cursor.is_initial());
9127 }
9128 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9129
9130 C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9131 get_last_peering_reset(), cop);
9132 fin->offset = obj_offset;
9133 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
9134
9135 ceph_tid_t tid = osd->objecter->read(soid.oid, oloc, op,
9136 sub_cop->src.snap, NULL,
9137 flags,
9138 new C_OnFinisher(fin, osd->objecter_finishers[n]),
9139 // discover the object version if we don't know it yet
9140 sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9141 fin->tid = tid;
9142 sub_cop->objecter_tid = tid;
9143 if (last_offset < iter->first) {
9144 break;
9145 }
9146 }
9147 }
9148
9149 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9150 {
9151 dout(10) << __func__ << " " << oid << " tid " << tid
9152 << " " << cpp_strerror(r) << dendl;
9153 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9154 if (p == copy_ops.end()) {
9155 dout(10) << __func__ << " no copy_op found" << dendl;
9156 return;
9157 }
9158 CopyOpRef cop = p->second;
9159 if (tid != cop->objecter_tid) {
9160 dout(10) << __func__ << " tid " << tid << " != cop " << cop
9161 << " tid " << cop->objecter_tid << dendl;
9162 return;
9163 }
9164
9165 if (cop->omap_data.length() || cop->omap_header.length())
9166 cop->results.has_omap = true;
9167
9168 if (r >= 0 && !pool.info.supports_omap() &&
9169 (cop->omap_data.length() || cop->omap_header.length())) {
9170 r = -EOPNOTSUPP;
9171 }
9172 cop->objecter_tid = 0;
9173 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9174 ObjectContextRef& cobc = cop->obc;
9175
9176 if (r < 0)
9177 goto out;
9178
9179 ceph_assert(cop->rval >= 0);
9180
9181 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9182 // verify snap hasn't been deleted
9183 vector<snapid_t>::iterator p = cop->results.snaps.begin();
9184 while (p != cop->results.snaps.end()) {
9185 if (pool.info.is_removed_snap(*p)) {
9186 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9187 << dendl;
9188 for (vector<snapid_t>::iterator q = p + 1;
9189 q != cop->results.snaps.end();
9190 ++q)
9191 *(q - 1) = *q;
9192 cop->results.snaps.resize(cop->results.snaps.size() - 1);
9193 } else {
9194 ++p;
9195 }
9196 }
9197 if (cop->results.snaps.empty()) {
9198 dout(10) << __func__ << " no more snaps for " << oid << dendl;
9199 r = -ENOENT;
9200 goto out;
9201 }
9202 }
9203
9204 ceph_assert(cop->rval >= 0);
9205
9206 if (!cop->temp_cursor.data_complete) {
9207 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9208 }
9209 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9210 if (cop->omap_header.length()) {
9211 cop->results.omap_digest =
9212 cop->omap_header.crc32c(cop->results.omap_digest);
9213 }
9214 if (cop->omap_data.length()) {
9215 bufferlist keys;
9216 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9217 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9218 }
9219 }
9220
9221 if (!cop->temp_cursor.attr_complete) {
9222 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9223 p != cop->attrs.end();
9224 ++p) {
9225 cop->results.attrs[string("_") + p->first] = p->second;
9226 }
9227 cop->attrs.clear();
9228 }
9229
9230 if (!cop->cursor.is_complete()) {
9231 // write out what we have so far
9232 if (cop->temp_cursor.is_initial()) {
9233 ceph_assert(!cop->results.started_temp_obj);
9234 cop->results.started_temp_obj = true;
9235 cop->results.temp_oid = generate_temp_object(oid);
9236 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9237 }
9238 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9239 OpContextUPtr ctx = simple_opc_create(tempobc);
9240 if (cop->temp_cursor.is_initial()) {
9241 ctx->new_temp_oid = cop->results.temp_oid;
9242 }
9243 _write_copy_chunk(cop, ctx->op_t.get());
9244 simple_opc_submit(std::move(ctx));
9245 dout(10) << __func__ << " fetching more" << dendl;
9246 _copy_some(cobc, cop);
9247 return;
9248 }
9249
9250 // verify digests?
9251 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9252 dout(20) << __func__ << std::hex
9253 << " got digest: rx data 0x" << cop->results.data_digest
9254 << " omap 0x" << cop->results.omap_digest
9255 << ", source: data 0x" << cop->results.source_data_digest
9256 << " omap 0x" << cop->results.source_omap_digest
9257 << std::dec
9258 << " flags " << cop->results.flags
9259 << dendl;
9260 }
9261 if (cop->results.is_data_digest() &&
9262 cop->results.data_digest != cop->results.source_data_digest) {
9263 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9264 << " != source 0x" << cop->results.source_data_digest << std::dec
9265 << dendl;
9266 osd->clog->error() << info.pgid << " copy from " << cop->src
9267 << " to " << cop->obc->obs.oi.soid << std::hex
9268 << " data digest 0x" << cop->results.data_digest
9269 << " != source 0x" << cop->results.source_data_digest
9270 << std::dec;
9271 r = -EIO;
9272 goto out;
9273 }
9274 if (cop->results.is_omap_digest() &&
9275 cop->results.omap_digest != cop->results.source_omap_digest) {
9276 derr << __func__ << std::hex
9277 << " omap digest 0x" << cop->results.omap_digest
9278 << " != source 0x" << cop->results.source_omap_digest
9279 << std::dec << dendl;
9280 osd->clog->error() << info.pgid << " copy from " << cop->src
9281 << " to " << cop->obc->obs.oi.soid << std::hex
9282 << " omap digest 0x" << cop->results.omap_digest
9283 << " != source 0x" << cop->results.source_omap_digest
9284 << std::dec;
9285 r = -EIO;
9286 goto out;
9287 }
9288 if (cct->_conf->osd_debug_inject_copyfrom_error) {
9289 derr << __func__ << " injecting copyfrom failure" << dendl;
9290 r = -EIO;
9291 goto out;
9292 }
9293
9294 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9295 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9296 ObjectState& obs = cop->obc->obs;
9297 if (cop->temp_cursor.is_initial()) {
9298 dout(20) << "fill_in_final_tx: writing "
9299 << "directly to final object" << dendl;
9300 // write directly to final object
9301 cop->results.temp_oid = obs.oi.soid;
9302 _write_copy_chunk(cop, t);
9303 } else {
9304 // finish writing to temp object, then move into place
9305 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9306 _write_copy_chunk(cop, t);
9307 t->rename(obs.oi.soid, cop->results.temp_oid);
9308 }
9309 t->setattrs(obs.oi.soid, cop->results.attrs);
9310 });
9311
9312 dout(20) << __func__ << " success; committing" << dendl;
9313
9314 out:
9315 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9316 CopyCallbackResults results(r, &cop->results);
9317 cop->cb->complete(results);
9318
9319 copy_ops.erase(cobc->obs.oi.soid);
9320 cobc->stop_block();
9321
9322 if (r < 0 && cop->results.started_temp_obj) {
9323 dout(10) << __func__ << " deleting partial temp object "
9324 << cop->results.temp_oid << dendl;
9325 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9326 OpContextUPtr ctx = simple_opc_create(tempobc);
9327 ctx->op_t->remove(cop->results.temp_oid);
9328 ctx->discard_temp_oid = cop->results.temp_oid;
9329 simple_opc_submit(std::move(ctx));
9330 }
9331
9332 // cancel and requeue proxy ops on this object
9333 if (!r) {
9334 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9335 }
9336
9337 kick_object_context_blocked(cobc);
9338 }
9339
9340 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9341 {
9342 dout(10) << __func__ << " " << oid << " tid " << tid
9343 << " " << cpp_strerror(r) << dendl;
9344 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9345 if (p == copy_ops.end()) {
9346 dout(10) << __func__ << " no copy_op found" << dendl;
9347 return;
9348 }
9349 CopyOpRef obj_cop = p->second;
9350 CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9351
9352 if (tid != chunk_cop->objecter_tid) {
9353 dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9354 << " tid " << chunk_cop->objecter_tid << dendl;
9355 return;
9356 }
9357
9358 if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9359 r = -EOPNOTSUPP;
9360 }
9361
9362 chunk_cop->objecter_tid = 0;
9363 chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9364 ObjectContextRef& cobc = obj_cop->obc;
9365 OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9366
9367 if (r < 0) {
9368 obj_cop->failed = true;
9369 goto out;
9370 }
9371
9372 if (obj_cop->failed) {
9373 return;
9374 }
9375 if (!chunk_data.outdata.length()) {
9376 r = -EIO;
9377 obj_cop->failed = true;
9378 goto out;
9379 }
9380
9381 obj_cop->num_chunk--;
9382
9383 /* check all of the copyop are completed */
9384 if (obj_cop->num_chunk) {
9385 dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9386 return;
9387 }
9388
9389 {
9390 OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9391 if (!ctx->lock_manager.take_write_lock(
9392 obj_cop->obc->obs.oi.soid,
9393 obj_cop->obc)) {
9394 // recovery op can take read lock.
9395 // so need to wait for recovery completion
9396 r = -EAGAIN;
9397 obj_cop->failed = true;
9398 close_op_ctx(ctx.release());
9399 goto out;
9400 }
9401 dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9402
9403 PGTransaction *t = ctx->op_t.get();
9404 ObjectState& obs = ctx->new_obs;
9405 for (auto p : obj_cop->chunk_cops) {
9406 OSDOp &sub_chunk = p.second->chunk_ops[0];
9407 t->write(cobc->obs.oi.soid,
9408 p.second->cursor.data_offset,
9409 sub_chunk.outdata.length(),
9410 sub_chunk.outdata,
9411 p.second->dest_obj_fadvise_flags);
9412 dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9413 << " length: " << sub_chunk.outdata.length() << dendl;
9414 write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9415 p.second->cursor.data_offset, sub_chunk.outdata.length());
9416 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_DIRTY);
9417 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9418 sub_chunk.outdata.clear();
9419 }
9420 obs.oi.clear_data_digest();
9421 ctx->at_version = get_next_version();
9422 finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9423 simple_opc_submit(std::move(ctx));
9424
9425 auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9426 /* check remaining work */
9427 if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9428 if (obj_cop->last_offset >= p->first + p->second.length) {
9429 for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9430 if (obj_cop->last_offset < en.first) {
9431 _copy_some_manifest(cobc, obj_cop, en.first);
9432 return;
9433 }
9434 }
9435 }
9436 }
9437 }
9438
9439 out:
9440 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9441 CopyCallbackResults results(r, &obj_cop->results);
9442 obj_cop->cb->complete(results);
9443
9444 copy_ops.erase(cobc->obs.oi.soid);
9445 cobc->stop_block();
9446
9447 // cancel and requeue proxy ops on this object
9448 if (!r) {
9449 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9450 }
9451
9452 kick_object_context_blocked(cobc);
9453 }
9454
9455 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9456 vector<ceph_tid_t> tids;
9457 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9458 it != proxyread_ops.end();) {
9459 if (it->second->soid == oid) {
9460 cancel_proxy_read((it++)->second, &tids);
9461 } else {
9462 ++it;
9463 }
9464 }
9465 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9466 it != proxywrite_ops.end();) {
9467 if (it->second->soid == oid) {
9468 cancel_proxy_write((it++)->second, &tids);
9469 } else {
9470 ++it;
9471 }
9472 }
9473 osd->objecter->op_cancel(tids, -ECANCELED);
9474 kick_proxy_ops_blocked(oid);
9475 }
9476
9477 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9478 {
9479 dout(20) << __func__ << " " << cop
9480 << " " << cop->attrs.size() << " attrs"
9481 << " " << cop->data.length() << " bytes"
9482 << " " << cop->omap_header.length() << " omap header bytes"
9483 << " " << cop->omap_data.length() << " omap data bytes"
9484 << dendl;
9485 if (!cop->temp_cursor.attr_complete) {
9486 t->create(cop->results.temp_oid);
9487 }
9488 if (!cop->temp_cursor.data_complete) {
9489 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9490 cop->cursor.data_offset);
9491 if (pool.info.required_alignment() &&
9492 !cop->cursor.data_complete) {
9493 /**
9494 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9495 * to pick it up on the next pass.
9496 */
9497 ceph_assert(cop->temp_cursor.data_offset %
9498 pool.info.required_alignment() == 0);
9499 if (cop->data.length() % pool.info.required_alignment() != 0) {
9500 uint64_t to_trim =
9501 cop->data.length() % pool.info.required_alignment();
9502 bufferlist bl;
9503 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9504 cop->data.swap(bl);
9505 cop->cursor.data_offset -= to_trim;
9506 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9507 cop->cursor.data_offset);
9508 }
9509 }
9510 if (cop->data.length()) {
9511 t->write(
9512 cop->results.temp_oid,
9513 cop->temp_cursor.data_offset,
9514 cop->data.length(),
9515 cop->data,
9516 cop->dest_obj_fadvise_flags);
9517 }
9518 cop->data.clear();
9519 }
9520 if (pool.info.supports_omap()) {
9521 if (!cop->temp_cursor.omap_complete) {
9522 if (cop->omap_header.length()) {
9523 t->omap_setheader(
9524 cop->results.temp_oid,
9525 cop->omap_header);
9526 cop->omap_header.clear();
9527 }
9528 if (cop->omap_data.length()) {
9529 map<string,bufferlist> omap;
9530 bufferlist::const_iterator p = cop->omap_data.begin();
9531 decode(omap, p);
9532 t->omap_setkeys(cop->results.temp_oid, omap);
9533 cop->omap_data.clear();
9534 }
9535 }
9536 } else {
9537 ceph_assert(cop->omap_header.length() == 0);
9538 ceph_assert(cop->omap_data.length() == 0);
9539 }
9540 cop->temp_cursor = cop->cursor;
9541 }
9542
9543 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9544 {
9545 OpContext *ctx = cb->ctx;
9546 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9547
9548 ObjectState& obs = ctx->new_obs;
9549 if (obs.exists) {
9550 dout(20) << __func__ << ": exists, removing" << dendl;
9551 ctx->op_t->remove(obs.oi.soid);
9552 } else {
9553 ctx->delta_stats.num_objects++;
9554 obs.exists = true;
9555 }
9556 if (cb->is_temp_obj_used()) {
9557 ctx->discard_temp_oid = cb->results->temp_oid;
9558 }
9559 cb->results->fill_in_final_tx(ctx->op_t.get());
9560
9561 // CopyFromCallback fills this in for us
9562 obs.oi.user_version = ctx->user_at_version;
9563
9564 if (cb->results->is_data_digest()) {
9565 obs.oi.set_data_digest(cb->results->data_digest);
9566 } else {
9567 obs.oi.clear_data_digest();
9568 }
9569 if (cb->results->is_omap_digest()) {
9570 obs.oi.set_omap_digest(cb->results->omap_digest);
9571 } else {
9572 obs.oi.clear_omap_digest();
9573 }
9574
9575 obs.oi.truncate_seq = cb->results->truncate_seq;
9576 obs.oi.truncate_size = cb->results->truncate_size;
9577
9578 ctx->extra_reqids = cb->results->reqids;
9579 ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9580
9581 // cache: clear whiteout?
9582 if (obs.oi.is_whiteout()) {
9583 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9584 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9585 --ctx->delta_stats.num_whiteouts;
9586 }
9587
9588 if (cb->results->has_omap) {
9589 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9590 obs.oi.set_flag(object_info_t::FLAG_OMAP);
9591 } else {
9592 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9593 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9594 }
9595
9596 interval_set<uint64_t> ch;
9597 if (obs.oi.size > 0)
9598 ch.insert(0, obs.oi.size);
9599 ctx->modified_ranges.union_of(ch);
9600
9601 if (cb->get_data_size() != obs.oi.size) {
9602 ctx->delta_stats.num_bytes -= obs.oi.size;
9603 obs.oi.size = cb->get_data_size();
9604 ctx->delta_stats.num_bytes += obs.oi.size;
9605 }
9606 ctx->delta_stats.num_wr++;
9607 ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
9608
9609 osd->logger->inc(l_osd_copyfrom);
9610 }
9611
9612 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9613 ObjectContextRef obc)
9614 {
9615 const hobject_t& soid = obc->obs.oi.soid;
9616 dout(10) << __func__ << " " << soid << " r=" << r
9617 << " uv" << results->user_version << dendl;
9618
9619 if (r == -ECANCELED) {
9620 return;
9621 }
9622
9623 if (r != -ENOENT && soid.is_snap()) {
9624 if (results->snaps.empty()) {
9625 // we must have read "snap" content from the head object in
9626 // the base pool. use snap_seq to construct what snaps should
9627 // be for this clone (what is was before we evicted the clean
9628 // clone from this pool, and what it will be when we flush and
9629 // the clone eventually happens in the base pool).
9630 SnapSet& snapset = obc->ssc->snapset;
9631 vector<snapid_t>::iterator p = snapset.snaps.begin();
9632 while (p != snapset.snaps.end() && *p > soid.snap)
9633 ++p;
9634 while (p != snapset.snaps.end() && *p > results->snap_seq) {
9635 results->snaps.push_back(*p);
9636 ++p;
9637 }
9638 }
9639
9640 dout(20) << __func__ << " snaps " << results->snaps << dendl;
9641 filter_snapc(results->snaps);
9642
9643 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9644 if (results->snaps.empty()) {
9645 dout(20) << __func__
9646 << " snaps are empty, clone is invalid,"
9647 << " setting r to ENOENT" << dendl;
9648 r = -ENOENT;
9649 }
9650 }
9651
9652 if (r < 0 && results->started_temp_obj) {
9653 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9654 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
9655 ceph_assert(tempobc);
9656 OpContextUPtr ctx = simple_opc_create(tempobc);
9657 ctx->op_t->remove(results->temp_oid);
9658 simple_opc_submit(std::move(ctx));
9659 results->started_temp_obj = false;
9660 }
9661
9662 if (r == -ENOENT && soid.is_snap()) {
9663 dout(10) << __func__
9664 << ": enoent while trying to promote clone, " << soid
9665 << " must have been trimmed, removing from snapset"
9666 << dendl;
9667 hobject_t head(soid.get_head());
9668 ObjectContextRef obc = get_object_context(head, false);
9669 ceph_assert(obc);
9670
9671 OpContextUPtr tctx = simple_opc_create(obc);
9672 tctx->at_version = get_next_version();
9673 filter_snapc(tctx->new_snapset.snaps);
9674 vector<snapid_t> new_clones;
9675 map<snapid_t, vector<snapid_t>> new_clone_snaps;
9676 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9677 i != tctx->new_snapset.clones.end();
9678 ++i) {
9679 if (*i != soid.snap) {
9680 new_clones.push_back(*i);
9681 auto p = tctx->new_snapset.clone_snaps.find(*i);
9682 if (p != tctx->new_snapset.clone_snaps.end()) {
9683 new_clone_snaps[*i] = p->second;
9684 }
9685 }
9686 }
9687 tctx->new_snapset.clones.swap(new_clones);
9688 tctx->new_snapset.clone_overlap.erase(soid.snap);
9689 tctx->new_snapset.clone_size.erase(soid.snap);
9690 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9691
9692 // take RWWRITE lock for duration of our local write. ignore starvation.
9693 if (!tctx->lock_manager.take_write_lock(
9694 head,
9695 obc)) {
9696 ceph_abort_msg("problem!");
9697 }
9698 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9699
9700 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9701
9702 simple_opc_submit(std::move(tctx));
9703 return;
9704 }
9705
9706 bool whiteout = false;
9707 if (r == -ENOENT) {
9708 ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
9709 dout(10) << __func__ << " whiteout " << soid << dendl;
9710 whiteout = true;
9711 }
9712
9713 if (r < 0 && !whiteout) {
9714 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9715 // pass error to everyone blocked on this object
9716 // FIXME: this is pretty sloppy, but at this point we got
9717 // something unexpected and don't have many other options.
9718 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9719 waiting_for_blocked_object.find(soid);
9720 if (blocked_iter != waiting_for_blocked_object.end()) {
9721 while (!blocked_iter->second.empty()) {
9722 osd->reply_op_error(blocked_iter->second.front(), r);
9723 blocked_iter->second.pop_front();
9724 }
9725 waiting_for_blocked_object.erase(blocked_iter);
9726 }
9727 return;
9728 }
9729
9730 osd->promote_finish(results->object_size);
9731
9732 OpContextUPtr tctx = simple_opc_create(obc);
9733 tctx->at_version = get_next_version();
9734
9735 if (!obc->obs.oi.has_manifest()) {
9736 ++tctx->delta_stats.num_objects;
9737 }
9738 if (soid.snap < CEPH_NOSNAP)
9739 ++tctx->delta_stats.num_object_clones;
9740 tctx->new_obs.exists = true;
9741
9742 tctx->extra_reqids = results->reqids;
9743 tctx->extra_reqid_return_codes = results->reqid_return_codes;
9744
9745 if (whiteout) {
9746 // create a whiteout
9747 tctx->op_t->create(soid);
9748 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9749 ++tctx->delta_stats.num_whiteouts;
9750 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9751 osd->logger->inc(l_osd_tier_whiteout);
9752 } else {
9753 if (results->has_omap) {
9754 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9755 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9756 ++tctx->delta_stats.num_objects_omap;
9757 }
9758
9759 results->fill_in_final_tx(tctx->op_t.get());
9760 if (results->started_temp_obj) {
9761 tctx->discard_temp_oid = results->temp_oid;
9762 }
9763 tctx->new_obs.oi.size = results->object_size;
9764 tctx->new_obs.oi.user_version = results->user_version;
9765 if (results->is_data_digest()) {
9766 tctx->new_obs.oi.set_data_digest(results->data_digest);
9767 } else {
9768 tctx->new_obs.oi.clear_data_digest();
9769 }
9770 if (results->is_omap_digest()) {
9771 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
9772 } else {
9773 tctx->new_obs.oi.clear_omap_digest();
9774 }
9775 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
9776 tctx->new_obs.oi.truncate_size = results->truncate_size;
9777
9778 if (soid.snap != CEPH_NOSNAP) {
9779 ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
9780 ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
9781 ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
9782 results->object_size);
9783 ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
9784
9785 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
9786 } else {
9787 tctx->delta_stats.num_bytes += results->object_size;
9788 }
9789 }
9790
9791 if (results->mirror_snapset) {
9792 ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
9793 tctx->new_snapset.from_snap_set(
9794 results->snapset,
9795 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
9796 }
9797 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
9798
9799 // take RWWRITE lock for duration of our local write. ignore starvation.
9800 if (!tctx->lock_manager.take_write_lock(
9801 obc->obs.oi.soid,
9802 obc)) {
9803 ceph_abort_msg("problem!");
9804 }
9805 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9806
9807 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9808
9809 simple_opc_submit(std::move(tctx));
9810
9811 osd->logger->inc(l_osd_tier_promote);
9812
9813 if (agent_state &&
9814 agent_state->is_idle())
9815 agent_choose_mode();
9816 }
9817
9818 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
9819 ObjectContextRef obc)
9820 {
9821 const hobject_t& soid = obc->obs.oi.soid;
9822 dout(10) << __func__ << " " << soid << " r=" << r
9823 << " uv" << results->user_version << dendl;
9824
9825 if (r == -ECANCELED || r == -EAGAIN) {
9826 return;
9827 }
9828
9829 if (r < 0) {
9830 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9831 // pass error to everyone blocked on this object
9832 // FIXME: this is pretty sloppy, but at this point we got
9833 // something unexpected and don't have many other options.
9834 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9835 waiting_for_blocked_object.find(soid);
9836 if (blocked_iter != waiting_for_blocked_object.end()) {
9837 while (!blocked_iter->second.empty()) {
9838 osd->reply_op_error(blocked_iter->second.front(), r);
9839 blocked_iter->second.pop_front();
9840 }
9841 waiting_for_blocked_object.erase(blocked_iter);
9842 }
9843 return;
9844 }
9845
9846 osd->promote_finish(results->object_size);
9847 osd->logger->inc(l_osd_tier_promote);
9848
9849 if (agent_state &&
9850 agent_state->is_idle())
9851 agent_choose_mode();
9852 }
9853
9854 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
9855 vector<ceph_tid_t> *tids)
9856 {
9857 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
9858 << " from " << cop->src << " " << cop->oloc
9859 << " v" << cop->results.user_version << dendl;
9860
9861 // cancel objecter op, if we can
9862 if (cop->objecter_tid) {
9863 tids->push_back(cop->objecter_tid);
9864 cop->objecter_tid = 0;
9865 if (cop->objecter_tid2) {
9866 tids->push_back(cop->objecter_tid2);
9867 cop->objecter_tid2 = 0;
9868 }
9869 }
9870
9871 copy_ops.erase(cop->obc->obs.oi.soid);
9872 cop->obc->stop_block();
9873
9874 kick_object_context_blocked(cop->obc);
9875 cop->results.should_requeue = requeue;
9876 CopyCallbackResults result(-ECANCELED, &cop->results);
9877 cop->cb->complete(result);
9878
9879 // There may still be an objecter callback referencing this copy op.
9880 // That callback will not need the obc since it's been canceled, and
9881 // we need the obc reference to go away prior to flush.
9882 cop->obc = ObjectContextRef();
9883 }
9884
9885 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
9886 {
9887 dout(10) << __func__ << dendl;
9888 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
9889 while (p != copy_ops.end()) {
9890 // requeue this op? can I queue up all of them?
9891 cancel_copy((p++)->second, requeue, tids);
9892 }
9893 }
9894
9895
9896 // ========================================================================
9897 // flush
9898 //
9899 // Flush a dirty object in the cache tier by writing it back to the
9900 // base tier. The sequence looks like:
9901 //
9902 // * send a copy-from operation to the base tier to copy the current
9903 // version of the object
9904 // * base tier will pull the object via (perhaps multiple) copy-get(s)
9905 // * on completion, we check if the object has been modified. if so,
9906 // just reply with -EAGAIN.
9907 // * try to take a write lock so we can clear the dirty flag. if this
9908 // fails, wait and retry
9909 // * start a repop that clears the bit.
9910 //
9911 // If we have to wait, we will retry by coming back through the
9912 // start_flush method. We check if a flush is already in progress
9913 // and, if so, try to finish it by rechecking the version and trying
9914 // to clear the dirty bit.
9915 //
9916 // In order for the cache-flush (a write op) to not block the copy-get
9917 // from reading the object, the client *must* set the SKIPRWLOCKS
9918 // flag.
9919 //
9920 // NOTE: normally writes are strictly ordered for the client, but
9921 // flushes are special in that they can be reordered with respect to
9922 // other writes. In particular, we can't have a flush request block
9923 // an update to the cache pool object!
9924
9925 struct C_Flush : public Context {
9926 PrimaryLogPGRef pg;
9927 hobject_t oid;
9928 epoch_t last_peering_reset;
9929 ceph_tid_t tid;
9930 utime_t start;
9931 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
9932 : pg(p), oid(o), last_peering_reset(lpr),
9933 tid(0), start(ceph_clock_now())
9934 {}
9935 void finish(int r) override {
9936 if (r == -ECANCELED)
9937 return;
9938 pg->lock();
9939 if (last_peering_reset == pg->get_last_peering_reset()) {
9940 pg->finish_flush(oid, tid, r);
9941 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
9942 }
9943 pg->unlock();
9944 }
9945 };
9946
9947 int PrimaryLogPG::start_flush(
9948 OpRequestRef op, ObjectContextRef obc,
9949 bool blocking, hobject_t *pmissing,
9950 boost::optional<std::function<void()>> &&on_flush)
9951 {
9952 const object_info_t& oi = obc->obs.oi;
9953 const hobject_t& soid = oi.soid;
9954 dout(10) << __func__ << " " << soid
9955 << " v" << oi.version
9956 << " uv" << oi.user_version
9957 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
9958 << dendl;
9959
9960 // get a filtered snapset, need to remove removed snaps
9961 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
9962
9963 // verify there are no (older) check for dirty clones
9964 {
9965 dout(20) << " snapset " << snapset << dendl;
9966 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
9967 while (p != snapset.clones.rend() && *p >= soid.snap)
9968 ++p;
9969 if (p != snapset.clones.rend()) {
9970 hobject_t next = soid;
9971 next.snap = *p;
9972 ceph_assert(next.snap < soid.snap);
9973 if (pg_log.get_missing().is_missing(next)) {
9974 dout(10) << __func__ << " missing clone is " << next << dendl;
9975 if (pmissing)
9976 *pmissing = next;
9977 return -ENOENT;
9978 }
9979 ObjectContextRef older_obc = get_object_context(next, false);
9980 if (older_obc) {
9981 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
9982 << dendl;
9983 if (older_obc->obs.oi.is_dirty()) {
9984 dout(10) << __func__ << " next oldest clone is dirty: "
9985 << older_obc->obs.oi << dendl;
9986 return -EBUSY;
9987 }
9988 } else {
9989 dout(20) << __func__ << " next oldest clone " << next
9990 << " is not present; implicitly clean" << dendl;
9991 }
9992 } else {
9993 dout(20) << __func__ << " no older clones" << dendl;
9994 }
9995 }
9996
9997 if (blocking)
9998 obc->start_block();
9999
10000 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10001 if (p != flush_ops.end()) {
10002 FlushOpRef fop = p->second;
10003 if (fop->op == op) {
10004 // we couldn't take the write lock on a cache-try-flush before;
10005 // now we are trying again for the lock.
10006 return try_flush_mark_clean(fop);
10007 }
10008 if (fop->flushed_version == obc->obs.oi.user_version &&
10009 (fop->blocking || !blocking)) {
10010 // nonblocking can join anything
10011 // blocking can only join a blocking flush
10012 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10013 if (op)
10014 fop->dup_ops.push_back(op);
10015 return -EAGAIN; // clean up this ctx; op will retry later
10016 }
10017
10018 // cancel current flush since it will fail anyway, or because we
10019 // are blocking and the existing flush is nonblocking.
10020 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10021 if (fop->op)
10022 osd->reply_op_error(fop->op, -EBUSY);
10023 while (!fop->dup_ops.empty()) {
10024 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10025 fop->dup_ops.pop_front();
10026 }
10027 vector<ceph_tid_t> tids;
10028 cancel_flush(fop, false, &tids);
10029 osd->objecter->op_cancel(tids, -ECANCELED);
10030 }
10031
10032 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10033 int r = start_manifest_flush(op, obc, blocking, std::move(on_flush));
10034 if (r != -EINPROGRESS) {
10035 if (blocking)
10036 obc->stop_block();
10037 }
10038 return r;
10039 }
10040
10041 /**
10042 * In general, we need to send a delete and a copyfrom.
10043 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10044 * where 4 is marked as clean. To flush 10, we have to:
10045 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10046 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10047 *
10048 * There is a complicating case. Supposed there had been a clone 7
10049 * for snaps [7, 6] which has been trimmed since they no longer exist.
10050 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10051 * the delete, the snap will be promoted to 5, and the head will become
10052 * a whiteout. When the copy-from goes through, we'll end up with
10053 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10054 *
10055 * Another complication is the case where there is an interval change
10056 * after doing the delete and the flush but before marking the object
10057 * clean. We'll happily delete head and then recreate it at the same
10058 * sequence number, which works out ok.
10059 */
10060
10061 SnapContext snapc, dsnapc;
10062 if (snapset.seq != 0) {
10063 if (soid.snap == CEPH_NOSNAP) {
10064 snapc.seq = snapset.seq;
10065 snapc.snaps = snapset.snaps;
10066 } else {
10067 snapid_t min_included_snap;
10068 auto p = snapset.clone_snaps.find(soid.snap);
10069 ceph_assert(p != snapset.clone_snaps.end());
10070 min_included_snap = p->second.back();
10071 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10072 }
10073
10074 snapid_t prev_snapc = 0;
10075 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10076 citer != snapset.clones.rend();
10077 ++citer) {
10078 if (*citer < soid.snap) {
10079 prev_snapc = *citer;
10080 break;
10081 }
10082 }
10083
10084 dsnapc = snapset.get_ssc_as_of(prev_snapc);
10085 }
10086
10087 object_locator_t base_oloc(soid);
10088 base_oloc.pool = pool.info.tier_of;
10089
10090 if (dsnapc.seq < snapc.seq) {
10091 ObjectOperation o;
10092 o.remove();
10093 osd->objecter->mutate(
10094 soid.oid,
10095 base_oloc,
10096 o,
10097 dsnapc,
10098 ceph::real_clock::from_ceph_timespec(oi.mtime),
10099 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10100 CEPH_OSD_FLAG_ENFORCE_SNAPC),
10101 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10102 }
10103
10104 FlushOpRef fop(std::make_shared<FlushOp>());
10105 fop->obc = obc;
10106 fop->flushed_version = oi.user_version;
10107 fop->blocking = blocking;
10108 fop->on_flush = std::move(on_flush);
10109 fop->op = op;
10110
10111 ObjectOperation o;
10112 if (oi.is_whiteout()) {
10113 fop->removal = true;
10114 o.remove();
10115 } else {
10116 object_locator_t oloc(soid);
10117 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10118 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10119 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10120 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10121 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10122 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10123
10124 //mean the base tier don't cache data after this
10125 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10126 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10127 }
10128 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10129
10130 unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
10131 ceph_tid_t tid = osd->objecter->mutate(
10132 soid.oid, base_oloc, o, snapc,
10133 ceph::real_clock::from_ceph_timespec(oi.mtime),
10134 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10135 new C_OnFinisher(fin,
10136 osd->objecter_finishers[n]));
10137 /* we're under the pg lock and fin->finish() is grabbing that */
10138 fin->tid = tid;
10139 fop->objecter_tid = tid;
10140
10141 flush_ops[soid] = fop;
10142 info.stats.stats.sum.num_flush++;
10143 info.stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10144 return -EINPROGRESS;
10145 }
10146
10147 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10148 {
10149 dout(10) << __func__ << " " << oid << " tid " << tid
10150 << " " << cpp_strerror(r) << dendl;
10151 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10152 if (p == flush_ops.end()) {
10153 dout(10) << __func__ << " no flush_op found" << dendl;
10154 return;
10155 }
10156 FlushOpRef fop = p->second;
10157 if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10158 dout(10) << __func__ << " tid " << tid << " != fop " << fop
10159 << " tid " << fop->objecter_tid << dendl;
10160 return;
10161 }
10162 ObjectContextRef obc = fop->obc;
10163 fop->objecter_tid = 0;
10164
10165 if (r < 0 && !(r == -ENOENT && fop->removal)) {
10166 if (fop->op)
10167 osd->reply_op_error(fop->op, -EBUSY);
10168 if (fop->blocking) {
10169 obc->stop_block();
10170 kick_object_context_blocked(obc);
10171 }
10172
10173 if (!fop->dup_ops.empty()) {
10174 dout(20) << __func__ << " requeueing dups" << dendl;
10175 requeue_ops(fop->dup_ops);
10176 }
10177 if (fop->on_flush) {
10178 (*(fop->on_flush))();
10179 fop->on_flush = boost::none;
10180 }
10181 flush_ops.erase(oid);
10182 return;
10183 }
10184
10185 r = try_flush_mark_clean(fop);
10186 if (r == -EBUSY && fop->op) {
10187 osd->reply_op_error(fop->op, r);
10188 }
10189 }
10190
10191 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10192 {
10193 ObjectContextRef obc = fop->obc;
10194 const hobject_t& oid = obc->obs.oi.soid;
10195
10196 if (fop->blocking) {
10197 obc->stop_block();
10198 kick_object_context_blocked(obc);
10199 }
10200
10201 if (fop->flushed_version != obc->obs.oi.user_version ||
10202 !obc->obs.exists) {
10203 if (obc->obs.exists)
10204 dout(10) << __func__ << " flushed_version " << fop->flushed_version
10205 << " != current " << obc->obs.oi.user_version
10206 << dendl;
10207 else
10208 dout(10) << __func__ << " object no longer exists" << dendl;
10209
10210 if (!fop->dup_ops.empty()) {
10211 dout(20) << __func__ << " requeueing dups" << dendl;
10212 requeue_ops(fop->dup_ops);
10213 }
10214 if (fop->on_flush) {
10215 (*(fop->on_flush))();
10216 fop->on_flush = boost::none;
10217 }
10218 flush_ops.erase(oid);
10219 if (fop->blocking)
10220 osd->logger->inc(l_osd_tier_flush_fail);
10221 else
10222 osd->logger->inc(l_osd_tier_try_flush_fail);
10223 return -EBUSY;
10224 }
10225
10226 if (!fop->blocking &&
10227 write_blocked_by_scrub(oid)) {
10228 if (fop->op) {
10229 dout(10) << __func__ << " blocked by scrub" << dendl;
10230 requeue_op(fop->op);
10231 requeue_ops(fop->dup_ops);
10232 return -EAGAIN; // will retry
10233 } else {
10234 osd->logger->inc(l_osd_tier_try_flush_fail);
10235 vector<ceph_tid_t> tids;
10236 cancel_flush(fop, false, &tids);
10237 osd->objecter->op_cancel(tids, -ECANCELED);
10238 return -ECANCELED;
10239 }
10240 }
10241
10242 // successfully flushed, can we evict this object?
10243 if (!obc->obs.oi.has_manifest() && !fop->op &&
10244 agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
10245 agent_maybe_evict(obc, true)) {
10246 osd->logger->inc(l_osd_tier_clean);
10247 if (fop->on_flush) {
10248 (*(fop->on_flush))();
10249 fop->on_flush = boost::none;
10250 }
10251 flush_ops.erase(oid);
10252 return 0;
10253 }
10254
10255 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10256 OpContextUPtr ctx = simple_opc_create(fop->obc);
10257
10258 // successfully flushed; can we clear the dirty bit?
10259 // try to take the lock manually, since we don't
10260 // have a ctx yet.
10261 if (ctx->lock_manager.get_lock_type(
10262 ObjectContext::RWState::RWWRITE,
10263 oid,
10264 obc,
10265 fop->op)) {
10266 dout(20) << __func__ << " took write lock" << dendl;
10267 } else if (fop->op) {
10268 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10269 << fop->dup_ops << dendl;
10270 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10271 for (auto op : fop->dup_ops) {
10272 bool locked = ctx->lock_manager.get_lock_type(
10273 ObjectContext::RWState::RWWRITE,
10274 oid,
10275 obc,
10276 op);
10277 ceph_assert(!locked);
10278 }
10279 close_op_ctx(ctx.release());
10280 return -EAGAIN; // will retry
10281 } else {
10282 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10283 close_op_ctx(ctx.release());
10284 osd->logger->inc(l_osd_tier_try_flush_fail);
10285 vector<ceph_tid_t> tids;
10286 cancel_flush(fop, false, &tids);
10287 osd->objecter->op_cancel(tids, -ECANCELED);
10288 return -ECANCELED;
10289 }
10290
10291 if (fop->on_flush) {
10292 ctx->register_on_finish(*(fop->on_flush));
10293 fop->on_flush = boost::none;
10294 }
10295
10296 ctx->at_version = get_next_version();
10297
10298 ctx->new_obs = obc->obs;
10299 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10300 --ctx->delta_stats.num_objects_dirty;
10301 if (fop->obc->obs.oi.has_manifest()) {
10302 ceph_assert(obc->obs.oi.manifest.is_chunked());
10303 PGTransaction* t = ctx->op_t.get();
10304 uint64_t chunks_size = 0;
10305 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10306 chunks_size += p.second.length;
10307 }
10308 if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10309 t->omap_clear(oid);
10310 ctx->new_obs.oi.clear_omap_digest();
10311 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10312 }
10313 if (obc->obs.oi.size == chunks_size) {
10314 t->truncate(oid, 0);
10315 interval_set<uint64_t> trim;
10316 trim.insert(0, ctx->new_obs.oi.size);
10317 ctx->modified_ranges.union_of(trim);
10318 truncate_update_size_and_usage(ctx->delta_stats,
10319 ctx->new_obs.oi,
10320 0);
10321 ctx->new_obs.oi.new_object();
10322 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10323 p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10324 p.second.set_flag(chunk_info_t::FLAG_MISSING);
10325 }
10326 } else {
10327 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10328 if (p.second.is_dirty()) {
10329 dout(20) << __func__ << " offset: " << p.second.offset
10330 << " length: " << p.second.length << dendl;
10331 p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10332 p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
10333 }
10334 }
10335 }
10336 }
10337
10338 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10339
10340 osd->logger->inc(l_osd_tier_clean);
10341
10342 if (!fop->dup_ops.empty() || fop->op) {
10343 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10344 list<OpRequestRef> ls;
10345 if (fop->op)
10346 ls.push_back(fop->op);
10347 ls.splice(ls.end(), fop->dup_ops);
10348 requeue_ops(ls);
10349 }
10350
10351 simple_opc_submit(std::move(ctx));
10352
10353 flush_ops.erase(oid);
10354
10355 if (fop->blocking)
10356 osd->logger->inc(l_osd_tier_flush);
10357 else
10358 osd->logger->inc(l_osd_tier_try_flush);
10359
10360 return -EINPROGRESS;
10361 }
10362
10363 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10364 vector<ceph_tid_t> *tids)
10365 {
10366 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10367 << fop->objecter_tid << dendl;
10368 if (fop->objecter_tid) {
10369 tids->push_back(fop->objecter_tid);
10370 fop->objecter_tid = 0;
10371 }
10372 if (fop->io_tids.size()) {
10373 for (auto &p : fop->io_tids) {
10374 tids->push_back(p.second);
10375 p.second = 0;
10376 }
10377 }
10378 if (fop->blocking && fop->obc->is_blocked()) {
10379 fop->obc->stop_block();
10380 kick_object_context_blocked(fop->obc);
10381 }
10382 if (requeue) {
10383 if (fop->op)
10384 requeue_op(fop->op);
10385 requeue_ops(fop->dup_ops);
10386 }
10387 if (fop->on_flush) {
10388 (*(fop->on_flush))();
10389 fop->on_flush = boost::none;
10390 }
10391 flush_ops.erase(fop->obc->obs.oi.soid);
10392 }
10393
10394 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
10395 {
10396 dout(10) << __func__ << dendl;
10397 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10398 while (p != flush_ops.end()) {
10399 cancel_flush((p++)->second, requeue, tids);
10400 }
10401 }
10402
10403 bool PrimaryLogPG::is_present_clone(hobject_t coid)
10404 {
10405 if (!pool.info.allow_incomplete_clones())
10406 return true;
10407 if (is_missing_object(coid))
10408 return true;
10409 ObjectContextRef obc = get_object_context(coid, false);
10410 return obc && obc->obs.exists;
10411 }
10412
10413 // ========================================================================
10414 // rep op gather
10415
10416 class C_OSD_RepopCommit : public Context {
10417 PrimaryLogPGRef pg;
10418 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10419 public:
10420 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10421 : pg(pg), repop(repop) {}
10422 void finish(int) override {
10423 pg->repop_all_committed(repop.get());
10424 }
10425 };
10426
10427 void PrimaryLogPG::repop_all_committed(RepGather *repop)
10428 {
10429 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10430 << dendl;
10431 repop->all_committed = true;
10432 if (!repop->rep_aborted) {
10433 if (repop->v != eversion_t()) {
10434 last_update_ondisk = repop->v;
10435 last_complete_ondisk = repop->pg_local_last_complete;
10436 }
10437 eval_repop(repop);
10438 }
10439 }
10440
10441 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10442 {
10443 dout(10) << "op_applied version " << applied_version << dendl;
10444 ceph_assert(applied_version != eversion_t());
10445 ceph_assert(applied_version <= info.last_update);
10446 last_update_applied = applied_version;
10447 if (is_primary()) {
10448 if (scrubber.active) {
10449 if (last_update_applied >= scrubber.subset_last_update) {
10450 requeue_scrub(ops_blocked_by_scrub());
10451 }
10452 } else {
10453 ceph_assert(scrubber.start == scrubber.end);
10454 }
10455 }
10456 }
10457
10458 void PrimaryLogPG::eval_repop(RepGather *repop)
10459 {
10460 const MOSDOp *m = NULL;
10461 if (repop->op)
10462 m = static_cast<const MOSDOp *>(repop->op->get_req());
10463
10464 if (m)
10465 dout(10) << "eval_repop " << *repop << dendl;
10466 else
10467 dout(10) << "eval_repop " << *repop << " (no op)" << dendl;
10468
10469 // ondisk?
10470 if (repop->all_committed) {
10471 dout(10) << " commit: " << *repop << dendl;
10472 for (auto p = repop->on_committed.begin();
10473 p != repop->on_committed.end();
10474 repop->on_committed.erase(p++)) {
10475 (*p)();
10476 }
10477 // send dup commits, in order
10478 auto it = waiting_for_ondisk.find(repop->v);
10479 if (it != waiting_for_ondisk.end()) {
10480 ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10481 for (auto& i : it->second) {
10482 int return_code = repop->r;
10483 if (return_code >= 0) {
10484 return_code = std::get<2>(i);
10485 }
10486 osd->reply_op_error(std::get<0>(i), return_code, repop->v,
10487 std::get<1>(i));
10488 }
10489 waiting_for_ondisk.erase(it);
10490 }
10491
10492 publish_stats_to_osd();
10493 calc_min_last_complete_ondisk();
10494
10495 dout(10) << " removing " << *repop << dendl;
10496 ceph_assert(!repop_queue.empty());
10497 dout(20) << " q front is " << *repop_queue.front() << dendl;
10498 if (repop_queue.front() == repop) {
10499 RepGather *to_remove = nullptr;
10500 while (!repop_queue.empty() &&
10501 (to_remove = repop_queue.front())->all_committed) {
10502 repop_queue.pop_front();
10503 for (auto p = to_remove->on_success.begin();
10504 p != to_remove->on_success.end();
10505 to_remove->on_success.erase(p++)) {
10506 (*p)();
10507 }
10508 remove_repop(to_remove);
10509 }
10510 }
10511 }
10512 }
10513
10514 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
10515 {
10516 FUNCTRACE(cct);
10517 const hobject_t& soid = ctx->obs->oi.soid;
10518 dout(7) << "issue_repop rep_tid " << repop->rep_tid
10519 << " o " << soid
10520 << dendl;
10521
10522 repop->v = ctx->at_version;
10523 if (ctx->at_version > eversion_t()) {
10524 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
10525 i != acting_recovery_backfill.end();
10526 ++i) {
10527 if (*i == get_primary()) continue;
10528 pg_info_t &pinfo = peer_info[*i];
10529 // keep peer_info up to date
10530 if (pinfo.last_complete == pinfo.last_update)
10531 pinfo.last_complete = ctx->at_version;
10532 pinfo.last_update = ctx->at_version;
10533 }
10534 }
10535
10536 ctx->op_t->add_obc(ctx->obc);
10537 if (ctx->clone_obc) {
10538 ctx->op_t->add_obc(ctx->clone_obc);
10539 }
10540 if (ctx->head_obc) {
10541 ctx->op_t->add_obc(ctx->head_obc);
10542 }
10543
10544 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
10545 if (!(ctx->log.empty())) {
10546 ceph_assert(ctx->at_version >= projected_last_update);
10547 projected_last_update = ctx->at_version;
10548 }
10549 for (auto &&entry: ctx->log) {
10550 projected_log.add(entry);
10551 }
10552
10553 bool requires_missing_loc = false;
10554 for (set<pg_shard_t>::iterator i = async_recovery_targets.begin();
10555 i != async_recovery_targets.end();
10556 ++i) {
10557 if (*i == get_primary() || !peer_missing[*i].is_missing(soid)) continue;
10558 requires_missing_loc = true;
10559 for (auto &&entry: ctx->log) {
10560 peer_missing[*i].add_next_event(entry);
10561 }
10562 }
10563
10564 if (requires_missing_loc) {
10565 for (auto &&entry: ctx->log) {
10566 dout(30) << __func__ << " missing_loc before: "
10567 << missing_loc.get_locations(entry.soid) << dendl;
10568 missing_loc.add_missing(entry.soid, entry.version,
10569 eversion_t(), entry.is_delete());
10570 // clear out missing_loc
10571 missing_loc.clear_location(entry.soid);
10572 for (auto &i: actingset) {
10573 if (!peer_missing[i].is_missing(entry.soid))
10574 missing_loc.add_location(entry.soid, i);
10575 }
10576 dout(30) << __func__ << " missing_loc after: "
10577 << missing_loc.get_locations(entry.soid) << dendl;
10578 }
10579 }
10580
10581 pgbackend->submit_transaction(
10582 soid,
10583 ctx->delta_stats,
10584 ctx->at_version,
10585 std::move(ctx->op_t),
10586 pg_trim_to,
10587 min_last_complete_ondisk,
10588 ctx->log,
10589 ctx->updated_hset_history,
10590 on_all_commit,
10591 repop->rep_tid,
10592 ctx->reqid,
10593 ctx->op);
10594 }
10595
10596 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
10597 OpContext *ctx, ObjectContextRef obc,
10598 ceph_tid_t rep_tid)
10599 {
10600 if (ctx->op)
10601 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
10602 else
10603 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
10604
10605 RepGather *repop = new RepGather(
10606 ctx, rep_tid, info.last_complete);
10607
10608 repop->start = ceph_clock_now();
10609
10610 repop_queue.push_back(&repop->queue_item);
10611 repop->get();
10612
10613 osd->logger->inc(l_osd_op_wip);
10614
10615 dout(10) << __func__ << ": " << *repop << dendl;
10616 return repop;
10617 }
10618
10619 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
10620 eversion_t version,
10621 int r,
10622 ObcLockManager &&manager,
10623 OpRequestRef &&op,
10624 boost::optional<std::function<void(void)> > &&on_complete)
10625 {
10626 RepGather *repop = new RepGather(
10627 std::move(manager),
10628 std::move(op),
10629 std::move(on_complete),
10630 osd->get_tid(),
10631 info.last_complete,
10632 r);
10633 repop->v = version;
10634
10635 repop->start = ceph_clock_now();
10636
10637 repop_queue.push_back(&repop->queue_item);
10638
10639 osd->logger->inc(l_osd_op_wip);
10640
10641 dout(10) << __func__ << ": " << *repop << dendl;
10642 return boost::intrusive_ptr<RepGather>(repop);
10643 }
10644
10645 void PrimaryLogPG::remove_repop(RepGather *repop)
10646 {
10647 dout(20) << __func__ << " " << *repop << dendl;
10648
10649 for (auto p = repop->on_finish.begin();
10650 p != repop->on_finish.end();
10651 repop->on_finish.erase(p++)) {
10652 (*p)();
10653 }
10654
10655 release_object_locks(
10656 repop->lock_manager);
10657 repop->put();
10658
10659 osd->logger->dec(l_osd_op_wip);
10660 }
10661
10662 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
10663 {
10664 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
10665 ceph_tid_t rep_tid = osd->get_tid();
10666 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
10667 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
10668 ctx->op_t.reset(new PGTransaction());
10669 ctx->mtime = ceph_clock_now();
10670 return ctx;
10671 }
10672
10673 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
10674 {
10675 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
10676 dout(20) << __func__ << " " << repop << dendl;
10677 issue_repop(repop, ctx.get());
10678 eval_repop(repop);
10679 if (hard_limit_pglog())
10680 calc_trim_to_aggressive();
10681 else
10682 calc_trim_to();
10683 repop->put();
10684 }
10685
10686
10687 void PrimaryLogPG::submit_log_entries(
10688 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
10689 ObcLockManager &&manager,
10690 boost::optional<std::function<void(void)> > &&_on_complete,
10691 OpRequestRef op,
10692 int r)
10693 {
10694 dout(10) << __func__ << " " << entries << dendl;
10695 ceph_assert(is_primary());
10696
10697 eversion_t version;
10698 if (!entries.empty()) {
10699 ceph_assert(entries.rbegin()->version >= projected_last_update);
10700 version = projected_last_update = entries.rbegin()->version;
10701 }
10702
10703 boost::intrusive_ptr<RepGather> repop;
10704 boost::optional<std::function<void(void)> > on_complete;
10705 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10706 repop = new_repop(
10707 version,
10708 r,
10709 std::move(manager),
10710 std::move(op),
10711 std::move(_on_complete));
10712 } else {
10713 on_complete = std::move(_on_complete);
10714 }
10715
10716 pgbackend->call_write_ordered(
10717 [this, entries, repop, on_complete]() {
10718 ObjectStore::Transaction t;
10719 eversion_t old_last_update = info.last_update;
10720 merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
10721
10722
10723 set<pg_shard_t> waiting_on;
10724 for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
10725 i != acting_recovery_backfill.end();
10726 ++i) {
10727 pg_shard_t peer(*i);
10728 if (peer == pg_whoami) continue;
10729 ceph_assert(peer_missing.count(peer));
10730 ceph_assert(peer_info.count(peer));
10731 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10732 ceph_assert(repop);
10733 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
10734 entries,
10735 spg_t(info.pgid.pgid, i->shard),
10736 pg_whoami.shard,
10737 get_osdmap_epoch(),
10738 last_peering_reset,
10739 repop->rep_tid,
10740 pg_trim_to,
10741 min_last_complete_ondisk);
10742 osd->send_message_osd_cluster(
10743 peer.osd, m, get_osdmap_epoch());
10744 waiting_on.insert(peer);
10745 } else {
10746 MOSDPGLog *m = new MOSDPGLog(
10747 peer.shard, pg_whoami.shard,
10748 info.last_update.epoch,
10749 info, last_peering_reset);
10750 m->log.log = entries;
10751 m->log.tail = old_last_update;
10752 m->log.head = info.last_update;
10753 osd->send_message_osd_cluster(
10754 peer.osd, m, get_osdmap_epoch());
10755 }
10756 }
10757 ceph_tid_t rep_tid = repop->rep_tid;
10758 waiting_on.insert(pg_whoami);
10759 log_entry_update_waiting_on.insert(
10760 make_pair(
10761 rep_tid,
10762 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
10763 ));
10764 struct OnComplete : public Context {
10765 PrimaryLogPGRef pg;
10766 ceph_tid_t rep_tid;
10767 epoch_t epoch;
10768 OnComplete(
10769 PrimaryLogPGRef pg,
10770 ceph_tid_t rep_tid,
10771 epoch_t epoch)
10772 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
10773 void finish(int) override {
10774 pg->lock();
10775 if (!pg->pg_has_reset_since(epoch)) {
10776 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
10777 ceph_assert(it != pg->log_entry_update_waiting_on.end());
10778 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
10779 ceph_assert(it2 != it->second.waiting_on.end());
10780 it->second.waiting_on.erase(it2);
10781 if (it->second.waiting_on.empty()) {
10782 pg->repop_all_committed(it->second.repop.get());
10783 pg->log_entry_update_waiting_on.erase(it);
10784 }
10785 }
10786 pg->unlock();
10787 }
10788 };
10789 t.register_on_commit(
10790 new OnComplete{this, rep_tid, get_osdmap_epoch()});
10791 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
10792 ceph_assert(r == 0);
10793 op_applied(info.last_update);
10794 });
10795
10796 if (hard_limit_pglog())
10797 calc_trim_to_aggressive();
10798 else
10799 calc_trim_to();
10800 }
10801
10802 void PrimaryLogPG::cancel_log_updates()
10803 {
10804 // get rid of all the LogUpdateCtx so their references to repops are
10805 // dropped
10806 log_entry_update_waiting_on.clear();
10807 }
10808
10809 // -------------------------------------------------------
10810
10811 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
10812 {
10813 lock();
10814 pair<hobject_t, ObjectContextRef> i;
10815 while (object_contexts.get_next(i.first, &i)) {
10816 ObjectContextRef obc(i.second);
10817 get_obc_watchers(obc, *ls);
10818 }
10819 unlock();
10820 }
10821
10822 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
10823 {
10824 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10825 obc->watchers.begin();
10826 j != obc->watchers.end();
10827 ++j) {
10828 obj_watch_item_t owi;
10829
10830 owi.obj = obc->obs.oi.soid;
10831 owi.wi.addr = j->second->get_peer_addr();
10832 owi.wi.name = j->second->get_entity();
10833 owi.wi.cookie = j->second->get_cookie();
10834 owi.wi.timeout_seconds = j->second->get_timeout();
10835
10836 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
10837 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
10838
10839 pg_watchers.push_back(owi);
10840 }
10841 }
10842
10843 void PrimaryLogPG::check_blacklisted_watchers()
10844 {
10845 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
10846 pair<hobject_t, ObjectContextRef> i;
10847 while (object_contexts.get_next(i.first, &i))
10848 check_blacklisted_obc_watchers(i.second);
10849 }
10850
10851 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
10852 {
10853 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
10854 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
10855 obc->watchers.begin();
10856 k != obc->watchers.end();
10857 ) {
10858 //Advance iterator now so handle_watch_timeout() can erase element
10859 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
10860 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
10861 entity_addr_t ea = j->second->get_peer_addr();
10862 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
10863 if (get_osdmap()->is_blacklisted(ea)) {
10864 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
10865 ceph_assert(j->second->get_pg() == this);
10866 j->second->unregister_cb();
10867 handle_watch_timeout(j->second);
10868 }
10869 }
10870 }
10871
10872 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
10873 {
10874 ceph_assert(is_active());
10875 auto it_objects = pg_log.get_log().objects.find(obc->obs.oi.soid);
10876 ceph_assert((recovering.count(obc->obs.oi.soid) ||
10877 !is_missing_object(obc->obs.oi.soid)) ||
10878 (it_objects != pg_log.get_log().objects.end() && // or this is a revert... see recover_primary()
10879 it_objects->second->op ==
10880 pg_log_entry_t::LOST_REVERT &&
10881 it_objects->second->reverting_to ==
10882 obc->obs.oi.version));
10883
10884 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
10885 ceph_assert(obc->watchers.empty());
10886 // populate unconnected_watchers
10887 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
10888 obc->obs.oi.watchers.begin();
10889 p != obc->obs.oi.watchers.end();
10890 ++p) {
10891 utime_t expire = info.stats.last_became_active;
10892 expire += p->second.timeout_seconds;
10893 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
10894 WatchRef watch(
10895 Watch::makeWatchRef(
10896 this, osd, obc, p->second.timeout_seconds, p->first.first,
10897 p->first.second, p->second.addr));
10898 watch->disconnect();
10899 obc->watchers.insert(
10900 make_pair(
10901 make_pair(p->first.first, p->first.second),
10902 watch));
10903 }
10904 // Look for watchers from blacklisted clients and drop
10905 check_blacklisted_obc_watchers(obc);
10906 }
10907
10908 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
10909 {
10910 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
10911 dout(10) << "handle_watch_timeout obc " << obc << dendl;
10912
10913 if (!is_active()) {
10914 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
10915 return;
10916 }
10917 if (!obc->obs.exists) {
10918 dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
10919 return;
10920 }
10921 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
10922 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
10923 watch->get_delayed_cb()
10924 );
10925 dout(10) << "handle_watch_timeout waiting for degraded on obj "
10926 << obc->obs.oi.soid
10927 << dendl;
10928 return;
10929 }
10930
10931 if (write_blocked_by_scrub(obc->obs.oi.soid)) {
10932 dout(10) << "handle_watch_timeout waiting for scrub on obj "
10933 << obc->obs.oi.soid
10934 << dendl;
10935 scrubber.add_callback(
10936 watch->get_delayed_cb() // This callback!
10937 );
10938 return;
10939 }
10940
10941 OpContextUPtr ctx = simple_opc_create(obc);
10942 ctx->at_version = get_next_version();
10943
10944 object_info_t& oi = ctx->new_obs.oi;
10945 oi.watchers.erase(make_pair(watch->get_cookie(),
10946 watch->get_entity()));
10947
10948 list<watch_disconnect_t> watch_disconnects = {
10949 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
10950 };
10951 ctx->register_on_success(
10952 [this, obc, watch_disconnects]() {
10953 complete_disconnect_watches(obc, watch_disconnects);
10954 });
10955
10956
10957 PGTransaction *t = ctx->op_t.get();
10958 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
10959 ctx->at_version,
10960 oi.version,
10961 0,
10962 osd_reqid_t(), ctx->mtime, 0));
10963
10964 oi.prior_version = obc->obs.oi.version;
10965 oi.version = ctx->at_version;
10966 bufferlist bl;
10967 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
10968 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
10969
10970 // apply new object state.
10971 ctx->obc->obs = ctx->new_obs;
10972
10973 // no ctx->delta_stats
10974 simple_opc_submit(std::move(ctx));
10975 }
10976
10977 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
10978 SnapSetContext *ssc)
10979 {
10980 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
10981 ceph_assert(obc->destructor_callback == NULL);
10982 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
10983 obc->obs.oi = oi;
10984 obc->obs.exists = false;
10985 obc->ssc = ssc;
10986 if (ssc)
10987 register_snapset_context(ssc);
10988 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
10989 if (is_active())
10990 populate_obc_watchers(obc);
10991 return obc;
10992 }
10993
10994 ObjectContextRef PrimaryLogPG::get_object_context(
10995 const hobject_t& soid,
10996 bool can_create,
10997 const map<string, bufferlist> *attrs)
10998 {
10999 auto it_objects = pg_log.get_log().objects.find(soid);
11000 ceph_assert(
11001 attrs || !pg_log.get_missing().is_missing(soid) ||
11002 // or this is a revert... see recover_primary()
11003 (it_objects != pg_log.get_log().objects.end() &&
11004 it_objects->second->op ==
11005 pg_log_entry_t::LOST_REVERT));
11006 ObjectContextRef obc = object_contexts.lookup(soid);
11007 osd->logger->inc(l_osd_object_ctx_cache_total);
11008 if (obc) {
11009 osd->logger->inc(l_osd_object_ctx_cache_hit);
11010 dout(10) << __func__ << ": found obc in cache: " << obc
11011 << dendl;
11012 } else {
11013 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11014 // check disk
11015 bufferlist bv;
11016 if (attrs) {
11017 auto it_oi = attrs->find(OI_ATTR);
11018 ceph_assert(it_oi != attrs->end());
11019 bv = it_oi->second;
11020 } else {
11021 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11022 if (r < 0) {
11023 if (!can_create) {
11024 dout(10) << __func__ << ": no obc for soid "
11025 << soid << " and !can_create"
11026 << dendl;
11027 return ObjectContextRef(); // -ENOENT!
11028 }
11029
11030 dout(10) << __func__ << ": no obc for soid "
11031 << soid << " but can_create"
11032 << dendl;
11033 // new object.
11034 object_info_t oi(soid);
11035 SnapSetContext *ssc = get_snapset_context(
11036 soid, true, 0, false);
11037 ceph_assert(ssc);
11038 obc = create_object_context(oi, ssc);
11039 dout(10) << __func__ << ": " << obc << " " << soid
11040 << " " << obc->rwstate
11041 << " oi: " << obc->obs.oi
11042 << " ssc: " << obc->ssc
11043 << " snapset: " << obc->ssc->snapset << dendl;
11044 return obc;
11045 }
11046 }
11047
11048 object_info_t oi;
11049 try {
11050 bufferlist::const_iterator bliter = bv.begin();
11051 decode(oi, bliter);
11052 } catch (...) {
11053 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11054 return ObjectContextRef(); // -ENOENT!
11055 }
11056
11057 ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11058
11059 obc = object_contexts.lookup_or_create(oi.soid);
11060 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11061 obc->obs.oi = oi;
11062 obc->obs.exists = true;
11063
11064 obc->ssc = get_snapset_context(
11065 soid, true,
11066 soid.has_snapset() ? attrs : 0);
11067
11068 if (is_active())
11069 populate_obc_watchers(obc);
11070
11071 if (pool.info.is_erasure()) {
11072 if (attrs) {
11073 obc->attr_cache = *attrs;
11074 } else {
11075 int r = pgbackend->objects_get_attrs(
11076 soid,
11077 &obc->attr_cache);
11078 ceph_assert(r == 0);
11079 }
11080 }
11081
11082 dout(10) << __func__ << ": creating obc from disk: " << obc
11083 << dendl;
11084 }
11085
11086 // XXX: Caller doesn't expect this
11087 if (obc->ssc == NULL) {
11088 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11089 return ObjectContextRef(); // -ENOENT!
11090 }
11091
11092 dout(10) << __func__ << ": " << obc << " " << soid
11093 << " " << obc->rwstate
11094 << " oi: " << obc->obs.oi
11095 << " exists: " << (int)obc->obs.exists
11096 << " ssc: " << obc->ssc
11097 << " snapset: " << obc->ssc->snapset << dendl;
11098 return obc;
11099 }
11100
11101 void PrimaryLogPG::context_registry_on_change()
11102 {
11103 pair<hobject_t, ObjectContextRef> i;
11104 while (object_contexts.get_next(i.first, &i)) {
11105 ObjectContextRef obc(i.second);
11106 if (obc) {
11107 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11108 obc->watchers.begin();
11109 j != obc->watchers.end();
11110 obc->watchers.erase(j++)) {
11111 j->second->discard();
11112 }
11113 }
11114 }
11115 }
11116
11117
11118 /*
11119 * If we return an error, and set *pmissing, then promoting that
11120 * object may help.
11121 *
11122 * If we return -EAGAIN, we will always set *pmissing to the missing
11123 * object to wait for.
11124 *
11125 * If we return an error but do not set *pmissing, then we know the
11126 * object does not exist.
11127 */
11128 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11129 ObjectContextRef *pobc,
11130 bool can_create,
11131 bool map_snapid_to_clone,
11132 hobject_t *pmissing)
11133 {
11134 FUNCTRACE(cct);
11135 ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11136 // want the head?
11137 if (oid.snap == CEPH_NOSNAP) {
11138 ObjectContextRef obc = get_object_context(oid, can_create);
11139 if (!obc) {
11140 if (pmissing)
11141 *pmissing = oid;
11142 return -ENOENT;
11143 }
11144 dout(10) << __func__ << " " << oid
11145 << " @" << oid.snap
11146 << " oi=" << obc->obs.oi
11147 << dendl;
11148 *pobc = obc;
11149
11150 return 0;
11151 }
11152
11153 hobject_t head = oid.get_head();
11154
11155 // we want a snap
11156 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
11157 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
11158 return -ENOENT;
11159 }
11160
11161 SnapSetContext *ssc = get_snapset_context(oid, can_create);
11162 if (!ssc || !(ssc->exists || can_create)) {
11163 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11164 if (pmissing)
11165 *pmissing = head; // start by getting the head
11166 if (ssc)
11167 put_snapset_context(ssc);
11168 return -ENOENT;
11169 }
11170
11171 if (map_snapid_to_clone) {
11172 dout(10) << __func__ << " " << oid << " @" << oid.snap
11173 << " snapset " << ssc->snapset
11174 << " map_snapid_to_clone=true" << dendl;
11175 if (oid.snap > ssc->snapset.seq) {
11176 // already must be readable
11177 ObjectContextRef obc = get_object_context(head, false);
11178 dout(10) << __func__ << " " << oid << " @" << oid.snap
11179 << " snapset " << ssc->snapset
11180 << " maps to head" << dendl;
11181 *pobc = obc;
11182 put_snapset_context(ssc);
11183 return (obc && obc->obs.exists) ? 0 : -ENOENT;
11184 } else {
11185 vector<snapid_t>::const_iterator citer = std::find(
11186 ssc->snapset.clones.begin(),
11187 ssc->snapset.clones.end(),
11188 oid.snap);
11189 if (citer == ssc->snapset.clones.end()) {
11190 dout(10) << __func__ << " " << oid << " @" << oid.snap
11191 << " snapset " << ssc->snapset
11192 << " maps to nothing" << dendl;
11193 put_snapset_context(ssc);
11194 return -ENOENT;
11195 }
11196
11197 dout(10) << __func__ << " " << oid << " @" << oid.snap
11198 << " snapset " << ssc->snapset
11199 << " maps to " << oid << dendl;
11200
11201 if (pg_log.get_missing().is_missing(oid)) {
11202 dout(10) << __func__ << " " << oid << " @" << oid.snap
11203 << " snapset " << ssc->snapset
11204 << " " << oid << " is missing" << dendl;
11205 if (pmissing)
11206 *pmissing = oid;
11207 put_snapset_context(ssc);
11208 return -EAGAIN;
11209 }
11210
11211 ObjectContextRef obc = get_object_context(oid, false);
11212 if (!obc || !obc->obs.exists) {
11213 dout(10) << __func__ << " " << oid << " @" << oid.snap
11214 << " snapset " << ssc->snapset
11215 << " " << oid << " is not present" << dendl;
11216 if (pmissing)
11217 *pmissing = oid;
11218 put_snapset_context(ssc);
11219 return -ENOENT;
11220 }
11221 dout(10) << __func__ << " " << oid << " @" << oid.snap
11222 << " snapset " << ssc->snapset
11223 << " " << oid << " HIT" << dendl;
11224 *pobc = obc;
11225 put_snapset_context(ssc);
11226 return 0;
11227 }
11228 ceph_abort(); //unreachable
11229 }
11230
11231 dout(10) << __func__ << " " << oid << " @" << oid.snap
11232 << " snapset " << ssc->snapset << dendl;
11233
11234 // head?
11235 if (oid.snap > ssc->snapset.seq) {
11236 ObjectContextRef obc = get_object_context(head, false);
11237 dout(10) << __func__ << " " << head
11238 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11239 << " -- HIT " << obc->obs
11240 << dendl;
11241 if (!obc->ssc)
11242 obc->ssc = ssc;
11243 else {
11244 ceph_assert(ssc == obc->ssc);
11245 put_snapset_context(ssc);
11246 }
11247 *pobc = obc;
11248 return 0;
11249 }
11250
11251 // which clone would it be?
11252 unsigned k = 0;
11253 while (k < ssc->snapset.clones.size() &&
11254 ssc->snapset.clones[k] < oid.snap)
11255 k++;
11256 if (k == ssc->snapset.clones.size()) {
11257 dout(10) << __func__ << " no clones with last >= oid.snap "
11258 << oid.snap << " -- DNE" << dendl;
11259 put_snapset_context(ssc);
11260 return -ENOENT;
11261 }
11262 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11263 info.pgid.pool(), oid.get_namespace());
11264
11265 if (pg_log.get_missing().is_missing(soid)) {
11266 dout(20) << __func__ << " " << soid << " missing, try again later"
11267 << dendl;
11268 if (pmissing)
11269 *pmissing = soid;
11270 put_snapset_context(ssc);
11271 return -EAGAIN;
11272 }
11273
11274 ObjectContextRef obc = get_object_context(soid, false);
11275 if (!obc || !obc->obs.exists) {
11276 if (pmissing)
11277 *pmissing = soid;
11278 put_snapset_context(ssc);
11279 if (is_degraded_or_backfilling_object(soid)) {
11280 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11281 return -EAGAIN;
11282 } else if (is_degraded_on_async_recovery_target(soid)) {
11283 dout(20) << __func__ << " clone is recovering " << soid << dendl;
11284 return -EAGAIN;
11285 } else {
11286 dout(20) << __func__ << " missing clone " << soid << dendl;
11287 return -ENOENT;
11288 }
11289 }
11290
11291 if (!obc->ssc) {
11292 obc->ssc = ssc;
11293 } else {
11294 ceph_assert(obc->ssc == ssc);
11295 put_snapset_context(ssc);
11296 }
11297 ssc = 0;
11298
11299 // clone
11300 dout(20) << __func__ << " " << soid
11301 << " snapset " << obc->ssc->snapset
11302 << dendl;
11303 snapid_t first, last;
11304 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11305 ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11306 if (p->second.empty()) {
11307 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11308 ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11309 return -ENOENT;
11310 }
11311 first = p->second.back();
11312 last = p->second.front();
11313 if (first <= oid.snap) {
11314 dout(20) << __func__ << " " << soid << " [" << first << "," << last
11315 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11316 *pobc = obc;
11317 return 0;
11318 } else {
11319 dout(20) << __func__ << " " << soid << " [" << first << "," << last
11320 << "] does not contain " << oid.snap << " -- DNE" << dendl;
11321 return -ENOENT;
11322 }
11323 }
11324
11325 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11326 {
11327 if (obc->ssc)
11328 put_snapset_context(obc->ssc);
11329 }
11330
11331 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11332 {
11333 object_info_t& oi = obc->obs.oi;
11334
11335 dout(10) << __func__ << " " << oi.soid << dendl;
11336 ceph_assert(!oi.soid.is_snapdir());
11337
11338 object_stat_sum_t stat;
11339 stat.num_objects++;
11340 if (oi.is_dirty())
11341 stat.num_objects_dirty++;
11342 if (oi.is_whiteout())
11343 stat.num_whiteouts++;
11344 if (oi.is_omap())
11345 stat.num_objects_omap++;
11346 if (oi.is_cache_pinned())
11347 stat.num_objects_pinned++;
11348 if (oi.has_manifest())
11349 stat.num_objects_manifest++;
11350
11351 if (oi.soid.is_snap()) {
11352 stat.num_object_clones++;
11353
11354 if (!obc->ssc)
11355 obc->ssc = get_snapset_context(oi.soid, false);
11356 ceph_assert(obc->ssc);
11357 stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11358 } else {
11359 stat.num_bytes += oi.size;
11360 }
11361
11362 // add it in
11363 pgstat->stats.sum.add(stat);
11364 }
11365
11366 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11367 {
11368 const hobject_t& soid = obc->obs.oi.soid;
11369 if (obc->is_blocked()) {
11370 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11371 return;
11372 }
11373
11374 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11375 if (p != waiting_for_blocked_object.end()) {
11376 list<OpRequestRef>& ls = p->second;
11377 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11378 requeue_ops(ls);
11379 waiting_for_blocked_object.erase(p);
11380 }
11381
11382 map<hobject_t, ObjectContextRef>::iterator i =
11383 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11384 if (i != objects_blocked_on_snap_promotion.end()) {
11385 ceph_assert(i->second == obc);
11386 objects_blocked_on_snap_promotion.erase(i);
11387 }
11388
11389 if (obc->requeue_scrub_on_unblock) {
11390 obc->requeue_scrub_on_unblock = false;
11391 requeue_scrub();
11392 }
11393 }
11394
11395 SnapSetContext *PrimaryLogPG::get_snapset_context(
11396 const hobject_t& oid,
11397 bool can_create,
11398 const map<string, bufferlist> *attrs,
11399 bool oid_existed)
11400 {
11401 std::lock_guard l(snapset_contexts_lock);
11402 SnapSetContext *ssc;
11403 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11404 oid.get_snapdir());
11405 if (p != snapset_contexts.end()) {
11406 if (can_create || p->second->exists) {
11407 ssc = p->second;
11408 } else {
11409 return NULL;
11410 }
11411 } else {
11412 bufferlist bv;
11413 if (!attrs) {
11414 int r = -ENOENT;
11415 if (!(oid.is_head() && !oid_existed)) {
11416 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
11417 }
11418 if (r < 0 && !can_create)
11419 return NULL;
11420 } else {
11421 auto it_ss = attrs->find(SS_ATTR);
11422 ceph_assert(it_ss != attrs->end());
11423 bv = it_ss->second;
11424 }
11425 ssc = new SnapSetContext(oid.get_snapdir());
11426 _register_snapset_context(ssc);
11427 if (bv.length()) {
11428 bufferlist::const_iterator bvp = bv.begin();
11429 try {
11430 ssc->snapset.decode(bvp);
11431 } catch (buffer::error& e) {
11432 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
11433 return NULL;
11434 }
11435 ssc->exists = true;
11436 } else {
11437 ssc->exists = false;
11438 }
11439 }
11440 ceph_assert(ssc);
11441 ssc->ref++;
11442 return ssc;
11443 }
11444
11445 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11446 {
11447 std::lock_guard l(snapset_contexts_lock);
11448 --ssc->ref;
11449 if (ssc->ref == 0) {
11450 if (ssc->registered)
11451 snapset_contexts.erase(ssc->oid);
11452 delete ssc;
11453 }
11454 }
11455
11456 /*
11457 * Return values:
11458 * NONE - didn't pull anything
11459 * YES - pulled what the caller wanted
11460 * HEAD - needed to pull head first
11461 */
11462 enum { PULL_NONE, PULL_HEAD, PULL_YES };
11463
11464 int PrimaryLogPG::recover_missing(
11465 const hobject_t &soid, eversion_t v,
11466 int priority,
11467 PGBackend::RecoveryHandle *h)
11468 {
11469 if (missing_loc.is_unfound(soid)) {
11470 dout(7) << __func__ << " " << soid
11471 << " v " << v
11472 << " but it is unfound" << dendl;
11473 return PULL_NONE;
11474 }
11475
11476 if (missing_loc.is_deleted(soid)) {
11477 start_recovery_op(soid);
11478 ceph_assert(!recovering.count(soid));
11479 recovering.insert(make_pair(soid, ObjectContextRef()));
11480 epoch_t cur_epoch = get_osdmap_epoch();
11481 remove_missing_object(soid, v, new FunctionContext(
11482 [=](int) {
11483 lock();
11484 if (!pg_has_reset_since(cur_epoch)) {
11485 bool object_missing = false;
11486 for (const auto& shard : acting_recovery_backfill) {
11487 if (shard == pg_whoami)
11488 continue;
11489 if (peer_missing[shard].is_missing(soid)) {
11490 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11491 object_missing = true;
11492 break;
11493 }
11494 }
11495 if (!object_missing) {
11496 object_stat_sum_t stat_diff;
11497 stat_diff.num_objects_recovered = 1;
11498 if (scrub_after_recovery)
11499 stat_diff.num_objects_repaired = 1;
11500 on_global_recover(soid, stat_diff, true);
11501 } else {
11502 auto recovery_handle = pgbackend->open_recovery_op();
11503 pgbackend->recover_delete_object(soid, v, recovery_handle);
11504 pgbackend->run_recovery_op(recovery_handle, priority);
11505 }
11506 }
11507 unlock();
11508 }));
11509 return PULL_YES;
11510 }
11511
11512 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11513 ObjectContextRef obc;
11514 ObjectContextRef head_obc;
11515 if (soid.snap && soid.snap < CEPH_NOSNAP) {
11516 // do we have the head?
11517 hobject_t head = soid.get_head();
11518 if (pg_log.get_missing().is_missing(head)) {
11519 if (recovering.count(head)) {
11520 dout(10) << " missing but already recovering head " << head << dendl;
11521 return PULL_NONE;
11522 } else {
11523 int r = recover_missing(
11524 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
11525 h);
11526 if (r != PULL_NONE)
11527 return PULL_HEAD;
11528 return PULL_NONE;
11529 }
11530 }
11531 head_obc = get_object_context(
11532 head,
11533 false,
11534 0);
11535 ceph_assert(head_obc);
11536 }
11537 start_recovery_op(soid);
11538 ceph_assert(!recovering.count(soid));
11539 recovering.insert(make_pair(soid, obc));
11540 int r = pgbackend->recover_object(
11541 soid,
11542 v,
11543 head_obc,
11544 obc,
11545 h);
11546 // This is only a pull which shouldn't return an error
11547 ceph_assert(r >= 0);
11548 return PULL_YES;
11549 }
11550
11551 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
11552 eversion_t v, Context *on_complete)
11553 {
11554 dout(20) << __func__ << " " << soid << " " << v << dendl;
11555 ceph_assert(on_complete != nullptr);
11556 // delete locally
11557 ObjectStore::Transaction t;
11558 remove_snap_mapped_object(t, soid);
11559
11560 ObjectRecoveryInfo recovery_info;
11561 recovery_info.soid = soid;
11562 recovery_info.version = v;
11563
11564 epoch_t cur_epoch = get_osdmap_epoch();
11565 t.register_on_complete(new FunctionContext(
11566 [=](int) {
11567 lock();
11568 if (!pg_has_reset_since(cur_epoch)) {
11569 ObjectStore::Transaction t2;
11570 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
11571 t2.register_on_complete(on_complete);
11572 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
11573 ceph_assert(r == 0);
11574 unlock();
11575 } else {
11576 unlock();
11577 on_complete->complete(-EAGAIN);
11578 }
11579 }));
11580 int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
11581 ceph_assert(r == 0);
11582 }
11583
11584 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
11585 {
11586 dout(10) << __func__ << " " << oid << dendl;
11587 if (callbacks_for_degraded_object.count(oid)) {
11588 list<Context*> contexts;
11589 contexts.swap(callbacks_for_degraded_object[oid]);
11590 callbacks_for_degraded_object.erase(oid);
11591 for (list<Context*>::iterator i = contexts.begin();
11592 i != contexts.end();
11593 ++i) {
11594 (*i)->complete(0);
11595 }
11596 }
11597 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
11598 oid.get_head());
11599 if (i != objects_blocked_on_degraded_snap.end() &&
11600 i->second == oid.snap)
11601 objects_blocked_on_degraded_snap.erase(i);
11602 }
11603
11604 void PrimaryLogPG::_committed_pushed_object(
11605 epoch_t epoch, eversion_t last_complete)
11606 {
11607 lock();
11608 if (!pg_has_reset_since(epoch)) {
11609 dout(10) << __func__ << " last_complete " << last_complete << " now ondisk" << dendl;
11610 last_complete_ondisk = last_complete;
11611
11612 if (last_complete_ondisk == info.last_update) {
11613 if (!is_primary()) {
11614 // Either we are a replica or backfill target.
11615 // we are fully up to date. tell the primary!
11616 osd->send_message_osd_cluster(
11617 get_primary().osd,
11618 new MOSDPGTrim(
11619 get_osdmap_epoch(),
11620 spg_t(info.pgid.pgid, get_primary().shard),
11621 last_complete_ondisk),
11622 get_osdmap_epoch());
11623 } else {
11624 calc_min_last_complete_ondisk();
11625 }
11626 }
11627
11628 } else {
11629 dout(10) << __func__ << " pg has changed, not touching last_complete_ondisk" << dendl;
11630 }
11631
11632 unlock();
11633 }
11634
11635 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
11636 {
11637 dout(20) << __func__ << dendl;
11638 if (obc) {
11639 dout(20) << "obc = " << *obc << dendl;
11640 }
11641 ceph_assert(active_pushes >= 1);
11642 --active_pushes;
11643
11644 // requeue an active chunky scrub waiting on recovery ops
11645 if (!deleting && active_pushes == 0
11646 && scrubber.is_chunky_scrub_active()) {
11647 requeue_scrub(ops_blocked_by_scrub());
11648 }
11649 }
11650
11651 void PrimaryLogPG::_applied_recovered_object_replica()
11652 {
11653 dout(20) << __func__ << dendl;
11654 ceph_assert(active_pushes >= 1);
11655 --active_pushes;
11656
11657 // requeue an active chunky scrub waiting on recovery ops
11658 if (!deleting && active_pushes == 0 &&
11659 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
11660 scrubber.active_rep_scrub->get_req())->chunky) {
11661 auto& op = scrubber.active_rep_scrub;
11662 osd->enqueue_back(
11663 OpQueueItem(
11664 unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
11665 op->get_req()->get_cost(),
11666 op->get_req()->get_priority(),
11667 op->get_req()->get_recv_stamp(),
11668 op->get_req()->get_source().num(),
11669 get_osdmap_epoch()));
11670 scrubber.active_rep_scrub.reset();
11671 }
11672 }
11673
11674 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
11675 {
11676 dout(10) << "got missing " << oid << " v " << v << dendl;
11677 pg_log.recover_got(oid, v, info);
11678 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
11679 dout(10) << "last_complete now " << info.last_complete
11680 << " log.complete_to " << pg_log.get_log().complete_to->version
11681 << dendl;
11682 } else {
11683 dout(10) << "last_complete now " << info.last_complete
11684 << " log.complete_to at end" << dendl;
11685 //below is not true in the repair case.
11686 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
11687 ceph_assert(info.last_complete == info.last_update);
11688 }
11689 }
11690
11691 void PrimaryLogPG::primary_failed(const hobject_t &soid)
11692 {
11693 list<pg_shard_t> fl = { pg_whoami };
11694 failed_push(fl, soid);
11695 }
11696
11697 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
11698 {
11699 dout(20) << __func__ << ": " << soid << dendl;
11700 ceph_assert(recovering.count(soid));
11701 auto obc = recovering[soid];
11702 if (obc) {
11703 list<OpRequestRef> blocked_ops;
11704 obc->drop_recovery_read(&blocked_ops);
11705 requeue_ops(blocked_ops);
11706 }
11707 recovering.erase(soid);
11708 for (auto&& i : from)
11709 missing_loc.remove_location(soid, i);
11710 dout(0) << __func__ << " " << soid << " from shard " << from
11711 << ", reps on " << missing_loc.get_locations(soid)
11712 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
11713 finish_recovery_op(soid); // close out this attempt,
11714 }
11715
11716 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
11717 {
11718 eversion_t v;
11719 pg_missing_item pmi;
11720 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
11721 ceph_assert(is_missing);
11722 v = pmi.have;
11723 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
11724
11725 ceph_assert(!acting_recovery_backfill.empty());
11726 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
11727 i != acting_recovery_backfill.end();
11728 ++i) {
11729 if (*i == get_primary()) continue;
11730 pg_shard_t peer = *i;
11731 if (!peer_missing[peer].is_missing(oid)) {
11732 continue;
11733 }
11734 eversion_t h = peer_missing[peer].get_items().at(oid).have;
11735 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
11736 if (h > v)
11737 v = h;
11738 }
11739
11740 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
11741 return v;
11742 }
11743
11744 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
11745 {
11746 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
11747 op->get_req());
11748 ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
11749 ObjectStore::Transaction t;
11750 boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
11751 if (m->pg_trim_to != eversion_t())
11752 op_trim_to = m->pg_trim_to;
11753 if (m->pg_roll_forward_to != eversion_t())
11754 op_roll_forward_to = m->pg_roll_forward_to;
11755
11756 dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
11757
11758 append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
11759 eversion_t new_lcod = info.last_complete;
11760
11761 Context *complete = new FunctionContext(
11762 [=](int) {
11763 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
11764 op->get_req());
11765 lock();
11766 if (!pg_has_reset_since(msg->get_epoch())) {
11767 update_last_complete_ondisk(new_lcod);
11768 MOSDPGUpdateLogMissingReply *reply =
11769 new MOSDPGUpdateLogMissingReply(
11770 spg_t(info.pgid.pgid, primary_shard().shard),
11771 pg_whoami.shard,
11772 msg->get_epoch(),
11773 msg->min_epoch,
11774 msg->get_tid(),
11775 new_lcod);
11776 reply->set_priority(CEPH_MSG_PRIO_HIGH);
11777 msg->get_connection()->send_message(reply);
11778 }
11779 unlock();
11780 });
11781
11782 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
11783 t.register_on_commit(complete);
11784 } else {
11785 /* Hack to work around the fact that ReplicatedBackend sends
11786 * ack+commit if commit happens first
11787 *
11788 * This behavior is no longer necessary, but we preserve it so old
11789 * primaries can keep their repops in order */
11790 if (pool.info.is_erasure()) {
11791 t.register_on_complete(complete);
11792 } else {
11793 t.register_on_commit(complete);
11794 }
11795 }
11796 int tr = osd->store->queue_transaction(
11797 ch,
11798 std::move(t),
11799 nullptr);
11800 ceph_assert(tr == 0);
11801 op_applied(info.last_update);
11802 }
11803
11804 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
11805 {
11806 const MOSDPGUpdateLogMissingReply *m =
11807 static_cast<const MOSDPGUpdateLogMissingReply*>(
11808 op->get_req());
11809 dout(20) << __func__ << " got reply from "
11810 << m->get_from() << dendl;
11811
11812 auto it = log_entry_update_waiting_on.find(m->get_tid());
11813 if (it != log_entry_update_waiting_on.end()) {
11814 if (it->second.waiting_on.count(m->get_from())) {
11815 it->second.waiting_on.erase(m->get_from());
11816 if (m->last_complete_ondisk != eversion_t()) {
11817 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
11818 }
11819 } else {
11820 osd->clog->error()
11821 << info.pgid << " got reply "
11822 << *m << " from shard we are not waiting for "
11823 << m->get_from();
11824 }
11825
11826 if (it->second.waiting_on.empty()) {
11827 repop_all_committed(it->second.repop.get());
11828 log_entry_update_waiting_on.erase(it);
11829 }
11830 } else {
11831 osd->clog->error()
11832 << info.pgid << " got reply "
11833 << *m << " on unknown tid " << m->get_tid();
11834 }
11835 }
11836
11837 /* Mark all unfound objects as lost.
11838 */
11839 void PrimaryLogPG::mark_all_unfound_lost(
11840 int what,
11841 ConnectionRef con,
11842 ceph_tid_t tid)
11843 {
11844 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
11845 list<hobject_t> oids;
11846
11847 dout(30) << __func__ << ": log before:\n";
11848 pg_log.get_log().print(*_dout);
11849 *_dout << dendl;
11850
11851 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
11852
11853 utime_t mtime = ceph_clock_now();
11854 map<hobject_t, pg_missing_item>::const_iterator m =
11855 missing_loc.get_needs_recovery().begin();
11856 map<hobject_t, pg_missing_item>::const_iterator mend =
11857 missing_loc.get_needs_recovery().end();
11858
11859 ObcLockManager manager;
11860 eversion_t v = get_next_version();
11861 v.epoch = get_osdmap_epoch();
11862 uint64_t num_unfound = missing_loc.num_unfound();
11863 while (m != mend) {
11864 const hobject_t &oid(m->first);
11865 if (!missing_loc.is_unfound(oid)) {
11866 // We only care about unfound objects
11867 ++m;
11868 continue;
11869 }
11870
11871 ObjectContextRef obc;
11872 eversion_t prev;
11873
11874 switch (what) {
11875 case pg_log_entry_t::LOST_MARK:
11876 ceph_abort_msg("actually, not implemented yet!");
11877 break;
11878
11879 case pg_log_entry_t::LOST_REVERT:
11880 prev = pick_newest_available(oid);
11881 if (prev > eversion_t()) {
11882 // log it
11883 pg_log_entry_t e(
11884 pg_log_entry_t::LOST_REVERT, oid, v,
11885 m->second.need, 0, osd_reqid_t(), mtime, 0);
11886 e.reverting_to = prev;
11887 e.mark_unrollbackable();
11888 log_entries.push_back(e);
11889 dout(10) << e << dendl;
11890
11891 // we are now missing the new version; recovery code will sort it out.
11892 ++v.version;
11893 ++m;
11894 break;
11895 }
11896
11897 case pg_log_entry_t::LOST_DELETE:
11898 {
11899 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
11900 0, osd_reqid_t(), mtime, 0);
11901 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
11902 if (pool.info.require_rollback()) {
11903 e.mod_desc.try_rmobject(v.version);
11904 } else {
11905 e.mark_unrollbackable();
11906 }
11907 } // otherwise, just do what we used to do
11908 dout(10) << e << dendl;
11909 log_entries.push_back(e);
11910 oids.push_back(oid);
11911
11912 // If context found mark object as deleted in case
11913 // of racing with new creation. This can happen if
11914 // object lost and EIO at primary.
11915 obc = object_contexts.lookup(oid);
11916 if (obc)
11917 obc->obs.exists = false;
11918
11919 ++v.version;
11920 ++m;
11921 }
11922 break;
11923
11924 default:
11925 ceph_abort();
11926 }
11927 }
11928
11929 info.stats.stats_invalid = true;
11930
11931 submit_log_entries(
11932 log_entries,
11933 std::move(manager),
11934 boost::optional<std::function<void(void)> >(
11935 [this, oids, con, num_unfound, tid]() {
11936 if (perform_deletes_during_peering()) {
11937 for (auto oid : oids) {
11938 // clear old locations - merge_new_log_entries will have
11939 // handled rebuilding missing_loc for each of these
11940 // objects if we have the RECOVERY_DELETES flag
11941 missing_loc.recovered(oid);
11942 }
11943 }
11944
11945 if (is_recovery_unfound()) {
11946 queue_peering_event(
11947 PGPeeringEventRef(
11948 std::make_shared<PGPeeringEvent>(
11949 get_osdmap_epoch(),
11950 get_osdmap_epoch(),
11951 DoRecovery())));
11952 } else if (is_backfill_unfound()) {
11953 queue_peering_event(
11954 PGPeeringEventRef(
11955 std::make_shared<PGPeeringEvent>(
11956 get_osdmap_epoch(),
11957 get_osdmap_epoch(),
11958 RequestBackfill())));
11959 } else {
11960 queue_recovery();
11961 }
11962
11963 stringstream ss;
11964 ss << "pg has " << num_unfound
11965 << " objects unfound and apparently lost marking";
11966 string rs = ss.str();
11967 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
11968 osd->clog->info() << rs;
11969 if (con) {
11970 MCommandReply *reply = new MCommandReply(0, rs);
11971 reply->set_tid(tid);
11972 con->send_message(reply);
11973 }
11974 }),
11975 OpRequestRef());
11976 }
11977
11978 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
11979 {
11980 ceph_assert(repop_queue.empty());
11981 }
11982
11983 /*
11984 * pg status change notification
11985 */
11986
11987 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11988 {
11989 list<OpRequestRef> rq;
11990
11991 // apply all repops
11992 while (!repop_queue.empty()) {
11993 RepGather *repop = repop_queue.front();
11994 repop_queue.pop_front();
11995 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11996 repop->rep_aborted = true;
11997 repop->on_committed.clear();
11998 repop->on_success.clear();
11999
12000 if (requeue) {
12001 if (repop->op) {
12002 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12003 rq.push_back(repop->op);
12004 repop->op = OpRequestRef();
12005 }
12006
12007 // also requeue any dups, interleaved into position
12008 auto p = waiting_for_ondisk.find(repop->v);
12009 if (p != waiting_for_ondisk.end()) {
12010 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
12011 for (auto& i : p->second) {
12012 rq.push_back(std::get<0>(i));
12013 }
12014 waiting_for_ondisk.erase(p);
12015 }
12016 }
12017
12018 remove_repop(repop);
12019 }
12020
12021 ceph_assert(repop_queue.empty());
12022
12023 if (requeue) {
12024 requeue_ops(rq);
12025 if (!waiting_for_ondisk.empty()) {
12026 for (auto& i : waiting_for_ondisk) {
12027 for (auto& j : i.second) {
12028 derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12029 << " waiting on " << i.first << dendl;
12030 }
12031 }
12032 ceph_assert(waiting_for_ondisk.empty());
12033 }
12034 }
12035
12036 waiting_for_ondisk.clear();
12037 }
12038
12039 void PrimaryLogPG::on_flushed()
12040 {
12041 ceph_assert(flushes_in_progress > 0);
12042 flushes_in_progress--;
12043 if (flushes_in_progress == 0) {
12044 requeue_ops(waiting_for_flush);
12045 }
12046 if (!is_peered() || !is_primary()) {
12047 pair<hobject_t, ObjectContextRef> i;
12048 while (object_contexts.get_next(i.first, &i)) {
12049 derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12050 }
12051 ceph_assert(object_contexts.empty());
12052 }
12053 }
12054
12055 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
12056 {
12057 dout(10) << __func__ << dendl;
12058
12059 // adjust info to backfill
12060 info.set_last_backfill(hobject_t());
12061 pg_log.reset_backfill();
12062 dirty_info = true;
12063
12064 // clear log
12065 PGLogEntryHandler rollbacker{this, t};
12066 pg_log.roll_forward(&rollbacker);
12067
12068 on_shutdown();
12069 }
12070
12071 void PrimaryLogPG::clear_async_reads()
12072 {
12073 dout(10) << __func__ << dendl;
12074 for(auto& i : in_progress_async_reads) {
12075 dout(10) << "clear ctx: "
12076 << "OpRequestRef " << i.first
12077 << " OpContext " << i.second
12078 << dendl;
12079 close_op_ctx(i.second);
12080 }
12081 }
12082
12083 void PrimaryLogPG::clear_cache()
12084 {
12085 object_contexts.clear();
12086 }
12087
12088 void PrimaryLogPG::on_shutdown()
12089 {
12090 dout(10) << __func__ << dendl;
12091
12092 // handles queue races
12093 deleting = true;
12094
12095 if (recovery_queued) {
12096 recovery_queued = false;
12097 osd->clear_queued_recovery(this);
12098 }
12099
12100 clear_scrub_reserved();
12101 scrub_clear_state();
12102
12103 unreg_next_scrub();
12104
12105 vector<ceph_tid_t> tids;
12106 cancel_copy_ops(false, &tids);
12107 cancel_flush_ops(false, &tids);
12108 cancel_proxy_ops(false, &tids);
12109 osd->objecter->op_cancel(tids, -ECANCELED);
12110
12111 apply_and_flush_repops(false);
12112 cancel_log_updates();
12113 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12114 clear_backoffs();
12115 // clean up snap trim references
12116 snap_trimmer_machine.process_event(Reset());
12117
12118 pgbackend->on_change();
12119
12120 context_registry_on_change();
12121 object_contexts.clear();
12122
12123 clear_async_reads();
12124
12125 osd->remote_reserver.cancel_reservation(info.pgid);
12126 osd->local_reserver.cancel_reservation(info.pgid);
12127
12128 clear_primary_state();
12129 cancel_recovery();
12130
12131 if (is_primary()) {
12132 osd->clear_ready_to_merge(this);
12133 }
12134 }
12135
12136 void PrimaryLogPG::on_activate()
12137 {
12138 // all clean?
12139 if (needs_recovery()) {
12140 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12141 queue_peering_event(
12142 PGPeeringEventRef(
12143 std::make_shared<PGPeeringEvent>(
12144 get_osdmap_epoch(),
12145 get_osdmap_epoch(),
12146 DoRecovery())));
12147 } else if (needs_backfill()) {
12148 dout(10) << "activate queueing backfill" << dendl;
12149 queue_peering_event(
12150 PGPeeringEventRef(
12151 std::make_shared<PGPeeringEvent>(
12152 get_osdmap_epoch(),
12153 get_osdmap_epoch(),
12154 RequestBackfill())));
12155 } else {
12156 dout(10) << "activate all replicas clean, no recovery" << dendl;
12157 eio_errors_to_process = false;
12158 queue_peering_event(
12159 PGPeeringEventRef(
12160 std::make_shared<PGPeeringEvent>(
12161 get_osdmap_epoch(),
12162 get_osdmap_epoch(),
12163 AllReplicasRecovered())));
12164 }
12165
12166 publish_stats_to_osd();
12167
12168 if (!backfill_targets.empty()) {
12169 last_backfill_started = earliest_backfill();
12170 new_backfill = true;
12171 ceph_assert(!last_backfill_started.is_max());
12172 dout(5) << __func__ << ": bft=" << backfill_targets
12173 << " from " << last_backfill_started << dendl;
12174 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12175 i != backfill_targets.end();
12176 ++i) {
12177 dout(5) << "target shard " << *i
12178 << " from " << peer_info[*i].last_backfill
12179 << dendl;
12180 }
12181 }
12182
12183 hit_set_setup();
12184 agent_setup();
12185 }
12186
12187 void PrimaryLogPG::_on_new_interval()
12188 {
12189 dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
12190 if (!pg_log.get_missing().may_include_deletes &&
12191 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
12192 pg_log.rebuild_missing_set_with_deletes(osd->store, ch, info);
12193 }
12194 ceph_assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
12195 }
12196
12197 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
12198 {
12199 dout(10) << __func__ << dendl;
12200
12201 if (hit_set && hit_set->insert_count() == 0) {
12202 dout(20) << " discarding empty hit_set" << dendl;
12203 hit_set_clear();
12204 }
12205
12206 if (recovery_queued) {
12207 recovery_queued = false;
12208 osd->clear_queued_recovery(this);
12209 }
12210
12211 // requeue everything in the reverse order they should be
12212 // reexamined.
12213 requeue_ops(waiting_for_peered);
12214 requeue_ops(waiting_for_flush);
12215 requeue_ops(waiting_for_active);
12216
12217 clear_scrub_reserved();
12218
12219 vector<ceph_tid_t> tids;
12220 cancel_copy_ops(is_primary(), &tids);
12221 cancel_flush_ops(is_primary(), &tids);
12222 cancel_proxy_ops(is_primary(), &tids);
12223 osd->objecter->op_cancel(tids, -ECANCELED);
12224
12225 // requeue object waiters
12226 for (auto& p : waiting_for_unreadable_object) {
12227 release_backoffs(p.first);
12228 }
12229 if (is_primary()) {
12230 requeue_object_waiters(waiting_for_unreadable_object);
12231 } else {
12232 waiting_for_unreadable_object.clear();
12233 }
12234 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12235 p != waiting_for_degraded_object.end();
12236 waiting_for_degraded_object.erase(p++)) {
12237 release_backoffs(p->first);
12238 if (is_primary())
12239 requeue_ops(p->second);
12240 else
12241 p->second.clear();
12242 finish_degraded_object(p->first);
12243 }
12244
12245 // requeues waiting_for_scrub
12246 scrub_clear_state();
12247
12248 for (auto p = waiting_for_blocked_object.begin();
12249 p != waiting_for_blocked_object.end();
12250 waiting_for_blocked_object.erase(p++)) {
12251 if (is_primary())
12252 requeue_ops(p->second);
12253 else
12254 p->second.clear();
12255 }
12256 for (auto i = callbacks_for_degraded_object.begin();
12257 i != callbacks_for_degraded_object.end();
12258 ) {
12259 finish_degraded_object((i++)->first);
12260 }
12261 ceph_assert(callbacks_for_degraded_object.empty());
12262
12263 if (is_primary()) {
12264 requeue_ops(waiting_for_cache_not_full);
12265 } else {
12266 waiting_for_cache_not_full.clear();
12267 }
12268 objects_blocked_on_cache_full.clear();
12269
12270 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12271 in_progress_async_reads.begin();
12272 i != in_progress_async_reads.end();
12273 in_progress_async_reads.erase(i++)) {
12274 close_op_ctx(i->second);
12275 if (is_primary())
12276 requeue_op(i->first);
12277 }
12278
12279 // this will requeue ops we were working on but didn't finish, and
12280 // any dups
12281 apply_and_flush_repops(is_primary());
12282 cancel_log_updates();
12283
12284 // do this *after* apply_and_flush_repops so that we catch any newly
12285 // registered watches.
12286 context_registry_on_change();
12287
12288 pgbackend->on_change_cleanup(t);
12289 scrubber.cleanup_store(t);
12290 pgbackend->on_change();
12291
12292 // clear snap_trimmer state
12293 snap_trimmer_machine.process_event(Reset());
12294
12295 debug_op_order.clear();
12296 unstable_stats.clear();
12297
12298 // we don't want to cache object_contexts through the interval change
12299 // NOTE: we actually assert that all currently live references are dead
12300 // by the time the flush for the next interval completes.
12301 object_contexts.clear();
12302
12303 // should have been cleared above by finishing all of the degraded objects
12304 ceph_assert(objects_blocked_on_degraded_snap.empty());
12305 }
12306
12307 void PrimaryLogPG::on_role_change()
12308 {
12309 dout(10) << __func__ << dendl;
12310 if (get_role() != 0 && hit_set) {
12311 dout(10) << " clearing hit set" << dendl;
12312 hit_set_clear();
12313 }
12314 }
12315
12316 void PrimaryLogPG::on_pool_change()
12317 {
12318 dout(10) << __func__ << dendl;
12319 // requeue cache full waiters just in case the cache_mode is
12320 // changing away from writeback mode. note that if we are not
12321 // active the normal requeuing machinery is sufficient (and properly
12322 // ordered).
12323 if (is_active() &&
12324 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12325 !waiting_for_cache_not_full.empty()) {
12326 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12327 << dendl;
12328 requeue_ops(waiting_for_cache_not_full);
12329 objects_blocked_on_cache_full.clear();
12330 }
12331 hit_set_setup();
12332 agent_setup();
12333 }
12334
12335 // clear state. called on recovery completion AND cancellation.
12336 void PrimaryLogPG::_clear_recovery_state()
12337 {
12338 missing_loc.clear();
12339 #ifdef DEBUG_RECOVERY_OIDS
12340 recovering_oids.clear();
12341 #endif
12342 last_backfill_started = hobject_t();
12343 set<hobject_t>::iterator i = backfills_in_flight.begin();
12344 while (i != backfills_in_flight.end()) {
12345 ceph_assert(recovering.count(*i));
12346 backfills_in_flight.erase(i++);
12347 }
12348
12349 list<OpRequestRef> blocked_ops;
12350 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12351 i != recovering.end();
12352 recovering.erase(i++)) {
12353 if (i->second) {
12354 i->second->drop_recovery_read(&blocked_ops);
12355 requeue_ops(blocked_ops);
12356 }
12357 }
12358 ceph_assert(backfills_in_flight.empty());
12359 pending_backfill_updates.clear();
12360 ceph_assert(recovering.empty());
12361 pgbackend->clear_recovery_state();
12362 }
12363
12364 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12365 {
12366 dout(20) << __func__ << ": " << soid << dendl;
12367 ceph_assert(recovering.count(soid));
12368 ObjectContextRef obc = recovering[soid];
12369 if (obc) {
12370 list<OpRequestRef> blocked_ops;
12371 obc->drop_recovery_read(&blocked_ops);
12372 requeue_ops(blocked_ops);
12373 }
12374 recovering.erase(soid);
12375 finish_recovery_op(soid);
12376 release_backoffs(soid);
12377 if (waiting_for_degraded_object.count(soid)) {
12378 dout(20) << " kicking degraded waiters on " << soid << dendl;
12379 requeue_ops(waiting_for_degraded_object[soid]);
12380 waiting_for_degraded_object.erase(soid);
12381 }
12382 if (waiting_for_unreadable_object.count(soid)) {
12383 dout(20) << " kicking unreadable waiters on " << soid << dendl;
12384 requeue_ops(waiting_for_unreadable_object[soid]);
12385 waiting_for_unreadable_object.erase(soid);
12386 }
12387 if (is_missing_object(soid))
12388 pg_log.set_last_requested(0); // get recover_primary to start over
12389 finish_degraded_object(soid);
12390 }
12391
12392 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12393 {
12394 /*
12395 * check that any peers we are planning to (or currently) pulling
12396 * objects from are dealt with.
12397 */
12398 missing_loc.check_recovery_sources(osdmap);
12399 pgbackend->check_recovery_sources(osdmap);
12400
12401 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
12402 i != peer_log_requested.end();
12403 ) {
12404 if (!osdmap->is_up(i->osd)) {
12405 dout(10) << "peer_log_requested removing " << *i << dendl;
12406 peer_log_requested.erase(i++);
12407 } else {
12408 ++i;
12409 }
12410 }
12411
12412 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
12413 i != peer_missing_requested.end();
12414 ) {
12415 if (!osdmap->is_up(i->osd)) {
12416 dout(10) << "peer_missing_requested removing " << *i << dendl;
12417 peer_missing_requested.erase(i++);
12418 } else {
12419 ++i;
12420 }
12421 }
12422 }
12423
12424 bool PrimaryLogPG::start_recovery_ops(
12425 uint64_t max,
12426 ThreadPool::TPHandle &handle,
12427 uint64_t *ops_started)
12428 {
12429 uint64_t& started = *ops_started;
12430 started = 0;
12431 bool work_in_progress = false;
12432 bool recovery_started = false;
12433 ceph_assert(is_primary());
12434 ceph_assert(is_peered());
12435 ceph_assert(!is_deleting());
12436
12437 ceph_assert(recovery_queued);
12438 recovery_queued = false;
12439
12440 if (!state_test(PG_STATE_RECOVERING) &&
12441 !state_test(PG_STATE_BACKFILLING)) {
12442 /* TODO: I think this case is broken and will make do_recovery()
12443 * unhappy since we're returning false */
12444 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
12445 return have_unfound();
12446 }
12447
12448 const auto &missing = pg_log.get_missing();
12449
12450 unsigned int num_missing = missing.num_missing();
12451 uint64_t num_unfound = get_num_unfound();
12452
12453 if (num_missing == 0) {
12454 info.last_complete = info.last_update;
12455 }
12456
12457 if (num_missing == num_unfound) {
12458 // All of the missing objects we have are unfound.
12459 // Recover the replicas.
12460 started = recover_replicas(max, handle, &recovery_started);
12461 }
12462 if (!started) {
12463 // We still have missing objects that we should grab from replicas.
12464 started += recover_primary(max, handle);
12465 }
12466 if (!started && num_unfound != get_num_unfound()) {
12467 // second chance to recovery replicas
12468 started = recover_replicas(max, handle, &recovery_started);
12469 }
12470
12471 if (started || recovery_started)
12472 work_in_progress = true;
12473
12474 bool deferred_backfill = false;
12475 if (recovering.empty() &&
12476 state_test(PG_STATE_BACKFILLING) &&
12477 !backfill_targets.empty() && started < max &&
12478 missing.num_missing() == 0 &&
12479 waiting_on_backfill.empty()) {
12480 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12481 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12482 deferred_backfill = true;
12483 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12484 !is_degraded()) {
12485 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12486 deferred_backfill = true;
12487 } else if (!backfill_reserved) {
12488 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12489 if (!backfill_reserving) {
12490 dout(10) << "queueing RequestBackfill" << dendl;
12491 backfill_reserving = true;
12492 queue_peering_event(
12493 PGPeeringEventRef(
12494 std::make_shared<PGPeeringEvent>(
12495 get_osdmap_epoch(),
12496 get_osdmap_epoch(),
12497 RequestBackfill())));
12498 }
12499 deferred_backfill = true;
12500 } else {
12501 started += recover_backfill(max - started, handle, &work_in_progress);
12502 }
12503 }
12504
12505 dout(10) << " started " << started << dendl;
12506 osd->logger->inc(l_osd_rop, started);
12507
12508 if (!recovering.empty() ||
12509 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
12510 return !work_in_progress && have_unfound();
12511
12512 ceph_assert(recovering.empty());
12513 ceph_assert(recovery_ops_active == 0);
12514
12515 dout(10) << __func__ << " needs_recovery: "
12516 << missing_loc.get_needs_recovery()
12517 << dendl;
12518 dout(10) << __func__ << " missing_loc: "
12519 << missing_loc.get_missing_locs()
12520 << dendl;
12521 int unfound = get_num_unfound();
12522 if (unfound) {
12523 dout(10) << " still have " << unfound << " unfound" << dendl;
12524 return true;
12525 }
12526
12527 if (missing.num_missing() > 0) {
12528 // this shouldn't happen!
12529 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12530 << missing.num_missing() << ": " << missing.get_items();
12531 return false;
12532 }
12533
12534 if (needs_recovery()) {
12535 // this shouldn't happen!
12536 // We already checked num_missing() so we must have missing replicas
12537 osd->clog->error() << info.pgid
12538 << " Unexpected Error: recovery ending with missing replicas";
12539 return false;
12540 }
12541
12542 if (state_test(PG_STATE_RECOVERING)) {
12543 state_clear(PG_STATE_RECOVERING);
12544 state_clear(PG_STATE_FORCED_RECOVERY);
12545 if (needs_backfill()) {
12546 dout(10) << "recovery done, queuing backfill" << dendl;
12547 queue_peering_event(
12548 PGPeeringEventRef(
12549 std::make_shared<PGPeeringEvent>(
12550 get_osdmap_epoch(),
12551 get_osdmap_epoch(),
12552 RequestBackfill())));
12553 } else {
12554 dout(10) << "recovery done, no backfill" << dendl;
12555 eio_errors_to_process = false;
12556 state_clear(PG_STATE_FORCED_BACKFILL);
12557 queue_peering_event(
12558 PGPeeringEventRef(
12559 std::make_shared<PGPeeringEvent>(
12560 get_osdmap_epoch(),
12561 get_osdmap_epoch(),
12562 AllReplicasRecovered())));
12563 }
12564 } else { // backfilling
12565 state_clear(PG_STATE_BACKFILLING);
12566 state_clear(PG_STATE_FORCED_BACKFILL);
12567 state_clear(PG_STATE_FORCED_RECOVERY);
12568 dout(10) << "recovery done, backfill done" << dendl;
12569 eio_errors_to_process = false;
12570 queue_peering_event(
12571 PGPeeringEventRef(
12572 std::make_shared<PGPeeringEvent>(
12573 get_osdmap_epoch(),
12574 get_osdmap_epoch(),
12575 Backfilled())));
12576 }
12577
12578 return false;
12579 }
12580
12581 /**
12582 * do one recovery op.
12583 * return true if done, false if nothing left to do.
12584 */
12585 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12586 {
12587 ceph_assert(is_primary());
12588
12589 const auto &missing = pg_log.get_missing();
12590
12591 dout(10) << __func__ << " recovering " << recovering.size()
12592 << " in pg,"
12593 << " missing " << missing << dendl;
12594
12595 dout(25) << __func__ << " " << missing.get_items() << dendl;
12596
12597 // look at log!
12598 pg_log_entry_t *latest = 0;
12599 unsigned started = 0;
12600 int skipped = 0;
12601
12602 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12603 map<version_t, hobject_t>::const_iterator p =
12604 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
12605 while (p != missing.get_rmissing().end()) {
12606 handle.reset_tp_timeout();
12607 hobject_t soid;
12608 version_t v = p->first;
12609
12610 auto it_objects = pg_log.get_log().objects.find(p->second);
12611 if (it_objects != pg_log.get_log().objects.end()) {
12612 latest = it_objects->second;
12613 ceph_assert(latest->is_update() || latest->is_delete());
12614 soid = latest->soid;
12615 } else {
12616 latest = 0;
12617 soid = p->second;
12618 }
12619 const pg_missing_item& item = missing.get_items().find(p->second)->second;
12620 ++p;
12621
12622 hobject_t head = soid.get_head();
12623
12624 eversion_t need = item.need;
12625
12626 dout(10) << __func__ << " "
12627 << soid << " " << item.need
12628 << (missing.is_missing(soid) ? " (missing)":"")
12629 << (missing.is_missing(head) ? " (missing head)":"")
12630 << (recovering.count(soid) ? " (recovering)":"")
12631 << (recovering.count(head) ? " (recovering head)":"")
12632 << dendl;
12633
12634 if (latest) {
12635 switch (latest->op) {
12636 case pg_log_entry_t::CLONE:
12637 /*
12638 * Handling for this special case removed for now, until we
12639 * can correctly construct an accurate SnapSet from the old
12640 * one.
12641 */
12642 break;
12643
12644 case pg_log_entry_t::LOST_REVERT:
12645 {
12646 if (item.have == latest->reverting_to) {
12647 ObjectContextRef obc = get_object_context(soid, true);
12648
12649 if (obc->obs.oi.version == latest->version) {
12650 // I'm already reverting
12651 dout(10) << " already reverting " << soid << dendl;
12652 } else {
12653 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
12654 obc->obs.oi.version = latest->version;
12655
12656 ObjectStore::Transaction t;
12657 bufferlist b2;
12658 obc->obs.oi.encode(
12659 b2,
12660 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12661 ceph_assert(!pool.info.require_rollback());
12662 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
12663
12664 recover_got(soid, latest->version);
12665 missing_loc.add_location(soid, pg_whoami);
12666
12667 ++active_pushes;
12668
12669 t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
12670 t.register_on_commit(new C_OSD_CommittedPushedObject(
12671 this,
12672 get_osdmap_epoch(),
12673 info.last_complete));
12674 osd->store->queue_transaction(ch, std::move(t));
12675 continue;
12676 }
12677 } else {
12678 /*
12679 * Pull the old version of the object. Update missing_loc here to have the location
12680 * of the version we want.
12681 *
12682 * This doesn't use the usual missing_loc paths, but that's okay:
12683 * - if we have it locally, we hit the case above, and go from there.
12684 * - if we don't, we always pass through this case during recovery and set up the location
12685 * properly.
12686 * - this way we don't need to mangle the missing code to be general about needing an old
12687 * version...
12688 */
12689 eversion_t alternate_need = latest->reverting_to;
12690 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
12691
12692 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
12693 p != peer_missing.end();
12694 ++p)
12695 if (p->second.is_missing(soid, need) &&
12696 p->second.get_items().at(soid).have == alternate_need) {
12697 missing_loc.add_location(soid, p->first);
12698 }
12699 dout(10) << " will pull " << alternate_need << " or " << need
12700 << " from one of " << missing_loc.get_locations(soid)
12701 << dendl;
12702 }
12703 }
12704 break;
12705 }
12706 }
12707
12708 if (!recovering.count(soid)) {
12709 if (recovering.count(head)) {
12710 ++skipped;
12711 } else {
12712 int r = recover_missing(
12713 soid, need, get_recovery_op_priority(), h);
12714 switch (r) {
12715 case PULL_YES:
12716 ++started;
12717 break;
12718 case PULL_HEAD:
12719 ++started;
12720 case PULL_NONE:
12721 ++skipped;
12722 break;
12723 default:
12724 ceph_abort();
12725 }
12726 if (started >= max)
12727 break;
12728 }
12729 }
12730
12731 // only advance last_requested if we haven't skipped anything
12732 if (!skipped)
12733 pg_log.set_last_requested(v);
12734 }
12735
12736 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12737 return started;
12738 }
12739
12740 bool PrimaryLogPG::primary_error(
12741 const hobject_t& soid, eversion_t v)
12742 {
12743 pg_log.missing_add(soid, v, eversion_t());
12744 pg_log.set_last_requested(0);
12745 missing_loc.remove_location(soid, pg_whoami);
12746 bool uhoh = true;
12747 ceph_assert(!acting_recovery_backfill.empty());
12748 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
12749 i != acting_recovery_backfill.end();
12750 ++i) {
12751 if (*i == get_primary()) continue;
12752 pg_shard_t peer = *i;
12753 if (!peer_missing[peer].is_missing(soid, v)) {
12754 missing_loc.add_location(soid, peer);
12755 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
12756 << ", there should be a copy on shard " << peer << dendl;
12757 uhoh = false;
12758 }
12759 }
12760 if (uhoh)
12761 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
12762 else
12763 osd->clog->error() << info.pgid << " missing primary copy of " << soid
12764 << ", will try copies on " << missing_loc.get_locations(soid);
12765 return uhoh;
12766 }
12767
12768 int PrimaryLogPG::prep_object_replica_deletes(
12769 const hobject_t& soid, eversion_t v,
12770 PGBackend::RecoveryHandle *h,
12771 bool *work_started)
12772 {
12773 ceph_assert(is_primary());
12774 dout(10) << __func__ << ": on " << soid << dendl;
12775
12776 ObjectContextRef obc = get_object_context(soid, false);
12777 if (obc) {
12778 if (!obc->get_recovery_read()) {
12779 dout(20) << "replica delete delayed on " << soid
12780 << "; could not get rw_manager lock" << dendl;
12781 *work_started = true;
12782 return 0;
12783 } else {
12784 dout(20) << "replica delete got recovery read lock on " << soid
12785 << dendl;
12786 }
12787 }
12788
12789 start_recovery_op(soid);
12790 ceph_assert(!recovering.count(soid));
12791 if (!obc)
12792 recovering.insert(make_pair(soid, ObjectContextRef()));
12793 else
12794 recovering.insert(make_pair(soid, obc));
12795
12796 pgbackend->recover_delete_object(soid, v, h);
12797 return 1;
12798 }
12799
12800 int PrimaryLogPG::prep_object_replica_pushes(
12801 const hobject_t& soid, eversion_t v,
12802 PGBackend::RecoveryHandle *h,
12803 bool *work_started)
12804 {
12805 ceph_assert(is_primary());
12806 dout(10) << __func__ << ": on " << soid << dendl;
12807
12808 // NOTE: we know we will get a valid oloc off of disk here.
12809 ObjectContextRef obc = get_object_context(soid, false);
12810 if (!obc) {
12811 primary_error(soid, v);
12812 return 0;
12813 }
12814
12815 if (!obc->get_recovery_read()) {
12816 dout(20) << "recovery delayed on " << soid
12817 << "; could not get rw_manager lock" << dendl;
12818 *work_started = true;
12819 return 0;
12820 } else {
12821 dout(20) << "recovery got recovery read lock on " << soid
12822 << dendl;
12823 }
12824
12825 start_recovery_op(soid);
12826 ceph_assert(!recovering.count(soid));
12827 recovering.insert(make_pair(soid, obc));
12828
12829 /* We need this in case there is an in progress write on the object. In fact,
12830 * the only possible write is an update to the xattr due to a lost_revert --
12831 * a client write would be blocked since the object is degraded.
12832 * In almost all cases, therefore, this lock should be uncontended.
12833 */
12834 int r = pgbackend->recover_object(
12835 soid,
12836 v,
12837 ObjectContextRef(),
12838 obc, // has snapset context
12839 h);
12840 if (r < 0) {
12841 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
12842 primary_failed(soid);
12843 primary_error(soid, v);
12844 return 0;
12845 }
12846 return 1;
12847 }
12848
12849 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
12850 bool *work_started)
12851 {
12852 dout(10) << __func__ << "(" << max << ")" << dendl;
12853 uint64_t started = 0;
12854
12855 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12856
12857 // this is FAR from an optimal recovery order. pretty lame, really.
12858 ceph_assert(!acting_recovery_backfill.empty());
12859 // choose replicas to recover, replica has the shortest missing list first
12860 // so we can bring it back to normal ASAP
12861 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
12862 async_by_num_missing;
12863 replicas_by_num_missing.reserve(acting_recovery_backfill.size() - 1);
12864 for (auto &p: acting_recovery_backfill) {
12865 if (p == get_primary()) {
12866 continue;
12867 }
12868 auto pm = peer_missing.find(p);
12869 ceph_assert(pm != peer_missing.end());
12870 auto nm = pm->second.num_missing();
12871 if (nm != 0) {
12872 if (async_recovery_targets.count(p)) {
12873 async_by_num_missing.push_back(make_pair(nm, p));
12874 } else {
12875 replicas_by_num_missing.push_back(make_pair(nm, p));
12876 }
12877 }
12878 }
12879 // sort by number of missing objects, in ascending order.
12880 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
12881 const std::pair<unsigned int, pg_shard_t> &rhs) {
12882 return lhs.first < rhs.first;
12883 };
12884 // acting goes first
12885 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
12886 // then async_recovery_targets
12887 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
12888 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
12889 async_by_num_missing.begin(), async_by_num_missing.end());
12890 for (auto &replica: replicas_by_num_missing) {
12891 pg_shard_t &peer = replica.second;
12892 ceph_assert(peer != get_primary());
12893 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
12894 ceph_assert(pm != peer_missing.end());
12895 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
12896 ceph_assert(pi != peer_info.end());
12897 size_t m_sz = pm->second.num_missing();
12898
12899 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
12900 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
12901
12902 // oldest first!
12903 const pg_missing_t &m(pm->second);
12904 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
12905 p != m.get_rmissing().end() && started < max;
12906 ++p) {
12907 handle.reset_tp_timeout();
12908 const hobject_t soid(p->second);
12909
12910 if (missing_loc.is_unfound(soid)) {
12911 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
12912 continue;
12913 }
12914
12915 if (soid > pi->second.last_backfill) {
12916 if (!recovering.count(soid)) {
12917 derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
12918 derr << __func__ << ": object added to missing set for backfill, but "
12919 << "is not in recovering, error!" << dendl;
12920 ceph_abort();
12921 }
12922 continue;
12923 }
12924
12925 if (recovering.count(soid)) {
12926 dout(10) << __func__ << ": already recovering " << soid << dendl;
12927 continue;
12928 }
12929
12930 if (missing_loc.is_deleted(soid)) {
12931 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
12932 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12933 started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
12934 continue;
12935 }
12936
12937 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
12938 dout(10) << __func__ << ": " << soid.get_head()
12939 << " still missing on primary" << dendl;
12940 continue;
12941 }
12942
12943 if (pg_log.get_missing().is_missing(soid)) {
12944 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
12945 continue;
12946 }
12947
12948 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
12949 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12950 started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
12951 }
12952 }
12953
12954 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12955 return started;
12956 }
12957
12958 hobject_t PrimaryLogPG::earliest_peer_backfill() const
12959 {
12960 hobject_t e = hobject_t::get_max();
12961 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
12962 i != backfill_targets.end();
12963 ++i) {
12964 pg_shard_t peer = *i;
12965 map<pg_shard_t, BackfillInterval>::const_iterator iter =
12966 peer_backfill_info.find(peer);
12967 ceph_assert(iter != peer_backfill_info.end());
12968 if (iter->second.begin < e)
12969 e = iter->second.begin;
12970 }
12971 return e;
12972 }
12973
12974 bool PrimaryLogPG::all_peer_done() const
12975 {
12976 // Primary hasn't got any more objects
12977 ceph_assert(backfill_info.empty());
12978
12979 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
12980 i != backfill_targets.end();
12981 ++i) {
12982 pg_shard_t bt = *i;
12983 map<pg_shard_t, BackfillInterval>::const_iterator piter =
12984 peer_backfill_info.find(bt);
12985 ceph_assert(piter != peer_backfill_info.end());
12986 const BackfillInterval& pbi = piter->second;
12987 // See if peer has more to process
12988 if (!pbi.extends_to_end() || !pbi.empty())
12989 return false;
12990 }
12991 return true;
12992 }
12993
12994 /**
12995 * recover_backfill
12996 *
12997 * Invariants:
12998 *
12999 * backfilled: fully pushed to replica or present in replica's missing set (both
13000 * our copy and theirs).
13001 *
13002 * All objects on a backfill_target in
13003 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13004 * objects have been actually deleted and all logically-valid objects are replicated.
13005 * There may be PG objects in this interval yet to be backfilled.
13006 *
13007 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13008 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13009 *
13010 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13011 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13012 * interval remain on the backfill target.
13013 *
13014 * For a backfill target, all objects <= peer_info[target].last_backfill
13015 * have been backfilled to target
13016 *
13017 * There *MAY* be missing/outdated objects between last_backfill_started and
13018 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13019 * io created objects since the last scan. For this reason, we call
13020 * update_range() again before continuing backfill.
13021 */
13022 uint64_t PrimaryLogPG::recover_backfill(
13023 uint64_t max,
13024 ThreadPool::TPHandle &handle, bool *work_started)
13025 {
13026 dout(10) << __func__ << " (" << max << ")"
13027 << " bft=" << backfill_targets
13028 << " last_backfill_started " << last_backfill_started
13029 << (new_backfill ? " new_backfill":"")
13030 << dendl;
13031 ceph_assert(!backfill_targets.empty());
13032
13033 // Initialize from prior backfill state
13034 if (new_backfill) {
13035 // on_activate() was called prior to getting here
13036 ceph_assert(last_backfill_started == earliest_backfill());
13037 new_backfill = false;
13038
13039 // initialize BackfillIntervals
13040 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13041 i != backfill_targets.end();
13042 ++i) {
13043 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
13044 }
13045 backfill_info.reset(last_backfill_started);
13046
13047 backfills_in_flight.clear();
13048 pending_backfill_updates.clear();
13049 }
13050
13051 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13052 i != backfill_targets.end();
13053 ++i) {
13054 dout(10) << "peer osd." << *i
13055 << " info " << peer_info[*i]
13056 << " interval " << peer_backfill_info[*i].begin
13057 << "-" << peer_backfill_info[*i].end
13058 << " " << peer_backfill_info[*i].objects.size() << " objects"
13059 << dendl;
13060 }
13061
13062 // update our local interval to cope with recent changes
13063 backfill_info.begin = last_backfill_started;
13064 update_range(&backfill_info, handle);
13065
13066 unsigned ops = 0;
13067 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13068 set<hobject_t> add_to_stat;
13069
13070 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13071 i != backfill_targets.end();
13072 ++i) {
13073 peer_backfill_info[*i].trim_to(
13074 std::max(peer_info[*i].last_backfill, last_backfill_started));
13075 }
13076 backfill_info.trim_to(last_backfill_started);
13077
13078 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13079 while (ops < max) {
13080 if (backfill_info.begin <= earliest_peer_backfill() &&
13081 !backfill_info.extends_to_end() && backfill_info.empty()) {
13082 hobject_t next = backfill_info.end;
13083 backfill_info.reset(next);
13084 backfill_info.end = hobject_t::get_max();
13085 update_range(&backfill_info, handle);
13086 backfill_info.trim();
13087 }
13088
13089 dout(20) << " my backfill interval " << backfill_info << dendl;
13090
13091 bool sent_scan = false;
13092 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13093 i != backfill_targets.end();
13094 ++i) {
13095 pg_shard_t bt = *i;
13096 BackfillInterval& pbi = peer_backfill_info[bt];
13097
13098 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13099 if (pbi.begin <= backfill_info.begin &&
13100 !pbi.extends_to_end() && pbi.empty()) {
13101 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13102 epoch_t e = get_osdmap_epoch();
13103 MOSDPGScan *m = new MOSDPGScan(
13104 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
13105 spg_t(info.pgid.pgid, bt.shard),
13106 pbi.end, hobject_t());
13107 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13108 ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13109 waiting_on_backfill.insert(bt);
13110 sent_scan = true;
13111 }
13112 }
13113
13114 // Count simultaneous scans as a single op and let those complete
13115 if (sent_scan) {
13116 ops++;
13117 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13118 break;
13119 }
13120
13121 if (backfill_info.empty() && all_peer_done()) {
13122 dout(10) << " reached end for both local and all peers" << dendl;
13123 break;
13124 }
13125
13126 // Get object within set of peers to operate on and
13127 // the set of targets for which that object applies.
13128 hobject_t check = earliest_peer_backfill();
13129
13130 if (check < backfill_info.begin) {
13131
13132 set<pg_shard_t> check_targets;
13133 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13134 i != backfill_targets.end();
13135 ++i) {
13136 pg_shard_t bt = *i;
13137 BackfillInterval& pbi = peer_backfill_info[bt];
13138 if (pbi.begin == check)
13139 check_targets.insert(bt);
13140 }
13141 ceph_assert(!check_targets.empty());
13142
13143 dout(20) << " BACKFILL removing " << check
13144 << " from peers " << check_targets << dendl;
13145 for (set<pg_shard_t>::iterator i = check_targets.begin();
13146 i != check_targets.end();
13147 ++i) {
13148 pg_shard_t bt = *i;
13149 BackfillInterval& pbi = peer_backfill_info[bt];
13150 ceph_assert(pbi.begin == check);
13151
13152 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13153 pbi.pop_front();
13154 }
13155
13156 last_backfill_started = check;
13157
13158 // Don't increment ops here because deletions
13159 // are cheap and not replied to unlike real recovery_ops,
13160 // and we can't increment ops without requeueing ourself
13161 // for recovery.
13162 } else {
13163 eversion_t& obj_v = backfill_info.objects.begin()->second;
13164
13165 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13166 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13167 i != backfill_targets.end();
13168 ++i) {
13169 pg_shard_t bt = *i;
13170 BackfillInterval& pbi = peer_backfill_info[bt];
13171 // Find all check peers that have the wrong version
13172 if (check == backfill_info.begin && check == pbi.begin) {
13173 if (pbi.objects.begin()->second != obj_v) {
13174 need_ver_targs.push_back(bt);
13175 } else {
13176 keep_ver_targs.push_back(bt);
13177 }
13178 } else {
13179 pg_info_t& pinfo = peer_info[bt];
13180
13181 // Only include peers that we've caught up to their backfill line
13182 // otherwise, they only appear to be missing this object
13183 // because their pbi.begin > backfill_info.begin.
13184 if (backfill_info.begin > pinfo.last_backfill)
13185 missing_targs.push_back(bt);
13186 else
13187 skip_targs.push_back(bt);
13188 }
13189 }
13190
13191 if (!keep_ver_targs.empty()) {
13192 // These peers have version obj_v
13193 dout(20) << " BACKFILL keeping " << check
13194 << " with ver " << obj_v
13195 << " on peers " << keep_ver_targs << dendl;
13196 //assert(!waiting_for_degraded_object.count(check));
13197 }
13198 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13199 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13200 ceph_assert(obc);
13201 if (obc->get_recovery_read()) {
13202 if (!need_ver_targs.empty()) {
13203 dout(20) << " BACKFILL replacing " << check
13204 << " with ver " << obj_v
13205 << " to peers " << need_ver_targs << dendl;
13206 }
13207 if (!missing_targs.empty()) {
13208 dout(20) << " BACKFILL pushing " << backfill_info.begin
13209 << " with ver " << obj_v
13210 << " to peers " << missing_targs << dendl;
13211 }
13212 vector<pg_shard_t> all_push = need_ver_targs;
13213 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13214
13215 handle.reset_tp_timeout();
13216 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13217 if (r < 0) {
13218 *work_started = true;
13219 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13220 break;
13221 }
13222 ops++;
13223 } else {
13224 *work_started = true;
13225 dout(20) << "backfill blocking on " << backfill_info.begin
13226 << "; could not get rw_manager lock" << dendl;
13227 break;
13228 }
13229 }
13230 dout(20) << "need_ver_targs=" << need_ver_targs
13231 << " keep_ver_targs=" << keep_ver_targs << dendl;
13232 dout(20) << "backfill_targets=" << backfill_targets
13233 << " missing_targs=" << missing_targs
13234 << " skip_targs=" << skip_targs << dendl;
13235
13236 last_backfill_started = backfill_info.begin;
13237 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13238 backfill_info.pop_front();
13239 vector<pg_shard_t> check_targets = need_ver_targs;
13240 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13241 for (vector<pg_shard_t>::iterator i = check_targets.begin();
13242 i != check_targets.end();
13243 ++i) {
13244 pg_shard_t bt = *i;
13245 BackfillInterval& pbi = peer_backfill_info[bt];
13246 pbi.pop_front();
13247 }
13248 }
13249 }
13250
13251 hobject_t backfill_pos =
13252 std::min(backfill_info.begin, earliest_peer_backfill());
13253
13254 for (set<hobject_t>::iterator i = add_to_stat.begin();
13255 i != add_to_stat.end();
13256 ++i) {
13257 ObjectContextRef obc = get_object_context(*i, false);
13258 ceph_assert(obc);
13259 pg_stat_t stat;
13260 add_object_context_to_pg_stat(obc, &stat);
13261 pending_backfill_updates[*i] = stat;
13262 }
13263 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13264 for (unsigned i = 0; i < to_remove.size(); ++i) {
13265 handle.reset_tp_timeout();
13266 const hobject_t& oid = to_remove[i].get<0>();
13267 eversion_t v = to_remove[i].get<1>();
13268 pg_shard_t peer = to_remove[i].get<2>();
13269 MOSDPGBackfillRemove *m;
13270 auto it = reqs.find(peer);
13271 if (it != reqs.end()) {
13272 m = it->second;
13273 } else {
13274 m = reqs[peer] = new MOSDPGBackfillRemove(
13275 spg_t(info.pgid.pgid, peer.shard),
13276 get_osdmap_epoch());
13277 }
13278 m->ls.push_back(make_pair(oid, v));
13279
13280 if (oid <= last_backfill_started)
13281 pending_backfill_updates[oid]; // add empty stat!
13282 }
13283 for (auto p : reqs) {
13284 osd->send_message_osd_cluster(p.first.osd, p.second,
13285 get_osdmap_epoch());
13286 }
13287
13288 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13289
13290 dout(5) << "backfill_pos is " << backfill_pos << dendl;
13291 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13292 i != backfills_in_flight.end();
13293 ++i) {
13294 dout(20) << *i << " is still in flight" << dendl;
13295 }
13296
13297 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13298 backfill_pos : *(backfills_in_flight.begin());
13299 hobject_t new_last_backfill = earliest_backfill();
13300 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13301 for (map<hobject_t, pg_stat_t>::iterator i =
13302 pending_backfill_updates.begin();
13303 i != pending_backfill_updates.end() &&
13304 i->first < next_backfill_to_complete;
13305 pending_backfill_updates.erase(i++)) {
13306 dout(20) << " pending_backfill_update " << i->first << dendl;
13307 ceph_assert(i->first > new_last_backfill);
13308 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
13309 j != backfill_targets.end();
13310 ++j) {
13311 pg_shard_t bt = *j;
13312 pg_info_t& pinfo = peer_info[bt];
13313 //Add stats to all peers that were missing object
13314 if (i->first > pinfo.last_backfill)
13315 pinfo.stats.add(i->second);
13316 }
13317 new_last_backfill = i->first;
13318 }
13319 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13320
13321 ceph_assert(!pending_backfill_updates.empty() ||
13322 new_last_backfill == last_backfill_started);
13323 if (pending_backfill_updates.empty() &&
13324 backfill_pos.is_max()) {
13325 ceph_assert(backfills_in_flight.empty());
13326 new_last_backfill = backfill_pos;
13327 last_backfill_started = backfill_pos;
13328 }
13329 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13330
13331 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13332 // all the backfill targets. Otherwise, we will move last_backfill up on
13333 // those targets need it and send OP_BACKFILL_PROGRESS to them.
13334 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13335 i != backfill_targets.end();
13336 ++i) {
13337 pg_shard_t bt = *i;
13338 pg_info_t& pinfo = peer_info[bt];
13339
13340 if (new_last_backfill > pinfo.last_backfill) {
13341 pinfo.set_last_backfill(new_last_backfill);
13342 epoch_t e = get_osdmap_epoch();
13343 MOSDPGBackfill *m = NULL;
13344 if (pinfo.last_backfill.is_max()) {
13345 m = new MOSDPGBackfill(
13346 MOSDPGBackfill::OP_BACKFILL_FINISH,
13347 e,
13348 last_peering_reset,
13349 spg_t(info.pgid.pgid, bt.shard));
13350 // Use default priority here, must match sub_op priority
13351 /* pinfo.stats might be wrong if we did log-based recovery on the
13352 * backfilled portion in addition to continuing backfill.
13353 */
13354 pinfo.stats = info.stats;
13355 start_recovery_op(hobject_t::get_max());
13356 } else {
13357 m = new MOSDPGBackfill(
13358 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13359 e,
13360 last_peering_reset,
13361 spg_t(info.pgid.pgid, bt.shard));
13362 // Use default priority here, must match sub_op priority
13363 }
13364 m->last_backfill = pinfo.last_backfill;
13365 m->stats = pinfo.stats;
13366 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13367 dout(10) << " peer " << bt
13368 << " num_objects now " << pinfo.stats.stats.sum.num_objects
13369 << " / " << info.stats.stats.sum.num_objects << dendl;
13370 }
13371 }
13372
13373 if (ops)
13374 *work_started = true;
13375 return ops;
13376 }
13377
13378 int PrimaryLogPG::prep_backfill_object_push(
13379 hobject_t oid, eversion_t v,
13380 ObjectContextRef obc,
13381 vector<pg_shard_t> peers,
13382 PGBackend::RecoveryHandle *h)
13383 {
13384 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
13385 ceph_assert(!peers.empty());
13386
13387 backfills_in_flight.insert(oid);
13388 for (unsigned int i = 0 ; i < peers.size(); ++i) {
13389 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
13390 ceph_assert(bpm != peer_missing.end());
13391 bpm->second.add(oid, eversion_t(), eversion_t(), false);
13392 }
13393
13394 ceph_assert(!recovering.count(oid));
13395
13396 start_recovery_op(oid);
13397 recovering.insert(make_pair(oid, obc));
13398
13399 // We need to take the read_lock here in order to flush in-progress writes
13400 int r = pgbackend->recover_object(
13401 oid,
13402 v,
13403 ObjectContextRef(),
13404 obc,
13405 h);
13406 if (r < 0) {
13407 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
13408 primary_failed(oid);
13409 primary_error(oid, v);
13410 backfills_in_flight.erase(oid);
13411 missing_loc.add_missing(oid, v, eversion_t());
13412 }
13413 return r;
13414 }
13415
13416 void PrimaryLogPG::update_range(
13417 BackfillInterval *bi,
13418 ThreadPool::TPHandle &handle)
13419 {
13420 int local_min = cct->_conf->osd_backfill_scan_min;
13421 int local_max = cct->_conf->osd_backfill_scan_max;
13422
13423 if (bi->version < info.log_tail) {
13424 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13425 << dendl;
13426 bi->version = info.last_update;
13427 scan_range(local_min, local_max, bi, handle);
13428 }
13429
13430 if (bi->version >= projected_last_update) {
13431 dout(10) << __func__<< ": bi is current " << dendl;
13432 ceph_assert(bi->version == projected_last_update);
13433 } else if (bi->version >= info.log_tail) {
13434 if (pg_log.get_log().empty() && projected_log.empty()) {
13435 /* Because we don't move log_tail on split, the log might be
13436 * empty even if log_tail != last_update. However, the only
13437 * way to get here with an empty log is if log_tail is actually
13438 * eversion_t(), because otherwise the entry which changed
13439 * last_update since the last scan would have to be present.
13440 */
13441 ceph_assert(bi->version == eversion_t());
13442 return;
13443 }
13444
13445 dout(10) << __func__<< ": bi is old, (" << bi->version
13446 << ") can be updated with log to projected_last_update "
13447 << projected_last_update << dendl;
13448
13449 auto func = [&](const pg_log_entry_t &e) {
13450 dout(10) << __func__ << ": updating from version " << e.version
13451 << dendl;
13452 const hobject_t &soid = e.soid;
13453 if (soid >= bi->begin &&
13454 soid < bi->end) {
13455 if (e.is_update()) {
13456 dout(10) << __func__ << ": " << e.soid << " updated to version "
13457 << e.version << dendl;
13458 bi->objects.erase(e.soid);
13459 bi->objects.insert(
13460 make_pair(
13461 e.soid,
13462 e.version));
13463 } else if (e.is_delete()) {
13464 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13465 bi->objects.erase(e.soid);
13466 }
13467 }
13468 };
13469 dout(10) << "scanning pg log first" << dendl;
13470 pg_log.get_log().scan_log_after(bi->version, func);
13471 dout(10) << "scanning projected log" << dendl;
13472 projected_log.scan_log_after(bi->version, func);
13473 bi->version = projected_last_update;
13474 } else {
13475 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13476 }
13477 }
13478
13479 void PrimaryLogPG::scan_range(
13480 int min, int max, BackfillInterval *bi,
13481 ThreadPool::TPHandle &handle)
13482 {
13483 ceph_assert(is_locked());
13484 dout(10) << "scan_range from " << bi->begin << dendl;
13485 bi->clear_objects();
13486
13487 vector<hobject_t> ls;
13488 ls.reserve(max);
13489 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
13490 ceph_assert(r >= 0);
13491 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13492 dout(20) << ls << dendl;
13493
13494 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13495 handle.reset_tp_timeout();
13496 ObjectContextRef obc;
13497 if (is_primary())
13498 obc = object_contexts.lookup(*p);
13499 if (obc) {
13500 bi->objects[*p] = obc->obs.oi.version;
13501 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
13502 } else {
13503 bufferlist bl;
13504 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
13505
13506 /* If the object does not exist here, it must have been removed
13507 * between the collection_list_partial and here. This can happen
13508 * for the first item in the range, which is usually last_backfill.
13509 */
13510 if (r == -ENOENT)
13511 continue;
13512
13513 ceph_assert(r >= 0);
13514 object_info_t oi(bl);
13515 bi->objects[*p] = oi.version;
13516 dout(20) << " " << *p << " " << oi.version << dendl;
13517 }
13518 }
13519 }
13520
13521
13522 /** check_local
13523 *
13524 * verifies that stray objects have been deleted
13525 */
13526 void PrimaryLogPG::check_local()
13527 {
13528 dout(10) << __func__ << dendl;
13529
13530 ceph_assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
13531
13532 if (!cct->_conf->osd_debug_verify_stray_on_activate)
13533 return;
13534
13535 // just scan the log.
13536 set<hobject_t> did;
13537 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
13538 p != pg_log.get_log().log.rend();
13539 ++p) {
13540 if (did.count(p->soid))
13541 continue;
13542 did.insert(p->soid);
13543
13544 if (p->is_delete() && !is_missing_object(p->soid)) {
13545 dout(10) << " checking " << p->soid
13546 << " at " << p->version << dendl;
13547 struct stat st;
13548 int r = osd->store->stat(
13549 ch,
13550 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13551 &st);
13552 if (r != -ENOENT) {
13553 derr << __func__ << " " << p->soid << " exists, but should have been "
13554 << "deleted" << dendl;
13555 ceph_abort_msg("erroneously present object");
13556 }
13557 } else {
13558 // ignore old(+missing) objects
13559 }
13560 }
13561 }
13562
13563
13564
13565 // ===========================
13566 // hit sets
13567
13568 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13569 {
13570 ostringstream ss;
13571 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13572 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13573 info.pgid.ps(), info.pgid.pool(),
13574 cct->_conf->osd_hit_set_namespace);
13575 dout(20) << __func__ << " " << hoid << dendl;
13576 return hoid;
13577 }
13578
13579 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13580 utime_t end,
13581 bool using_gmt)
13582 {
13583 ostringstream ss;
13584 ss << "hit_set_" << info.pgid.pgid << "_archive_";
13585 if (using_gmt) {
13586 start.gmtime(ss) << "_";
13587 end.gmtime(ss);
13588 } else {
13589 start.localtime(ss) << "_";
13590 end.localtime(ss);
13591 }
13592 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13593 info.pgid.ps(), info.pgid.pool(),
13594 cct->_conf->osd_hit_set_namespace);
13595 dout(20) << __func__ << " " << hoid << dendl;
13596 return hoid;
13597 }
13598
13599 void PrimaryLogPG::hit_set_clear()
13600 {
13601 dout(20) << __func__ << dendl;
13602 hit_set.reset();
13603 hit_set_start_stamp = utime_t();
13604 }
13605
13606 void PrimaryLogPG::hit_set_setup()
13607 {
13608 if (!is_active() ||
13609 !is_primary()) {
13610 hit_set_clear();
13611 return;
13612 }
13613
13614 if (is_active() && is_primary() &&
13615 (!pool.info.hit_set_count ||
13616 !pool.info.hit_set_period ||
13617 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
13618 hit_set_clear();
13619
13620 // only primary is allowed to remove all the hit set objects
13621 hit_set_remove_all();
13622 return;
13623 }
13624
13625 // FIXME: discard any previous data for now
13626 hit_set_create();
13627
13628 // include any writes we know about from the pg log. this doesn't
13629 // capture reads, but it is better than nothing!
13630 hit_set_apply_log();
13631 }
13632
13633 void PrimaryLogPG::hit_set_remove_all()
13634 {
13635 // If any archives are degraded we skip this
13636 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13637 p != info.hit_set.history.end();
13638 ++p) {
13639 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13640
13641 // Once we hit a degraded object just skip
13642 if (is_degraded_or_backfilling_object(aoid))
13643 return;
13644 if (write_blocked_by_scrub(aoid))
13645 return;
13646 }
13647
13648 if (!info.hit_set.history.empty()) {
13649 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
13650 ceph_assert(p != info.hit_set.history.rend());
13651 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13652 ceph_assert(!is_degraded_or_backfilling_object(oid));
13653 ObjectContextRef obc = get_object_context(oid, false);
13654 ceph_assert(obc);
13655
13656 OpContextUPtr ctx = simple_opc_create(obc);
13657 ctx->at_version = get_next_version();
13658 ctx->updated_hset_history = info.hit_set;
13659 utime_t now = ceph_clock_now();
13660 ctx->mtime = now;
13661 hit_set_trim(ctx, 0);
13662 simple_opc_submit(std::move(ctx));
13663 }
13664
13665 info.hit_set = pg_hit_set_history_t();
13666 if (agent_state) {
13667 agent_state->discard_hit_sets();
13668 }
13669 }
13670
13671 void PrimaryLogPG::hit_set_create()
13672 {
13673 utime_t now = ceph_clock_now();
13674 // make a copy of the params to modify
13675 HitSet::Params params(pool.info.hit_set_params);
13676
13677 dout(20) << __func__ << " " << params << dendl;
13678 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
13679 BloomHitSet::Params *p =
13680 static_cast<BloomHitSet::Params*>(params.impl.get());
13681
13682 // convert false positive rate so it holds up across the full period
13683 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
13684 if (p->get_fpp() <= 0.0)
13685 p->set_fpp(.01); // fpp cannot be zero!
13686
13687 // if we don't have specified size, estimate target size based on the
13688 // previous bin!
13689 if (p->target_size == 0 && hit_set) {
13690 utime_t dur = now - hit_set_start_stamp;
13691 unsigned unique = hit_set->approx_unique_insert_count();
13692 dout(20) << __func__ << " previous set had approx " << unique
13693 << " unique items over " << dur << " seconds" << dendl;
13694 p->target_size = (double)unique * (double)pool.info.hit_set_period
13695 / (double)dur;
13696 }
13697 if (p->target_size <
13698 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
13699 p->target_size = cct->_conf->osd_hit_set_min_size;
13700
13701 if (p->target_size
13702 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
13703 p->target_size = cct->_conf->osd_hit_set_max_size;
13704
13705 p->seed = now.sec();
13706
13707 dout(10) << __func__ << " target_size " << p->target_size
13708 << " fpp " << p->get_fpp() << dendl;
13709 }
13710 hit_set.reset(new HitSet(params));
13711 hit_set_start_stamp = now;
13712 }
13713
13714 /**
13715 * apply log entries to set
13716 *
13717 * this would only happen after peering, to at least capture writes
13718 * during an interval that was potentially lost.
13719 */
13720 bool PrimaryLogPG::hit_set_apply_log()
13721 {
13722 if (!hit_set)
13723 return false;
13724
13725 eversion_t to = info.last_update;
13726 eversion_t from = info.hit_set.current_last_update;
13727 if (to <= from) {
13728 dout(20) << __func__ << " no update" << dendl;
13729 return false;
13730 }
13731
13732 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
13733 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
13734 while (p != pg_log.get_log().log.rend() && p->version > to)
13735 ++p;
13736 while (p != pg_log.get_log().log.rend() && p->version > from) {
13737 hit_set->insert(p->soid);
13738 ++p;
13739 }
13740
13741 return true;
13742 }
13743
13744 void PrimaryLogPG::hit_set_persist()
13745 {
13746 dout(10) << __func__ << dendl;
13747 bufferlist bl;
13748 unsigned max = pool.info.hit_set_count;
13749
13750 utime_t now = ceph_clock_now();
13751 hobject_t oid;
13752
13753 // If any archives are degraded we skip this persist request
13754 // account for the additional entry being added below
13755 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13756 p != info.hit_set.history.end();
13757 ++p) {
13758 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13759
13760 // Once we hit a degraded object just skip further trim
13761 if (is_degraded_or_backfilling_object(aoid))
13762 return;
13763 if (write_blocked_by_scrub(aoid))
13764 return;
13765 }
13766
13767 // If backfill is in progress and we could possibly overlap with the
13768 // hit_set_* objects, back off. Since these all have
13769 // hobject_t::hash set to pgid.ps(), and those sort first, we can
13770 // look just at that. This is necessary because our transactions
13771 // may include a modify of the new hit_set *and* a delete of the
13772 // old one, and this may span the backfill boundary.
13773 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
13774 p != backfill_targets.end();
13775 ++p) {
13776 ceph_assert(peer_info.count(*p));
13777 const pg_info_t& pi = peer_info[*p];
13778 if (pi.last_backfill == hobject_t() ||
13779 pi.last_backfill.get_hash() == info.pgid.ps()) {
13780 dout(10) << __func__ << " backfill target osd." << *p
13781 << " last_backfill has not progressed past pgid ps"
13782 << dendl;
13783 return;
13784 }
13785 }
13786
13787
13788 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
13789 new_hset.begin = hit_set_start_stamp;
13790 new_hset.end = now;
13791 oid = get_hit_set_archive_object(
13792 new_hset.begin,
13793 new_hset.end,
13794 new_hset.using_gmt);
13795
13796 // If the current object is degraded we skip this persist request
13797 if (write_blocked_by_scrub(oid))
13798 return;
13799
13800 hit_set->seal();
13801 encode(*hit_set, bl);
13802 dout(20) << __func__ << " archive " << oid << dendl;
13803
13804 if (agent_state) {
13805 agent_state->add_hit_set(new_hset.begin, hit_set);
13806 uint32_t size = agent_state->hit_set_map.size();
13807 if (size >= pool.info.hit_set_count) {
13808 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
13809 }
13810 hit_set_in_memory_trim(size);
13811 }
13812
13813 ObjectContextRef obc = get_object_context(oid, true);
13814 OpContextUPtr ctx = simple_opc_create(obc);
13815
13816 ctx->at_version = get_next_version();
13817 ctx->updated_hset_history = info.hit_set;
13818 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
13819
13820 updated_hit_set_hist.current_last_update = info.last_update;
13821 new_hset.version = ctx->at_version;
13822
13823 updated_hit_set_hist.history.push_back(new_hset);
13824 hit_set_create();
13825
13826 // fabricate an object_info_t and SnapSet
13827 obc->obs.oi.version = ctx->at_version;
13828 obc->obs.oi.mtime = now;
13829 obc->obs.oi.size = bl.length();
13830 obc->obs.exists = true;
13831 obc->obs.oi.set_data_digest(bl.crc32c(-1));
13832
13833 ctx->new_obs = obc->obs;
13834
13835 ctx->new_snapset = obc->ssc->snapset;
13836
13837 ctx->delta_stats.num_objects++;
13838 ctx->delta_stats.num_objects_hit_set_archive++;
13839
13840 ctx->delta_stats.num_bytes += bl.length();
13841 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
13842
13843 bufferlist bss;
13844 encode(ctx->new_snapset, bss);
13845 bufferlist boi(sizeof(ctx->new_obs.oi));
13846 encode(ctx->new_obs.oi, boi,
13847 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13848
13849 ctx->op_t->create(oid);
13850 if (bl.length()) {
13851 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
13852 }
13853 map <string, bufferlist> attrs;
13854 attrs[OI_ATTR].claim(boi);
13855 attrs[SS_ATTR].claim(bss);
13856 setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
13857 ctx->log.push_back(
13858 pg_log_entry_t(
13859 pg_log_entry_t::MODIFY,
13860 oid,
13861 ctx->at_version,
13862 eversion_t(),
13863 0,
13864 osd_reqid_t(),
13865 ctx->mtime,
13866 0)
13867 );
13868
13869 hit_set_trim(ctx, max);
13870
13871 simple_opc_submit(std::move(ctx));
13872 }
13873
13874 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
13875 {
13876 ceph_assert(ctx->updated_hset_history);
13877 pg_hit_set_history_t &updated_hit_set_hist =
13878 *(ctx->updated_hset_history);
13879 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
13880 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
13881 ceph_assert(p != updated_hit_set_hist.history.end());
13882 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13883
13884 ceph_assert(!is_degraded_or_backfilling_object(oid));
13885
13886 dout(20) << __func__ << " removing " << oid << dendl;
13887 ++ctx->at_version.version;
13888 ctx->log.push_back(
13889 pg_log_entry_t(pg_log_entry_t::DELETE,
13890 oid,
13891 ctx->at_version,
13892 p->version,
13893 0,
13894 osd_reqid_t(),
13895 ctx->mtime,
13896 0));
13897
13898 ctx->op_t->remove(oid);
13899 updated_hit_set_hist.history.pop_front();
13900
13901 ObjectContextRef obc = get_object_context(oid, false);
13902 ceph_assert(obc);
13903 --ctx->delta_stats.num_objects;
13904 --ctx->delta_stats.num_objects_hit_set_archive;
13905 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
13906 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
13907 }
13908 }
13909
13910 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
13911 {
13912 while (agent_state->hit_set_map.size() > max_in_memory) {
13913 agent_state->remove_oldest_hit_set();
13914 }
13915 }
13916
13917
13918 // =======================================
13919 // cache agent
13920
13921 void PrimaryLogPG::agent_setup()
13922 {
13923 ceph_assert(is_locked());
13924 if (!is_active() ||
13925 !is_primary() ||
13926 state_test(PG_STATE_PREMERGE) ||
13927 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13928 pool.info.tier_of < 0 ||
13929 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13930 agent_clear();
13931 return;
13932 }
13933 if (!agent_state) {
13934 agent_state.reset(new TierAgentState);
13935
13936 // choose random starting position
13937 agent_state->position = hobject_t();
13938 agent_state->position.pool = info.pgid.pool();
13939 agent_state->position.set_hash(pool.info.get_random_pg_position(
13940 info.pgid.pgid,
13941 rand()));
13942 agent_state->start = agent_state->position;
13943
13944 dout(10) << __func__ << " allocated new state, position "
13945 << agent_state->position << dendl;
13946 } else {
13947 dout(10) << __func__ << " keeping existing state" << dendl;
13948 }
13949
13950 if (info.stats.stats_invalid) {
13951 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13952 }
13953
13954 agent_choose_mode();
13955 }
13956
13957 void PrimaryLogPG::agent_clear()
13958 {
13959 agent_stop();
13960 agent_state.reset(NULL);
13961 }
13962
13963 // Return false if no objects operated on since start of object hash space
13964 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13965 {
13966 lock();
13967 if (!agent_state) {
13968 dout(10) << __func__ << " no agent state, stopping" << dendl;
13969 unlock();
13970 return true;
13971 }
13972
13973 ceph_assert(!deleting);
13974
13975 if (agent_state->is_idle()) {
13976 dout(10) << __func__ << " idle, stopping" << dendl;
13977 unlock();
13978 return true;
13979 }
13980
13981 osd->logger->inc(l_osd_agent_wake);
13982
13983 dout(10) << __func__
13984 << " max " << start_max
13985 << ", flush " << agent_state->get_flush_mode_name()
13986 << ", evict " << agent_state->get_evict_mode_name()
13987 << ", pos " << agent_state->position
13988 << dendl;
13989 ceph_assert(is_primary());
13990 ceph_assert(is_active());
13991
13992 agent_load_hit_sets();
13993
13994 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13995 ceph_assert(base_pool);
13996
13997 int ls_min = 1;
13998 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13999
14000 // list some objects. this conveniently lists clones (oldest to
14001 // newest) before heads... the same order we want to flush in.
14002 //
14003 // NOTE: do not flush the Sequencer. we will assume that the
14004 // listing we get back is imprecise.
14005 vector<hobject_t> ls;
14006 hobject_t next;
14007 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14008 &ls, &next);
14009 ceph_assert(r >= 0);
14010 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14011 int started = 0;
14012 for (vector<hobject_t>::iterator p = ls.begin();
14013 p != ls.end();
14014 ++p) {
14015 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14016 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14017 osd->logger->inc(l_osd_agent_skip);
14018 continue;
14019 }
14020 if (is_degraded_or_backfilling_object(*p)) {
14021 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14022 osd->logger->inc(l_osd_agent_skip);
14023 continue;
14024 }
14025 if (is_missing_object(p->get_head())) {
14026 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14027 osd->logger->inc(l_osd_agent_skip);
14028 continue;
14029 }
14030 ObjectContextRef obc = get_object_context(*p, false, NULL);
14031 if (!obc) {
14032 // we didn't flush; we may miss something here.
14033 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14034 osd->logger->inc(l_osd_agent_skip);
14035 continue;
14036 }
14037 if (!obc->obs.exists) {
14038 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14039 osd->logger->inc(l_osd_agent_skip);
14040 continue;
14041 }
14042 if (range_intersects_scrub(obc->obs.oi.soid,
14043 obc->obs.oi.soid.get_head())) {
14044 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14045 osd->logger->inc(l_osd_agent_skip);
14046 continue;
14047 }
14048 if (obc->is_blocked()) {
14049 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14050 osd->logger->inc(l_osd_agent_skip);
14051 continue;
14052 }
14053 if (obc->is_request_pending()) {
14054 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14055 osd->logger->inc(l_osd_agent_skip);
14056 continue;
14057 }
14058
14059 // be careful flushing omap to an EC pool.
14060 if (!base_pool->supports_omap() &&
14061 obc->obs.oi.is_omap()) {
14062 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14063 osd->logger->inc(l_osd_agent_skip);
14064 continue;
14065 }
14066
14067 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14068 agent_maybe_evict(obc, false))
14069 ++started;
14070 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14071 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14072 ++started;
14073 --agent_flush_quota;
14074 }
14075 if (started >= start_max) {
14076 // If finishing early, set "next" to the next object
14077 if (++p != ls.end())
14078 next = *p;
14079 break;
14080 }
14081 }
14082
14083 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14084 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14085 agent_state->hist_age = 0;
14086 agent_state->temp_hist.decay();
14087 }
14088
14089 // Total objects operated on so far
14090 int total_started = agent_state->started + started;
14091 bool need_delay = false;
14092
14093 dout(20) << __func__ << " start pos " << agent_state->position
14094 << " next start pos " << next
14095 << " started " << total_started << dendl;
14096
14097 // See if we've made a full pass over the object hash space
14098 // This might check at most ls_max objects a second time to notice that
14099 // we've checked every objects at least once.
14100 if (agent_state->position < agent_state->start &&
14101 next >= agent_state->start) {
14102 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14103 if (total_started == 0)
14104 need_delay = true;
14105 else
14106 total_started = 0;
14107 agent_state->start = next;
14108 }
14109 agent_state->started = total_started;
14110
14111 // See if we are starting from beginning
14112 if (next.is_max())
14113 agent_state->position = hobject_t();
14114 else
14115 agent_state->position = next;
14116
14117 // Discard old in memory HitSets
14118 hit_set_in_memory_trim(pool.info.hit_set_count);
14119
14120 if (need_delay) {
14121 ceph_assert(agent_state->delaying == false);
14122 agent_delay();
14123 unlock();
14124 return false;
14125 }
14126 agent_choose_mode();
14127 unlock();
14128 return true;
14129 }
14130
14131 void PrimaryLogPG::agent_load_hit_sets()
14132 {
14133 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14134 return;
14135 }
14136
14137 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14138 dout(10) << __func__ << dendl;
14139 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
14140 p != info.hit_set.history.end(); ++p) {
14141 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14142 dout(10) << __func__ << " loading " << p->begin << "-"
14143 << p->end << dendl;
14144 if (!pool.info.is_replicated()) {
14145 // FIXME: EC not supported here yet
14146 derr << __func__ << " on non-replicated pool" << dendl;
14147 break;
14148 }
14149
14150 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14151 if (is_unreadable_object(oid)) {
14152 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14153 break;
14154 }
14155
14156 ObjectContextRef obc = get_object_context(oid, false);
14157 if (!obc) {
14158 derr << __func__ << ": could not load hitset " << oid << dendl;
14159 break;
14160 }
14161
14162 bufferlist bl;
14163 {
14164 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14165 ceph_assert(r >= 0);
14166 }
14167 HitSetRef hs(new HitSet);
14168 bufferlist::const_iterator pbl = bl.begin();
14169 decode(*hs, pbl);
14170 agent_state->add_hit_set(p->begin.sec(), hs);
14171 }
14172 }
14173 }
14174 }
14175
14176 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14177 {
14178 if (!obc->obs.oi.is_dirty()) {
14179 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14180 osd->logger->inc(l_osd_agent_skip);
14181 return false;
14182 }
14183 if (obc->obs.oi.is_cache_pinned()) {
14184 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14185 osd->logger->inc(l_osd_agent_skip);
14186 return false;
14187 }
14188
14189 utime_t now = ceph_clock_now();
14190 utime_t ob_local_mtime;
14191 if (obc->obs.oi.local_mtime != utime_t()) {
14192 ob_local_mtime = obc->obs.oi.local_mtime;
14193 } else {
14194 ob_local_mtime = obc->obs.oi.mtime;
14195 }
14196 bool evict_mode_full =
14197 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14198 if (!evict_mode_full &&
14199 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
14200 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14201 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14202 osd->logger->inc(l_osd_agent_skip);
14203 return false;
14204 }
14205
14206 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14207 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14208 osd->logger->inc(l_osd_agent_skip);
14209 return false;
14210 }
14211
14212 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14213
14214 // FIXME: flush anything dirty, regardless of what distribution of
14215 // ages we expect.
14216
14217 hobject_t oid = obc->obs.oi.soid;
14218 osd->agent_start_op(oid);
14219 // no need to capture a pg ref, can't outlive fop or ctx
14220 std::function<void()> on_flush = [this, oid]() {
14221 osd->agent_finish_op(oid);
14222 };
14223
14224 int result = start_flush(
14225 OpRequestRef(), obc, false, NULL,
14226 on_flush);
14227 if (result != -EINPROGRESS) {
14228 on_flush();
14229 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14230 << " with " << result << dendl;
14231 osd->logger->inc(l_osd_agent_skip);
14232 return false;
14233 }
14234
14235 osd->logger->inc(l_osd_agent_flush);
14236 return true;
14237 }
14238
14239 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14240 {
14241 const hobject_t& soid = obc->obs.oi.soid;
14242 if (!after_flush && obc->obs.oi.is_dirty()) {
14243 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14244 return false;
14245 }
14246 if (!obc->obs.oi.watchers.empty()) {
14247 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14248 return false;
14249 }
14250 if (obc->is_blocked()) {
14251 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14252 return false;
14253 }
14254 if (obc->obs.oi.is_cache_pinned()) {
14255 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14256 return false;
14257 }
14258
14259 if (soid.snap == CEPH_NOSNAP) {
14260 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14261 if (result < 0) {
14262 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14263 return false;
14264 }
14265 }
14266
14267 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14268 // is this object old than cache_min_evict_age?
14269 utime_t now = ceph_clock_now();
14270 utime_t ob_local_mtime;
14271 if (obc->obs.oi.local_mtime != utime_t()) {
14272 ob_local_mtime = obc->obs.oi.local_mtime;
14273 } else {
14274 ob_local_mtime = obc->obs.oi.mtime;
14275 }
14276 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14277 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14278 osd->logger->inc(l_osd_agent_skip);
14279 return false;
14280 }
14281 // is this object old and/or cold enough?
14282 int temp = 0;
14283 uint64_t temp_upper = 0, temp_lower = 0;
14284 if (hit_set)
14285 agent_estimate_temp(soid, &temp);
14286 agent_state->temp_hist.add(temp);
14287 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14288
14289 dout(20) << __func__
14290 << " temp " << temp
14291 << " pos " << temp_lower << "-" << temp_upper
14292 << ", evict_effort " << agent_state->evict_effort
14293 << dendl;
14294 dout(30) << "agent_state:\n";
14295 Formatter *f = Formatter::create("");
14296 f->open_object_section("agent_state");
14297 agent_state->dump(f);
14298 f->close_section();
14299 f->flush(*_dout);
14300 delete f;
14301 *_dout << dendl;
14302
14303 if (1000000 - temp_upper >= agent_state->evict_effort)
14304 return false;
14305 }
14306
14307 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14308 OpContextUPtr ctx = simple_opc_create(obc);
14309
14310 auto null_op_req = OpRequestRef();
14311 if (!ctx->lock_manager.get_lock_type(
14312 ObjectContext::RWState::RWWRITE,
14313 obc->obs.oi.soid,
14314 obc,
14315 null_op_req)) {
14316 close_op_ctx(ctx.release());
14317 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14318 return false;
14319 }
14320
14321 osd->agent_start_evict_op();
14322 ctx->register_on_finish(
14323 [this]() {
14324 osd->agent_finish_evict_op();
14325 });
14326
14327 ctx->at_version = get_next_version();
14328 ceph_assert(ctx->new_obs.exists);
14329 int r = _delete_oid(ctx.get(), true, false);
14330 if (obc->obs.oi.is_omap())
14331 ctx->delta_stats.num_objects_omap--;
14332 ctx->delta_stats.num_evict++;
14333 ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
14334 if (obc->obs.oi.is_dirty())
14335 --ctx->delta_stats.num_objects_dirty;
14336 ceph_assert(r == 0);
14337 finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
14338 simple_opc_submit(std::move(ctx));
14339 osd->logger->inc(l_osd_tier_evict);
14340 osd->logger->inc(l_osd_agent_evict);
14341 return true;
14342 }
14343
14344 void PrimaryLogPG::agent_stop()
14345 {
14346 dout(20) << __func__ << dendl;
14347 if (agent_state && !agent_state->is_idle()) {
14348 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14349 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14350 osd->agent_disable_pg(this, agent_state->evict_effort);
14351 }
14352 }
14353
14354 void PrimaryLogPG::agent_delay()
14355 {
14356 dout(20) << __func__ << dendl;
14357 if (agent_state && !agent_state->is_idle()) {
14358 ceph_assert(agent_state->delaying == false);
14359 agent_state->delaying = true;
14360 osd->agent_disable_pg(this, agent_state->evict_effort);
14361 }
14362 }
14363
14364 void PrimaryLogPG::agent_choose_mode_restart()
14365 {
14366 dout(20) << __func__ << dendl;
14367 lock();
14368 if (agent_state && agent_state->delaying) {
14369 agent_state->delaying = false;
14370 agent_choose_mode(true);
14371 }
14372 unlock();
14373 }
14374
14375 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14376 {
14377 bool requeued = false;
14378 // Let delay play out
14379 if (agent_state->delaying) {
14380 dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
14381 return requeued;
14382 }
14383
14384 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14385 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14386 unsigned evict_effort = 0;
14387
14388 if (info.stats.stats_invalid) {
14389 // idle; stats can't be trusted until we scrub.
14390 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14391 goto skip_calc;
14392 }
14393
14394 {
14395 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
14396 ceph_assert(divisor > 0);
14397
14398 // adjust (effective) user objects down based on the number
14399 // of HitSet objects, which should not count toward our total since
14400 // they cannot be flushed.
14401 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14402
14403 // also exclude omap objects if ec backing pool
14404 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14405 ceph_assert(base_pool);
14406 if (!base_pool->supports_omap())
14407 unflushable += info.stats.stats.sum.num_objects_omap;
14408
14409 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14410 if (num_user_objects > unflushable)
14411 num_user_objects -= unflushable;
14412 else
14413 num_user_objects = 0;
14414
14415 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14416 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14417 num_user_bytes -= unflushable_bytes;
14418 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14419 num_user_bytes += num_overhead_bytes;
14420
14421 // also reduce the num_dirty by num_objects_omap
14422 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14423 if (!base_pool->supports_omap()) {
14424 if (num_dirty > info.stats.stats.sum.num_objects_omap)
14425 num_dirty -= info.stats.stats.sum.num_objects_omap;
14426 else
14427 num_dirty = 0;
14428 }
14429
14430 dout(10) << __func__
14431 << " flush_mode: "
14432 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14433 << " evict_mode: "
14434 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14435 << " num_objects: " << info.stats.stats.sum.num_objects
14436 << " num_bytes: " << info.stats.stats.sum.num_bytes
14437 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14438 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14439 << " num_dirty: " << num_dirty
14440 << " num_user_objects: " << num_user_objects
14441 << " num_user_bytes: " << num_user_bytes
14442 << " num_overhead_bytes: " << num_overhead_bytes
14443 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14444 << " pool.info.target_max_objects: " << pool.info.target_max_objects
14445 << dendl;
14446
14447 // get dirty, full ratios
14448 uint64_t dirty_micro = 0;
14449 uint64_t full_micro = 0;
14450 if (pool.info.target_max_bytes && num_user_objects > 0) {
14451 uint64_t avg_size = num_user_bytes / num_user_objects;
14452 dirty_micro =
14453 num_dirty * avg_size * 1000000 /
14454 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14455 full_micro =
14456 num_user_objects * avg_size * 1000000 /
14457 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14458 }
14459 if (pool.info.target_max_objects > 0) {
14460 uint64_t dirty_objects_micro =
14461 num_dirty * 1000000 /
14462 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14463 if (dirty_objects_micro > dirty_micro)
14464 dirty_micro = dirty_objects_micro;
14465 uint64_t full_objects_micro =
14466 num_user_objects * 1000000 /
14467 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14468 if (full_objects_micro > full_micro)
14469 full_micro = full_objects_micro;
14470 }
14471 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14472 << " full " << ((float)full_micro / 1000000.0)
14473 << dendl;
14474
14475 // flush mode
14476 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14477 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14478 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14479 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14480 flush_target += flush_slop;
14481 flush_high_target += flush_slop;
14482 } else {
14483 flush_target -= std::min(flush_target, flush_slop);
14484 flush_high_target -= std::min(flush_high_target, flush_slop);
14485 }
14486
14487 if (dirty_micro > flush_high_target) {
14488 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
14489 } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
14490 flush_mode = TierAgentState::FLUSH_MODE_LOW;
14491 }
14492
14493 // evict mode
14494 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14495 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14496 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14497 evict_target += evict_slop;
14498 else
14499 evict_target -= std::min(evict_target, evict_slop);
14500
14501 if (full_micro > 1000000) {
14502 // evict anything clean
14503 evict_mode = TierAgentState::EVICT_MODE_FULL;
14504 evict_effort = 1000000;
14505 } else if (full_micro > evict_target) {
14506 // set effort in [0..1] range based on where we are between
14507 evict_mode = TierAgentState::EVICT_MODE_SOME;
14508 uint64_t over = full_micro - evict_target;
14509 uint64_t span = 1000000 - evict_target;
14510 evict_effort = std::max(over * 1000000 / span,
14511 uint64_t(1000000.0 *
14512 cct->_conf->osd_agent_min_evict_effort));
14513
14514 // quantize effort to avoid too much reordering in the agent_queue.
14515 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
14516 ceph_assert(inc > 0);
14517 uint64_t was = evict_effort;
14518 evict_effort -= evict_effort % inc;
14519 if (evict_effort < inc)
14520 evict_effort = inc;
14521 ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
14522 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14523 }
14524 }
14525
14526 skip_calc:
14527 bool old_idle = agent_state->is_idle();
14528 if (flush_mode != agent_state->flush_mode) {
14529 dout(5) << __func__ << " flush_mode "
14530 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14531 << " -> "
14532 << TierAgentState::get_flush_mode_name(flush_mode)
14533 << dendl;
14534 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14535 osd->agent_inc_high_count();
14536 info.stats.stats.sum.num_flush_mode_high = 1;
14537 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14538 info.stats.stats.sum.num_flush_mode_low = 1;
14539 }
14540 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14541 osd->agent_dec_high_count();
14542 info.stats.stats.sum.num_flush_mode_high = 0;
14543 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14544 info.stats.stats.sum.num_flush_mode_low = 0;
14545 }
14546 agent_state->flush_mode = flush_mode;
14547 }
14548 if (evict_mode != agent_state->evict_mode) {
14549 dout(5) << __func__ << " evict_mode "
14550 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14551 << " -> "
14552 << TierAgentState::get_evict_mode_name(evict_mode)
14553 << dendl;
14554 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14555 is_active()) {
14556 if (op)
14557 requeue_op(op);
14558 requeue_ops(waiting_for_flush);
14559 requeue_ops(waiting_for_active);
14560 requeue_ops(waiting_for_scrub);
14561 requeue_ops(waiting_for_cache_not_full);
14562 objects_blocked_on_cache_full.clear();
14563 requeued = true;
14564 }
14565 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14566 info.stats.stats.sum.num_evict_mode_some = 1;
14567 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14568 info.stats.stats.sum.num_evict_mode_full = 1;
14569 }
14570 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14571 info.stats.stats.sum.num_evict_mode_some = 0;
14572 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14573 info.stats.stats.sum.num_evict_mode_full = 0;
14574 }
14575 agent_state->evict_mode = evict_mode;
14576 }
14577 uint64_t old_effort = agent_state->evict_effort;
14578 if (evict_effort != agent_state->evict_effort) {
14579 dout(5) << __func__ << " evict_effort "
14580 << ((float)agent_state->evict_effort / 1000000.0)
14581 << " -> "
14582 << ((float)evict_effort / 1000000.0)
14583 << dendl;
14584 agent_state->evict_effort = evict_effort;
14585 }
14586
14587 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14588 // (including flush). This is probably fine (they should be
14589 // correlated) but it is not precisely correct.
14590 if (agent_state->is_idle()) {
14591 if (!restart && !old_idle) {
14592 osd->agent_disable_pg(this, old_effort);
14593 }
14594 } else {
14595 if (restart || old_idle) {
14596 osd->agent_enable_pg(this, agent_state->evict_effort);
14597 } else if (old_effort != agent_state->evict_effort) {
14598 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
14599 }
14600 }
14601 return requeued;
14602 }
14603
14604 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
14605 {
14606 ceph_assert(hit_set);
14607 ceph_assert(temp);
14608 *temp = 0;
14609 if (hit_set->contains(oid))
14610 *temp = 1000000;
14611 unsigned i = 0;
14612 int last_n = pool.info.hit_set_search_last_n;
14613 for (map<time_t,HitSetRef>::reverse_iterator p =
14614 agent_state->hit_set_map.rbegin(); last_n > 0 &&
14615 p != agent_state->hit_set_map.rend(); ++p, ++i) {
14616 if (p->second->contains(oid)) {
14617 *temp += pool.info.get_grade(i);
14618 --last_n;
14619 }
14620 }
14621 }
14622
14623 // Dup op detection
14624
14625 bool PrimaryLogPG::already_complete(eversion_t v)
14626 {
14627 dout(20) << __func__ << ": " << v << dendl;
14628 for (xlist<RepGather*>::iterator i = repop_queue.begin();
14629 !i.end();
14630 ++i) {
14631 dout(20) << __func__ << ": " << **i << dendl;
14632 // skip copy from temp object ops
14633 if ((*i)->v == eversion_t()) {
14634 dout(20) << __func__ << ": " << **i
14635 << " version is empty" << dendl;
14636 continue;
14637 }
14638 if ((*i)->v > v) {
14639 dout(20) << __func__ << ": " << **i
14640 << " (*i)->v past v" << dendl;
14641 break;
14642 }
14643 if (!(*i)->all_committed) {
14644 dout(20) << __func__ << ": " << **i
14645 << " not committed, returning false"
14646 << dendl;
14647 return false;
14648 }
14649 }
14650 dout(20) << __func__ << ": returning true" << dendl;
14651 return true;
14652 }
14653
14654 bool PrimaryLogPG::already_ack(eversion_t v)
14655 {
14656 dout(20) << __func__ << ": " << v << dendl;
14657 for (xlist<RepGather*>::iterator i = repop_queue.begin();
14658 !i.end();
14659 ++i) {
14660 // skip copy from temp object ops
14661 if ((*i)->v == eversion_t()) {
14662 dout(20) << __func__ << ": " << **i
14663 << " version is empty" << dendl;
14664 continue;
14665 }
14666 if ((*i)->v > v) {
14667 dout(20) << __func__ << ": " << **i
14668 << " (*i)->v past v" << dendl;
14669 break;
14670 }
14671 }
14672 dout(20) << __func__ << ": returning true" << dendl;
14673 return true;
14674 }
14675
14676
14677 // ==========================================================================================
14678 // SCRUB
14679
14680
14681 bool PrimaryLogPG::_range_available_for_scrub(
14682 const hobject_t &begin, const hobject_t &end)
14683 {
14684 pair<hobject_t, ObjectContextRef> next;
14685 next.second = object_contexts.lookup(begin);
14686 next.first = begin;
14687 bool more = true;
14688 while (more && next.first < end) {
14689 if (next.second && next.second->is_blocked()) {
14690 next.second->requeue_scrub_on_unblock = true;
14691 dout(10) << __func__ << ": scrub delayed, "
14692 << next.first << " is blocked"
14693 << dendl;
14694 return false;
14695 }
14696 more = object_contexts.get_next(next.first, &next);
14697 }
14698 return true;
14699 }
14700
14701 static bool doing_clones(const boost::optional<SnapSet> &snapset,
14702 const vector<snapid_t>::reverse_iterator &curclone) {
14703 return snapset && curclone != snapset.get().clones.rend();
14704 }
14705
14706 void PrimaryLogPG::log_missing(unsigned missing,
14707 const boost::optional<hobject_t> &head,
14708 LogChannelRef clog,
14709 const spg_t &pgid,
14710 const char *func,
14711 const char *mode,
14712 bool allow_incomplete_clones)
14713 {
14714 ceph_assert(head);
14715 if (allow_incomplete_clones) {
14716 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
14717 << " skipped " << missing << " clone(s) in cache tier" << dendl;
14718 } else {
14719 clog->info() << mode << " " << pgid << " " << head.get()
14720 << " : " << missing << " missing clone(s)";
14721 }
14722 }
14723
14724 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
14725 const boost::optional<SnapSet> &snapset,
14726 LogChannelRef clog,
14727 const spg_t &pgid,
14728 const char *mode,
14729 bool allow_incomplete_clones,
14730 boost::optional<snapid_t> target,
14731 vector<snapid_t>::reverse_iterator *curclone,
14732 inconsistent_snapset_wrapper &e)
14733 {
14734 ceph_assert(head);
14735 ceph_assert(snapset);
14736 unsigned missing = 0;
14737
14738 // NOTE: clones are in descending order, thus **curclone > target test here
14739 hobject_t next_clone(head.get());
14740 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
14741 ++missing;
14742 // it is okay to be missing one or more clones in a cache tier.
14743 // skip higher-numbered clones in the list.
14744 if (!allow_incomplete_clones) {
14745 next_clone.snap = **curclone;
14746 clog->error() << mode << " " << pgid << " " << head.get()
14747 << " : expected clone " << next_clone << " " << missing
14748 << " missing";
14749 ++scrubber.shallow_errors;
14750 e.set_clone_missing(next_clone.snap);
14751 }
14752 // Clones are descending
14753 ++(*curclone);
14754 }
14755 return missing;
14756 }
14757
14758 /*
14759 * Validate consistency of the object info and snap sets.
14760 *
14761 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14762 * the comparison of the objects is against multiple snapset.clones. There are
14763 * multiple clone lists and in between lists we expect head.
14764 *
14765 * Example
14766 *
14767 * objects expected
14768 * ======= =======
14769 * obj1 snap 1 head, unexpected obj1 snap 1
14770 * obj2 head head, match
14771 * [SnapSet clones 6 4 2 1]
14772 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
14773 * obj2 snap 6 obj2 snap 6, match
14774 * obj2 snap 4 obj2 snap 4, match
14775 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
14776 * [Snapset clones 3 1]
14777 * obj3 snap 3 obj3 snap 3 match
14778 * obj3 snap 1 obj3 snap 1 match
14779 * obj4 head head, match
14780 * [Snapset clones 4]
14781 * EOL obj4 snap 4, (expected)
14782 */
14783 void PrimaryLogPG::scrub_snapshot_metadata(
14784 ScrubMap &scrubmap,
14785 const map<hobject_t,
14786 pair<boost::optional<uint32_t>,
14787 boost::optional<uint32_t>>> &missing_digest)
14788 {
14789 dout(10) << __func__ << dendl;
14790
14791 bool repair = state_test(PG_STATE_REPAIR);
14792 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14793 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14794 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
14795
14796 // traverse in reverse order.
14797 boost::optional<hobject_t> head;
14798 boost::optional<SnapSet> snapset; // If initialized so will head (above)
14799 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
14800 unsigned missing = 0;
14801 inconsistent_snapset_wrapper soid_error, head_error;
14802 unsigned soid_error_count = 0;
14803
14804 for (map<hobject_t,ScrubMap::object>::reverse_iterator
14805 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
14806 const hobject_t& soid = p->first;
14807 ceph_assert(!soid.is_snapdir());
14808 soid_error = inconsistent_snapset_wrapper{soid};
14809 object_stat_sum_t stat;
14810 boost::optional<object_info_t> oi;
14811
14812 stat.num_objects++;
14813
14814 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14815 stat.num_objects_hit_set_archive++;
14816
14817 if (soid.is_snap()) {
14818 // it's a clone
14819 stat.num_object_clones++;
14820 }
14821
14822 // basic checks.
14823 if (p->second.attrs.count(OI_ATTR) == 0) {
14824 oi = boost::none;
14825 osd->clog->error() << mode << " " << info.pgid << " " << soid
14826 << " : no '" << OI_ATTR << "' attr";
14827 ++scrubber.shallow_errors;
14828 soid_error.set_info_missing();
14829 } else {
14830 bufferlist bv;
14831 bv.push_back(p->second.attrs[OI_ATTR]);
14832 try {
14833 oi = object_info_t(); // Initialize optional<> before decode into it
14834 oi.get().decode(bv);
14835 } catch (buffer::error& e) {
14836 oi = boost::none;
14837 osd->clog->error() << mode << " " << info.pgid << " " << soid
14838 << " : can't decode '" << OI_ATTR << "' attr " << e.what();
14839 ++scrubber.shallow_errors;
14840 soid_error.set_info_corrupted();
14841 soid_error.set_info_missing(); // Not available too
14842 }
14843 }
14844
14845 if (oi) {
14846 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
14847 osd->clog->error() << mode << " " << info.pgid << " " << soid
14848 << " : on disk size (" << p->second.size
14849 << ") does not match object info size ("
14850 << oi->size << ") adjusted for ondisk to ("
14851 << pgbackend->be_get_ondisk_size(oi->size)
14852 << ")";
14853 soid_error.set_size_mismatch();
14854 ++scrubber.shallow_errors;
14855 }
14856
14857 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
14858
14859 // A clone num_bytes will be added later when we have snapset
14860 if (!soid.is_snap()) {
14861 stat.num_bytes += oi->size;
14862 }
14863 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14864 stat.num_bytes_hit_set_archive += oi->size;
14865
14866 if (oi->is_dirty())
14867 ++stat.num_objects_dirty;
14868 if (oi->is_whiteout())
14869 ++stat.num_whiteouts;
14870 if (oi->is_omap())
14871 ++stat.num_objects_omap;
14872 if (oi->is_cache_pinned())
14873 ++stat.num_objects_pinned;
14874 if (oi->has_manifest())
14875 ++stat.num_objects_manifest;
14876 }
14877
14878 // Check for any problems while processing clones
14879 if (doing_clones(snapset, curclone)) {
14880 boost::optional<snapid_t> target;
14881 // Expecting an object with snap for current head
14882 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
14883
14884 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
14885 << soid << " while processing " << head.get() << dendl;
14886
14887 target = all_clones;
14888 } else {
14889 ceph_assert(soid.is_snap());
14890 target = soid.snap;
14891 }
14892
14893 // Log any clones we were expecting to be there up to target
14894 // This will set missing, but will be a no-op if snap.soid == *curclone.
14895 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14896 pool.info.allow_incomplete_clones(), target, &curclone,
14897 head_error);
14898 }
14899 bool expected;
14900 // Check doing_clones() again in case we ran process_clones_to()
14901 if (doing_clones(snapset, curclone)) {
14902 // A head would have processed all clones above
14903 // or all greater than *curclone.
14904 ceph_assert(soid.is_snap() && *curclone <= soid.snap);
14905
14906 // After processing above clone snap should match the expected curclone
14907 expected = (*curclone == soid.snap);
14908 } else {
14909 // If we aren't doing clones any longer, then expecting head
14910 expected = soid.has_snapset();
14911 }
14912 if (!expected) {
14913 // If we couldn't read the head's snapset, just ignore clones
14914 if (head && !snapset) {
14915 osd->clog->error() << mode << " " << info.pgid << " " << soid
14916 << " : clone ignored due to missing snapset";
14917 } else {
14918 osd->clog->error() << mode << " " << info.pgid << " " << soid
14919 << " : is an unexpected clone";
14920 }
14921 ++scrubber.shallow_errors;
14922 soid_error.set_headless();
14923 scrubber.store->add_snap_error(pool.id, soid_error);
14924 ++soid_error_count;
14925 if (head && soid.get_head() == head->get_head())
14926 head_error.set_clone(soid.snap);
14927 continue;
14928 }
14929
14930 // new snapset?
14931 if (soid.has_snapset()) {
14932
14933 if (missing) {
14934 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14935 pool.info.allow_incomplete_clones());
14936 }
14937
14938 // Save previous head error information
14939 if (head && (head_error.errors || soid_error_count))
14940 scrubber.store->add_snap_error(pool.id, head_error);
14941 // Set this as a new head object
14942 head = soid;
14943 missing = 0;
14944 head_error = soid_error;
14945 soid_error_count = 0;
14946
14947 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14948
14949 if (p->second.attrs.count(SS_ATTR) == 0) {
14950 osd->clog->error() << mode << " " << info.pgid << " " << soid
14951 << " : no '" << SS_ATTR << "' attr";
14952 ++scrubber.shallow_errors;
14953 snapset = boost::none;
14954 head_error.set_snapset_missing();
14955 } else {
14956 bufferlist bl;
14957 bl.push_back(p->second.attrs[SS_ATTR]);
14958 auto blp = bl.cbegin();
14959 try {
14960 snapset = SnapSet(); // Initialize optional<> before decoding into it
14961 decode(snapset.get(), blp);
14962 head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14963 } catch (buffer::error& e) {
14964 snapset = boost::none;
14965 osd->clog->error() << mode << " " << info.pgid << " " << soid
14966 << " : can't decode '" << SS_ATTR << "' attr " << e.what();
14967 ++scrubber.shallow_errors;
14968 head_error.set_snapset_corrupted();
14969 }
14970 }
14971
14972 if (snapset) {
14973 // what will be next?
14974 curclone = snapset->clones.rbegin();
14975
14976 if (!snapset->clones.empty()) {
14977 dout(20) << " snapset " << snapset.get() << dendl;
14978 if (snapset->seq == 0) {
14979 osd->clog->error() << mode << " " << info.pgid << " " << soid
14980 << " : snaps.seq not set";
14981 ++scrubber.shallow_errors;
14982 head_error.set_snapset_error();
14983 }
14984 }
14985 }
14986 } else {
14987 ceph_assert(soid.is_snap());
14988 ceph_assert(head);
14989 ceph_assert(snapset);
14990 ceph_assert(soid.snap == *curclone);
14991
14992 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14993
14994 if (snapset->clone_size.count(soid.snap) == 0) {
14995 osd->clog->error() << mode << " " << info.pgid << " " << soid
14996 << " : is missing in clone_size";
14997 ++scrubber.shallow_errors;
14998 soid_error.set_size_mismatch();
14999 } else {
15000 if (oi && oi->size != snapset->clone_size[soid.snap]) {
15001 osd->clog->error() << mode << " " << info.pgid << " " << soid
15002 << " : size " << oi->size << " != clone_size "
15003 << snapset->clone_size[*curclone];
15004 ++scrubber.shallow_errors;
15005 soid_error.set_size_mismatch();
15006 }
15007
15008 if (snapset->clone_overlap.count(soid.snap) == 0) {
15009 osd->clog->error() << mode << " " << info.pgid << " " << soid
15010 << " : is missing in clone_overlap";
15011 ++scrubber.shallow_errors;
15012 soid_error.set_size_mismatch();
15013 } else {
15014 // This checking is based on get_clone_bytes(). The first 2 asserts
15015 // can't happen because we know we have a clone_size and
15016 // a clone_overlap. Now we check that the interval_set won't
15017 // cause the last assert.
15018 uint64_t size = snapset->clone_size.find(soid.snap)->second;
15019 const interval_set<uint64_t> &overlap =
15020 snapset->clone_overlap.find(soid.snap)->second;
15021 bool bad_interval_set = false;
15022 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
15023 i != overlap.end(); ++i) {
15024 if (size < i.get_len()) {
15025 bad_interval_set = true;
15026 break;
15027 }
15028 size -= i.get_len();
15029 }
15030
15031 if (bad_interval_set) {
15032 osd->clog->error() << mode << " " << info.pgid << " " << soid
15033 << " : bad interval_set in clone_overlap";
15034 ++scrubber.shallow_errors;
15035 soid_error.set_size_mismatch();
15036 } else {
15037 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
15038 }
15039 }
15040 }
15041
15042 // what's next?
15043 ++curclone;
15044 if (soid_error.errors) {
15045 scrubber.store->add_snap_error(pool.id, soid_error);
15046 ++soid_error_count;
15047 }
15048 }
15049
15050 scrub_cstat.add(stat);
15051 }
15052
15053 if (doing_clones(snapset, curclone)) {
15054 dout(10) << __func__ << " " << mode << " " << info.pgid
15055 << " No more objects while processing " << head.get() << dendl;
15056
15057 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
15058 pool.info.allow_incomplete_clones(), all_clones, &curclone,
15059 head_error);
15060 }
15061 // There could be missing found by the test above or even
15062 // before dropping out of the loop for the last head.
15063 if (missing) {
15064 log_missing(missing, head, osd->clog, info.pgid, __func__,
15065 mode, pool.info.allow_incomplete_clones());
15066 }
15067 if (head && (head_error.errors || soid_error_count))
15068 scrubber.store->add_snap_error(pool.id, head_error);
15069
15070 for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
15071 ceph_assert(!p->first.is_snapdir());
15072 dout(10) << __func__ << " recording digests for " << p->first << dendl;
15073 ObjectContextRef obc = get_object_context(p->first, false);
15074 if (!obc) {
15075 osd->clog->error() << info.pgid << " " << mode
15076 << " cannot get object context for object "
15077 << p->first;
15078 continue;
15079 } else if (obc->obs.oi.soid != p->first) {
15080 osd->clog->error() << info.pgid << " " << mode
15081 << " " << p->first
15082 << " : object has a valid oi attr with a mismatched name, "
15083 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
15084 continue;
15085 }
15086 OpContextUPtr ctx = simple_opc_create(obc);
15087 ctx->at_version = get_next_version();
15088 ctx->mtime = utime_t(); // do not update mtime
15089 if (p->second.first) {
15090 ctx->new_obs.oi.set_data_digest(*p->second.first);
15091 } else {
15092 ctx->new_obs.oi.clear_data_digest();
15093 }
15094 if (p->second.second) {
15095 ctx->new_obs.oi.set_omap_digest(*p->second.second);
15096 } else {
15097 ctx->new_obs.oi.clear_omap_digest();
15098 }
15099 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
15100
15101 ctx->register_on_success(
15102 [this]() {
15103 dout(20) << "updating scrub digest" << dendl;
15104 if (--scrubber.num_digest_updates_pending == 0) {
15105 requeue_scrub();
15106 }
15107 });
15108
15109 simple_opc_submit(std::move(ctx));
15110 ++scrubber.num_digest_updates_pending;
15111 }
15112
15113 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
15114 }
15115
15116 void PrimaryLogPG::_scrub_clear_state()
15117 {
15118 scrub_cstat = object_stat_collection_t();
15119 }
15120
15121 void PrimaryLogPG::_scrub_finish()
15122 {
15123 bool repair = state_test(PG_STATE_REPAIR);
15124 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
15125 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
15126
15127 if (info.stats.stats_invalid) {
15128 info.stats.stats = scrub_cstat;
15129 info.stats.stats_invalid = false;
15130
15131 if (agent_state)
15132 agent_choose_mode();
15133 }
15134
15135 dout(10) << mode << " got "
15136 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15137 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15138 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15139 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15140 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15141 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15142 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15143 << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15144 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
15145 << dendl;
15146
15147 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
15148 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
15149 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
15150 !info.stats.dirty_stats_invalid) ||
15151 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
15152 !info.stats.omap_stats_invalid) ||
15153 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
15154 !info.stats.pin_stats_invalid) ||
15155 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
15156 !info.stats.hitset_stats_invalid) ||
15157 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
15158 !info.stats.hitset_bytes_stats_invalid) ||
15159 (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest &&
15160 !info.stats.manifest_stats_invalid) ||
15161 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
15162 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
15163 osd->clog->error() << info.pgid << " " << mode
15164 << " : stat mismatch, got "
15165 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15166 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15167 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15168 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15169 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15170 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15171 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
15172 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15173 << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15174 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
15175 ++scrubber.shallow_errors;
15176
15177 if (repair) {
15178 ++scrubber.fixed;
15179 info.stats.stats = scrub_cstat;
15180 info.stats.dirty_stats_invalid = false;
15181 info.stats.omap_stats_invalid = false;
15182 info.stats.hitset_stats_invalid = false;
15183 info.stats.hitset_bytes_stats_invalid = false;
15184 info.stats.pin_stats_invalid = false;
15185 info.stats.manifest_stats_invalid = false;
15186 publish_stats_to_osd();
15187 share_pg_info();
15188 }
15189 }
15190 // Clear object context cache to get repair information
15191 if (repair)
15192 object_contexts.clear();
15193 }
15194
15195 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
15196 {
15197 return osd->check_osdmap_full(missing_on);
15198 }
15199
15200 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15201 {
15202 OpRequestRef op = ctx->op;
15203 // Only supports replicated pools
15204 ceph_assert(!pool.info.is_erasure());
15205 ceph_assert(is_primary());
15206
15207 dout(10) << __func__ << " " << soid
15208 << " peers osd.{" << acting_recovery_backfill << "}" << dendl;
15209
15210 if (!is_clean()) {
15211 block_for_clean(soid, op);
15212 return -EAGAIN;
15213 }
15214
15215 ceph_assert(!pg_log.get_missing().is_missing(soid));
15216 auto& oi = ctx->new_obs.oi;
15217 eversion_t v = oi.version;
15218
15219 missing_loc.add_missing(soid, v, eversion_t());
15220 if (primary_error(soid, v)) {
15221 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15222 // XXX: If we knew that there is no down osd which could include this
15223 // object, it would be nice if we could return EIO here.
15224 // If a "never fail" flag was available, that could be used
15225 // for rbd to NOT return EIO until object marked lost.
15226
15227 // Drop through to save this op in case an osd comes up with the object.
15228 }
15229
15230 // Restart the op after object becomes readable again
15231 waiting_for_unreadable_object[soid].push_back(op);
15232 op->mark_delayed("waiting for missing object");
15233
15234 if (!eio_errors_to_process) {
15235 eio_errors_to_process = true;
15236 ceph_assert(is_clean());
15237 state_set(PG_STATE_REPAIR);
15238 queue_peering_event(
15239 PGPeeringEventRef(
15240 std::make_shared<PGPeeringEvent>(
15241 get_osdmap_epoch(),
15242 get_osdmap_epoch(),
15243 DoRecovery())));
15244 } else {
15245 // A prior error must have already cleared clean state and queued recovery
15246 // or a map change has triggered re-peering.
15247 // Not inlining the recovery by calling maybe_kick_recovery(soid);
15248 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
15249 }
15250
15251 return -EAGAIN;
15252 }
15253
15254 /*---SnapTrimmer Logging---*/
15255 #undef dout_prefix
15256 #define dout_prefix pg->gen_prefix(*_dout)
15257
15258 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15259 {
15260 ldout(pg->cct, 20) << "enter " << state_name << dendl;
15261 }
15262
15263 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15264 {
15265 ldout(pg->cct, 20) << "exit " << state_name << dendl;
15266 }
15267
15268 /*---SnapTrimmer states---*/
15269 #undef dout_prefix
15270 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15271 << "SnapTrimmer state<" << get_state_name() << ">: ")
15272
15273 /* NotTrimming */
15274 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15275 : my_base(ctx),
15276 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
15277 {
15278 context< SnapTrimmer >().log_enter(state_name);
15279 }
15280
15281 void PrimaryLogPG::NotTrimming::exit()
15282 {
15283 context< SnapTrimmer >().log_exit(state_name, enter_time);
15284 }
15285
15286 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15287 {
15288 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15289 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15290
15291 if (!(pg->is_primary() && pg->is_active())) {
15292 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15293 return discard_event();
15294 }
15295 if (!pg->is_clean() ||
15296 pg->snap_trimq.empty()) {
15297 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15298 return discard_event();
15299 }
15300 if (pg->scrubber.active) {
15301 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15302 return transit< WaitScrub >();
15303 } else {
15304 return transit< Trimming >();
15305 }
15306 }
15307
15308 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15309 {
15310 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15311 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15312
15313 pending = nullptr;
15314 if (!context< SnapTrimmer >().can_trim()) {
15315 post_event(KickTrim());
15316 return transit< NotTrimming >();
15317 }
15318
15319 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15320 ldout(pg->cct, 10) << "NotTrimming: trimming "
15321 << pg->snap_trimq.range_start()
15322 << dendl;
15323 return transit< AwaitAsyncWork >();
15324 }
15325
15326 /* AwaitAsyncWork */
15327 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15328 : my_base(ctx),
15329 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
15330 {
15331 auto *pg = context< SnapTrimmer >().pg;
15332 context< SnapTrimmer >().log_enter(state_name);
15333 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15334 pg->state_set(PG_STATE_SNAPTRIM);
15335 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15336 pg->publish_stats_to_osd();
15337 }
15338
15339 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15340 {
15341 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15342 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15343 auto &in_flight = context<Trimming>().in_flight;
15344 ceph_assert(in_flight.empty());
15345
15346 ceph_assert(pg->is_primary() && pg->is_active());
15347 if (!context< SnapTrimmer >().can_trim()) {
15348 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15349 post_event(KickTrim());
15350 return transit< NotTrimming >();
15351 }
15352
15353 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15354
15355 vector<hobject_t> to_trim;
15356 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15357 to_trim.reserve(max);
15358 int r = pg->snap_mapper.get_next_objects_to_trim(
15359 snap_to_trim,
15360 max,
15361 &to_trim);
15362 if (r != 0 && r != -ENOENT) {
15363 lderr(pg->cct) << "get_next_objects_to_trim returned "
15364 << cpp_strerror(r) << dendl;
15365 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15366 } else if (r == -ENOENT) {
15367 // Done!
15368 ldout(pg->cct, 10) << "got ENOENT" << dendl;
15369
15370 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15371 << " to purged_snaps"
15372 << dendl;
15373 pg->info.purged_snaps.insert(snap_to_trim);
15374 pg->snap_trimq.erase(snap_to_trim);
15375 ldout(pg->cct, 10) << "purged_snaps now "
15376 << pg->info.purged_snaps << ", snap_trimq now "
15377 << pg->snap_trimq << dendl;
15378
15379 ObjectStore::Transaction t;
15380 pg->dirty_big_info = true;
15381 pg->write_if_dirty(t);
15382 int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15383 ceph_assert(tr == 0);
15384
15385 pg->share_pg_info();
15386 post_event(KickTrim());
15387 return transit< NotTrimming >();
15388 }
15389 ceph_assert(!to_trim.empty());
15390
15391 for (auto &&object: to_trim) {
15392 // Get next
15393 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15394 OpContextUPtr ctx;
15395 int error = pg->trim_object(in_flight.empty(), object, &ctx);
15396 if (error) {
15397 if (error == -ENOLCK) {
15398 ldout(pg->cct, 10) << "could not get write lock on obj "
15399 << object << dendl;
15400 } else {
15401 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15402 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15403 }
15404 if (!in_flight.empty()) {
15405 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15406 return transit< WaitRepops >();
15407 }
15408 if (error == -ENOLCK) {
15409 ldout(pg->cct, 10) << "waiting for it to clear"
15410 << dendl;
15411 return transit< WaitRWLock >();
15412 } else {
15413 return transit< NotTrimming >();
15414 }
15415 }
15416
15417 in_flight.insert(object);
15418 ctx->register_on_success(
15419 [pg, object, &in_flight]() {
15420 ceph_assert(in_flight.find(object) != in_flight.end());
15421 in_flight.erase(object);
15422 if (in_flight.empty()) {
15423 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15424 pg->snap_trimmer_machine.process_event(Reset());
15425 } else {
15426 pg->snap_trimmer_machine.process_event(RepopsComplete());
15427 }
15428 }
15429 });
15430
15431 pg->simple_opc_submit(std::move(ctx));
15432 }
15433
15434 return transit< WaitRepops >();
15435 }
15436
15437 void PrimaryLogPG::setattr_maybe_cache(
15438 ObjectContextRef obc,
15439 PGTransaction *t,
15440 const string &key,
15441 bufferlist &val)
15442 {
15443 t->setattr(obc->obs.oi.soid, key, val);
15444 }
15445
15446 void PrimaryLogPG::setattrs_maybe_cache(
15447 ObjectContextRef obc,
15448 PGTransaction *t,
15449 map<string, bufferlist> &attrs)
15450 {
15451 t->setattrs(obc->obs.oi.soid, attrs);
15452 }
15453
15454 void PrimaryLogPG::rmattr_maybe_cache(
15455 ObjectContextRef obc,
15456 PGTransaction *t,
15457 const string &key)
15458 {
15459 t->rmattr(obc->obs.oi.soid, key);
15460 }
15461
15462 int PrimaryLogPG::getattr_maybe_cache(
15463 ObjectContextRef obc,
15464 const string &key,
15465 bufferlist *val)
15466 {
15467 if (pool.info.is_erasure()) {
15468 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15469 if (i != obc->attr_cache.end()) {
15470 if (val)
15471 *val = i->second;
15472 return 0;
15473 } else {
15474 return -ENODATA;
15475 }
15476 }
15477 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15478 }
15479
15480 int PrimaryLogPG::getattrs_maybe_cache(
15481 ObjectContextRef obc,
15482 map<string, bufferlist> *out)
15483 {
15484 int r = 0;
15485 ceph_assert(out);
15486 if (pool.info.is_erasure()) {
15487 *out = obc->attr_cache;
15488 } else {
15489 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15490 }
15491 map<string, bufferlist> tmp;
15492 for (map<string, bufferlist>::iterator i = out->begin();
15493 i != out->end();
15494 ++i) {
15495 if (i->first.size() > 1 && i->first[0] == '_')
15496 tmp[i->first.substr(1, i->first.size())].claim(i->second);
15497 }
15498 tmp.swap(*out);
15499 return r;
15500 }
15501
15502 bool PrimaryLogPG::check_failsafe_full() {
15503 return osd->check_failsafe_full(get_dpp());
15504 }
15505
15506 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15507 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15508
15509 #ifdef PG_DEBUG_REFS
15510 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15511 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15512 #endif
15513
15514 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15515 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }