]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
b50624963a5c30fe6b3f297d4c2d8dc7da5e4008
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
20 #include "PG.h"
21 #include "PrimaryLogPG.h"
22 #include "OSD.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
25 #include "Session.h"
26 #include "objclass/objclass.h"
27
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
31
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
47
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
56
57 #ifdef WITH_LTTNG
58 #include "tracing/osd.h"
59 #else
60 #define tracepoint(...)
61 #endif
62
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
66 #undef dout_prefix
67 #define dout_prefix _prefix(_dout, this)
68 template <typename T>
69 static ostream& _prefix(std::ostream *_dout, T *pg) {
70 return *_dout << pg->gen_prefix();
71 }
72
73
74 #include <sstream>
75 #include <utility>
76
77 #include <errno.h>
78
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
81 PGLSFilter::PGLSFilter() : cct(nullptr)
82 {
83 }
84
85 PGLSFilter::~PGLSFilter()
86 {
87 }
88
89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
90 PrimaryLogPGRef pg;
91 epoch_t epoch;
92 eversion_t v;
93 C_OSD_OnApplied(
94 PrimaryLogPGRef pg,
95 epoch_t epoch,
96 eversion_t v)
97 : pg(pg), epoch(epoch), v(v) {}
98 void finish(int) override {
99 pg->lock();
100 if (!pg->pg_has_reset_since(epoch))
101 pg->op_applied(v);
102 pg->unlock();
103 }
104 };
105
106 /**
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
110 *
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
113 */
114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
115 protected:
116 CopyCallback() {}
117 /**
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
122 */
123 void finish(CopyCallbackResults results_) override = 0;
124
125 public:
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override {}
128 };
129
130 template <typename T>
131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
132 PrimaryLogPGRef pg;
133 unique_ptr<GenContext<T>> c;
134 epoch_t e;
135 public:
136 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
137 : pg(pg), c(c), e(e) {}
138 void finish(T t) override {
139 pg->lock();
140 if (pg->pg_has_reset_since(e))
141 c.reset();
142 else
143 c.release()->complete(t);
144 pg->unlock();
145 }
146 };
147
148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
149 GenContext<ThreadPool::TPHandle&> *c) {
150 return new BlessedGenContext<ThreadPool::TPHandle&>(
151 this, c, get_osdmap()->get_epoch());
152 }
153
154 class PrimaryLogPG::BlessedContext : public Context {
155 PrimaryLogPGRef pg;
156 unique_ptr<Context> c;
157 epoch_t e;
158 public:
159 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
160 : pg(pg), c(c), e(e) {}
161 void finish(int r) override {
162 pg->lock();
163 if (pg->pg_has_reset_since(e))
164 c.reset();
165 else
166 c.release()->complete(r);
167 pg->unlock();
168 }
169 };
170
171
172 Context *PrimaryLogPG::bless_context(Context *c) {
173 return new BlessedContext(this, c, get_osdmap()->get_epoch());
174 }
175
176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
177 PrimaryLogPGRef pg;
178 ObjectContext *obc;
179 public:
180 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
181 pg(p), obc(o) {}
182 void finish(int r) override {
183 pg->object_context_destructor_callback(obc);
184 }
185 };
186
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
188 ObjectContextRef obc, obc2, obc3;
189 public:
190 C_OSD_OndiskWriteUnlock(
191 ObjectContextRef o,
192 ObjectContextRef o2 = ObjectContextRef(),
193 ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
194 void finish(int r) override {
195 obc->ondisk_write_unlock();
196 if (obc2)
197 obc2->ondisk_write_unlock();
198 if (obc3)
199 obc3->ondisk_write_unlock();
200 }
201 };
202
203 struct OnReadComplete : public Context {
204 PrimaryLogPG *pg;
205 PrimaryLogPG::OpContext *opcontext;
206 OnReadComplete(
207 PrimaryLogPG *pg,
208 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
209 void finish(int r) override {
210 if (r < 0)
211 opcontext->async_read_result = r;
212 opcontext->finish_read(pg);
213 }
214 ~OnReadComplete() override {}
215 };
216
217 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
218 PrimaryLogPGRef pg;
219 ObjectContextRef obc;
220 public:
221 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
222 pg(p), obc(o) {}
223 void finish(int r) override {
224 pg->_applied_recovered_object(obc);
225 }
226 };
227
228 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
229 PrimaryLogPGRef pg;
230 epoch_t epoch;
231 eversion_t last_complete;
232 public:
233 C_OSD_CommittedPushedObject(
234 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
235 pg(p), epoch(epoch), last_complete(lc) {
236 }
237 void finish(int r) override {
238 pg->_committed_pushed_object(epoch, last_complete);
239 }
240 };
241
242 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
243 PrimaryLogPGRef pg;
244 public:
245 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
246 pg(p) {}
247 void finish(int r) override {
248 pg->_applied_recovered_object_replica();
249 }
250 };
251
252 // OpContext
253 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
254 {
255 inflightreads = 1;
256 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
257 pair<bufferlist*, Context*> > > in;
258 in.swap(pending_async_reads);
259 pg->pgbackend->objects_read_async(
260 obc->obs.oi.soid,
261 in,
262 new OnReadComplete(pg, this), pg->get_pool().fast_read);
263 }
264 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
265 {
266 assert(inflightreads > 0);
267 --inflightreads;
268 if (async_reads_complete()) {
269 assert(pg->in_progress_async_reads.size());
270 assert(pg->in_progress_async_reads.front().second == this);
271 pg->in_progress_async_reads.pop_front();
272 pg->complete_read_ctx(async_read_result, this);
273 }
274 }
275
276 class CopyFromCallback: public PrimaryLogPG::CopyCallback {
277 public:
278 PrimaryLogPG::CopyResults *results;
279 int retval;
280 PrimaryLogPG::OpContext *ctx;
281 explicit CopyFromCallback(PrimaryLogPG::OpContext *ctx_)
282 : results(NULL),
283 retval(0),
284 ctx(ctx_) {}
285 ~CopyFromCallback() override {}
286
287 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
288 results = results_.get<1>();
289 int r = results_.get<0>();
290 retval = r;
291
292 // for finish_copyfrom
293 ctx->user_at_version = results->user_version;
294
295 if (r >= 0) {
296 ctx->pg->execute_ctx(ctx);
297 }
298 ctx->copy_cb = NULL;
299 if (r < 0) {
300 if (r != -ECANCELED) { // on cancel just toss it out; client resends
301 if (ctx->op)
302 ctx->pg->osd->reply_op_error(ctx->op, r);
303 } else if (results->should_requeue) {
304 if (ctx->op)
305 ctx->pg->requeue_op(ctx->op);
306 }
307 ctx->pg->close_op_ctx(ctx);
308 }
309 }
310
311 bool is_temp_obj_used() {
312 return results->started_temp_obj;
313 }
314 uint64_t get_data_size() {
315 return results->object_size;
316 }
317 int get_result() {
318 return retval;
319 }
320 };
321
322 // ======================
323 // PGBackend::Listener
324
325 void PrimaryLogPG::on_local_recover(
326 const hobject_t &hoid,
327 const ObjectRecoveryInfo &_recovery_info,
328 ObjectContextRef obc,
329 ObjectStore::Transaction *t
330 )
331 {
332 dout(10) << __func__ << ": " << hoid << dendl;
333
334 ObjectRecoveryInfo recovery_info(_recovery_info);
335 clear_object_snap_mapping(t, hoid);
336 if (recovery_info.soid.is_snap()) {
337 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
338 set<snapid_t> snaps;
339 dout(20) << " snapset " << recovery_info.ss
340 << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
341 if (recovery_info.ss.is_legacy() ||
342 recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
343 assert(recovery_info.oi.legacy_snaps.size());
344 snaps.insert(recovery_info.oi.legacy_snaps.begin(),
345 recovery_info.oi.legacy_snaps.end());
346 } else {
347 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
348 assert(p != recovery_info.ss.clone_snaps.end()); // hmm, should we warn?
349 snaps.insert(p->second.begin(), p->second.end());
350 }
351 dout(20) << " snaps " << snaps << dendl;
352 snap_mapper.add_oid(
353 recovery_info.soid,
354 snaps,
355 &_t);
356 }
357 if (pg_log.get_missing().is_missing(recovery_info.soid) &&
358 pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
359 assert(is_primary());
360 const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
361 if (latest->op == pg_log_entry_t::LOST_REVERT &&
362 latest->reverting_to == recovery_info.version) {
363 dout(10) << " got old revert version " << recovery_info.version
364 << " for " << *latest << dendl;
365 recovery_info.version = latest->version;
366 // update the attr to the revert event version
367 recovery_info.oi.prior_version = recovery_info.oi.version;
368 recovery_info.oi.version = latest->version;
369 bufferlist bl;
370 ::encode(recovery_info.oi, bl,
371 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
372 assert(!pool.info.require_rollback());
373 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
374 if (obc)
375 obc->attr_cache[OI_ATTR] = bl;
376 }
377 }
378
379 // keep track of active pushes for scrub
380 ++active_pushes;
381
382 if (recovery_info.version > pg_log.get_can_rollback_to()) {
383 /* This can only happen during a repair, and even then, it would
384 * be one heck of a race. If we are repairing the object, the
385 * write in question must be fully committed, so it's not valid
386 * to roll it back anyway (and we'll be rolled forward shortly
387 * anyway) */
388 PGLogEntryHandler h{this, t};
389 pg_log.roll_forward_to(recovery_info.version, &h);
390 }
391 recover_got(recovery_info.soid, recovery_info.version);
392
393 if (is_primary()) {
394 assert(obc);
395 obc->obs.exists = true;
396 obc->ondisk_write_lock();
397
398 bool got = obc->get_recovery_read();
399 assert(got);
400
401 assert(recovering.count(obc->obs.oi.soid));
402 recovering[obc->obs.oi.soid] = obc;
403 obc->obs.oi = recovery_info.oi; // may have been updated above
404
405
406 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
407 t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
408
409 publish_stats_to_osd();
410 assert(missing_loc.needs_recovery(hoid));
411 missing_loc.add_location(hoid, pg_whoami);
412 release_backoffs(hoid);
413 if (!is_unreadable_object(hoid)) {
414 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
415 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
416 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
417 requeue_ops(unreadable_object_entry->second);
418 waiting_for_unreadable_object.erase(unreadable_object_entry);
419 }
420 }
421 if (pg_log.get_missing().get_items().size() == 0) {
422 requeue_ops(waiting_for_all_missing);
423 waiting_for_all_missing.clear();
424 }
425 } else {
426 t->register_on_applied(
427 new C_OSD_AppliedRecoveredObjectReplica(this));
428
429 }
430
431 t->register_on_commit(
432 new C_OSD_CommittedPushedObject(
433 this,
434 get_osdmap()->get_epoch(),
435 info.last_complete));
436
437 // update pg
438 dirty_info = true;
439 write_if_dirty(*t);
440 }
441
442 void PrimaryLogPG::on_global_recover(
443 const hobject_t &soid,
444 const object_stat_sum_t &stat_diff)
445 {
446 info.stats.stats.sum.add(stat_diff);
447 missing_loc.recovered(soid);
448 publish_stats_to_osd();
449 dout(10) << "pushed " << soid << " to all replicas" << dendl;
450 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
451 assert(i != recovering.end());
452
453 // recover missing won't have had an obc, but it gets filled in
454 // during on_local_recover
455 assert(i->second);
456 list<OpRequestRef> requeue_list;
457 i->second->drop_recovery_read(&requeue_list);
458 requeue_ops(requeue_list);
459
460 backfills_in_flight.erase(soid);
461
462 recovering.erase(i);
463 finish_recovery_op(soid);
464 release_backoffs(soid);
465 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
466 if (degraded_object_entry != waiting_for_degraded_object.end()) {
467 dout(20) << " kicking degraded waiters on " << soid << dendl;
468 requeue_ops(degraded_object_entry->second);
469 waiting_for_degraded_object.erase(degraded_object_entry);
470 }
471 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
472 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
473 dout(20) << " kicking unreadable waiters on " << soid << dendl;
474 requeue_ops(unreadable_object_entry->second);
475 waiting_for_unreadable_object.erase(unreadable_object_entry);
476 }
477 finish_degraded_object(soid);
478 }
479
480 void PrimaryLogPG::on_peer_recover(
481 pg_shard_t peer,
482 const hobject_t &soid,
483 const ObjectRecoveryInfo &recovery_info)
484 {
485 publish_stats_to_osd();
486 // done!
487 peer_missing[peer].got(soid, recovery_info.version);
488 }
489
490 void PrimaryLogPG::begin_peer_recover(
491 pg_shard_t peer,
492 const hobject_t soid)
493 {
494 peer_missing[peer].revise_have(soid, eversion_t());
495 }
496
497 void PrimaryLogPG::schedule_recovery_work(
498 GenContext<ThreadPool::TPHandle&> *c)
499 {
500 osd->recovery_gen_wq.queue(c);
501 }
502
503 void PrimaryLogPG::send_message_osd_cluster(
504 int peer, Message *m, epoch_t from_epoch)
505 {
506 osd->send_message_osd_cluster(peer, m, from_epoch);
507 }
508
509 void PrimaryLogPG::send_message_osd_cluster(
510 Message *m, Connection *con)
511 {
512 osd->send_message_osd_cluster(m, con);
513 }
514
515 void PrimaryLogPG::send_message_osd_cluster(
516 Message *m, const ConnectionRef& con)
517 {
518 osd->send_message_osd_cluster(m, con);
519 }
520
521 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
522 int peer, epoch_t from_epoch)
523 {
524 return osd->get_con_osd_cluster(peer, from_epoch);
525 }
526
527 PerfCounters *PrimaryLogPG::get_logger()
528 {
529 return osd->logger;
530 }
531
532
533 // ====================
534 // missing objects
535
536 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
537 {
538 return pg_log.get_missing().get_items().count(soid);
539 }
540
541 void PrimaryLogPG::maybe_kick_recovery(
542 const hobject_t &soid)
543 {
544 eversion_t v;
545 if (!missing_loc.needs_recovery(soid, &v))
546 return;
547
548 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
549 if (p != recovering.end()) {
550 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
551 } else if (missing_loc.is_unfound(soid)) {
552 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
553 } else {
554 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
555 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
556 if (is_missing_object(soid)) {
557 recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
558 } else {
559 prep_object_replica_pushes(soid, v, h);
560 }
561 pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
562 }
563 }
564
565 void PrimaryLogPG::wait_for_unreadable_object(
566 const hobject_t& soid, OpRequestRef op)
567 {
568 assert(is_unreadable_object(soid));
569 maybe_kick_recovery(soid);
570 waiting_for_unreadable_object[soid].push_back(op);
571 op->mark_delayed("waiting for missing object");
572 }
573
574 void PrimaryLogPG::wait_for_all_missing(OpRequestRef op)
575 {
576 waiting_for_all_missing.push_back(op);
577 op->mark_delayed("waiting for all missing");
578 }
579
580 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
581 {
582 /* The conditions below may clear (on_local_recover, before we queue
583 * the transaction) before we actually requeue the degraded waiters
584 * in on_global_recover after the transaction completes.
585 */
586 if (waiting_for_degraded_object.count(soid))
587 return true;
588 if (pg_log.get_missing().get_items().count(soid))
589 return true;
590 assert(!actingbackfill.empty());
591 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
592 i != actingbackfill.end();
593 ++i) {
594 if (*i == get_primary()) continue;
595 pg_shard_t peer = *i;
596 auto peer_missing_entry = peer_missing.find(peer);
597 if (peer_missing_entry != peer_missing.end() &&
598 peer_missing_entry->second.get_items().count(soid))
599 return true;
600
601 // Object is degraded if after last_backfill AND
602 // we are backfilling it
603 if (is_backfill_targets(peer) &&
604 peer_info[peer].last_backfill <= soid &&
605 last_backfill_started >= soid &&
606 backfills_in_flight.count(soid))
607 return true;
608 }
609 return false;
610 }
611
612 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
613 {
614 assert(is_degraded_or_backfilling_object(soid));
615
616 maybe_kick_recovery(soid);
617 waiting_for_degraded_object[soid].push_back(op);
618 op->mark_delayed("waiting for degraded object");
619 }
620
621 void PrimaryLogPG::block_write_on_full_cache(
622 const hobject_t& _oid, OpRequestRef op)
623 {
624 const hobject_t oid = _oid.get_head();
625 dout(20) << __func__ << ": blocking object " << oid
626 << " on full cache" << dendl;
627 objects_blocked_on_cache_full.insert(oid);
628 waiting_for_cache_not_full.push_back(op);
629 op->mark_delayed("waiting for cache not full");
630 }
631
632 void PrimaryLogPG::block_write_on_snap_rollback(
633 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
634 {
635 dout(20) << __func__ << ": blocking object " << oid.get_head()
636 << " on snap promotion " << obc->obs.oi.soid << dendl;
637 // otherwise, we'd have blocked in do_op
638 assert(oid.is_head());
639 assert(objects_blocked_on_snap_promotion.count(oid) == 0);
640 objects_blocked_on_snap_promotion[oid] = obc;
641 wait_for_blocked_object(obc->obs.oi.soid, op);
642 }
643
644 void PrimaryLogPG::block_write_on_degraded_snap(
645 const hobject_t& snap, OpRequestRef op)
646 {
647 dout(20) << __func__ << ": blocking object " << snap.get_head()
648 << " on degraded snap " << snap << dendl;
649 // otherwise, we'd have blocked in do_op
650 assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
651 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
652 wait_for_degraded_object(snap, op);
653 }
654
655 bool PrimaryLogPG::maybe_await_blocked_snapset(
656 const hobject_t &hoid,
657 OpRequestRef op)
658 {
659 ObjectContextRef obc;
660 obc = object_contexts.lookup(hoid.get_head());
661 if (obc) {
662 if (obc->is_blocked()) {
663 wait_for_blocked_object(obc->obs.oi.soid, op);
664 return true;
665 } else {
666 return false;
667 }
668 }
669 obc = object_contexts.lookup(hoid.get_snapdir());
670 if (obc) {
671 if (obc->is_blocked()) {
672 wait_for_blocked_object(obc->obs.oi.soid, op);
673 return true;
674 } else {
675 return false;
676 }
677 }
678 return false;
679 }
680
681 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
682 {
683 dout(10) << __func__ << " " << soid << " " << op << dendl;
684 waiting_for_blocked_object[soid].push_back(op);
685 op->mark_delayed("waiting for blocked object");
686 }
687
688 void PrimaryLogPG::maybe_force_recovery()
689 {
690 // no force if not in degraded/recovery/backfill stats
691 if (!is_degraded() &&
692 !state_test(PG_STATE_RECOVERING |
693 PG_STATE_RECOVERY_WAIT |
694 PG_STATE_BACKFILL |
695 PG_STATE_BACKFILL_WAIT |
696 PG_STATE_BACKFILL_TOOFULL))
697 return;
698
699 if (pg_log.get_log().approx_size() <
700 cct->_conf->osd_max_pg_log_entries *
701 cct->_conf->osd_force_recovery_pg_log_entries_factor)
702 return;
703
704 // find the oldest missing object
705 version_t min_version = 0;
706 hobject_t soid;
707 if (!pg_log.get_missing().get_items().empty()) {
708 min_version = pg_log.get_missing().get_rmissing().begin()->first;
709 soid = pg_log.get_missing().get_rmissing().begin()->second;
710 }
711 assert(!actingbackfill.empty());
712 for (set<pg_shard_t>::iterator it = actingbackfill.begin();
713 it != actingbackfill.end();
714 ++it) {
715 if (*it == get_primary()) continue;
716 pg_shard_t peer = *it;
717 if (peer_missing.count(peer) &&
718 !peer_missing[peer].get_items().empty() &&
719 min_version > peer_missing[peer].get_rmissing().begin()->first) {
720 min_version = peer_missing[peer].get_rmissing().begin()->first;
721 soid = peer_missing[peer].get_rmissing().begin()->second;
722 }
723 }
724
725 // recover it
726 if (soid != hobject_t())
727 maybe_kick_recovery(soid);
728 }
729
730 class PGLSPlainFilter : public PGLSFilter {
731 string val;
732 public:
733 int init(bufferlist::iterator &params) override
734 {
735 try {
736 ::decode(xattr, params);
737 ::decode(val, params);
738 } catch (buffer::error &e) {
739 return -EINVAL;
740 }
741
742 return 0;
743 }
744 ~PGLSPlainFilter() override {}
745 bool filter(const hobject_t &obj, bufferlist& xattr_data,
746 bufferlist& outdata) override;
747 };
748
749 class PGLSParentFilter : public PGLSFilter {
750 inodeno_t parent_ino;
751 public:
752 CephContext* cct;
753 PGLSParentFilter(CephContext* cct) : cct(cct) {
754 xattr = "_parent";
755 }
756 int init(bufferlist::iterator &params) override
757 {
758 try {
759 ::decode(parent_ino, params);
760 } catch (buffer::error &e) {
761 return -EINVAL;
762 }
763 generic_dout(0) << "parent_ino=" << parent_ino << dendl;
764
765 return 0;
766 }
767 ~PGLSParentFilter() override {}
768 bool filter(const hobject_t &obj, bufferlist& xattr_data,
769 bufferlist& outdata) override;
770 };
771
772 bool PGLSParentFilter::filter(const hobject_t &obj,
773 bufferlist& xattr_data, bufferlist& outdata)
774 {
775 bufferlist::iterator iter = xattr_data.begin();
776 inode_backtrace_t bt;
777
778 generic_dout(0) << "PGLSParentFilter::filter" << dendl;
779
780 ::decode(bt, iter);
781
782 vector<inode_backpointer_t>::iterator vi;
783 for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
784 generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
785 if (vi->dirino == parent_ino) {
786 ::encode(*vi, outdata);
787 return true;
788 }
789 }
790
791 return false;
792 }
793
794 bool PGLSPlainFilter::filter(const hobject_t &obj,
795 bufferlist& xattr_data, bufferlist& outdata)
796 {
797 if (val.size() != xattr_data.length())
798 return false;
799
800 if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
801 return false;
802
803 return true;
804 }
805
806 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
807 {
808 bufferlist bl;
809
810 // If filter has expressed an interest in an xattr, load it.
811 if (!filter->get_xattr().empty()) {
812 int ret = pgbackend->objects_get_attr(
813 sobj,
814 filter->get_xattr(),
815 &bl);
816 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
817 if (ret < 0) {
818 if (ret != -ENODATA || filter->reject_empty_xattr()) {
819 return false;
820 }
821 }
822 }
823
824 return filter->filter(sobj, bl, outdata);
825 }
826
827 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
828 {
829 string type;
830 PGLSFilter *filter;
831
832 try {
833 ::decode(type, iter);
834 }
835 catch (buffer::error& e) {
836 return -EINVAL;
837 }
838
839 if (type.compare("parent") == 0) {
840 filter = new PGLSParentFilter(cct);
841 } else if (type.compare("plain") == 0) {
842 filter = new PGLSPlainFilter();
843 } else {
844 std::size_t dot = type.find(".");
845 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
846 return -EINVAL;
847 }
848
849 const std::string class_name = type.substr(0, dot);
850 const std::string filter_name = type.substr(dot + 1);
851 ClassHandler::ClassData *cls = NULL;
852 int r = osd->class_handler->open_class(class_name, &cls);
853 if (r != 0) {
854 derr << "Error opening class '" << class_name << "': "
855 << cpp_strerror(r) << dendl;
856 if (r != -EPERM) // propogate permission error
857 r = -EINVAL;
858 return r;
859 } else {
860 assert(cls);
861 }
862
863 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
864 if (class_filter == NULL) {
865 derr << "Error finding filter '" << filter_name << "' in class "
866 << class_name << dendl;
867 return -EINVAL;
868 }
869 filter = class_filter->fn();
870 if (!filter) {
871 // Object classes are obliged to return us something, but let's
872 // give an error rather than asserting out.
873 derr << "Buggy class " << class_name << " failed to construct "
874 "filter " << filter_name << dendl;
875 return -EINVAL;
876 }
877 }
878
879 assert(filter);
880 int r = filter->init(iter);
881 if (r < 0) {
882 derr << "Error initializing filter " << type << ": "
883 << cpp_strerror(r) << dendl;
884 delete filter;
885 return -EINVAL;
886 } else {
887 // Successfully constructed and initialized, return it.
888 *pfilter = filter;
889 return 0;
890 }
891 }
892
893
894 // ==========================================================
895
896 int PrimaryLogPG::do_command(
897 cmdmap_t cmdmap,
898 ostream& ss,
899 bufferlist& idata,
900 bufferlist& odata,
901 ConnectionRef con,
902 ceph_tid_t tid)
903 {
904 const pg_missing_t &missing = pg_log.get_missing();
905 string prefix;
906 string format;
907
908 cmd_getval(cct, cmdmap, "format", format);
909 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
910
911 string command;
912 cmd_getval(cct, cmdmap, "cmd", command);
913 if (command == "query") {
914 f->open_object_section("pg");
915 f->dump_string("state", pg_state_string(get_state()));
916 f->dump_stream("snap_trimq") << snap_trimq;
917 f->dump_unsigned("epoch", get_osdmap()->get_epoch());
918 f->open_array_section("up");
919 for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
920 f->dump_unsigned("osd", *p);
921 f->close_section();
922 f->open_array_section("acting");
923 for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
924 f->dump_unsigned("osd", *p);
925 f->close_section();
926 if (!backfill_targets.empty()) {
927 f->open_array_section("backfill_targets");
928 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
929 p != backfill_targets.end();
930 ++p)
931 f->dump_stream("shard") << *p;
932 f->close_section();
933 }
934 if (!actingbackfill.empty()) {
935 f->open_array_section("actingbackfill");
936 for (set<pg_shard_t>::iterator p = actingbackfill.begin();
937 p != actingbackfill.end();
938 ++p)
939 f->dump_stream("shard") << *p;
940 f->close_section();
941 }
942 f->open_object_section("info");
943 _update_calc_stats();
944 info.dump(f.get());
945 f->close_section();
946
947 f->open_array_section("peer_info");
948 for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
949 p != peer_info.end();
950 ++p) {
951 f->open_object_section("info");
952 f->dump_stream("peer") << p->first;
953 p->second.dump(f.get());
954 f->close_section();
955 }
956 f->close_section();
957
958 f->open_array_section("recovery_state");
959 handle_query_state(f.get());
960 f->close_section();
961
962 f->open_object_section("agent_state");
963 if (agent_state)
964 agent_state->dump(f.get());
965 f->close_section();
966
967 f->close_section();
968 f->flush(odata);
969 return 0;
970 }
971 else if (command == "mark_unfound_lost") {
972 string mulcmd;
973 cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
974 int mode = -1;
975 if (mulcmd == "revert") {
976 if (pool.info.ec_pool()) {
977 ss << "mode must be 'delete' for ec pool";
978 return -EINVAL;
979 }
980 mode = pg_log_entry_t::LOST_REVERT;
981 } else if (mulcmd == "delete") {
982 mode = pg_log_entry_t::LOST_DELETE;
983 } else {
984 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
985 return -EINVAL;
986 }
987 assert(mode == pg_log_entry_t::LOST_REVERT ||
988 mode == pg_log_entry_t::LOST_DELETE);
989
990 if (!is_primary()) {
991 ss << "not primary";
992 return -EROFS;
993 }
994
995 uint64_t unfound = missing_loc.num_unfound();
996 if (!unfound) {
997 ss << "pg has no unfound objects";
998 return 0; // make command idempotent
999 }
1000
1001 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1002 ss << "pg has " << unfound
1003 << " unfound objects but we haven't probed all sources, not marking lost";
1004 return -EINVAL;
1005 }
1006
1007 mark_all_unfound_lost(mode, con, tid);
1008 return -EAGAIN;
1009 }
1010 else if (command == "list_missing") {
1011 hobject_t offset;
1012 string offset_json;
1013 if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1014 json_spirit::Value v;
1015 try {
1016 if (!json_spirit::read(offset_json, v))
1017 throw std::runtime_error("bad json");
1018 offset.decode(v);
1019 } catch (std::runtime_error& e) {
1020 ss << "error parsing offset: " << e.what();
1021 return -EINVAL;
1022 }
1023 }
1024 f->open_object_section("missing");
1025 {
1026 f->open_object_section("offset");
1027 offset.dump(f.get());
1028 f->close_section();
1029 }
1030 f->dump_int("num_missing", missing.num_missing());
1031 f->dump_int("num_unfound", get_num_unfound());
1032 const map<hobject_t, pg_missing_item> &needs_recovery_map =
1033 missing_loc.get_needs_recovery();
1034 map<hobject_t, pg_missing_item>::const_iterator p =
1035 needs_recovery_map.upper_bound(offset);
1036 {
1037 f->open_array_section("objects");
1038 int32_t num = 0;
1039 for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1040 if (missing_loc.is_unfound(p->first)) {
1041 f->open_object_section("object");
1042 {
1043 f->open_object_section("oid");
1044 p->first.dump(f.get());
1045 f->close_section();
1046 }
1047 p->second.dump(f.get()); // have, need keys
1048 {
1049 f->open_array_section("locations");
1050 for (set<pg_shard_t>::iterator r =
1051 missing_loc.get_locations(p->first).begin();
1052 r != missing_loc.get_locations(p->first).end();
1053 ++r)
1054 f->dump_stream("shard") << *r;
1055 f->close_section();
1056 }
1057 f->close_section();
1058 num++;
1059 }
1060 }
1061 f->close_section();
1062 }
1063 f->dump_bool("more", p != needs_recovery_map.end());
1064 f->close_section();
1065 f->flush(odata);
1066 return 0;
1067 }
1068
1069 ss << "unknown pg command " << prefix;
1070 return -EINVAL;
1071 }
1072
1073 // ==========================================================
1074
1075 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1076 {
1077 // NOTE: this is non-const because we modify the OSDOp.outdata in
1078 // place
1079 MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1080 assert(m->get_type() == CEPH_MSG_OSD_OP);
1081 dout(10) << "do_pg_op " << *m << dendl;
1082
1083 op->mark_started();
1084
1085 int result = 0;
1086 string cname, mname;
1087 PGLSFilter *filter = NULL;
1088 bufferlist filter_out;
1089
1090 snapid_t snapid = m->get_snapid();
1091
1092 vector<OSDOp> ops = m->ops;
1093
1094 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1095 OSDOp& osd_op = *p;
1096 bufferlist::iterator bp = p->indata.begin();
1097 switch (p->op.op) {
1098 case CEPH_OSD_OP_PGNLS_FILTER:
1099 try {
1100 ::decode(cname, bp);
1101 ::decode(mname, bp);
1102 }
1103 catch (const buffer::error& e) {
1104 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1105 result = -EINVAL;
1106 break;
1107 }
1108 if (filter) {
1109 delete filter;
1110 filter = NULL;
1111 }
1112 result = get_pgls_filter(bp, &filter);
1113 if (result < 0)
1114 break;
1115
1116 assert(filter);
1117
1118 // fall through
1119
1120 case CEPH_OSD_OP_PGNLS:
1121 if (snapid != CEPH_NOSNAP) {
1122 result = -EINVAL;
1123 break;
1124 }
1125 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1126 dout(10) << " pgnls pg=" << m->get_pg()
1127 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1128 << " != " << info.pgid << dendl;
1129 result = 0; // hmm?
1130 } else {
1131 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1132
1133 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1134 // read into a buffer
1135 vector<hobject_t> sentries;
1136 pg_nls_response_t response;
1137 try {
1138 ::decode(response.handle, bp);
1139 }
1140 catch (const buffer::error& e) {
1141 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1142 result = -EINVAL;
1143 break;
1144 }
1145
1146 hobject_t next;
1147 hobject_t lower_bound = response.handle;
1148 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1149 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1150 dout(10) << " pgnls lower_bound " << lower_bound
1151 << " pg_end " << pg_end << dendl;
1152 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1153 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1154 // this should only happen with a buggy client.
1155 dout(10) << "outside of PG bounds " << pg_start << " .. "
1156 << pg_end << dendl;
1157 result = -EINVAL;
1158 break;
1159 }
1160
1161 hobject_t current = lower_bound;
1162 osr->flush();
1163 int r = pgbackend->objects_list_partial(
1164 current,
1165 list_size,
1166 list_size,
1167 &sentries,
1168 &next);
1169 if (r != 0) {
1170 result = -EINVAL;
1171 break;
1172 }
1173
1174 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1175 pg_log.get_missing().get_items().lower_bound(current);
1176 vector<hobject_t>::iterator ls_iter = sentries.begin();
1177 hobject_t _max = hobject_t::get_max();
1178 while (1) {
1179 const hobject_t &mcand =
1180 missing_iter == pg_log.get_missing().get_items().end() ?
1181 _max :
1182 missing_iter->first;
1183 const hobject_t &lcand =
1184 ls_iter == sentries.end() ?
1185 _max :
1186 *ls_iter;
1187
1188 hobject_t candidate;
1189 if (mcand == lcand) {
1190 candidate = mcand;
1191 if (!mcand.is_max()) {
1192 ++ls_iter;
1193 ++missing_iter;
1194 }
1195 } else if (mcand < lcand) {
1196 candidate = mcand;
1197 assert(!mcand.is_max());
1198 ++missing_iter;
1199 } else {
1200 candidate = lcand;
1201 assert(!lcand.is_max());
1202 ++ls_iter;
1203 }
1204
1205 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1206 << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1207
1208 if (candidate >= next) {
1209 break;
1210 }
1211
1212 if (response.entries.size() == list_size) {
1213 next = candidate;
1214 break;
1215 }
1216
1217 // skip snapdir objects
1218 if (candidate.snap == CEPH_SNAPDIR)
1219 continue;
1220
1221 if (candidate.snap != CEPH_NOSNAP)
1222 continue;
1223
1224 // skip internal namespace
1225 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1226 continue;
1227
1228 // skip wrong namespace
1229 if (m->get_hobj().nspace != librados::all_nspaces &&
1230 candidate.get_namespace() != m->get_hobj().nspace)
1231 continue;
1232
1233 if (filter && !pgls_filter(filter, candidate, filter_out))
1234 continue;
1235
1236 dout(20) << "pgnls item 0x" << std::hex
1237 << candidate.get_hash()
1238 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1239 << std::dec << " "
1240 << candidate.oid.name << dendl;
1241
1242 librados::ListObjectImpl item;
1243 item.nspace = candidate.get_namespace();
1244 item.oid = candidate.oid.name;
1245 item.locator = candidate.get_key();
1246 response.entries.push_back(item);
1247 }
1248
1249 if (next.is_max() &&
1250 missing_iter == pg_log.get_missing().get_items().end() &&
1251 ls_iter == sentries.end()) {
1252 result = 1;
1253
1254 // Set response.handle to the start of the next PG according
1255 // to the object sort order.
1256 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1257 } else {
1258 response.handle = next;
1259 }
1260 dout(10) << "pgnls handle=" << response.handle << dendl;
1261 ::encode(response, osd_op.outdata);
1262 if (filter)
1263 ::encode(filter_out, osd_op.outdata);
1264 dout(10) << " pgnls result=" << result << " outdata.length()="
1265 << osd_op.outdata.length() << dendl;
1266 }
1267 break;
1268
1269 case CEPH_OSD_OP_PGLS_FILTER:
1270 try {
1271 ::decode(cname, bp);
1272 ::decode(mname, bp);
1273 }
1274 catch (const buffer::error& e) {
1275 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1276 result = -EINVAL;
1277 break;
1278 }
1279 if (filter) {
1280 delete filter;
1281 filter = NULL;
1282 }
1283 result = get_pgls_filter(bp, &filter);
1284 if (result < 0)
1285 break;
1286
1287 assert(filter);
1288
1289 // fall through
1290
1291 case CEPH_OSD_OP_PGLS:
1292 if (snapid != CEPH_NOSNAP) {
1293 result = -EINVAL;
1294 break;
1295 }
1296 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1297 dout(10) << " pgls pg=" << m->get_pg()
1298 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1299 << " != " << info.pgid << dendl;
1300 result = 0; // hmm?
1301 } else {
1302 unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1303
1304 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1305 // read into a buffer
1306 vector<hobject_t> sentries;
1307 pg_ls_response_t response;
1308 try {
1309 ::decode(response.handle, bp);
1310 }
1311 catch (const buffer::error& e) {
1312 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1313 result = -EINVAL;
1314 break;
1315 }
1316
1317 hobject_t next;
1318 hobject_t current = response.handle;
1319 osr->flush();
1320 int r = pgbackend->objects_list_partial(
1321 current,
1322 list_size,
1323 list_size,
1324 &sentries,
1325 &next);
1326 if (r != 0) {
1327 result = -EINVAL;
1328 break;
1329 }
1330
1331 assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1332
1333 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1334 pg_log.get_missing().get_items().lower_bound(current);
1335 vector<hobject_t>::iterator ls_iter = sentries.begin();
1336 hobject_t _max = hobject_t::get_max();
1337 while (1) {
1338 const hobject_t &mcand =
1339 missing_iter == pg_log.get_missing().get_items().end() ?
1340 _max :
1341 missing_iter->first;
1342 const hobject_t &lcand =
1343 ls_iter == sentries.end() ?
1344 _max :
1345 *ls_iter;
1346
1347 hobject_t candidate;
1348 if (mcand == lcand) {
1349 candidate = mcand;
1350 if (!mcand.is_max()) {
1351 ++ls_iter;
1352 ++missing_iter;
1353 }
1354 } else if (mcand < lcand) {
1355 candidate = mcand;
1356 assert(!mcand.is_max());
1357 ++missing_iter;
1358 } else {
1359 candidate = lcand;
1360 assert(!lcand.is_max());
1361 ++ls_iter;
1362 }
1363
1364 if (candidate >= next) {
1365 break;
1366 }
1367
1368 if (response.entries.size() == list_size) {
1369 next = candidate;
1370 break;
1371 }
1372
1373 // skip snapdir objects
1374 if (candidate.snap == CEPH_SNAPDIR)
1375 continue;
1376
1377 if (candidate.snap != CEPH_NOSNAP)
1378 continue;
1379
1380 // skip wrong namespace
1381 if (candidate.get_namespace() != m->get_hobj().nspace)
1382 continue;
1383
1384 if (filter && !pgls_filter(filter, candidate, filter_out))
1385 continue;
1386
1387 response.entries.push_back(make_pair(candidate.oid,
1388 candidate.get_key()));
1389 }
1390 if (next.is_max() &&
1391 missing_iter == pg_log.get_missing().get_items().end() &&
1392 ls_iter == sentries.end()) {
1393 result = 1;
1394 }
1395 response.handle = next;
1396 ::encode(response, osd_op.outdata);
1397 if (filter)
1398 ::encode(filter_out, osd_op.outdata);
1399 dout(10) << " pgls result=" << result << " outdata.length()="
1400 << osd_op.outdata.length() << dendl;
1401 }
1402 break;
1403
1404 case CEPH_OSD_OP_PG_HITSET_LS:
1405 {
1406 list< pair<utime_t,utime_t> > ls;
1407 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1408 p != info.hit_set.history.end();
1409 ++p)
1410 ls.push_back(make_pair(p->begin, p->end));
1411 if (hit_set)
1412 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1413 ::encode(ls, osd_op.outdata);
1414 }
1415 break;
1416
1417 case CEPH_OSD_OP_PG_HITSET_GET:
1418 {
1419 utime_t stamp(osd_op.op.hit_set_get.stamp);
1420 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1421 // read the current in-memory HitSet, not the version we've
1422 // checkpointed.
1423 if (!hit_set) {
1424 result= -ENOENT;
1425 break;
1426 }
1427 ::encode(*hit_set, osd_op.outdata);
1428 result = osd_op.outdata.length();
1429 } else {
1430 // read an archived HitSet.
1431 hobject_t oid;
1432 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1433 p != info.hit_set.history.end();
1434 ++p) {
1435 if (stamp >= p->begin && stamp <= p->end) {
1436 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1437 break;
1438 }
1439 }
1440 if (oid == hobject_t()) {
1441 result = -ENOENT;
1442 break;
1443 }
1444 if (!pool.info.is_replicated()) {
1445 // FIXME: EC not supported yet
1446 result = -EOPNOTSUPP;
1447 break;
1448 }
1449 if (is_unreadable_object(oid)) {
1450 wait_for_unreadable_object(oid, op);
1451 delete filter;
1452 return;
1453 }
1454 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1455 }
1456 }
1457 break;
1458
1459 case CEPH_OSD_OP_SCRUBLS:
1460 result = do_scrub_ls(m, &osd_op);
1461 break;
1462
1463 default:
1464 result = -EINVAL;
1465 break;
1466 }
1467
1468 if (result < 0)
1469 break;
1470 }
1471
1472 // reply
1473 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1474 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1475 false);
1476 reply->claim_op_out_data(ops);
1477 reply->set_result(result);
1478 reply->set_reply_versions(info.last_update, info.last_user_version);
1479 osd->send_message_osd_client(reply, m->get_connection());
1480 delete filter;
1481 }
1482
1483 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1484 {
1485 if (m->get_pg() != info.pgid.pgid) {
1486 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1487 return -EINVAL; // hmm?
1488 }
1489 auto bp = osd_op->indata.begin();
1490 scrub_ls_arg_t arg;
1491 try {
1492 arg.decode(bp);
1493 } catch (buffer::error&) {
1494 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1495 return -EINVAL;
1496 }
1497 int r = 0;
1498 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1499 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1500 r = -EAGAIN;
1501 } else if (!scrubber.store) {
1502 r = -ENOENT;
1503 } else if (arg.get_snapsets) {
1504 result.vals = scrubber.store->get_snap_errors(osd->store,
1505 get_pgid().pool(),
1506 arg.start_after,
1507 arg.max_return);
1508 } else {
1509 result.vals = scrubber.store->get_object_errors(osd->store,
1510 get_pgid().pool(),
1511 arg.start_after,
1512 arg.max_return);
1513 }
1514 ::encode(result, osd_op->outdata);
1515 return r;
1516 }
1517
1518 void PrimaryLogPG::calc_trim_to()
1519 {
1520 size_t target = cct->_conf->osd_min_pg_log_entries;
1521 if (is_degraded() ||
1522 state_test(PG_STATE_RECOVERING |
1523 PG_STATE_RECOVERY_WAIT |
1524 PG_STATE_BACKFILL |
1525 PG_STATE_BACKFILL_WAIT |
1526 PG_STATE_BACKFILL_TOOFULL)) {
1527 target = cct->_conf->osd_max_pg_log_entries;
1528 }
1529
1530 eversion_t limit = MIN(
1531 min_last_complete_ondisk,
1532 pg_log.get_can_rollback_to());
1533 if (limit != eversion_t() &&
1534 limit != pg_trim_to &&
1535 pg_log.get_log().approx_size() > target) {
1536 size_t num_to_trim = pg_log.get_log().approx_size() - target;
1537 if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1538 return;
1539 }
1540 list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1541 eversion_t new_trim_to;
1542 for (size_t i = 0; i < num_to_trim; ++i) {
1543 new_trim_to = it->version;
1544 ++it;
1545 if (new_trim_to > limit) {
1546 new_trim_to = limit;
1547 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1548 break;
1549 }
1550 }
1551 dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1552 pg_trim_to = new_trim_to;
1553 assert(pg_trim_to <= pg_log.get_head());
1554 assert(pg_trim_to <= min_last_complete_ondisk);
1555 }
1556 }
1557
1558 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1559 const PGPool &_pool, spg_t p) :
1560 PG(o, curmap, _pool, p),
1561 pgbackend(
1562 PGBackend::build_pg_backend(
1563 _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1564 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1565 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1566 new_backfill(false),
1567 temp_seq(0),
1568 snap_trimmer_machine(this)
1569 {
1570 missing_loc.set_backend_predicates(
1571 pgbackend->get_is_readable_predicate(),
1572 pgbackend->get_is_recoverable_predicate());
1573 snap_trimmer_machine.initiate();
1574 }
1575
1576 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1577 {
1578 src_oloc = oloc;
1579 if (oloc.key.empty())
1580 src_oloc.key = oid.name;
1581 }
1582
1583 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1584 {
1585 const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1586 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1587 if (!session)
1588 return; // drop it.
1589 session->put(); // get_priv takes a ref, and so does the SessionRef
1590 hobject_t begin = info.pgid.pgid.get_hobj_start();
1591 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1592 if (begin < m->begin) {
1593 begin = m->begin;
1594 }
1595 if (end > m->end) {
1596 end = m->end;
1597 }
1598 dout(10) << __func__ << " backoff ack id " << m->id
1599 << " [" << begin << "," << end << ")" << dendl;
1600 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1601 }
1602
1603 void PrimaryLogPG::do_request(
1604 OpRequestRef& op,
1605 ThreadPool::TPHandle &handle)
1606 {
1607 if (op->osd_trace) {
1608 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1609 op->pg_trace.event("do request");
1610 }
1611 // make sure we have a new enough map
1612 auto p = waiting_for_map.find(op->get_source());
1613 if (p != waiting_for_map.end()) {
1614 // preserve ordering
1615 dout(20) << __func__ << " waiting_for_map "
1616 << p->first << " not empty, queueing" << dendl;
1617 p->second.push_back(op);
1618 op->mark_delayed("waiting_for_map not empty");
1619 return;
1620 }
1621 if (!have_same_or_newer_map(op->min_epoch)) {
1622 dout(20) << __func__ << " min " << op->min_epoch
1623 << ", queue on waiting_for_map " << op->get_source() << dendl;
1624 waiting_for_map[op->get_source()].push_back(op);
1625 op->mark_delayed("op must wait for map");
1626 return;
1627 }
1628
1629 if (can_discard_request(op)) {
1630 return;
1631 }
1632
1633 // pg-wide backoffs
1634 const Message *m = op->get_req();
1635 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1636 SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1637 if (!session)
1638 return; // drop it.
1639 session->put(); // get_priv takes a ref, and so does the SessionRef
1640
1641 if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1642 if (session->check_backoff(cct, info.pgid,
1643 info.pgid.pgid.get_hobj_start(), m)) {
1644 return;
1645 }
1646
1647 bool backoff =
1648 is_down() ||
1649 is_incomplete() ||
1650 (!is_active() && is_peered());
1651 if (g_conf->osd_backoff_on_peering && !backoff) {
1652 if (is_peering()) {
1653 backoff = true;
1654 }
1655 }
1656 if (backoff) {
1657 add_pg_backoff(session);
1658 return;
1659 }
1660 }
1661 // pg backoff acks at pg-level
1662 if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1663 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1664 if (ba->begin != ba->end) {
1665 handle_backoff(op);
1666 return;
1667 }
1668 }
1669 }
1670
1671 if (flushes_in_progress > 0) {
1672 dout(20) << flushes_in_progress
1673 << " flushes_in_progress pending "
1674 << "waiting for active on " << op << dendl;
1675 waiting_for_peered.push_back(op);
1676 op->mark_delayed("waiting for peered");
1677 return;
1678 }
1679
1680 if (!is_peered()) {
1681 // Delay unless PGBackend says it's ok
1682 if (pgbackend->can_handle_while_inactive(op)) {
1683 bool handled = pgbackend->handle_message(op);
1684 assert(handled);
1685 return;
1686 } else {
1687 waiting_for_peered.push_back(op);
1688 op->mark_delayed("waiting for peered");
1689 return;
1690 }
1691 }
1692
1693 assert(is_peered() && flushes_in_progress == 0);
1694 if (pgbackend->handle_message(op))
1695 return;
1696
1697 switch (op->get_req()->get_type()) {
1698 case CEPH_MSG_OSD_OP:
1699 case CEPH_MSG_OSD_BACKOFF:
1700 if (!is_active()) {
1701 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1702 waiting_for_active.push_back(op);
1703 op->mark_delayed("waiting for active");
1704 return;
1705 }
1706 switch (op->get_req()->get_type()) {
1707 case CEPH_MSG_OSD_OP:
1708 // verify client features
1709 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1710 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1711 osd->reply_op_error(op, -EOPNOTSUPP);
1712 return;
1713 }
1714 do_op(op);
1715 break;
1716 case CEPH_MSG_OSD_BACKOFF:
1717 // object-level backoff acks handled in osdop context
1718 handle_backoff(op);
1719 break;
1720 }
1721 break;
1722
1723 case MSG_OSD_SUBOP:
1724 do_sub_op(op);
1725 break;
1726
1727 case MSG_OSD_SUBOPREPLY:
1728 do_sub_op_reply(op);
1729 break;
1730
1731 case MSG_OSD_PG_SCAN:
1732 do_scan(op, handle);
1733 break;
1734
1735 case MSG_OSD_PG_BACKFILL:
1736 do_backfill(op);
1737 break;
1738
1739 case MSG_OSD_PG_BACKFILL_REMOVE:
1740 do_backfill_remove(op);
1741 break;
1742
1743 case MSG_OSD_SCRUB_RESERVE:
1744 {
1745 const MOSDScrubReserve *m =
1746 static_cast<const MOSDScrubReserve*>(op->get_req());
1747 switch (m->type) {
1748 case MOSDScrubReserve::REQUEST:
1749 handle_scrub_reserve_request(op);
1750 break;
1751 case MOSDScrubReserve::GRANT:
1752 handle_scrub_reserve_grant(op, m->from);
1753 break;
1754 case MOSDScrubReserve::REJECT:
1755 handle_scrub_reserve_reject(op, m->from);
1756 break;
1757 case MOSDScrubReserve::RELEASE:
1758 handle_scrub_reserve_release(op);
1759 break;
1760 }
1761 }
1762 break;
1763
1764 case MSG_OSD_REP_SCRUB:
1765 replica_scrub(op, handle);
1766 break;
1767
1768 case MSG_OSD_REP_SCRUBMAP:
1769 do_replica_scrub_map(op);
1770 break;
1771
1772 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1773 do_update_log_missing(op);
1774 break;
1775
1776 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1777 do_update_log_missing_reply(op);
1778 break;
1779
1780 default:
1781 assert(0 == "bad message type in do_request");
1782 }
1783 }
1784
1785 hobject_t PrimaryLogPG::earliest_backfill() const
1786 {
1787 hobject_t e = hobject_t::get_max();
1788 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1789 i != backfill_targets.end();
1790 ++i) {
1791 pg_shard_t bt = *i;
1792 map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1793 assert(iter != peer_info.end());
1794 if (iter->second.last_backfill < e)
1795 e = iter->second.last_backfill;
1796 }
1797 return e;
1798 }
1799
1800 /** do_op - do an op
1801 * pg lock will be held (if multithreaded)
1802 * osd_lock NOT held.
1803 */
1804 void PrimaryLogPG::do_op(OpRequestRef& op)
1805 {
1806 FUNCTRACE();
1807 // NOTE: take a non-const pointer here; we must be careful not to
1808 // change anything that will break other reads on m (operator<<).
1809 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1810 assert(m->get_type() == CEPH_MSG_OSD_OP);
1811 if (m->finish_decode()) {
1812 op->reset_desc(); // for TrackedOp
1813 m->clear_payload();
1814 }
1815
1816 dout(20) << __func__ << ": op " << *m << dendl;
1817
1818 hobject_t head = m->get_hobj();
1819 head.snap = CEPH_NOSNAP;
1820
1821 if (!info.pgid.pgid.contains(
1822 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1823 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1824 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1825 << std::hex << head.get_hash() << std::dec << dendl;
1826 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1827 << " op " << *m;
1828 assert(!cct->_conf->osd_debug_misdirected_ops);
1829 return;
1830 }
1831
1832 bool can_backoff =
1833 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1834 SessionRef session;
1835 if (can_backoff) {
1836 session = static_cast<Session*>(m->get_connection()->get_priv());
1837 if (!session.get()) {
1838 dout(10) << __func__ << " no session" << dendl;
1839 return;
1840 }
1841 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1842
1843 if (session->check_backoff(cct, info.pgid, head, m)) {
1844 return;
1845 }
1846 }
1847
1848 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1849 // not implemented.
1850 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1851 osd->reply_op_error(op, -EINVAL);
1852 return;
1853 }
1854
1855 if (op->rmw_flags == 0) {
1856 int r = osd->osd->init_op_flags(op);
1857 if (r) {
1858 osd->reply_op_error(op, r);
1859 return;
1860 }
1861 }
1862
1863 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1864 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1865 op->may_read() &&
1866 !(op->may_write() || op->may_cache())) {
1867 // balanced reads; any replica will do
1868 if (!(is_primary() || is_replica())) {
1869 osd->handle_misdirected_op(this, op);
1870 return;
1871 }
1872 } else {
1873 // normal case; must be primary
1874 if (!is_primary()) {
1875 osd->handle_misdirected_op(this, op);
1876 return;
1877 }
1878 }
1879
1880 if (!op_has_sufficient_caps(op)) {
1881 osd->reply_op_error(op, -EPERM);
1882 return;
1883 }
1884
1885 if (op->includes_pg_op()) {
1886 return do_pg_op(op);
1887 }
1888
1889 // object name too long?
1890 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1891 dout(4) << "do_op name is longer than "
1892 << cct->_conf->osd_max_object_name_len
1893 << " bytes" << dendl;
1894 osd->reply_op_error(op, -ENAMETOOLONG);
1895 return;
1896 }
1897 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1898 dout(4) << "do_op locator is longer than "
1899 << cct->_conf->osd_max_object_name_len
1900 << " bytes" << dendl;
1901 osd->reply_op_error(op, -ENAMETOOLONG);
1902 return;
1903 }
1904 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1905 dout(4) << "do_op namespace is longer than "
1906 << cct->_conf->osd_max_object_namespace_len
1907 << " bytes" << dendl;
1908 osd->reply_op_error(op, -ENAMETOOLONG);
1909 return;
1910 }
1911
1912 if (int r = osd->store->validate_hobject_key(head)) {
1913 dout(4) << "do_op object " << head << " invalid for backing store: "
1914 << r << dendl;
1915 osd->reply_op_error(op, r);
1916 return;
1917 }
1918
1919 // blacklisted?
1920 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1921 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1922 osd->reply_op_error(op, -EBLACKLISTED);
1923 return;
1924 }
1925
1926 // order this op as a write?
1927 bool write_ordered = op->rwordered();
1928
1929 // discard due to cluster full transition? (we discard any op that
1930 // originates before the cluster or pool is marked full; the client
1931 // will resend after the full flag is removed or if they expect the
1932 // op to succeed despite being full). The except is FULL_FORCE and
1933 // FULL_TRY ops, which there is no reason to discard because they
1934 // bypass all full checks anyway. If this op isn't write or
1935 // read-ordered, we skip.
1936 // FIXME: we exclude mds writes for now.
1937 if (write_ordered && !(m->get_source().is_mds() ||
1938 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1939 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1940 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1941 dout(10) << __func__ << " discarding op sent before full " << m << " "
1942 << *m << dendl;
1943 return;
1944 }
1945 // mds should have stopped writing before this point.
1946 // We can't allow OSD to become non-startable even if mds
1947 // could be writing as part of file removals.
1948 ostringstream ss;
1949 if (write_ordered && osd->check_failsafe_full(ss)) {
1950 dout(10) << __func__ << " fail-safe full check failed, dropping request"
1951 << ss.str()
1952 << dendl;
1953 return;
1954 }
1955 int64_t poolid = get_pgid().pool();
1956 if (op->may_write()) {
1957
1958 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1959 if (!pi) {
1960 return;
1961 }
1962
1963 // invalid?
1964 if (m->get_snapid() != CEPH_NOSNAP) {
1965 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1966 osd->reply_op_error(op, -EINVAL);
1967 return;
1968 }
1969
1970 // too big?
1971 if (cct->_conf->osd_max_write_size &&
1972 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1973 // journal can't hold commit!
1974 derr << "do_op msg data len " << m->get_data_len()
1975 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1976 << " on " << *m << dendl;
1977 osd->reply_op_error(op, -OSD_WRITETOOBIG);
1978 return;
1979 }
1980 }
1981
1982 dout(10) << "do_op " << *m
1983 << (op->may_write() ? " may_write" : "")
1984 << (op->may_read() ? " may_read" : "")
1985 << (op->may_cache() ? " may_cache" : "")
1986 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
1987 << " flags " << ceph_osd_flag_string(m->get_flags())
1988 << dendl;
1989
1990 // missing object?
1991 if (is_unreadable_object(head)) {
1992 if (can_backoff &&
1993 (g_conf->osd_backoff_on_degraded ||
1994 (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
1995 add_backoff(session, head, head);
1996 maybe_kick_recovery(head);
1997 } else {
1998 wait_for_unreadable_object(head, op);
1999 }
2000 return;
2001 }
2002
2003 // degraded object?
2004 if (write_ordered && is_degraded_or_backfilling_object(head)) {
2005 if (can_backoff && g_conf->osd_backoff_on_degraded) {
2006 add_backoff(session, head, head);
2007 } else {
2008 wait_for_degraded_object(head, op);
2009 }
2010 return;
2011 }
2012
2013 if (write_ordered &&
2014 scrubber.write_blocked_by_scrub(head)) {
2015 dout(20) << __func__ << ": waiting for scrub" << dendl;
2016 waiting_for_scrub.push_back(op);
2017 op->mark_delayed("waiting for scrub");
2018 return;
2019 }
2020
2021 // blocked on snap?
2022 map<hobject_t, snapid_t>::iterator blocked_iter =
2023 objects_blocked_on_degraded_snap.find(head);
2024 if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2025 hobject_t to_wait_on(head);
2026 to_wait_on.snap = blocked_iter->second;
2027 wait_for_degraded_object(to_wait_on, op);
2028 return;
2029 }
2030 map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2031 objects_blocked_on_snap_promotion.find(head);
2032 if (write_ordered &&
2033 blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2034 wait_for_blocked_object(
2035 blocked_snap_promote_iter->second->obs.oi.soid,
2036 op);
2037 return;
2038 }
2039 if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2040 block_write_on_full_cache(head, op);
2041 return;
2042 }
2043
2044 // missing snapdir?
2045 hobject_t snapdir = head.get_snapdir();
2046
2047 if (is_unreadable_object(snapdir)) {
2048 wait_for_unreadable_object(snapdir, op);
2049 return;
2050 }
2051
2052 // degraded object?
2053 if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2054 wait_for_degraded_object(snapdir, op);
2055 return;
2056 }
2057
2058 // dup/resent?
2059 if (op->may_write() || op->may_cache()) {
2060 // warning: we will get back *a* request for this reqid, but not
2061 // necessarily the most recent. this happens with flush and
2062 // promote ops, but we can't possible have both in our log where
2063 // the original request is still not stable on disk, so for our
2064 // purposes here it doesn't matter which one we get.
2065 eversion_t version;
2066 version_t user_version;
2067 int return_code = 0;
2068 bool got = check_in_progress_op(
2069 m->get_reqid(), &version, &user_version, &return_code);
2070 if (got) {
2071 dout(3) << __func__ << " dup " << m->get_reqid()
2072 << " version " << version << dendl;
2073 if (already_complete(version)) {
2074 osd->reply_op_error(op, return_code, version, user_version);
2075 } else {
2076 dout(10) << " waiting for " << version << " to commit" << dendl;
2077 // always queue ondisk waiters, so that we can requeue if needed
2078 waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2079 op->mark_delayed("waiting for ondisk");
2080 }
2081 return;
2082 }
2083 }
2084
2085 ObjectContextRef obc;
2086 bool can_create = op->may_write() || op->may_cache();
2087 hobject_t missing_oid;
2088 const hobject_t& oid = m->get_hobj();
2089
2090 // io blocked on obc?
2091 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2092 maybe_await_blocked_snapset(oid, op)) {
2093 return;
2094 }
2095
2096 int r = find_object_context(
2097 oid, &obc, can_create,
2098 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2099 &missing_oid);
2100
2101 if (r == -EAGAIN) {
2102 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2103 // we have to wait for the object.
2104 if (is_primary()) {
2105 // missing the specific snap we need; requeue and wait.
2106 assert(!op->may_write()); // only happens on a read/cache
2107 wait_for_unreadable_object(missing_oid, op);
2108 return;
2109 }
2110 } else if (r == 0) {
2111 if (is_unreadable_object(obc->obs.oi.soid)) {
2112 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2113 << " is unreadable, waiting" << dendl;
2114 wait_for_unreadable_object(obc->obs.oi.soid, op);
2115 return;
2116 }
2117
2118 // degraded object? (the check above was for head; this could be a clone)
2119 if (write_ordered &&
2120 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2121 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2122 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2123 << " is degraded, waiting" << dendl;
2124 wait_for_degraded_object(obc->obs.oi.soid, op);
2125 return;
2126 }
2127 }
2128
2129 bool in_hit_set = false;
2130 if (hit_set) {
2131 if (obc.get()) {
2132 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2133 in_hit_set = true;
2134 } else {
2135 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2136 in_hit_set = true;
2137 }
2138 if (!op->hitset_inserted) {
2139 hit_set->insert(oid);
2140 op->hitset_inserted = true;
2141 if (hit_set->is_full() ||
2142 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2143 hit_set_persist();
2144 }
2145 }
2146 }
2147
2148 if (agent_state) {
2149 if (agent_choose_mode(false, op))
2150 return;
2151 }
2152
2153 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2154 if (maybe_handle_manifest(op,
2155 write_ordered,
2156 obc))
2157 return;
2158 }
2159
2160 if (maybe_handle_cache(op,
2161 write_ordered,
2162 obc,
2163 r,
2164 missing_oid,
2165 false,
2166 in_hit_set))
2167 return;
2168
2169 if (r && (r != -ENOENT || !obc)) {
2170 // copy the reqids for copy get on ENOENT
2171 if (r == -ENOENT &&
2172 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2173 fill_in_copy_get_noent(op, oid, m->ops[0]);
2174 return;
2175 }
2176 dout(20) << __func__ << "find_object_context got error " << r << dendl;
2177 if (op->may_write() &&
2178 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2179 record_write_error(op, oid, nullptr, r);
2180 } else {
2181 osd->reply_op_error(op, r);
2182 }
2183 return;
2184 }
2185
2186 // make sure locator is consistent
2187 object_locator_t oloc(obc->obs.oi.soid);
2188 if (m->get_object_locator() != oloc) {
2189 dout(10) << " provided locator " << m->get_object_locator()
2190 << " != object's " << obc->obs.oi.soid << dendl;
2191 osd->clog->warn() << "bad locator " << m->get_object_locator()
2192 << " on object " << oloc
2193 << " op " << *m;
2194 }
2195
2196 // io blocked on obc?
2197 if (obc->is_blocked() &&
2198 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2199 wait_for_blocked_object(obc->obs.oi.soid, op);
2200 return;
2201 }
2202
2203 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2204
2205 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2206 OSDOp& osd_op = *p;
2207
2208 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2209 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2210 m->get_snapid() != CEPH_SNAPDIR) {
2211 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2212 osd->reply_op_error(op, -EINVAL);
2213 return;
2214 }
2215 }
2216
2217 OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, obc, this);
2218
2219 if (!obc->obs.exists)
2220 ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2221
2222 /* Due to obc caching, we might have a cached non-existent snapset_obc
2223 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2224 * do_op pipeline make decisions based on whether snapset_obc is
2225 * populated.
2226 */
2227 if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2228 ctx->snapset_obc = ObjectContextRef();
2229
2230 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2231 dout(20) << __func__ << ": skipping rw locks" << dendl;
2232 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2233 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2234
2235 // verify there is in fact a flush in progress
2236 // FIXME: we could make this a stronger test.
2237 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2238 if (p == flush_ops.end()) {
2239 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2240 reply_ctx(ctx, -EINVAL);
2241 return;
2242 }
2243 } else if (!get_rw_locks(write_ordered, ctx)) {
2244 dout(20) << __func__ << " waiting for rw locks " << dendl;
2245 op->mark_delayed("waiting for rw locks");
2246 close_op_ctx(ctx);
2247 return;
2248 }
2249 dout(20) << __func__ << " obc " << *obc << dendl;
2250
2251 if (r) {
2252 dout(20) << __func__ << " returned an error: " << r << dendl;
2253 close_op_ctx(ctx);
2254 if (op->may_write() &&
2255 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2256 record_write_error(op, oid, nullptr, r);
2257 } else {
2258 osd->reply_op_error(op, r);
2259 }
2260 return;
2261 }
2262
2263 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2264 ctx->ignore_cache = true;
2265 }
2266
2267 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2268 // This object is lost. Reading from it returns an error.
2269 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2270 << " is lost" << dendl;
2271 reply_ctx(ctx, -ENFILE);
2272 return;
2273 }
2274 if (!op->may_write() &&
2275 !op->may_cache() &&
2276 (!obc->obs.exists ||
2277 ((m->get_snapid() != CEPH_SNAPDIR) &&
2278 obc->obs.oi.is_whiteout()))) {
2279 // copy the reqids for copy get on ENOENT
2280 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2281 fill_in_copy_get_noent(op, oid, m->ops[0]);
2282 close_op_ctx(ctx);
2283 return;
2284 }
2285 reply_ctx(ctx, -ENOENT);
2286 return;
2287 }
2288
2289 op->mark_started();
2290
2291 execute_ctx(ctx);
2292 utime_t prepare_latency = ceph_clock_now();
2293 prepare_latency -= op->get_dequeued_time();
2294 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2295 if (op->may_read() && op->may_write()) {
2296 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2297 } else if (op->may_read()) {
2298 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2299 } else if (op->may_write() || op->may_cache()) {
2300 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2301 }
2302
2303 // force recovery of the oldest missing object if too many logs
2304 maybe_force_recovery();
2305 }
2306 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2307 OpRequestRef op,
2308 bool write_ordered,
2309 ObjectContextRef obc)
2310 {
2311 if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2312 CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2313 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2314 return cache_result_t::NOOP;
2315 }
2316
2317 if (obc)
2318 dout(10) << __func__ << " " << obc->obs.oi << " "
2319 << (obc->obs.exists ? "exists" : "DNE")
2320 << dendl;
2321
2322 // if it is write-ordered and blocked, stop now
2323 if (obc.get() && obc->is_blocked() && write_ordered) {
2324 // we're already doing something with this object
2325 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2326 return cache_result_t::NOOP;
2327 }
2328
2329 vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2330 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2331 OSDOp& osd_op = *p;
2332 ceph_osd_op& op = osd_op.op;
2333 if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2334 return cache_result_t::NOOP;
2335 }
2336 }
2337
2338 switch (obc->obs.oi.manifest.type) {
2339 case object_manifest_t::TYPE_REDIRECT:
2340 if (op->may_write() || write_ordered) {
2341 do_proxy_write(op, obc->obs.oi.soid, obc);
2342 } else {
2343 do_proxy_read(op, obc);
2344 }
2345 return cache_result_t::HANDLED_PROXY;
2346 case object_manifest_t::TYPE_CHUNKED:
2347 default:
2348 assert(0 == "unrecognized manifest type");
2349 }
2350
2351 return cache_result_t::NOOP;
2352 }
2353
2354 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2355 MOSDOpReply *orig_reply, int r)
2356 {
2357 dout(20) << __func__ << " r=" << r << dendl;
2358 assert(op->may_write());
2359 const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2360 ObjectContextRef obc;
2361 mempool::osd_pglog::list<pg_log_entry_t> entries;
2362 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2363 get_next_version(), eversion_t(), 0,
2364 reqid, utime_t(), r));
2365
2366 struct OnComplete {
2367 PrimaryLogPG *pg;
2368 OpRequestRef op;
2369 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2370 int r;
2371 OnComplete(
2372 PrimaryLogPG *pg,
2373 OpRequestRef op,
2374 MOSDOpReply *orig_reply,
2375 int r)
2376 : pg(pg), op(op),
2377 orig_reply(orig_reply, false /* take over ref */), r(r)
2378 {}
2379 void operator()() {
2380 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2381 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2382 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2383 MOSDOpReply *reply = orig_reply.detach();
2384 if (reply == nullptr) {
2385 reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2386 flags, true);
2387 }
2388 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2389 pg->osd->send_message_osd_client(reply, m->get_connection());
2390 }
2391 };
2392
2393 ObcLockManager lock_manager;
2394 submit_log_entries(
2395 entries,
2396 std::move(lock_manager),
2397 boost::optional<std::function<void(void)> >(
2398 OnComplete(this, op, orig_reply, r)),
2399 op,
2400 r);
2401 }
2402
2403 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2404 OpRequestRef op,
2405 bool write_ordered,
2406 ObjectContextRef obc,
2407 int r, hobject_t missing_oid,
2408 bool must_promote,
2409 bool in_hit_set,
2410 ObjectContextRef *promote_obc)
2411 {
2412 if (op &&
2413 op->get_req() &&
2414 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2415 (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2416 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2417 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2418 return cache_result_t::NOOP;
2419 }
2420 // return quickly if caching is not enabled
2421 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2422 return cache_result_t::NOOP;
2423
2424 must_promote = must_promote || op->need_promote();
2425
2426 if (obc)
2427 dout(25) << __func__ << " " << obc->obs.oi << " "
2428 << (obc->obs.exists ? "exists" : "DNE")
2429 << " missing_oid " << missing_oid
2430 << " must_promote " << (int)must_promote
2431 << " in_hit_set " << (int)in_hit_set
2432 << dendl;
2433 else
2434 dout(25) << __func__ << " (no obc)"
2435 << " missing_oid " << missing_oid
2436 << " must_promote " << (int)must_promote
2437 << " in_hit_set " << (int)in_hit_set
2438 << dendl;
2439
2440 // if it is write-ordered and blocked, stop now
2441 if (obc.get() && obc->is_blocked() && write_ordered) {
2442 // we're already doing something with this object
2443 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2444 return cache_result_t::NOOP;
2445 }
2446
2447 if (r == -ENOENT && missing_oid == hobject_t()) {
2448 // we know this object is logically absent (e.g., an undefined clone)
2449 return cache_result_t::NOOP;
2450 }
2451
2452 if (obc.get() && obc->obs.exists) {
2453 osd->logger->inc(l_osd_op_cache_hit);
2454 return cache_result_t::NOOP;
2455 }
2456
2457 if (missing_oid == hobject_t() && obc.get()) {
2458 missing_oid = obc->obs.oi.soid;
2459 }
2460
2461 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2462 const object_locator_t oloc = m->get_object_locator();
2463
2464 if (op->need_skip_handle_cache()) {
2465 return cache_result_t::NOOP;
2466 }
2467
2468 // older versions do not proxy the feature bits.
2469 bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2470 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2471 OpRequestRef promote_op;
2472
2473 switch (pool.info.cache_mode) {
2474 case pg_pool_t::CACHEMODE_WRITEBACK:
2475 if (agent_state &&
2476 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2477 if (!op->may_write() && !op->may_cache() &&
2478 !write_ordered && !must_promote) {
2479 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2480 do_proxy_read(op);
2481 return cache_result_t::HANDLED_PROXY;
2482 }
2483 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2484 block_write_on_full_cache(missing_oid, op);
2485 return cache_result_t::BLOCKED_FULL;
2486 }
2487
2488 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2489 promote_object(obc, missing_oid, oloc, op, promote_obc);
2490 return cache_result_t::BLOCKED_PROMOTE;
2491 }
2492
2493 if (op->may_write() || op->may_cache()) {
2494 if (can_proxy_write) {
2495 do_proxy_write(op, missing_oid);
2496 } else {
2497 // promote if can't proxy the write
2498 promote_object(obc, missing_oid, oloc, op, promote_obc);
2499 return cache_result_t::BLOCKED_PROMOTE;
2500 }
2501
2502 // Promote too?
2503 if (!op->need_skip_promote() &&
2504 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2505 pool.info.min_write_recency_for_promote,
2506 OpRequestRef(),
2507 promote_obc)) {
2508 return cache_result_t::BLOCKED_PROMOTE;
2509 }
2510 return cache_result_t::HANDLED_PROXY;
2511 } else {
2512 do_proxy_read(op);
2513
2514 // Avoid duplicate promotion
2515 if (obc.get() && obc->is_blocked()) {
2516 if (promote_obc)
2517 *promote_obc = obc;
2518 return cache_result_t::BLOCKED_PROMOTE;
2519 }
2520
2521 // Promote too?
2522 if (!op->need_skip_promote()) {
2523 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2524 pool.info.min_read_recency_for_promote,
2525 promote_op, promote_obc);
2526 }
2527
2528 return cache_result_t::HANDLED_PROXY;
2529 }
2530 assert(0 == "unreachable");
2531 return cache_result_t::NOOP;
2532
2533 case pg_pool_t::CACHEMODE_FORWARD:
2534 // FIXME: this mode allows requests to be reordered.
2535 do_cache_redirect(op);
2536 return cache_result_t::HANDLED_REDIRECT;
2537
2538 case pg_pool_t::CACHEMODE_READONLY:
2539 // TODO: clean this case up
2540 if (!obc.get() && r == -ENOENT) {
2541 // we don't have the object and op's a read
2542 promote_object(obc, missing_oid, oloc, op, promote_obc);
2543 return cache_result_t::BLOCKED_PROMOTE;
2544 }
2545 if (!r) { // it must be a write
2546 do_cache_redirect(op);
2547 return cache_result_t::HANDLED_REDIRECT;
2548 }
2549 // crap, there was a failure of some kind
2550 return cache_result_t::NOOP;
2551
2552 case pg_pool_t::CACHEMODE_READFORWARD:
2553 // Do writeback to the cache tier for writes
2554 if (op->may_write() || write_ordered || must_promote) {
2555 if (agent_state &&
2556 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2557 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2558 block_write_on_full_cache(missing_oid, op);
2559 return cache_result_t::BLOCKED_FULL;
2560 }
2561 promote_object(obc, missing_oid, oloc, op, promote_obc);
2562 return cache_result_t::BLOCKED_PROMOTE;
2563 }
2564
2565 // If it is a read, we can read, we need to forward it
2566 do_cache_redirect(op);
2567 return cache_result_t::HANDLED_REDIRECT;
2568
2569 case pg_pool_t::CACHEMODE_PROXY:
2570 if (!must_promote) {
2571 if (op->may_write() || op->may_cache() || write_ordered) {
2572 if (can_proxy_write) {
2573 do_proxy_write(op, missing_oid);
2574 return cache_result_t::HANDLED_PROXY;
2575 }
2576 } else {
2577 do_proxy_read(op);
2578 return cache_result_t::HANDLED_PROXY;
2579 }
2580 }
2581 // ugh, we're forced to promote.
2582 if (agent_state &&
2583 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2584 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2585 block_write_on_full_cache(missing_oid, op);
2586 return cache_result_t::BLOCKED_FULL;
2587 }
2588 promote_object(obc, missing_oid, oloc, op, promote_obc);
2589 return cache_result_t::BLOCKED_PROMOTE;
2590
2591 case pg_pool_t::CACHEMODE_READPROXY:
2592 // Do writeback to the cache tier for writes
2593 if (op->may_write() || write_ordered || must_promote) {
2594 if (agent_state &&
2595 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2596 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2597 block_write_on_full_cache(missing_oid, op);
2598 return cache_result_t::BLOCKED_FULL;
2599 }
2600 promote_object(obc, missing_oid, oloc, op, promote_obc);
2601 return cache_result_t::BLOCKED_PROMOTE;
2602 }
2603
2604 // If it is a read, we can read, we need to proxy it
2605 do_proxy_read(op);
2606 return cache_result_t::HANDLED_PROXY;
2607
2608 default:
2609 assert(0 == "unrecognized cache_mode");
2610 }
2611 return cache_result_t::NOOP;
2612 }
2613
2614 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2615 const hobject_t& missing_oid,
2616 const object_locator_t& oloc,
2617 bool in_hit_set,
2618 uint32_t recency,
2619 OpRequestRef promote_op,
2620 ObjectContextRef *promote_obc)
2621 {
2622 dout(20) << __func__ << " missing_oid " << missing_oid
2623 << " in_hit_set " << in_hit_set << dendl;
2624
2625 switch (recency) {
2626 case 0:
2627 break;
2628 case 1:
2629 // Check if in the current hit set
2630 if (in_hit_set) {
2631 break;
2632 } else {
2633 // not promoting
2634 return false;
2635 }
2636 break;
2637 default:
2638 {
2639 unsigned count = (int)in_hit_set;
2640 if (count) {
2641 // Check if in other hit sets
2642 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2643 for (map<time_t,HitSetRef>::reverse_iterator itor =
2644 agent_state->hit_set_map.rbegin();
2645 itor != agent_state->hit_set_map.rend();
2646 ++itor) {
2647 if (!itor->second->contains(oid)) {
2648 break;
2649 }
2650 ++count;
2651 if (count >= recency) {
2652 break;
2653 }
2654 }
2655 }
2656 if (count >= recency) {
2657 break;
2658 }
2659 return false; // not promoting
2660 }
2661 break;
2662 }
2663
2664 if (osd->promote_throttle()) {
2665 dout(10) << __func__ << " promote throttled" << dendl;
2666 return false;
2667 }
2668 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2669 return true;
2670 }
2671
2672 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2673 {
2674 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2675 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2676 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2677 get_osdmap()->get_epoch(), flags, false);
2678 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2679 reply->set_redirect(redir);
2680 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2681 << op << dendl;
2682 m->get_connection()->send_message(reply);
2683 return;
2684 }
2685
2686 struct C_ProxyRead : public Context {
2687 PrimaryLogPGRef pg;
2688 hobject_t oid;
2689 epoch_t last_peering_reset;
2690 ceph_tid_t tid;
2691 PrimaryLogPG::ProxyReadOpRef prdop;
2692 utime_t start;
2693 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2694 const PrimaryLogPG::ProxyReadOpRef& prd)
2695 : pg(p), oid(o), last_peering_reset(lpr),
2696 tid(0), prdop(prd), start(ceph_clock_now())
2697 {}
2698 void finish(int r) override {
2699 if (prdop->canceled)
2700 return;
2701 pg->lock();
2702 if (prdop->canceled) {
2703 pg->unlock();
2704 return;
2705 }
2706 if (last_peering_reset == pg->get_last_peering_reset()) {
2707 pg->finish_proxy_read(oid, tid, r);
2708 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2709 }
2710 pg->unlock();
2711 }
2712 };
2713
2714 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2715 {
2716 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2717 // stash the result in the request's OSDOp vector
2718 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2719 object_locator_t oloc;
2720 hobject_t soid;
2721 /* extensible tier */
2722 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2723 switch (obc->obs.oi.manifest.type) {
2724 case object_manifest_t::TYPE_REDIRECT:
2725 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2726 soid = obc->obs.oi.manifest.redirect_target;
2727 break;
2728 case object_manifest_t::TYPE_CHUNKED:
2729 default:
2730 assert(0 == "unrecognized manifest type");
2731 }
2732 } else {
2733 /* proxy */
2734 soid = m->get_hobj();
2735 oloc = object_locator_t(m->get_object_locator());
2736 oloc.pool = pool.info.tier_of;
2737 }
2738 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2739
2740 // pass through some original flags that make sense.
2741 // - leave out redirection and balancing flags since we are
2742 // already proxying through the primary
2743 // - leave off read/write/exec flags that are derived from the op
2744 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2745 CEPH_OSD_FLAG_ORDERSNAP |
2746 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2747 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2748
2749 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2750
2751 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2752
2753 ObjectOperation obj_op;
2754 obj_op.dup(prdop->ops);
2755
2756 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2757 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2758 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2759 ceph_osd_op op = obj_op.ops[i].op;
2760 switch (op.op) {
2761 case CEPH_OSD_OP_READ:
2762 case CEPH_OSD_OP_SYNC_READ:
2763 case CEPH_OSD_OP_SPARSE_READ:
2764 case CEPH_OSD_OP_CHECKSUM:
2765 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2766 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2767 }
2768 }
2769 }
2770
2771 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2772 prdop);
2773 ceph_tid_t tid = osd->objecter->read(
2774 soid.oid, oloc, obj_op,
2775 m->get_snapid(), NULL,
2776 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2777 &prdop->user_version,
2778 &prdop->data_offset,
2779 m->get_features());
2780 fin->tid = tid;
2781 prdop->objecter_tid = tid;
2782 proxyread_ops[tid] = prdop;
2783 in_progress_proxy_ops[soid].push_back(op);
2784 }
2785
2786 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2787 {
2788 dout(10) << __func__ << " " << oid << " tid " << tid
2789 << " " << cpp_strerror(r) << dendl;
2790
2791 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2792 if (p == proxyread_ops.end()) {
2793 dout(10) << __func__ << " no proxyread_op found" << dendl;
2794 return;
2795 }
2796 ProxyReadOpRef prdop = p->second;
2797 if (tid != prdop->objecter_tid) {
2798 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2799 << " tid " << prdop->objecter_tid << dendl;
2800 return;
2801 }
2802 if (oid != prdop->soid) {
2803 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2804 << " soid " << prdop->soid << dendl;
2805 return;
2806 }
2807 proxyread_ops.erase(tid);
2808
2809 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2810 if (q == in_progress_proxy_ops.end()) {
2811 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2812 return;
2813 }
2814 assert(q->second.size());
2815 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2816 q->second.end(),
2817 prdop->op);
2818 assert(it != q->second.end());
2819 OpRequestRef op = *it;
2820 q->second.erase(it);
2821 if (q->second.size() == 0) {
2822 in_progress_proxy_ops.erase(oid);
2823 }
2824
2825 osd->logger->inc(l_osd_tier_proxy_read);
2826
2827 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2828 OpContext *ctx = new OpContext(op, m->get_reqid(), prdop->ops, this);
2829 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2830 ctx->user_at_version = prdop->user_version;
2831 ctx->data_off = prdop->data_offset;
2832 ctx->ignore_log_op_stats = true;
2833 complete_read_ctx(r, ctx);
2834 }
2835
2836 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2837 {
2838 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2839 if (p == in_progress_proxy_ops.end())
2840 return;
2841
2842 list<OpRequestRef>& ls = p->second;
2843 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2844 requeue_ops(ls);
2845 in_progress_proxy_ops.erase(p);
2846 }
2847
2848 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2849 {
2850 dout(10) << __func__ << " " << prdop->soid << dendl;
2851 prdop->canceled = true;
2852
2853 // cancel objecter op, if we can
2854 if (prdop->objecter_tid) {
2855 osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2856 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2857 prdop->ops[i].outdata.clear();
2858 }
2859 proxyread_ops.erase(prdop->objecter_tid);
2860 prdop->objecter_tid = 0;
2861 }
2862 }
2863
2864 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2865 {
2866 dout(10) << __func__ << dendl;
2867
2868 // cancel proxy reads
2869 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2870 while (p != proxyread_ops.end()) {
2871 cancel_proxy_read((p++)->second);
2872 }
2873
2874 // cancel proxy writes
2875 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2876 while (q != proxywrite_ops.end()) {
2877 cancel_proxy_write((q++)->second);
2878 }
2879
2880 if (requeue) {
2881 map<hobject_t, list<OpRequestRef>>::iterator p =
2882 in_progress_proxy_ops.begin();
2883 while (p != in_progress_proxy_ops.end()) {
2884 list<OpRequestRef>& ls = p->second;
2885 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2886 << " requests" << dendl;
2887 requeue_ops(ls);
2888 in_progress_proxy_ops.erase(p++);
2889 }
2890 } else {
2891 in_progress_proxy_ops.clear();
2892 }
2893 }
2894
2895 struct C_ProxyWrite_Commit : public Context {
2896 PrimaryLogPGRef pg;
2897 hobject_t oid;
2898 epoch_t last_peering_reset;
2899 ceph_tid_t tid;
2900 PrimaryLogPG::ProxyWriteOpRef pwop;
2901 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2902 const PrimaryLogPG::ProxyWriteOpRef& pw)
2903 : pg(p), oid(o), last_peering_reset(lpr),
2904 tid(0), pwop(pw)
2905 {}
2906 void finish(int r) override {
2907 if (pwop->canceled)
2908 return;
2909 pg->lock();
2910 if (pwop->canceled) {
2911 pg->unlock();
2912 return;
2913 }
2914 if (last_peering_reset == pg->get_last_peering_reset()) {
2915 pg->finish_proxy_write(oid, tid, r);
2916 }
2917 pg->unlock();
2918 }
2919 };
2920
2921 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2922 {
2923 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2924 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2925 object_locator_t oloc;
2926 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2927 hobject_t soid;
2928 /* extensible tier */
2929 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2930 switch (obc->obs.oi.manifest.type) {
2931 case object_manifest_t::TYPE_REDIRECT:
2932 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2933 soid = obc->obs.oi.manifest.redirect_target;
2934 break;
2935 case object_manifest_t::TYPE_CHUNKED:
2936 default:
2937 assert(0 == "unrecognized manifest type");
2938 }
2939 } else {
2940 /* proxy */
2941 soid = m->get_hobj();
2942 oloc = object_locator_t(m->get_object_locator());
2943 oloc.pool = pool.info.tier_of;
2944 }
2945
2946 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2947 if (!(op->may_write() || op->may_cache())) {
2948 flags |= CEPH_OSD_FLAG_RWORDERED;
2949 }
2950 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2951
2952 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2953 pwop->ctx = new OpContext(op, m->get_reqid(), pwop->ops, this);
2954 pwop->mtime = m->get_mtime();
2955
2956 ObjectOperation obj_op;
2957 obj_op.dup(pwop->ops);
2958
2959 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2960 this, soid, get_last_peering_reset(), pwop);
2961 ceph_tid_t tid = osd->objecter->mutate(
2962 soid.oid, oloc, obj_op, snapc,
2963 ceph::real_clock::from_ceph_timespec(pwop->mtime),
2964 flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2965 &pwop->user_version, pwop->reqid);
2966 fin->tid = tid;
2967 pwop->objecter_tid = tid;
2968 proxywrite_ops[tid] = pwop;
2969 in_progress_proxy_ops[soid].push_back(op);
2970 }
2971
2972 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
2973 {
2974 dout(10) << __func__ << " " << oid << " tid " << tid
2975 << " " << cpp_strerror(r) << dendl;
2976
2977 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
2978 if (p == proxywrite_ops.end()) {
2979 dout(10) << __func__ << " no proxywrite_op found" << dendl;
2980 return;
2981 }
2982 ProxyWriteOpRef pwop = p->second;
2983 assert(tid == pwop->objecter_tid);
2984 assert(oid == pwop->soid);
2985
2986 proxywrite_ops.erase(tid);
2987
2988 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
2989 if (q == in_progress_proxy_ops.end()) {
2990 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2991 delete pwop->ctx;
2992 pwop->ctx = NULL;
2993 return;
2994 }
2995 list<OpRequestRef>& in_progress_op = q->second;
2996 assert(in_progress_op.size());
2997 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
2998 in_progress_op.end(),
2999 pwop->op);
3000 assert(it != in_progress_op.end());
3001 in_progress_op.erase(it);
3002 if (in_progress_op.size() == 0) {
3003 in_progress_proxy_ops.erase(oid);
3004 }
3005
3006 osd->logger->inc(l_osd_tier_proxy_write);
3007
3008 const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3009 assert(m != NULL);
3010
3011 if (!pwop->sent_reply) {
3012 // send commit.
3013 MOSDOpReply *reply = pwop->ctx->reply;
3014 if (reply)
3015 pwop->ctx->reply = NULL;
3016 else {
3017 reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3018 reply->set_reply_versions(eversion_t(), pwop->user_version);
3019 }
3020 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3021 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3022 osd->send_message_osd_client(reply, m->get_connection());
3023 pwop->sent_reply = true;
3024 pwop->ctx->op->mark_commit_sent();
3025 }
3026
3027 delete pwop->ctx;
3028 pwop->ctx = NULL;
3029 }
3030
3031 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3032 {
3033 dout(10) << __func__ << " " << pwop->soid << dendl;
3034 pwop->canceled = true;
3035
3036 // cancel objecter op, if we can
3037 if (pwop->objecter_tid) {
3038 osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3039 delete pwop->ctx;
3040 pwop->ctx = NULL;
3041 proxywrite_ops.erase(pwop->objecter_tid);
3042 pwop->objecter_tid = 0;
3043 }
3044 }
3045
3046 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3047 ObjectContextRef obc;
3048 PrimaryLogPG *pg;
3049 utime_t start;
3050 public:
3051 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3052 : obc(obc_),
3053 pg(pg_),
3054 start(ceph_clock_now()) {}
3055
3056 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3057 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3058 int r = results.get<0>();
3059 pg->finish_promote(r, results_data, obc);
3060 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3061 }
3062 };
3063
3064 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3065 const hobject_t& missing_oid,
3066 const object_locator_t& oloc,
3067 OpRequestRef op,
3068 ObjectContextRef *promote_obc)
3069 {
3070 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3071 assert(hoid != hobject_t());
3072 if (scrubber.write_blocked_by_scrub(hoid)) {
3073 dout(10) << __func__ << " " << hoid
3074 << " blocked by scrub" << dendl;
3075 if (op) {
3076 waiting_for_scrub.push_back(op);
3077 op->mark_delayed("waiting for scrub");
3078 dout(10) << __func__ << " " << hoid
3079 << " placing op in waiting_for_scrub" << dendl;
3080 } else {
3081 dout(10) << __func__ << " " << hoid
3082 << " no op, dropping on the floor" << dendl;
3083 }
3084 return;
3085 }
3086 if (!obc) { // we need to create an ObjectContext
3087 assert(missing_oid != hobject_t());
3088 obc = get_object_context(missing_oid, true);
3089 }
3090 if (promote_obc)
3091 *promote_obc = obc;
3092
3093 /*
3094 * Before promote complete, if there are proxy-reads for the object,
3095 * for this case we don't use DONTNEED.
3096 */
3097 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3098 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3099 if (q == in_progress_proxy_ops.end()) {
3100 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3101 }
3102
3103 PromoteCallback *cb = new PromoteCallback(obc, this);
3104 object_locator_t my_oloc = oloc;
3105 my_oloc.pool = pool.info.tier_of;
3106
3107 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3108 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3109 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3110 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3111 start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3112 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3113 src_fadvise_flags, 0);
3114
3115 assert(obc->is_blocked());
3116
3117 if (op)
3118 wait_for_blocked_object(obc->obs.oi.soid, op);
3119 info.stats.stats.sum.num_promote++;
3120 }
3121
3122 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3123 {
3124 FUNCTRACE();
3125 dout(10) << __func__ << " " << ctx << dendl;
3126 ctx->reset_obs(ctx->obc);
3127 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3128 OpRequestRef op = ctx->op;
3129 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3130 ObjectContextRef obc = ctx->obc;
3131 const hobject_t& soid = obc->obs.oi.soid;
3132
3133 // this method must be idempotent since we may call it several times
3134 // before we finally apply the resulting transaction.
3135 ctx->op_t.reset(new PGTransaction);
3136
3137 if (op->may_write() || op->may_cache()) {
3138 // snap
3139 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3140 pool.info.is_pool_snaps_mode()) {
3141 // use pool's snapc
3142 ctx->snapc = pool.snapc;
3143 } else {
3144 // client specified snapc
3145 ctx->snapc.seq = m->get_snap_seq();
3146 ctx->snapc.snaps = m->get_snaps();
3147 filter_snapc(ctx->snapc.snaps);
3148 }
3149 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3150 ctx->snapc.seq < obc->ssc->snapset.seq) {
3151 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3152 << " < snapset seq " << obc->ssc->snapset.seq
3153 << " on " << obc->obs.oi.soid << dendl;
3154 reply_ctx(ctx, -EOLDSNAPC);
3155 return;
3156 }
3157
3158 // version
3159 ctx->at_version = get_next_version();
3160 ctx->mtime = m->get_mtime();
3161
3162 dout(10) << __func__ << " " << soid << " " << ctx->ops
3163 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3164 << " snapc " << ctx->snapc
3165 << " snapset " << obc->ssc->snapset
3166 << dendl;
3167 } else {
3168 dout(10) << __func__ << " " << soid << " " << ctx->ops
3169 << " ov " << obc->obs.oi.version
3170 << dendl;
3171 }
3172
3173 if (!ctx->user_at_version)
3174 ctx->user_at_version = obc->obs.oi.user_version;
3175 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3176
3177 if (op->may_read()) {
3178 dout(10) << " taking ondisk_read_lock" << dendl;
3179 obc->ondisk_read_lock();
3180 }
3181
3182 {
3183 #ifdef WITH_LTTNG
3184 osd_reqid_t reqid = ctx->op->get_reqid();
3185 #endif
3186 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3187 reqid.name._num, reqid.tid, reqid.inc);
3188 }
3189
3190 int result = prepare_transaction(ctx);
3191
3192 {
3193 #ifdef WITH_LTTNG
3194 osd_reqid_t reqid = ctx->op->get_reqid();
3195 #endif
3196 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3197 reqid.name._num, reqid.tid, reqid.inc);
3198 }
3199
3200 if (op->may_read()) {
3201 dout(10) << " dropping ondisk_read_lock" << dendl;
3202 obc->ondisk_read_unlock();
3203 }
3204
3205 if (result == -EINPROGRESS) {
3206 // come back later.
3207 return;
3208 }
3209
3210 if (result == -EAGAIN) {
3211 // clean up after the ctx
3212 close_op_ctx(ctx);
3213 return;
3214 }
3215
3216 bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3217 // prepare the reply
3218 ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3219 successful_write);
3220
3221 // Write operations aren't allowed to return a data payload because
3222 // we can't do so reliably. If the client has to resend the request
3223 // and it has already been applied, we will return 0 with no
3224 // payload. Non-deterministic behavior is no good. However, it is
3225 // possible to construct an operation that does a read, does a guard
3226 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3227 // with the write, or return a CMPXATTR and the read value.
3228 if (successful_write) {
3229 // write. normalize the result code.
3230 dout(20) << " zeroing write result code " << result << dendl;
3231 result = 0;
3232 }
3233 ctx->reply->set_result(result);
3234
3235 // read or error?
3236 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3237 // finish side-effects
3238 if (result >= 0)
3239 do_osd_op_effects(ctx, m->get_connection());
3240
3241 if (ctx->pending_async_reads.empty()) {
3242 complete_read_ctx(result, ctx);
3243 } else {
3244 in_progress_async_reads.push_back(make_pair(op, ctx));
3245 ctx->start_async_reads(this);
3246 }
3247
3248 return;
3249 }
3250
3251 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3252
3253 assert(op->may_write() || op->may_cache());
3254
3255 // trim log?
3256 calc_trim_to();
3257
3258 // verify that we are doing this in order?
3259 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3260 !pool.info.is_tier() && !pool.info.has_tiers()) {
3261 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3262 ceph_tid_t t = m->get_tid();
3263 client_t n = m->get_source().num();
3264 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3265 if (p == cm.end()) {
3266 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3267 cm[n] = t;
3268 } else {
3269 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3270 if (p->second > t) {
3271 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3272 assert(0 == "out of order op");
3273 }
3274 p->second = t;
3275 }
3276 }
3277
3278 if (ctx->update_log_only) {
3279 if (result >= 0)
3280 do_osd_op_effects(ctx, m->get_connection());
3281
3282 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3283 // save just what we need from ctx
3284 MOSDOpReply *reply = ctx->reply;
3285 ctx->reply = nullptr;
3286 reply->claim_op_out_data(ctx->ops);
3287 reply->get_header().data_off = ctx->data_off;
3288 close_op_ctx(ctx);
3289
3290 if (result == -ENOENT) {
3291 reply->set_enoent_reply_versions(info.last_update,
3292 info.last_user_version);
3293 }
3294 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3295 // append to pg log for dup detection - don't save buffers for now
3296 record_write_error(op, soid, reply, result);
3297 return;
3298 }
3299
3300 // no need to capture PG ref, repop cancel will handle that
3301 // Can capture the ctx by pointer, it's owned by the repop
3302 ctx->register_on_commit(
3303 [m, ctx, this](){
3304 if (ctx->op)
3305 log_op_stats(
3306 ctx);
3307
3308 if (m && !ctx->sent_reply) {
3309 MOSDOpReply *reply = ctx->reply;
3310 if (reply)
3311 ctx->reply = nullptr;
3312 else {
3313 reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3314 reply->set_reply_versions(ctx->at_version,
3315 ctx->user_at_version);
3316 }
3317 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3318 dout(10) << " sending reply on " << *m << " " << reply << dendl;
3319 osd->send_message_osd_client(reply, m->get_connection());
3320 ctx->sent_reply = true;
3321 ctx->op->mark_commit_sent();
3322 }
3323 });
3324 ctx->register_on_success(
3325 [ctx, this]() {
3326 do_osd_op_effects(
3327 ctx,
3328 ctx->op ? ctx->op->get_req()->get_connection() :
3329 ConnectionRef());
3330 });
3331 ctx->register_on_finish(
3332 [ctx, this]() {
3333 delete ctx;
3334 });
3335
3336 // issue replica writes
3337 ceph_tid_t rep_tid = osd->get_tid();
3338
3339 RepGather *repop = new_repop(ctx, obc, rep_tid);
3340
3341 issue_repop(repop, ctx);
3342 eval_repop(repop);
3343 repop->put();
3344 }
3345
3346 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3347 {
3348 if (ctx->op)
3349 osd->reply_op_error(ctx->op, r);
3350 close_op_ctx(ctx);
3351 }
3352
3353 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3354 {
3355 if (ctx->op)
3356 osd->reply_op_error(ctx->op, r, v, uv);
3357 close_op_ctx(ctx);
3358 }
3359
3360 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3361 {
3362 OpRequestRef op = ctx->op;
3363 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3364
3365 utime_t now = ceph_clock_now();
3366 utime_t latency = now;
3367 latency -= ctx->op->get_req()->get_recv_stamp();
3368 utime_t process_latency = now;
3369 process_latency -= ctx->op->get_dequeued_time();
3370
3371 uint64_t inb = ctx->bytes_written;
3372 uint64_t outb = ctx->bytes_read;
3373
3374 osd->logger->inc(l_osd_op);
3375
3376 osd->logger->inc(l_osd_op_outb, outb);
3377 osd->logger->inc(l_osd_op_inb, inb);
3378 osd->logger->tinc(l_osd_op_lat, latency);
3379 osd->logger->tinc(l_osd_op_process_lat, process_latency);
3380
3381 if (op->may_read() && op->may_write()) {
3382 osd->logger->inc(l_osd_op_rw);
3383 osd->logger->inc(l_osd_op_rw_inb, inb);
3384 osd->logger->inc(l_osd_op_rw_outb, outb);
3385 osd->logger->tinc(l_osd_op_rw_lat, latency);
3386 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3387 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3388 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3389 } else if (op->may_read()) {
3390 osd->logger->inc(l_osd_op_r);
3391 osd->logger->inc(l_osd_op_r_outb, outb);
3392 osd->logger->tinc(l_osd_op_r_lat, latency);
3393 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3394 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3395 } else if (op->may_write() || op->may_cache()) {
3396 osd->logger->inc(l_osd_op_w);
3397 osd->logger->inc(l_osd_op_w_inb, inb);
3398 osd->logger->tinc(l_osd_op_w_lat, latency);
3399 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3400 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3401 } else
3402 ceph_abort();
3403
3404 dout(15) << "log_op_stats " << *m
3405 << " inb " << inb
3406 << " outb " << outb
3407 << " lat " << latency << dendl;
3408 }
3409
3410 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3411 {
3412 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3413 assert(have_same_or_newer_map(m->map_epoch));
3414 assert(m->get_type() == MSG_OSD_SUBOP);
3415 dout(15) << "do_sub_op " << *op->get_req() << dendl;
3416
3417 if (!is_peered()) {
3418 waiting_for_peered.push_back(op);
3419 op->mark_delayed("waiting for active");
3420 return;
3421 }
3422
3423 const OSDOp *first = NULL;
3424 if (m->ops.size() >= 1) {
3425 first = &m->ops[0];
3426 }
3427
3428 if (first) {
3429 switch (first->op.op) {
3430 case CEPH_OSD_OP_DELETE:
3431 sub_op_remove(op);
3432 return;
3433 case CEPH_OSD_OP_SCRUB_RESERVE:
3434 handle_scrub_reserve_request(op);
3435 return;
3436 case CEPH_OSD_OP_SCRUB_UNRESERVE:
3437 handle_scrub_reserve_release(op);
3438 return;
3439 case CEPH_OSD_OP_SCRUB_MAP:
3440 sub_op_scrub_map(op);
3441 return;
3442 }
3443 }
3444 }
3445
3446 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3447 {
3448 const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3449 assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3450 if (r->ops.size() >= 1) {
3451 const OSDOp& first = r->ops[0];
3452 switch (first.op.op) {
3453 case CEPH_OSD_OP_SCRUB_RESERVE:
3454 {
3455 pg_shard_t from = r->from;
3456 bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3457 bool reserved;
3458 ::decode(reserved, p);
3459 if (reserved) {
3460 handle_scrub_reserve_grant(op, from);
3461 } else {
3462 handle_scrub_reserve_reject(op, from);
3463 }
3464 }
3465 return;
3466 }
3467 }
3468 }
3469
3470 void PrimaryLogPG::do_scan(
3471 OpRequestRef op,
3472 ThreadPool::TPHandle &handle)
3473 {
3474 const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3475 assert(m->get_type() == MSG_OSD_PG_SCAN);
3476 dout(10) << "do_scan " << *m << dendl;
3477
3478 op->mark_started();
3479
3480 switch (m->op) {
3481 case MOSDPGScan::OP_SCAN_GET_DIGEST:
3482 {
3483 ostringstream ss;
3484 if (osd->check_backfill_full(ss)) {
3485 dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3486 queue_peering_event(
3487 CephPeeringEvtRef(
3488 std::make_shared<CephPeeringEvt>(
3489 get_osdmap()->get_epoch(),
3490 get_osdmap()->get_epoch(),
3491 BackfillTooFull())));
3492 return;
3493 }
3494
3495 BackfillInterval bi;
3496 bi.begin = m->begin;
3497 // No need to flush, there won't be any in progress writes occuring
3498 // past m->begin
3499 scan_range(
3500 cct->_conf->osd_backfill_scan_min,
3501 cct->_conf->osd_backfill_scan_max,
3502 &bi,
3503 handle);
3504 MOSDPGScan *reply = new MOSDPGScan(
3505 MOSDPGScan::OP_SCAN_DIGEST,
3506 pg_whoami,
3507 get_osdmap()->get_epoch(), m->query_epoch,
3508 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3509 ::encode(bi.objects, reply->get_data());
3510 osd->send_message_osd_cluster(reply, m->get_connection());
3511 }
3512 break;
3513
3514 case MOSDPGScan::OP_SCAN_DIGEST:
3515 {
3516 pg_shard_t from = m->from;
3517
3518 // Check that from is in backfill_targets vector
3519 assert(is_backfill_targets(from));
3520
3521 BackfillInterval& bi = peer_backfill_info[from];
3522 bi.begin = m->begin;
3523 bi.end = m->end;
3524 bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3525
3526 // take care to preserve ordering!
3527 bi.clear_objects();
3528 ::decode_noclear(bi.objects, p);
3529
3530 if (waiting_on_backfill.erase(from)) {
3531 if (waiting_on_backfill.empty()) {
3532 assert(peer_backfill_info.size() == backfill_targets.size());
3533 finish_recovery_op(hobject_t::get_max());
3534 }
3535 } else {
3536 // we canceled backfill for a while due to a too full, and this
3537 // is an extra response from a non-too-full peer
3538 }
3539 }
3540 break;
3541 }
3542 }
3543
3544 void PrimaryLogPG::do_backfill(OpRequestRef op)
3545 {
3546 const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3547 assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3548 dout(10) << "do_backfill " << *m << dendl;
3549
3550 op->mark_started();
3551
3552 switch (m->op) {
3553 case MOSDPGBackfill::OP_BACKFILL_FINISH:
3554 {
3555 assert(cct->_conf->osd_kill_backfill_at != 1);
3556
3557 MOSDPGBackfill *reply = new MOSDPGBackfill(
3558 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3559 get_osdmap()->get_epoch(),
3560 m->query_epoch,
3561 spg_t(info.pgid.pgid, get_primary().shard));
3562 reply->set_priority(get_recovery_op_priority());
3563 osd->send_message_osd_cluster(reply, m->get_connection());
3564 queue_peering_event(
3565 CephPeeringEvtRef(
3566 std::make_shared<CephPeeringEvt>(
3567 get_osdmap()->get_epoch(),
3568 get_osdmap()->get_epoch(),
3569 RecoveryDone())));
3570 }
3571 // fall-thru
3572
3573 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3574 {
3575 assert(cct->_conf->osd_kill_backfill_at != 2);
3576
3577 info.set_last_backfill(m->last_backfill);
3578 info.stats = m->stats;
3579
3580 ObjectStore::Transaction t;
3581 dirty_info = true;
3582 write_if_dirty(t);
3583 int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3584 assert(tr == 0);
3585 }
3586 break;
3587
3588 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3589 {
3590 assert(is_primary());
3591 assert(cct->_conf->osd_kill_backfill_at != 3);
3592 finish_recovery_op(hobject_t::get_max());
3593 }
3594 break;
3595 }
3596 }
3597
3598 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3599 {
3600 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3601 op->get_req());
3602 assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3603 dout(7) << __func__ << " " << m->ls << dendl;
3604
3605 op->mark_started();
3606
3607 ObjectStore::Transaction t;
3608 for (auto& p : m->ls) {
3609 remove_snap_mapped_object(t, p.first);
3610 }
3611 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3612 assert(r == 0);
3613 }
3614
3615 PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(
3616 bool first, const hobject_t &coid)
3617 {
3618 // load clone info
3619 bufferlist bl;
3620 ObjectContextRef obc = get_object_context(coid, false, NULL);
3621 if (!obc) {
3622 derr << __func__ << " could not find coid " << coid << dendl;
3623 ceph_abort();
3624 }
3625 assert(obc->ssc);
3626
3627 hobject_t snapoid(
3628 coid.oid, coid.get_key(),
3629 obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3630 info.pgid.pool(), coid.get_namespace());
3631 ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3632 assert(snapset_obc);
3633
3634 SnapSet& snapset = obc->ssc->snapset;
3635
3636 bool legacy = snapset.is_legacy() ||
3637 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3638
3639 object_info_t &coi = obc->obs.oi;
3640 set<snapid_t> old_snaps;
3641 if (legacy) {
3642 old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3643 } else {
3644 auto p = snapset.clone_snaps.find(coid.snap);
3645 if (p == snapset.clone_snaps.end()) {
3646 osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset
3647 << " for " << coid << "\n";
3648 return NULL;
3649 }
3650 old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3651 snapset.clone_snaps[coid.snap].end());
3652 }
3653 if (old_snaps.empty()) {
3654 osd->clog->error() << __func__ << " No object info snaps for " << coid;
3655 return NULL;
3656 }
3657
3658 dout(10) << coid << " old_snaps " << old_snaps
3659 << " old snapset " << snapset << dendl;
3660 if (snapset.seq == 0) {
3661 osd->clog->error() << __func__ << " No snapset.seq for " << coid;
3662 return NULL;
3663 }
3664
3665 set<snapid_t> new_snaps;
3666 for (set<snapid_t>::iterator i = old_snaps.begin();
3667 i != old_snaps.end();
3668 ++i) {
3669 if (!pool.info.is_removed_snap(*i))
3670 new_snaps.insert(*i);
3671 }
3672
3673 vector<snapid_t>::iterator p = snapset.clones.end();
3674
3675 if (new_snaps.empty()) {
3676 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3677 if (p == snapset.clones.end()) {
3678 osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones";
3679 return NULL;
3680 }
3681 }
3682
3683 OpContextUPtr ctx = simple_opc_create(obc);
3684 ctx->snapset_obc = snapset_obc;
3685
3686 if (!ctx->lock_manager.get_snaptrimmer_write(
3687 coid,
3688 obc,
3689 first)) {
3690 close_op_ctx(ctx.release());
3691 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3692 return NULL;
3693 }
3694
3695 if (!ctx->lock_manager.get_snaptrimmer_write(
3696 snapoid,
3697 snapset_obc,
3698 first)) {
3699 close_op_ctx(ctx.release());
3700 dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3701 return NULL;
3702 }
3703
3704 ctx->at_version = get_next_version();
3705
3706 PGTransaction *t = ctx->op_t.get();
3707
3708 if (new_snaps.empty()) {
3709 // remove clone
3710 dout(10) << coid << " snaps " << old_snaps << " -> "
3711 << new_snaps << " ... deleting" << dendl;
3712
3713 // ...from snapset
3714 assert(p != snapset.clones.end());
3715
3716 snapid_t last = coid.snap;
3717 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3718
3719 if (p != snapset.clones.begin()) {
3720 // not the oldest... merge overlap into next older clone
3721 vector<snapid_t>::iterator n = p - 1;
3722 hobject_t prev_coid = coid;
3723 prev_coid.snap = *n;
3724 bool adjust_prev_bytes = is_present_clone(prev_coid);
3725
3726 if (adjust_prev_bytes)
3727 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3728
3729 snapset.clone_overlap[*n].intersection_of(
3730 snapset.clone_overlap[*p]);
3731
3732 if (adjust_prev_bytes)
3733 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3734 }
3735 ctx->delta_stats.num_objects--;
3736 if (coi.is_dirty())
3737 ctx->delta_stats.num_objects_dirty--;
3738 if (coi.is_omap())
3739 ctx->delta_stats.num_objects_omap--;
3740 if (coi.is_whiteout()) {
3741 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3742 ctx->delta_stats.num_whiteouts--;
3743 }
3744 ctx->delta_stats.num_object_clones--;
3745 if (coi.is_cache_pinned())
3746 ctx->delta_stats.num_objects_pinned--;
3747 obc->obs.exists = false;
3748
3749 snapset.clones.erase(p);
3750 snapset.clone_overlap.erase(last);
3751 snapset.clone_size.erase(last);
3752 snapset.clone_snaps.erase(last);
3753
3754 ctx->log.push_back(
3755 pg_log_entry_t(
3756 pg_log_entry_t::DELETE,
3757 coid,
3758 ctx->at_version,
3759 ctx->obs->oi.version,
3760 0,
3761 osd_reqid_t(),
3762 ctx->mtime,
3763 0)
3764 );
3765 t->remove(coid);
3766 t->update_snaps(
3767 coid,
3768 old_snaps,
3769 new_snaps);
3770
3771 coi = object_info_t(coid);
3772
3773 ctx->at_version.version++;
3774 } else {
3775 // save adjusted snaps for this object
3776 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3777 if (legacy) {
3778 coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3779 } else {
3780 snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3781 new_snaps.rend());
3782 // we still do a 'modify' event on this object just to trigger a
3783 // snapmapper.update ... :(
3784 }
3785
3786 coi.prior_version = coi.version;
3787 coi.version = ctx->at_version;
3788 bl.clear();
3789 ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3790 t->setattr(coid, OI_ATTR, bl);
3791
3792 ctx->log.push_back(
3793 pg_log_entry_t(
3794 pg_log_entry_t::MODIFY,
3795 coid,
3796 coi.version,
3797 coi.prior_version,
3798 0,
3799 osd_reqid_t(),
3800 ctx->mtime,
3801 0)
3802 );
3803 ctx->at_version.version++;
3804
3805 t->update_snaps(
3806 coid,
3807 old_snaps,
3808 new_snaps);
3809 }
3810
3811 // save head snapset
3812 dout(10) << coid << " new snapset " << snapset << " on "
3813 << snapset_obc->obs.oi << dendl;
3814 if (snapset.clones.empty() &&
3815 (!snapset.head_exists ||
3816 (snapset_obc->obs.oi.is_whiteout() &&
3817 !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3818 !snapset_obc->obs.oi.is_cache_pinned()))) {
3819 // NOTE: this arguably constitutes minor interference with the
3820 // tiering agent if this is a cache tier since a snap trim event
3821 // is effectively evicting a whiteout we might otherwise want to
3822 // keep around.
3823 dout(10) << coid << " removing " << snapoid << dendl;
3824 ctx->log.push_back(
3825 pg_log_entry_t(
3826 pg_log_entry_t::DELETE,
3827 snapoid,
3828 ctx->at_version,
3829 ctx->snapset_obc->obs.oi.version,
3830 0,
3831 osd_reqid_t(),
3832 ctx->mtime,
3833 0)
3834 );
3835 if (snapoid.is_head()) {
3836 derr << "removing snap head" << dendl;
3837 object_info_t& oi = ctx->snapset_obc->obs.oi;
3838 ctx->delta_stats.num_objects--;
3839 if (oi.is_dirty()) {
3840 ctx->delta_stats.num_objects_dirty--;
3841 }
3842 if (oi.is_omap())
3843 ctx->delta_stats.num_objects_omap--;
3844 if (oi.is_whiteout()) {
3845 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3846 ctx->delta_stats.num_whiteouts--;
3847 }
3848 if (oi.is_cache_pinned()) {
3849 ctx->delta_stats.num_objects_pinned--;
3850 }
3851 }
3852 ctx->snapset_obc->obs.exists = false;
3853 ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3854 t->remove(snapoid);
3855 } else {
3856 dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3857 snapset.filter(pool.info);
3858 dout(10) << coid << " writing updated snapset on " << snapoid
3859 << ", snapset is " << snapset << dendl;
3860 ctx->log.push_back(
3861 pg_log_entry_t(
3862 pg_log_entry_t::MODIFY,
3863 snapoid,
3864 ctx->at_version,
3865 ctx->snapset_obc->obs.oi.version,
3866 0,
3867 osd_reqid_t(),
3868 ctx->mtime,
3869 0)
3870 );
3871
3872 ctx->snapset_obc->obs.oi.prior_version =
3873 ctx->snapset_obc->obs.oi.version;
3874 ctx->snapset_obc->obs.oi.version = ctx->at_version;
3875
3876 map <string, bufferlist> attrs;
3877 bl.clear();
3878 ::encode(snapset, bl);
3879 attrs[SS_ATTR].claim(bl);
3880
3881 bl.clear();
3882 ::encode(ctx->snapset_obc->obs.oi, bl,
3883 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3884 attrs[OI_ATTR].claim(bl);
3885 t->setattrs(snapoid, attrs);
3886 }
3887
3888 return ctx;
3889 }
3890
3891 void PrimaryLogPG::kick_snap_trim()
3892 {
3893 assert(is_active());
3894 assert(is_primary());
3895 if (is_clean() && !snap_trimq.empty()) {
3896 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3897 snap_trimmer_machine.process_event(KickTrim());
3898 }
3899 }
3900
3901 void PrimaryLogPG::snap_trimmer_scrub_complete()
3902 {
3903 if (is_primary() && is_active() && is_clean()) {
3904 assert(!snap_trimq.empty());
3905 snap_trimmer_machine.process_event(ScrubComplete());
3906 }
3907 }
3908
3909 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3910 {
3911 if (deleting || pg_has_reset_since(queued)) {
3912 return;
3913 }
3914
3915 assert(is_primary());
3916
3917 dout(10) << "snap_trimmer posting" << dendl;
3918 snap_trimmer_machine.process_event(DoSnapWork());
3919 dout(10) << "snap_trimmer complete" << dendl;
3920 return;
3921 }
3922
3923 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3924 {
3925 __u64 v2;
3926
3927 string v2s(xattr.c_str(), xattr.length());
3928 if (v2s.length())
3929 v2 = strtoull(v2s.c_str(), NULL, 10);
3930 else
3931 v2 = 0;
3932
3933 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3934
3935 switch (op) {
3936 case CEPH_OSD_CMPXATTR_OP_EQ:
3937 return (v1 == v2);
3938 case CEPH_OSD_CMPXATTR_OP_NE:
3939 return (v1 != v2);
3940 case CEPH_OSD_CMPXATTR_OP_GT:
3941 return (v1 > v2);
3942 case CEPH_OSD_CMPXATTR_OP_GTE:
3943 return (v1 >= v2);
3944 case CEPH_OSD_CMPXATTR_OP_LT:
3945 return (v1 < v2);
3946 case CEPH_OSD_CMPXATTR_OP_LTE:
3947 return (v1 <= v2);
3948 default:
3949 return -EINVAL;
3950 }
3951 }
3952
3953 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
3954 {
3955 string v2s(xattr.c_str(), xattr.length());
3956
3957 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
3958
3959 switch (op) {
3960 case CEPH_OSD_CMPXATTR_OP_EQ:
3961 return (v1s.compare(v2s) == 0);
3962 case CEPH_OSD_CMPXATTR_OP_NE:
3963 return (v1s.compare(v2s) != 0);
3964 case CEPH_OSD_CMPXATTR_OP_GT:
3965 return (v1s.compare(v2s) > 0);
3966 case CEPH_OSD_CMPXATTR_OP_GTE:
3967 return (v1s.compare(v2s) >= 0);
3968 case CEPH_OSD_CMPXATTR_OP_LT:
3969 return (v1s.compare(v2s) < 0);
3970 case CEPH_OSD_CMPXATTR_OP_LTE:
3971 return (v1s.compare(v2s) <= 0);
3972 default:
3973 return -EINVAL;
3974 }
3975 }
3976
3977 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
3978 {
3979 ceph_osd_op& op = osd_op.op;
3980 vector<OSDOp> read_ops(1);
3981 OSDOp& read_op = read_ops[0];
3982 int result = 0;
3983
3984 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
3985 read_op.op.extent.offset = op.extent.offset;
3986 read_op.op.extent.length = op.extent.length;
3987 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
3988 read_op.op.extent.truncate_size = op.extent.truncate_size;
3989
3990 result = do_osd_ops(ctx, read_ops);
3991 if (result < 0) {
3992 derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
3993 return result;
3994 }
3995
3996 if (read_op.outdata.length() != osd_op.indata.length())
3997 return -EINVAL;
3998
3999 for (uint64_t p = 0; p < osd_op.indata.length(); p++) {
4000 if (read_op.outdata[p] != osd_op.indata[p]) {
4001 return (-MAX_ERRNO - p);
4002 }
4003 }
4004
4005 return result;
4006 }
4007
4008 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4009 {
4010 ceph_osd_op& op = osd_op.op;
4011 vector<OSDOp> write_ops(1);
4012 OSDOp& write_op = write_ops[0];
4013 uint64_t write_length = op.writesame.length;
4014 int result = 0;
4015
4016 if (!write_length)
4017 return 0;
4018
4019 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4020 return -EINVAL;
4021
4022 if (op.writesame.data_length != osd_op.indata.length()) {
4023 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4024 return -EINVAL;
4025 }
4026
4027 while (write_length) {
4028 write_op.indata.append(osd_op.indata);
4029 write_length -= op.writesame.data_length;
4030 }
4031
4032 write_op.op.op = CEPH_OSD_OP_WRITE;
4033 write_op.op.extent.offset = op.writesame.offset;
4034 write_op.op.extent.length = op.writesame.length;
4035 result = do_osd_ops(ctx, write_ops);
4036 if (result < 0)
4037 derr << "do_writesame do_osd_ops failed " << result << dendl;
4038
4039 return result;
4040 }
4041
4042 // ========================================================================
4043 // low level osd ops
4044
4045 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4046 {
4047 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4048 bufferlist header, vals;
4049 int r = _get_tmap(ctx, &header, &vals);
4050 if (r < 0) {
4051 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4052 r = 0;
4053 return r;
4054 }
4055
4056 vector<OSDOp> ops(3);
4057
4058 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4059 ops[0].op.extent.offset = 0;
4060 ops[0].op.extent.length = 0;
4061
4062 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4063 ops[1].indata.claim(header);
4064
4065 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4066 ops[2].indata.claim(vals);
4067
4068 return do_osd_ops(ctx, ops);
4069 }
4070
4071 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4072 bufferlist& bl)
4073 {
4074 // decode
4075 bufferlist header;
4076 map<string, bufferlist> m;
4077 if (bl.length()) {
4078 bufferlist::iterator p = bl.begin();
4079 ::decode(header, p);
4080 ::decode(m, p);
4081 assert(p.end());
4082 }
4083
4084 // do the update(s)
4085 while (!bp.end()) {
4086 __u8 op;
4087 string key;
4088 ::decode(op, bp);
4089
4090 switch (op) {
4091 case CEPH_OSD_TMAP_SET: // insert key
4092 {
4093 ::decode(key, bp);
4094 bufferlist data;
4095 ::decode(data, bp);
4096 m[key] = data;
4097 }
4098 break;
4099 case CEPH_OSD_TMAP_RM: // remove key
4100 ::decode(key, bp);
4101 if (!m.count(key)) {
4102 return -ENOENT;
4103 }
4104 m.erase(key);
4105 break;
4106 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4107 ::decode(key, bp);
4108 m.erase(key);
4109 break;
4110 case CEPH_OSD_TMAP_HDR: // update header
4111 {
4112 ::decode(header, bp);
4113 }
4114 break;
4115 default:
4116 return -EINVAL;
4117 }
4118 }
4119
4120 // reencode
4121 bufferlist obl;
4122 ::encode(header, obl);
4123 ::encode(m, obl);
4124
4125 // write it out
4126 vector<OSDOp> nops(1);
4127 OSDOp& newop = nops[0];
4128 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4129 newop.op.extent.offset = 0;
4130 newop.op.extent.length = obl.length();
4131 newop.indata = obl;
4132 do_osd_ops(ctx, nops);
4133 osd_op.outdata.claim(newop.outdata);
4134 return 0;
4135 }
4136
4137 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4138 {
4139 bufferlist::iterator orig_bp = bp;
4140 int result = 0;
4141 if (bp.end()) {
4142 dout(10) << "tmapup is a no-op" << dendl;
4143 } else {
4144 // read the whole object
4145 vector<OSDOp> nops(1);
4146 OSDOp& newop = nops[0];
4147 newop.op.op = CEPH_OSD_OP_READ;
4148 newop.op.extent.offset = 0;
4149 newop.op.extent.length = 0;
4150 result = do_osd_ops(ctx, nops);
4151
4152 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4153
4154 dout(30) << " starting is \n";
4155 newop.outdata.hexdump(*_dout);
4156 *_dout << dendl;
4157
4158 bufferlist::iterator ip = newop.outdata.begin();
4159 bufferlist obl;
4160
4161 dout(30) << "the update command is: \n";
4162 osd_op.indata.hexdump(*_dout);
4163 *_dout << dendl;
4164
4165 // header
4166 bufferlist header;
4167 __u32 nkeys = 0;
4168 if (newop.outdata.length()) {
4169 ::decode(header, ip);
4170 ::decode(nkeys, ip);
4171 }
4172 dout(10) << "tmapup header " << header.length() << dendl;
4173
4174 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4175 ++bp;
4176 ::decode(header, bp);
4177 dout(10) << "tmapup new header " << header.length() << dendl;
4178 }
4179
4180 ::encode(header, obl);
4181
4182 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4183
4184 // update keys
4185 bufferlist newkeydata;
4186 string nextkey, last_in_key;
4187 bufferlist nextval;
4188 bool have_next = false;
4189 if (!ip.end()) {
4190 have_next = true;
4191 ::decode(nextkey, ip);
4192 ::decode(nextval, ip);
4193 }
4194 while (!bp.end() && !result) {
4195 __u8 op;
4196 string key;
4197 try {
4198 ::decode(op, bp);
4199 ::decode(key, bp);
4200 }
4201 catch (buffer::error& e) {
4202 return -EINVAL;
4203 }
4204 if (key < last_in_key) {
4205 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4206 << "', falling back to an inefficient (unsorted) update" << dendl;
4207 bp = orig_bp;
4208 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4209 }
4210 last_in_key = key;
4211
4212 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4213
4214 // skip existing intervening keys
4215 bool key_exists = false;
4216 while (have_next && !key_exists) {
4217 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4218 if (nextkey > key)
4219 break;
4220 if (nextkey < key) {
4221 // copy untouched.
4222 ::encode(nextkey, newkeydata);
4223 ::encode(nextval, newkeydata);
4224 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4225 } else {
4226 // don't copy; discard old value. and stop.
4227 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4228 key_exists = true;
4229 nkeys--;
4230 }
4231 if (!ip.end()) {
4232 ::decode(nextkey, ip);
4233 ::decode(nextval, ip);
4234 } else {
4235 have_next = false;
4236 }
4237 }
4238
4239 if (op == CEPH_OSD_TMAP_SET) {
4240 bufferlist val;
4241 try {
4242 ::decode(val, bp);
4243 }
4244 catch (buffer::error& e) {
4245 return -EINVAL;
4246 }
4247 ::encode(key, newkeydata);
4248 ::encode(val, newkeydata);
4249 dout(20) << " set " << key << " " << val.length() << dendl;
4250 nkeys++;
4251 } else if (op == CEPH_OSD_TMAP_CREATE) {
4252 if (key_exists) {
4253 return -EEXIST;
4254 }
4255 bufferlist val;
4256 try {
4257 ::decode(val, bp);
4258 }
4259 catch (buffer::error& e) {
4260 return -EINVAL;
4261 }
4262 ::encode(key, newkeydata);
4263 ::encode(val, newkeydata);
4264 dout(20) << " create " << key << " " << val.length() << dendl;
4265 nkeys++;
4266 } else if (op == CEPH_OSD_TMAP_RM) {
4267 // do nothing.
4268 if (!key_exists) {
4269 return -ENOENT;
4270 }
4271 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4272 // do nothing
4273 } else {
4274 dout(10) << " invalid tmap op " << (int)op << dendl;
4275 return -EINVAL;
4276 }
4277 }
4278
4279 // copy remaining
4280 if (have_next) {
4281 ::encode(nextkey, newkeydata);
4282 ::encode(nextval, newkeydata);
4283 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4284 }
4285 if (!ip.end()) {
4286 bufferlist rest;
4287 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4288 dout(20) << " keep trailing " << rest.length()
4289 << " at " << newkeydata.length() << dendl;
4290 newkeydata.claim_append(rest);
4291 }
4292
4293 // encode final key count + key data
4294 dout(20) << "tmapup final nkeys " << nkeys << dendl;
4295 ::encode(nkeys, obl);
4296 obl.claim_append(newkeydata);
4297
4298 if (0) {
4299 dout(30) << " final is \n";
4300 obl.hexdump(*_dout);
4301 *_dout << dendl;
4302
4303 // sanity check
4304 bufferlist::iterator tp = obl.begin();
4305 bufferlist h;
4306 ::decode(h, tp);
4307 map<string,bufferlist> d;
4308 ::decode(d, tp);
4309 assert(tp.end());
4310 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4311 }
4312
4313 // write it out
4314 if (!result) {
4315 dout(20) << "tmapput write " << obl.length() << dendl;
4316 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4317 newop.op.extent.offset = 0;
4318 newop.op.extent.length = obl.length();
4319 newop.indata = obl;
4320 do_osd_ops(ctx, nops);
4321 osd_op.outdata.claim(newop.outdata);
4322 }
4323 }
4324 return result;
4325 }
4326
4327 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4328 {
4329 if (offset >= max ||
4330 length > max ||
4331 offset + length > max)
4332 return -EFBIG;
4333
4334 return 0;
4335 }
4336
4337 struct FillInVerifyExtent : public Context {
4338 ceph_le64 *r;
4339 int32_t *rval;
4340 bufferlist *outdatap;
4341 boost::optional<uint32_t> maybe_crc;
4342 uint64_t size;
4343 OSDService *osd;
4344 hobject_t soid;
4345 __le32 flags;
4346 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4347 boost::optional<uint32_t> mc, uint64_t size,
4348 OSDService *osd, hobject_t soid, __le32 flags) :
4349 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4350 size(size), osd(osd), soid(soid), flags(flags) {}
4351 void finish(int len) override {
4352 *rval = len;
4353 *r = len;
4354 if (len < 0)
4355 return;
4356 // whole object? can we verify the checksum?
4357 if (maybe_crc && *r == size) {
4358 uint32_t crc = outdatap->crc32c(-1);
4359 if (maybe_crc != crc) {
4360 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4361 << " != expected 0x" << *maybe_crc
4362 << std::dec << " on " << soid;
4363 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4364 *rval = -EIO;
4365 *r = 0;
4366 }
4367 }
4368 }
4369 }
4370 };
4371
4372 struct ToSparseReadResult : public Context {
4373 bufferlist& data_bl;
4374 uint64_t data_offset;
4375 ceph_le64& len;
4376 ToSparseReadResult(bufferlist& bl, uint64_t offset, ceph_le64& len):
4377 data_bl(bl), data_offset(offset),len(len) {}
4378 void finish(int r) override {
4379 if (r < 0) return;
4380 len = r;
4381 bufferlist outdata;
4382 map<uint64_t, uint64_t> extents = {{data_offset, r}};
4383 ::encode(extents, outdata);
4384 ::encode_destructively(data_bl, outdata);
4385 data_bl.swap(outdata);
4386 }
4387 };
4388
4389 template<typename V>
4390 static string list_keys(const map<string, V>& m) {
4391 string s;
4392 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4393 if (!s.empty()) {
4394 s.push_back(',');
4395 }
4396 s.append(itr->first);
4397 }
4398 return s;
4399 }
4400
4401 template<typename T>
4402 static string list_entries(const T& m) {
4403 string s;
4404 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4405 if (!s.empty()) {
4406 s.push_back(',');
4407 }
4408 s.append(*itr);
4409 }
4410 return s;
4411 }
4412
4413 void PrimaryLogPG::maybe_create_new_object(
4414 OpContext *ctx,
4415 bool ignore_transaction)
4416 {
4417 ObjectState& obs = ctx->new_obs;
4418 if (!obs.exists) {
4419 ctx->delta_stats.num_objects++;
4420 obs.exists = true;
4421 assert(!obs.oi.is_whiteout());
4422 obs.oi.new_object();
4423 if (!ignore_transaction)
4424 ctx->op_t->create(obs.oi.soid);
4425 } else if (obs.oi.is_whiteout()) {
4426 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4427 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4428 --ctx->delta_stats.num_whiteouts;
4429 }
4430 }
4431
4432 struct C_ChecksumRead : public Context {
4433 PrimaryLogPG *primary_log_pg;
4434 OSDOp &osd_op;
4435 Checksummer::CSumType csum_type;
4436 bufferlist init_value_bl;
4437 ceph_le64 read_length;
4438 bufferlist read_bl;
4439 Context *fill_extent_ctx;
4440
4441 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4442 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4443 boost::optional<uint32_t> maybe_crc, uint64_t size,
4444 OSDService *osd, hobject_t soid, __le32 flags)
4445 : primary_log_pg(primary_log_pg), osd_op(osd_op),
4446 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4447 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4448 &read_bl, maybe_crc, size,
4449 osd, soid, flags)) {
4450 }
4451
4452 void finish(int r) override {
4453 fill_extent_ctx->complete(r);
4454
4455 if (osd_op.rval >= 0) {
4456 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4457 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4458 &init_value_bl_it,
4459 read_bl);
4460 }
4461 }
4462 };
4463
4464 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4465 bufferlist::iterator *bl_it, bool *async_read)
4466 {
4467 dout(20) << __func__ << dendl;
4468
4469 auto& op = osd_op.op;
4470 if (op.checksum.chunk_size > 0) {
4471 if (op.checksum.length == 0) {
4472 dout(10) << __func__ << ": length required when chunk size provided"
4473 << dendl;
4474 return -EINVAL;
4475 }
4476 if (op.checksum.length % op.checksum.chunk_size != 0) {
4477 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4478 return -EINVAL;
4479 }
4480 }
4481
4482 auto& oi = ctx->new_obs.oi;
4483 if (op.checksum.offset == 0 && op.checksum.length == 0) {
4484 // zeroed offset+length implies checksum whole object
4485 op.checksum.length = oi.size;
4486 } else if (op.checksum.offset + op.checksum.length > oi.size) {
4487 return -EOVERFLOW;
4488 }
4489
4490 Checksummer::CSumType csum_type;
4491 switch (op.checksum.type) {
4492 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4493 csum_type = Checksummer::CSUM_XXHASH32;
4494 break;
4495 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4496 csum_type = Checksummer::CSUM_XXHASH64;
4497 break;
4498 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4499 csum_type = Checksummer::CSUM_CRC32C;
4500 break;
4501 default:
4502 dout(10) << __func__ << ": unknown crc type ("
4503 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4504 return -EINVAL;
4505 }
4506
4507 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4508 if (bl_it->get_remaining() < csum_init_value_size) {
4509 dout(10) << __func__ << ": init value not provided" << dendl;
4510 return -EINVAL;
4511 }
4512
4513 bufferlist init_value_bl;
4514 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4515 csum_init_value_size);
4516 bl_it->advance(csum_init_value_size);
4517
4518 if (pool.info.require_rollback() && op.checksum.length > 0) {
4519 // If there is a data digest and it is possible we are reading
4520 // entire object, pass the digest.
4521 boost::optional<uint32_t> maybe_crc;
4522 if (oi.is_data_digest() && op.checksum.offset == 0 &&
4523 op.checksum.length >= oi.size) {
4524 maybe_crc = oi.data_digest;
4525 }
4526
4527 // async read
4528 auto& soid = oi.soid;
4529 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4530 std::move(init_value_bl), maybe_crc,
4531 oi.size, osd, soid, op.flags);
4532 ctx->pending_async_reads.push_back({
4533 {op.checksum.offset, op.checksum.length, op.flags},
4534 {&checksum_ctx->read_bl, checksum_ctx}});
4535
4536 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4537 *async_read = true;
4538 return 0;
4539 }
4540
4541 // sync read
4542 *async_read = false;
4543 std::vector<OSDOp> read_ops(1);
4544 auto& read_op = read_ops[0];
4545 if (op.checksum.length > 0) {
4546 read_op.op.op = CEPH_OSD_OP_READ;
4547 read_op.op.flags = op.flags;
4548 read_op.op.extent.offset = op.checksum.offset;
4549 read_op.op.extent.length = op.checksum.length;
4550 read_op.op.extent.truncate_size = 0;
4551 read_op.op.extent.truncate_seq = 0;
4552
4553 int r = do_osd_ops(ctx, read_ops);
4554 if (r < 0) {
4555 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4556 return r;
4557 }
4558 }
4559
4560 bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4561 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4562 read_op.outdata);
4563 }
4564
4565 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4566 Checksummer::CSumType csum_type,
4567 bufferlist::iterator *init_value_bl_it,
4568 const bufferlist &read_bl) {
4569 dout(20) << __func__ << dendl;
4570
4571 auto& op = osd_op.op;
4572
4573 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4574 derr << __func__ << ": bytes read " << read_bl.length() << " != "
4575 << op.checksum.length << dendl;
4576 return -EINVAL;
4577 }
4578
4579 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4580 op.checksum.chunk_size : read_bl.length());
4581 uint32_t csum_count = (csum_chunk_size > 0 ?
4582 read_bl.length() / csum_chunk_size : 0);
4583
4584 bufferlist csum;
4585 bufferptr csum_data;
4586 if (csum_count > 0) {
4587 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4588 csum_data = buffer::create(csum_value_size * csum_count);
4589 csum_data.zero();
4590 csum.append(csum_data);
4591
4592 switch (csum_type) {
4593 case Checksummer::CSUM_XXHASH32:
4594 {
4595 Checksummer::xxhash32::init_value_t init_value;
4596 ::decode(init_value, *init_value_bl_it);
4597 Checksummer::calculate<Checksummer::xxhash32>(
4598 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4599 &csum_data);
4600 }
4601 break;
4602 case Checksummer::CSUM_XXHASH64:
4603 {
4604 Checksummer::xxhash64::init_value_t init_value;
4605 ::decode(init_value, *init_value_bl_it);
4606 Checksummer::calculate<Checksummer::xxhash64>(
4607 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4608 &csum_data);
4609 }
4610 break;
4611 case Checksummer::CSUM_CRC32C:
4612 {
4613 Checksummer::crc32c::init_value_t init_value;
4614 ::decode(init_value, *init_value_bl_it);
4615 Checksummer::calculate<Checksummer::crc32c>(
4616 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4617 &csum_data);
4618 }
4619 break;
4620 default:
4621 break;
4622 }
4623 }
4624
4625 ::encode(csum_count, osd_op.outdata);
4626 osd_op.outdata.claim_append(csum);
4627 return 0;
4628 }
4629
4630 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
4631 {
4632 int result = 0;
4633 SnapSetContext *ssc = ctx->obc->ssc;
4634 ObjectState& obs = ctx->new_obs;
4635 object_info_t& oi = obs.oi;
4636 const hobject_t& soid = oi.soid;
4637
4638 bool first_read = true;
4639
4640 PGTransaction* t = ctx->op_t.get();
4641
4642 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
4643
4644 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
4645 OSDOp& osd_op = *p;
4646 ceph_osd_op& op = osd_op.op;
4647
4648 // TODO: check endianness (__le32 vs uint32_t, etc.)
4649 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
4650 // but the code in this function seems to treat them as native-endian. What should the
4651 // tracepoints do?
4652 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
4653
4654 dout(10) << "do_osd_op " << osd_op << dendl;
4655
4656 bufferlist::iterator bp = osd_op.indata.begin();
4657
4658 // user-visible modifcation?
4659 switch (op.op) {
4660 // non user-visible modifications
4661 case CEPH_OSD_OP_WATCH:
4662 case CEPH_OSD_OP_CACHE_EVICT:
4663 case CEPH_OSD_OP_CACHE_FLUSH:
4664 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
4665 case CEPH_OSD_OP_UNDIRTY:
4666 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
4667 case CEPH_OSD_OP_CACHE_PIN:
4668 case CEPH_OSD_OP_CACHE_UNPIN:
4669 case CEPH_OSD_OP_SET_REDIRECT:
4670 break;
4671 default:
4672 if (op.op & CEPH_OSD_OP_MODE_WR)
4673 ctx->user_modify = true;
4674 }
4675
4676 // munge -1 truncate to 0 truncate
4677 if (ceph_osd_op_uses_extent(op.op) &&
4678 op.extent.truncate_seq == 1 &&
4679 op.extent.truncate_size == (-1ULL)) {
4680 op.extent.truncate_size = 0;
4681 op.extent.truncate_seq = 0;
4682 }
4683
4684 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
4685 if (op.op == CEPH_OSD_OP_ZERO &&
4686 obs.exists &&
4687 op.extent.offset < cct->_conf->osd_max_object_size &&
4688 op.extent.length >= 1 &&
4689 op.extent.length <= cct->_conf->osd_max_object_size &&
4690 op.extent.offset + op.extent.length >= oi.size) {
4691 if (op.extent.offset >= oi.size) {
4692 // no-op
4693 goto fail;
4694 }
4695 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
4696 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
4697 op.op = CEPH_OSD_OP_TRUNCATE;
4698 }
4699
4700 switch (op.op) {
4701
4702 // --- READS ---
4703
4704 case CEPH_OSD_OP_CMPEXT:
4705 ++ctx->num_read;
4706 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4707 result = do_extent_cmp(ctx, osd_op);
4708 break;
4709
4710 case CEPH_OSD_OP_SYNC_READ:
4711 if (pool.info.require_rollback()) {
4712 result = -EOPNOTSUPP;
4713 break;
4714 }
4715 // fall through
4716 case CEPH_OSD_OP_READ:
4717 ++ctx->num_read;
4718 {
4719 __u32 seq = oi.truncate_seq;
4720 uint64_t size = oi.size;
4721 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), soid.snap.val, size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4722 bool trimmed_read = false;
4723 // are we beyond truncate_size?
4724 if ( (seq < op.extent.truncate_seq) &&
4725 (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4726 size = op.extent.truncate_size;
4727
4728 if (op.extent.length == 0) //length is zero mean read the whole object
4729 op.extent.length = size;
4730
4731 if (op.extent.offset >= size) {
4732 op.extent.length = 0;
4733 trimmed_read = true;
4734 } else if (op.extent.offset + op.extent.length > size) {
4735 op.extent.length = size - op.extent.offset;
4736 trimmed_read = true;
4737 }
4738
4739 // read into a buffer
4740 bool async = false;
4741 if (trimmed_read && op.extent.length == 0) {
4742 // read size was trimmed to zero and it is expected to do nothing
4743 // a read operation of 0 bytes does *not* do nothing, this is why
4744 // the trimmed_read boolean is needed
4745 } else if (pool.info.require_rollback()) {
4746 async = true;
4747 boost::optional<uint32_t> maybe_crc;
4748 // If there is a data digest and it is possible we are reading
4749 // entire object, pass the digest. FillInVerifyExtent will
4750 // will check the oi.size again.
4751 if (oi.is_data_digest() && op.extent.offset == 0 &&
4752 op.extent.length >= oi.size)
4753 maybe_crc = oi.data_digest;
4754 ctx->pending_async_reads.push_back(
4755 make_pair(
4756 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4757 make_pair(&osd_op.outdata,
4758 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4759 &osd_op.outdata, maybe_crc, oi.size, osd,
4760 soid, op.flags))));
4761 dout(10) << " async_read noted for " << soid << dendl;
4762 } else {
4763 int r = pgbackend->objects_read_sync(
4764 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4765 if (r >= 0)
4766 op.extent.length = r;
4767 else {
4768 result = r;
4769 op.extent.length = 0;
4770 }
4771 dout(10) << " read got " << r << " / " << op.extent.length
4772 << " bytes from obj " << soid << dendl;
4773
4774 // whole object? can we verify the checksum?
4775 if (op.extent.length == oi.size && oi.is_data_digest()) {
4776 uint32_t crc = osd_op.outdata.crc32c(-1);
4777 if (oi.data_digest != crc) {
4778 osd->clog->error() << info.pgid << std::hex
4779 << " full-object read crc 0x" << crc
4780 << " != expected 0x" << oi.data_digest
4781 << std::dec << " on " << soid;
4782 // FIXME fall back to replica or something?
4783 result = -EIO;
4784 }
4785 }
4786 }
4787 if (first_read) {
4788 first_read = false;
4789 ctx->data_off = op.extent.offset;
4790 }
4791 // XXX the op.extent.length is the requested length for async read
4792 // On error this length is changed to 0 after the error comes back.
4793 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4794 ctx->delta_stats.num_rd++;
4795
4796 // Skip checking the result and just proceed to the next operation
4797 if (async)
4798 continue;
4799
4800 }
4801 break;
4802
4803 case CEPH_OSD_OP_CHECKSUM:
4804 ++ctx->num_read;
4805 {
4806 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
4807 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
4808 op.checksum.offset, op.checksum.length,
4809 op.checksum.chunk_size);
4810
4811 bool async_read;
4812 result = do_checksum(ctx, osd_op, &bp, &async_read);
4813 if (result == 0 && async_read) {
4814 continue;
4815 }
4816 }
4817 break;
4818
4819 /* map extents */
4820 case CEPH_OSD_OP_MAPEXT:
4821 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
4822 if (pool.info.require_rollback()) {
4823 result = -EOPNOTSUPP;
4824 break;
4825 }
4826 ++ctx->num_read;
4827 {
4828 // read into a buffer
4829 bufferlist bl;
4830 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4831 info.pgid.shard),
4832 op.extent.offset, op.extent.length, bl);
4833 osd_op.outdata.claim(bl);
4834 if (r < 0)
4835 result = r;
4836 else
4837 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
4838 ctx->delta_stats.num_rd++;
4839 dout(10) << " map_extents done on object " << soid << dendl;
4840 }
4841 break;
4842
4843 /* map extents */
4844 case CEPH_OSD_OP_SPARSE_READ:
4845 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4846 if (op.extent.truncate_seq) {
4847 dout(0) << "sparse_read does not support truncation sequence " << dendl;
4848 result = -EINVAL;
4849 break;
4850 }
4851 ++ctx->num_read;
4852 if (pool.info.ec_pool()) {
4853 // translate sparse read to a normal one if not supported
4854 uint64_t offset = op.extent.offset;
4855 uint64_t length = op.extent.length;
4856 if (offset > oi.size) {
4857 length = 0;
4858 } else if (offset + length > oi.size) {
4859 length = oi.size - offset;
4860 }
4861 if (length > 0) {
4862 ctx->pending_async_reads.push_back(
4863 make_pair(
4864 boost::make_tuple(offset, length, op.flags),
4865 make_pair(
4866 &osd_op.outdata,
4867 new ToSparseReadResult(
4868 osd_op.outdata, offset,
4869 op.extent.length /* updated by the callback */))));
4870 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4871 } else {
4872 dout(10) << " sparse read ended up empty for " << soid << dendl;
4873 map<uint64_t, uint64_t> extents;
4874 ::encode(extents, osd_op.outdata);
4875 }
4876 } else {
4877 // read into a buffer
4878 map<uint64_t, uint64_t> m;
4879 uint32_t total_read = 0;
4880 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4881 info.pgid.shard),
4882 op.extent.offset, op.extent.length, m);
4883 if (r < 0) {
4884 result = r;
4885 break;
4886 }
4887 map<uint64_t, uint64_t>::iterator miter;
4888 bufferlist data_bl;
4889 uint64_t last = op.extent.offset;
4890 for (miter = m.begin(); miter != m.end(); ++miter) {
4891 // verify hole?
4892 if (cct->_conf->osd_verify_sparse_read_holes &&
4893 last < miter->first) {
4894 bufferlist t;
4895 uint64_t len = miter->first - last;
4896 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4897 if (r < 0) {
4898 osd->clog->error() << coll << " " << soid
4899 << " sparse-read failed to read: "
4900 << r;
4901 } else if (!t.is_zero()) {
4902 osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4903 << last << "~" << len;
4904 }
4905 }
4906
4907 bufferlist tmpbl;
4908 r = pgbackend->objects_read_sync(soid, miter->first, miter->second, op.flags, &tmpbl);
4909 if (r < 0) {
4910 result = r;
4911 break;
4912 }
4913
4914 if (r < (int)miter->second) /* this is usually happen when we get extent that exceeds the actual file size */
4915 miter->second = r;
4916 total_read += r;
4917 dout(10) << "sparse-read " << miter->first << "@" << miter->second << dendl;
4918 data_bl.claim_append(tmpbl);
4919 last = miter->first + r;
4920 }
4921
4922 if (r < 0) {
4923 result = r;
4924 break;
4925 }
4926
4927 // verify trailing hole?
4928 if (cct->_conf->osd_verify_sparse_read_holes) {
4929 uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4930 if (last < end) {
4931 bufferlist t;
4932 uint64_t len = end - last;
4933 r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4934 if (r < 0) {
4935 osd->clog->error() << coll << " " << soid
4936 << " sparse-read failed to read: "
4937 << r;
4938 } else if (!t.is_zero()) {
4939 osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4940 << last << "~" << len;
4941 }
4942 }
4943 }
4944
4945 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4946 // Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
4947 // So from this point, for spare-read add checksum make sense.
4948 if (total_read == oi.size && oi.is_data_digest()) {
4949 uint32_t crc = data_bl.crc32c(-1);
4950 if (oi.data_digest != crc) {
4951 osd->clog->error() << info.pgid << std::hex
4952 << " full-object read crc 0x" << crc
4953 << " != expected 0x" << oi.data_digest
4954 << std::dec << " on " << soid;
4955 // FIXME fall back to replica or something?
4956 result = -EIO;
4957 break;
4958 }
4959 }
4960
4961 op.extent.length = total_read;
4962
4963 ::encode(m, osd_op.outdata); // re-encode since it might be modified
4964 ::encode_destructively(data_bl, osd_op.outdata);
4965
4966 dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl;
4967 }
4968 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4969 ctx->delta_stats.num_rd++;
4970 break;
4971
4972 case CEPH_OSD_OP_CALL:
4973 {
4974 string cname, mname;
4975 bufferlist indata;
4976 try {
4977 bp.copy(op.cls.class_len, cname);
4978 bp.copy(op.cls.method_len, mname);
4979 bp.copy(op.cls.indata_len, indata);
4980 } catch (buffer::error& e) {
4981 dout(10) << "call unable to decode class + method + indata" << dendl;
4982 dout(30) << "in dump: ";
4983 osd_op.indata.hexdump(*_dout);
4984 *_dout << dendl;
4985 result = -EINVAL;
4986 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
4987 break;
4988 }
4989 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
4990
4991 ClassHandler::ClassData *cls;
4992 result = osd->class_handler->open_class(cname, &cls);
4993 assert(result == 0); // init_op_flags() already verified this works.
4994
4995 ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
4996 if (!method) {
4997 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
4998 result = -EOPNOTSUPP;
4999 break;
5000 }
5001
5002 int flags = method->get_flags();
5003 if (flags & CLS_METHOD_WR)
5004 ctx->user_modify = true;
5005
5006 bufferlist outdata;
5007 dout(10) << "call method " << cname << "." << mname << dendl;
5008 int prev_rd = ctx->num_read;
5009 int prev_wr = ctx->num_write;
5010 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5011
5012 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5013 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5014 result = -EIO;
5015 break;
5016 }
5017 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5018 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5019 result = -EIO;
5020 break;
5021 }
5022
5023 dout(10) << "method called response length=" << outdata.length() << dendl;
5024 op.extent.length = outdata.length();
5025 osd_op.outdata.claim_append(outdata);
5026 dout(30) << "out dump: ";
5027 osd_op.outdata.hexdump(*_dout);
5028 *_dout << dendl;
5029 }
5030 break;
5031
5032 case CEPH_OSD_OP_STAT:
5033 // note: stat does not require RD
5034 {
5035 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5036
5037 if (obs.exists && !oi.is_whiteout()) {
5038 ::encode(oi.size, osd_op.outdata);
5039 ::encode(oi.mtime, osd_op.outdata);
5040 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5041 } else {
5042 result = -ENOENT;
5043 dout(10) << "stat oi object does not exist" << dendl;
5044 }
5045
5046 ctx->delta_stats.num_rd++;
5047 }
5048 break;
5049
5050 case CEPH_OSD_OP_ISDIRTY:
5051 ++ctx->num_read;
5052 {
5053 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5054 bool is_dirty = obs.oi.is_dirty();
5055 ::encode(is_dirty, osd_op.outdata);
5056 ctx->delta_stats.num_rd++;
5057 result = 0;
5058 }
5059 break;
5060
5061 case CEPH_OSD_OP_UNDIRTY:
5062 ++ctx->num_write;
5063 {
5064 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5065 if (oi.is_dirty()) {
5066 ctx->undirty = true; // see make_writeable()
5067 ctx->modify = true;
5068 ctx->delta_stats.num_wr++;
5069 }
5070 result = 0;
5071 }
5072 break;
5073
5074 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5075 ++ctx->num_write;
5076 {
5077 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5078 if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5079 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5080 result = -EINVAL;
5081 break;
5082 }
5083 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5084 result = -EINVAL;
5085 break;
5086 }
5087 if (!obs.exists) {
5088 result = 0;
5089 break;
5090 }
5091 if (oi.is_cache_pinned()) {
5092 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5093 result = -EPERM;
5094 break;
5095 }
5096 if (oi.is_dirty()) {
5097 result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5098 if (result == -EINPROGRESS)
5099 result = -EAGAIN;
5100 } else {
5101 result = 0;
5102 }
5103 }
5104 break;
5105
5106 case CEPH_OSD_OP_CACHE_FLUSH:
5107 ++ctx->num_write;
5108 {
5109 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5110 if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5111 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5112 result = -EINVAL;
5113 break;
5114 }
5115 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5116 result = -EINVAL;
5117 break;
5118 }
5119 if (!obs.exists) {
5120 result = 0;
5121 break;
5122 }
5123 if (oi.is_cache_pinned()) {
5124 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5125 result = -EPERM;
5126 break;
5127 }
5128 hobject_t missing;
5129 if (oi.is_dirty()) {
5130 result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5131 if (result == -EINPROGRESS)
5132 result = -EAGAIN;
5133 } else {
5134 result = 0;
5135 }
5136 // Check special return value which has set missing_return
5137 if (result == -ENOENT) {
5138 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5139 assert(!missing.is_min());
5140 wait_for_unreadable_object(missing, ctx->op);
5141 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5142 result = -EAGAIN;
5143 }
5144 }
5145 break;
5146
5147 case CEPH_OSD_OP_CACHE_EVICT:
5148 ++ctx->num_write;
5149 {
5150 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5151 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5152 result = -EINVAL;
5153 break;
5154 }
5155 if (!obs.exists) {
5156 result = 0;
5157 break;
5158 }
5159 if (oi.is_cache_pinned()) {
5160 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5161 result = -EPERM;
5162 break;
5163 }
5164 if (oi.is_dirty()) {
5165 result = -EBUSY;
5166 break;
5167 }
5168 if (!oi.watchers.empty()) {
5169 result = -EBUSY;
5170 break;
5171 }
5172 if (soid.snap == CEPH_NOSNAP) {
5173 result = _verify_no_head_clones(soid, ssc->snapset);
5174 if (result < 0)
5175 break;
5176 }
5177 result = _delete_oid(ctx, true, false);
5178 if (result >= 0) {
5179 // mark that this is a cache eviction to avoid triggering normal
5180 // make_writeable() clone or snapdir object creation in finish_ctx()
5181 ctx->cache_evict = true;
5182 }
5183 osd->logger->inc(l_osd_tier_evict);
5184 }
5185 break;
5186
5187 case CEPH_OSD_OP_GETXATTR:
5188 ++ctx->num_read;
5189 {
5190 string aname;
5191 bp.copy(op.xattr.name_len, aname);
5192 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5193 string name = "_" + aname;
5194 int r = getattr_maybe_cache(
5195 ctx->obc,
5196 name,
5197 &(osd_op.outdata));
5198 if (r >= 0) {
5199 op.xattr.value_len = osd_op.outdata.length();
5200 result = 0;
5201 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5202 } else
5203 result = r;
5204
5205 ctx->delta_stats.num_rd++;
5206 }
5207 break;
5208
5209 case CEPH_OSD_OP_GETXATTRS:
5210 ++ctx->num_read;
5211 {
5212 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5213 map<string, bufferlist> out;
5214 result = getattrs_maybe_cache(
5215 ctx->obc,
5216 &out,
5217 true);
5218
5219 bufferlist bl;
5220 ::encode(out, bl);
5221 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5222 ctx->delta_stats.num_rd++;
5223 osd_op.outdata.claim_append(bl);
5224 }
5225 break;
5226
5227 case CEPH_OSD_OP_CMPXATTR:
5228 ++ctx->num_read;
5229 {
5230 string aname;
5231 bp.copy(op.xattr.name_len, aname);
5232 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5233 string name = "_" + aname;
5234 name[op.xattr.name_len + 1] = 0;
5235
5236 bufferlist xattr;
5237 result = getattr_maybe_cache(
5238 ctx->obc,
5239 name,
5240 &xattr);
5241 if (result < 0 && result != -EEXIST && result != -ENODATA)
5242 break;
5243
5244 ctx->delta_stats.num_rd++;
5245 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5246
5247 switch (op.xattr.cmp_mode) {
5248 case CEPH_OSD_CMPXATTR_MODE_STRING:
5249 {
5250 string val;
5251 bp.copy(op.xattr.value_len, val);
5252 val[op.xattr.value_len] = 0;
5253 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5254 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5255 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5256 }
5257 break;
5258
5259 case CEPH_OSD_CMPXATTR_MODE_U64:
5260 {
5261 uint64_t u64val;
5262 try {
5263 ::decode(u64val, bp);
5264 }
5265 catch (buffer::error& e) {
5266 result = -EINVAL;
5267 goto fail;
5268 }
5269 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5270 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5271 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5272 }
5273 break;
5274
5275 default:
5276 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5277 result = -EINVAL;
5278 }
5279
5280 if (!result) {
5281 dout(10) << "comparison returned false" << dendl;
5282 result = -ECANCELED;
5283 break;
5284 }
5285 if (result < 0) {
5286 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5287 break;
5288 }
5289
5290 dout(10) << "comparison returned true" << dendl;
5291 }
5292 break;
5293
5294 case CEPH_OSD_OP_ASSERT_VER:
5295 ++ctx->num_read;
5296 {
5297 uint64_t ver = op.assert_ver.ver;
5298 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5299 if (!ver)
5300 result = -EINVAL;
5301 else if (ver < oi.user_version)
5302 result = -ERANGE;
5303 else if (ver > oi.user_version)
5304 result = -EOVERFLOW;
5305 }
5306 break;
5307
5308 case CEPH_OSD_OP_LIST_WATCHERS:
5309 ++ctx->num_read;
5310 {
5311 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5312 obj_list_watch_response_t resp;
5313
5314 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5315 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5316 ++oi_iter) {
5317 dout(20) << "key cookie=" << oi_iter->first.first
5318 << " entity=" << oi_iter->first.second << " "
5319 << oi_iter->second << dendl;
5320 assert(oi_iter->first.first == oi_iter->second.cookie);
5321 assert(oi_iter->first.second.is_client());
5322
5323 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5324 oi_iter->second.timeout_seconds, oi_iter->second.addr);
5325 resp.entries.push_back(wi);
5326 }
5327
5328 resp.encode(osd_op.outdata, ctx->get_features());
5329 result = 0;
5330
5331 ctx->delta_stats.num_rd++;
5332 break;
5333 }
5334
5335 case CEPH_OSD_OP_LIST_SNAPS:
5336 ++ctx->num_read;
5337 {
5338 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5339 obj_list_snap_response_t resp;
5340
5341 if (!ssc) {
5342 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5343 }
5344 assert(ssc);
5345
5346 int clonecount = ssc->snapset.clones.size();
5347 if (ssc->snapset.head_exists)
5348 clonecount++;
5349 resp.clones.reserve(clonecount);
5350 for (auto clone_iter = ssc->snapset.clones.begin();
5351 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5352 clone_info ci;
5353 ci.cloneid = *clone_iter;
5354
5355 hobject_t clone_oid = soid;
5356 clone_oid.snap = *clone_iter;
5357
5358 if (!ssc->snapset.is_legacy()) {
5359 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5360 if (p == ssc->snapset.clone_snaps.end()) {
5361 osd->clog->error() << "osd." << osd->whoami
5362 << ": inconsistent clone_snaps found for oid "
5363 << soid << " clone " << *clone_iter
5364 << " snapset " << ssc->snapset;
5365 result = -EINVAL;
5366 break;
5367 }
5368 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5369 ci.snaps.push_back(*q);
5370 }
5371 } else {
5372 /* No need to take a lock here. We are only inspecting state cached on
5373 * in the ObjectContext, so we aren't performing an actual read unless
5374 * the clone obc is not already loaded (in which case, it cannot have
5375 * an in progress write). We also do not risk exposing uncommitted
5376 * state since we do have a read lock on the head object or snapdir,
5377 * which we would have to write lock in order to make user visible
5378 * modifications to the snapshot state (snap trim related mutations
5379 * are not user visible).
5380 */
5381 if (is_missing_object(clone_oid)) {
5382 dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5383 wait_for_unreadable_object(clone_oid, ctx->op);
5384 result = -EAGAIN;
5385 break;
5386 }
5387
5388 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5389 if (!clone_obc) {
5390 if (maybe_handle_cache(
5391 ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5392 // promoting the clone
5393 result = -EAGAIN;
5394 } else {
5395 osd->clog->error() << "osd." << osd->whoami
5396 << ": missing clone " << clone_oid
5397 << " for oid "
5398 << soid;
5399 // should not happen
5400 result = -ENOENT;
5401 }
5402 break;
5403 }
5404 for (vector<snapid_t>::reverse_iterator p =
5405 clone_obc->obs.oi.legacy_snaps.rbegin();
5406 p != clone_obc->obs.oi.legacy_snaps.rend();
5407 ++p) {
5408 ci.snaps.push_back(*p);
5409 }
5410 }
5411
5412 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5413
5414 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5415 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5416 if (coi == ssc->snapset.clone_overlap.end()) {
5417 osd->clog->error() << "osd." << osd->whoami
5418 << ": inconsistent clone_overlap found for oid "
5419 << soid << " clone " << *clone_iter;
5420 result = -EINVAL;
5421 break;
5422 }
5423 const interval_set<uint64_t> &o = coi->second;
5424 ci.overlap.reserve(o.num_intervals());
5425 for (interval_set<uint64_t>::const_iterator r = o.begin();
5426 r != o.end(); ++r) {
5427 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5428 r.get_len()));
5429 }
5430
5431 map<snapid_t, uint64_t>::const_iterator si;
5432 si = ssc->snapset.clone_size.find(ci.cloneid);
5433 if (si == ssc->snapset.clone_size.end()) {
5434 osd->clog->error() << "osd." << osd->whoami
5435 << ": inconsistent clone_size found for oid "
5436 << soid << " clone " << *clone_iter;
5437 result = -EINVAL;
5438 break;
5439 }
5440 ci.size = si->second;
5441
5442 resp.clones.push_back(ci);
5443 }
5444 if (result < 0) {
5445 break;
5446 }
5447 if (ssc->snapset.head_exists &&
5448 !ctx->obc->obs.oi.is_whiteout()) {
5449 assert(obs.exists);
5450 clone_info ci;
5451 ci.cloneid = CEPH_NOSNAP;
5452
5453 //Size for HEAD is oi.size
5454 ci.size = oi.size;
5455
5456 resp.clones.push_back(ci);
5457 }
5458 resp.seq = ssc->snapset.seq;
5459
5460 resp.encode(osd_op.outdata);
5461 result = 0;
5462
5463 ctx->delta_stats.num_rd++;
5464 break;
5465 }
5466
5467 case CEPH_OSD_OP_NOTIFY:
5468 ++ctx->num_read;
5469 {
5470 uint32_t timeout;
5471 bufferlist bl;
5472
5473 try {
5474 uint32_t ver; // obsolete
5475 ::decode(ver, bp);
5476 ::decode(timeout, bp);
5477 ::decode(bl, bp);
5478 } catch (const buffer::error &e) {
5479 timeout = 0;
5480 }
5481 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5482 if (!timeout)
5483 timeout = cct->_conf->osd_default_notify_timeout;
5484
5485 notify_info_t n;
5486 n.timeout = timeout;
5487 n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5488 n.cookie = op.watch.cookie;
5489 n.bl = bl;
5490 ctx->notifies.push_back(n);
5491
5492 // return our unique notify id to the client
5493 ::encode(n.notify_id, osd_op.outdata);
5494 }
5495 break;
5496
5497 case CEPH_OSD_OP_NOTIFY_ACK:
5498 ++ctx->num_read;
5499 {
5500 try {
5501 uint64_t notify_id = 0;
5502 uint64_t watch_cookie = 0;
5503 ::decode(notify_id, bp);
5504 ::decode(watch_cookie, bp);
5505 bufferlist reply_bl;
5506 if (!bp.end()) {
5507 ::decode(reply_bl, bp);
5508 }
5509 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5510 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5511 ctx->notify_acks.push_back(ack);
5512 } catch (const buffer::error &e) {
5513 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5514 OpContext::NotifyAck ack(
5515 // op.watch.cookie is actually the notify_id for historical reasons
5516 op.watch.cookie
5517 );
5518 ctx->notify_acks.push_back(ack);
5519 }
5520 }
5521 break;
5522
5523 case CEPH_OSD_OP_SETALLOCHINT:
5524 ++ctx->num_write;
5525 {
5526 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5527 maybe_create_new_object(ctx);
5528 oi.expected_object_size = op.alloc_hint.expected_object_size;
5529 oi.expected_write_size = op.alloc_hint.expected_write_size;
5530 oi.alloc_hint_flags = op.alloc_hint.flags;
5531 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5532 op.alloc_hint.expected_write_size,
5533 op.alloc_hint.flags);
5534 ctx->delta_stats.num_wr++;
5535 result = 0;
5536 }
5537 break;
5538
5539
5540 // --- WRITES ---
5541
5542 // -- object data --
5543
5544 case CEPH_OSD_OP_WRITE:
5545 ++ctx->num_write;
5546 { // write
5547 __u32 seq = oi.truncate_seq;
5548 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5549 if (op.extent.length != osd_op.indata.length()) {
5550 result = -EINVAL;
5551 break;
5552 }
5553
5554 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5555 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5556
5557 if (pool.info.requires_aligned_append() &&
5558 (op.extent.offset % pool.info.required_alignment() != 0)) {
5559 result = -EOPNOTSUPP;
5560 break;
5561 }
5562
5563 if (!obs.exists) {
5564 if (pool.info.requires_aligned_append() && op.extent.offset) {
5565 result = -EOPNOTSUPP;
5566 break;
5567 }
5568 } else if (op.extent.offset != oi.size &&
5569 pool.info.requires_aligned_append()) {
5570 result = -EOPNOTSUPP;
5571 break;
5572 }
5573
5574 if (seq && (seq > op.extent.truncate_seq) &&
5575 (op.extent.offset + op.extent.length > oi.size)) {
5576 // old write, arrived after trimtrunc
5577 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5578 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5579 << ", adjusting write length to " << op.extent.length << dendl;
5580 bufferlist t;
5581 t.substr_of(osd_op.indata, 0, op.extent.length);
5582 osd_op.indata.swap(t);
5583 }
5584 if (op.extent.truncate_seq > seq) {
5585 // write arrives before trimtrunc
5586 if (obs.exists && !oi.is_whiteout()) {
5587 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5588 << ", truncating to " << op.extent.truncate_size << dendl;
5589 t->truncate(soid, op.extent.truncate_size);
5590 oi.truncate_seq = op.extent.truncate_seq;
5591 oi.truncate_size = op.extent.truncate_size;
5592 if (op.extent.truncate_size != oi.size) {
5593 ctx->delta_stats.num_bytes -= oi.size;
5594 ctx->delta_stats.num_bytes += op.extent.truncate_size;
5595 oi.size = op.extent.truncate_size;
5596 }
5597 } else {
5598 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5599 << ", but object is new" << dendl;
5600 oi.truncate_seq = op.extent.truncate_seq;
5601 oi.truncate_size = op.extent.truncate_size;
5602 }
5603 }
5604 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5605 if (result < 0)
5606 break;
5607
5608 maybe_create_new_object(ctx);
5609
5610 if (op.extent.length == 0) {
5611 if (op.extent.offset > oi.size) {
5612 t->truncate(
5613 soid, op.extent.offset);
5614 } else {
5615 t->nop(soid);
5616 }
5617 } else {
5618 t->write(
5619 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5620 }
5621
5622 if (op.extent.offset == 0 && op.extent.length >= oi.size)
5623 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5624 else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5625 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5626 else
5627 obs.oi.clear_data_digest();
5628 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5629 op.extent.offset, op.extent.length);
5630
5631 }
5632 break;
5633
5634 case CEPH_OSD_OP_WRITEFULL:
5635 ++ctx->num_write;
5636 { // write full object
5637 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5638
5639 if (op.extent.length != osd_op.indata.length()) {
5640 result = -EINVAL;
5641 break;
5642 }
5643 result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5644 if (result < 0)
5645 break;
5646
5647 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5648 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5649
5650 maybe_create_new_object(ctx);
5651 if (pool.info.require_rollback()) {
5652 t->truncate(soid, 0);
5653 } else if (obs.exists && op.extent.length < oi.size) {
5654 t->truncate(soid, op.extent.length);
5655 }
5656 if (op.extent.length) {
5657 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5658 }
5659 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5660
5661 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5662 0, op.extent.length, true);
5663 }
5664 break;
5665
5666 case CEPH_OSD_OP_WRITESAME:
5667 ++ctx->num_write;
5668 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5669 result = do_writesame(ctx, osd_op);
5670 break;
5671
5672 case CEPH_OSD_OP_ROLLBACK :
5673 ++ctx->num_write;
5674 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5675 result = _rollback_to(ctx, op);
5676 break;
5677
5678 case CEPH_OSD_OP_ZERO:
5679 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5680 if (pool.info.requires_aligned_append()) {
5681 result = -EOPNOTSUPP;
5682 break;
5683 }
5684 ++ctx->num_write;
5685 { // zero
5686 result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5687 if (result < 0)
5688 break;
5689 assert(op.extent.length);
5690 if (obs.exists && !oi.is_whiteout()) {
5691 t->zero(soid, op.extent.offset, op.extent.length);
5692 interval_set<uint64_t> ch;
5693 ch.insert(op.extent.offset, op.extent.length);
5694 ctx->modified_ranges.union_of(ch);
5695 ctx->delta_stats.num_wr++;
5696 oi.clear_data_digest();
5697 } else {
5698 // no-op
5699 }
5700 }
5701 break;
5702 case CEPH_OSD_OP_CREATE:
5703 ++ctx->num_write;
5704 {
5705 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5706 int flags = le32_to_cpu(op.flags);
5707 if (obs.exists && !oi.is_whiteout() &&
5708 (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5709 result = -EEXIST; /* this is an exclusive create */
5710 } else {
5711 if (osd_op.indata.length()) {
5712 bufferlist::iterator p = osd_op.indata.begin();
5713 string category;
5714 try {
5715 ::decode(category, p);
5716 }
5717 catch (buffer::error& e) {
5718 result = -EINVAL;
5719 goto fail;
5720 }
5721 // category is no longer implemented.
5722 }
5723 if (result >= 0) {
5724 maybe_create_new_object(ctx);
5725 t->nop(soid);
5726 }
5727 }
5728 }
5729 break;
5730
5731 case CEPH_OSD_OP_TRIMTRUNC:
5732 op.extent.offset = op.extent.truncate_size;
5733 // falling through
5734
5735 case CEPH_OSD_OP_TRUNCATE:
5736 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5737 if (pool.info.requires_aligned_append()) {
5738 result = -EOPNOTSUPP;
5739 break;
5740 }
5741 ++ctx->num_write;
5742 {
5743 // truncate
5744 if (!obs.exists || oi.is_whiteout()) {
5745 dout(10) << " object dne, truncate is a no-op" << dendl;
5746 break;
5747 }
5748
5749 if (op.extent.offset > cct->_conf->osd_max_object_size) {
5750 result = -EFBIG;
5751 break;
5752 }
5753
5754 if (op.extent.truncate_seq) {
5755 assert(op.extent.offset == op.extent.truncate_size);
5756 if (op.extent.truncate_seq <= oi.truncate_seq) {
5757 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5758 << ", no-op" << dendl;
5759 break; // old
5760 }
5761 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5762 << ", truncating" << dendl;
5763 oi.truncate_seq = op.extent.truncate_seq;
5764 oi.truncate_size = op.extent.truncate_size;
5765 }
5766
5767 maybe_create_new_object(ctx);
5768 t->truncate(soid, op.extent.offset);
5769 if (oi.size > op.extent.offset) {
5770 interval_set<uint64_t> trim;
5771 trim.insert(op.extent.offset, oi.size-op.extent.offset);
5772 ctx->modified_ranges.union_of(trim);
5773 }
5774 if (op.extent.offset != oi.size) {
5775 ctx->delta_stats.num_bytes -= oi.size;
5776 ctx->delta_stats.num_bytes += op.extent.offset;
5777 oi.size = op.extent.offset;
5778 }
5779 ctx->delta_stats.num_wr++;
5780 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5781
5782 oi.clear_data_digest();
5783 }
5784 break;
5785
5786 case CEPH_OSD_OP_DELETE:
5787 ++ctx->num_write;
5788 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5789 {
5790 result = _delete_oid(ctx, false, ctx->ignore_cache);
5791 }
5792 break;
5793
5794 case CEPH_OSD_OP_WATCH:
5795 ++ctx->num_write;
5796 {
5797 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
5798 op.watch.cookie, op.watch.op);
5799 if (!obs.exists) {
5800 result = -ENOENT;
5801 break;
5802 }
5803 uint64_t cookie = op.watch.cookie;
5804 entity_name_t entity = ctx->reqid.name;
5805 ObjectContextRef obc = ctx->obc;
5806
5807 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
5808 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
5809 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
5810 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
5811 dout(10) << "watch: peer_addr="
5812 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
5813
5814 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
5815 if (op.watch.timeout != 0) {
5816 timeout = op.watch.timeout;
5817 }
5818
5819 watch_info_t w(cookie, timeout,
5820 ctx->op->get_req()->get_connection()->get_peer_addr());
5821 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
5822 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
5823 if (oi.watchers.count(make_pair(cookie, entity))) {
5824 dout(10) << " found existing watch " << w << " by " << entity << dendl;
5825 } else {
5826 dout(10) << " registered new watch " << w << " by " << entity << dendl;
5827 oi.watchers[make_pair(cookie, entity)] = w;
5828 t->nop(soid); // make sure update the object_info on disk!
5829 }
5830 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
5831 ctx->watch_connects.push_back(make_pair(w, will_ping));
5832 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
5833 if (!oi.watchers.count(make_pair(cookie, entity))) {
5834 result = -ENOTCONN;
5835 break;
5836 }
5837 dout(10) << " found existing watch " << w << " by " << entity << dendl;
5838 ctx->watch_connects.push_back(make_pair(w, true));
5839 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
5840 /* Note: WATCH with PING doesn't cause may_write() to return true,
5841 * so if there is nothing else in the transaction, this is going
5842 * to run do_osd_op_effects, but not write out a log entry */
5843 if (!oi.watchers.count(make_pair(cookie, entity))) {
5844 result = -ENOTCONN;
5845 break;
5846 }
5847 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
5848 obc->watchers.find(make_pair(cookie, entity));
5849 if (p == obc->watchers.end() ||
5850 !p->second->is_connected()) {
5851 // client needs to reconnect
5852 result = -ETIMEDOUT;
5853 break;
5854 }
5855 dout(10) << " found existing watch " << w << " by " << entity << dendl;
5856 p->second->got_ping(ceph_clock_now());
5857 result = 0;
5858 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
5859 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
5860 oi.watchers.find(make_pair(cookie, entity));
5861 if (oi_iter != oi.watchers.end()) {
5862 dout(10) << " removed watch " << oi_iter->second << " by "
5863 << entity << dendl;
5864 oi.watchers.erase(oi_iter);
5865 t->nop(soid); // update oi on disk
5866 ctx->watch_disconnects.push_back(
5867 watch_disconnect_t(cookie, entity, false));
5868 } else {
5869 dout(10) << " can't remove: no watch by " << entity << dendl;
5870 }
5871 }
5872 }
5873 break;
5874
5875 case CEPH_OSD_OP_CACHE_PIN:
5876 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
5877 if ((!pool.info.is_tier() ||
5878 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5879 result = -EINVAL;
5880 dout(10) << " pin object is only allowed on the cache tier " << dendl;
5881 break;
5882 }
5883 ++ctx->num_write;
5884 {
5885 if (!obs.exists || oi.is_whiteout()) {
5886 result = -ENOENT;
5887 break;
5888 }
5889
5890 if (!oi.is_cache_pinned()) {
5891 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
5892 ctx->modify = true;
5893 ctx->delta_stats.num_objects_pinned++;
5894 ctx->delta_stats.num_wr++;
5895 }
5896 result = 0;
5897 }
5898 break;
5899
5900 case CEPH_OSD_OP_CACHE_UNPIN:
5901 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
5902 if ((!pool.info.is_tier() ||
5903 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5904 result = -EINVAL;
5905 dout(10) << " pin object is only allowed on the cache tier " << dendl;
5906 break;
5907 }
5908 ++ctx->num_write;
5909 {
5910 if (!obs.exists || oi.is_whiteout()) {
5911 result = -ENOENT;
5912 break;
5913 }
5914
5915 if (oi.is_cache_pinned()) {
5916 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
5917 ctx->modify = true;
5918 ctx->delta_stats.num_objects_pinned--;
5919 ctx->delta_stats.num_wr++;
5920 }
5921 result = 0;
5922 }
5923 break;
5924
5925 case CEPH_OSD_OP_SET_REDIRECT:
5926 ++ctx->num_write;
5927 {
5928 if (pool.info.is_tier()) {
5929 result = -EINVAL;
5930 break;
5931 }
5932 if (!obs.exists) {
5933 result = -ENOENT;
5934 break;
5935 }
5936 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5937 result = -EOPNOTSUPP;
5938 break;
5939 }
5940
5941 object_t target_name;
5942 object_locator_t target_oloc;
5943 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
5944 version_t target_version = op.copy_from.src_version;
5945 try {
5946 ::decode(target_name, bp);
5947 ::decode(target_oloc, bp);
5948 }
5949 catch (buffer::error& e) {
5950 result = -EINVAL;
5951 goto fail;
5952 }
5953 pg_t raw_pg;
5954 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
5955 hobject_t target(target_name, target_oloc.key, target_snapid,
5956 raw_pg.ps(), raw_pg.pool(),
5957 target_oloc.nspace);
5958 if (target == soid) {
5959 dout(20) << " set-redirect self is invalid" << dendl;
5960 result = -EINVAL;
5961 break;
5962 }
5963 oi.set_flag(object_info_t::FLAG_MANIFEST);
5964 oi.manifest.redirect_target = target;
5965 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
5966 t->truncate(soid, 0);
5967 if (oi.is_omap() && pool.info.supports_omap()) {
5968 t->omap_clear(soid);
5969 obs.oi.clear_omap_digest();
5970 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
5971 }
5972 ctx->delta_stats.num_bytes -= oi.size;
5973 oi.size = 0;
5974 oi.new_object();
5975 oi.user_version = target_version;
5976 ctx->user_at_version = target_version;
5977 /* rm_attrs */
5978 map<string,bufferlist> rmattrs;
5979 result = getattrs_maybe_cache(ctx->obc,
5980 &rmattrs,
5981 true);
5982 if (result < 0) {
5983 return result;
5984 }
5985 map<string, bufferlist>::iterator iter;
5986 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
5987 const string& name = iter->first;
5988 t->rmattr(soid, name);
5989 }
5990 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
5991 }
5992
5993 break;
5994
5995 // -- object attrs --
5996
5997 case CEPH_OSD_OP_SETXATTR:
5998 ++ctx->num_write;
5999 {
6000 if (cct->_conf->osd_max_attr_size > 0 &&
6001 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6002 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6003 result = -EFBIG;
6004 break;
6005 }
6006 unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6007 cct->_conf->osd_max_attr_name_len);
6008 if (op.xattr.name_len > max_name_len) {
6009 result = -ENAMETOOLONG;
6010 break;
6011 }
6012 maybe_create_new_object(ctx);
6013 string aname;
6014 bp.copy(op.xattr.name_len, aname);
6015 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6016 string name = "_" + aname;
6017 bufferlist bl;
6018 bp.copy(op.xattr.value_len, bl);
6019 t->setattr(soid, name, bl);
6020 ctx->delta_stats.num_wr++;
6021 }
6022 break;
6023
6024 case CEPH_OSD_OP_RMXATTR:
6025 ++ctx->num_write;
6026 {
6027 string aname;
6028 bp.copy(op.xattr.name_len, aname);
6029 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6030 if (!obs.exists || oi.is_whiteout()) {
6031 result = -ENOENT;
6032 break;
6033 }
6034 string name = "_" + aname;
6035 t->rmattr(soid, name);
6036 ctx->delta_stats.num_wr++;
6037 }
6038 break;
6039
6040
6041 // -- fancy writers --
6042 case CEPH_OSD_OP_APPEND:
6043 {
6044 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6045 // just do it inline; this works because we are happy to execute
6046 // fancy op on replicas as well.
6047 vector<OSDOp> nops(1);
6048 OSDOp& newop = nops[0];
6049 newop.op.op = CEPH_OSD_OP_WRITE;
6050 newop.op.extent.offset = oi.size;
6051 newop.op.extent.length = op.extent.length;
6052 newop.op.extent.truncate_seq = oi.truncate_seq;
6053 newop.indata = osd_op.indata;
6054 result = do_osd_ops(ctx, nops);
6055 osd_op.outdata.claim(newop.outdata);
6056 }
6057 break;
6058
6059 case CEPH_OSD_OP_STARTSYNC:
6060 tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6061 t->nop(soid);
6062 break;
6063
6064
6065 // -- trivial map --
6066 case CEPH_OSD_OP_TMAPGET:
6067 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6068 if (pool.info.require_rollback()) {
6069 result = -EOPNOTSUPP;
6070 break;
6071 }
6072 {
6073 vector<OSDOp> nops(1);
6074 OSDOp& newop = nops[0];
6075 newop.op.op = CEPH_OSD_OP_SYNC_READ;
6076 newop.op.extent.offset = 0;
6077 newop.op.extent.length = 0;
6078 do_osd_ops(ctx, nops);
6079 osd_op.outdata.claim(newop.outdata);
6080 }
6081 break;
6082
6083 case CEPH_OSD_OP_TMAPPUT:
6084 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6085 if (pool.info.require_rollback()) {
6086 result = -EOPNOTSUPP;
6087 break;
6088 }
6089 {
6090 //_dout_lock.Lock();
6091 //osd_op.data.hexdump(*_dout);
6092 //_dout_lock.Unlock();
6093
6094 // verify sort order
6095 bool unsorted = false;
6096 if (true) {
6097 bufferlist header;
6098 ::decode(header, bp);
6099 uint32_t n;
6100 ::decode(n, bp);
6101 string last_key;
6102 while (n--) {
6103 string key;
6104 ::decode(key, bp);
6105 dout(10) << "tmapput key " << key << dendl;
6106 bufferlist val;
6107 ::decode(val, bp);
6108 if (key < last_key) {
6109 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6110 unsorted = true;
6111 break;
6112 }
6113 last_key = key;
6114 }
6115 }
6116
6117 // write it
6118 vector<OSDOp> nops(1);
6119 OSDOp& newop = nops[0];
6120 newop.op.op = CEPH_OSD_OP_WRITEFULL;
6121 newop.op.extent.offset = 0;
6122 newop.op.extent.length = osd_op.indata.length();
6123 newop.indata = osd_op.indata;
6124
6125 if (unsorted) {
6126 bp = osd_op.indata.begin();
6127 bufferlist header;
6128 map<string, bufferlist> m;
6129 ::decode(header, bp);
6130 ::decode(m, bp);
6131 assert(bp.end());
6132 bufferlist newbl;
6133 ::encode(header, newbl);
6134 ::encode(m, newbl);
6135 newop.indata = newbl;
6136 }
6137 result = do_osd_ops(ctx, nops);
6138 assert(result == 0);
6139 }
6140 break;
6141
6142 case CEPH_OSD_OP_TMAPUP:
6143 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6144 if (pool.info.require_rollback()) {
6145 result = -EOPNOTSUPP;
6146 break;
6147 }
6148 ++ctx->num_write;
6149 result = do_tmapup(ctx, bp, osd_op);
6150 break;
6151
6152 case CEPH_OSD_OP_TMAP2OMAP:
6153 ++ctx->num_write;
6154 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6155 result = do_tmap2omap(ctx, op.tmap2omap.flags);
6156 break;
6157
6158 // OMAP Read ops
6159 case CEPH_OSD_OP_OMAPGETKEYS:
6160 ++ctx->num_read;
6161 {
6162 string start_after;
6163 uint64_t max_return;
6164 try {
6165 ::decode(start_after, bp);
6166 ::decode(max_return, bp);
6167 }
6168 catch (buffer::error& e) {
6169 result = -EINVAL;
6170 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6171 goto fail;
6172 }
6173 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6174 max_return = cct->_conf->osd_max_omap_entries_per_request;
6175 }
6176 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6177
6178 bufferlist bl;
6179 uint32_t num = 0;
6180 bool truncated = false;
6181 if (oi.is_omap()) {
6182 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6183 coll, ghobject_t(soid)
6184 );
6185 assert(iter);
6186 iter->upper_bound(start_after);
6187 for (num = 0; iter->valid(); ++num, iter->next(false)) {
6188 if (num >= max_return ||
6189 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6190 truncated = true;
6191 break;
6192 }
6193 ::encode(iter->key(), bl);
6194 }
6195 } // else return empty out_set
6196 ::encode(num, osd_op.outdata);
6197 osd_op.outdata.claim_append(bl);
6198 ::encode(truncated, osd_op.outdata);
6199 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6200 ctx->delta_stats.num_rd++;
6201 }
6202 break;
6203
6204 case CEPH_OSD_OP_OMAPGETVALS:
6205 ++ctx->num_read;
6206 {
6207 string start_after;
6208 uint64_t max_return;
6209 string filter_prefix;
6210 try {
6211 ::decode(start_after, bp);
6212 ::decode(max_return, bp);
6213 ::decode(filter_prefix, bp);
6214 }
6215 catch (buffer::error& e) {
6216 result = -EINVAL;
6217 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6218 goto fail;
6219 }
6220 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6221 max_return = cct->_conf->osd_max_omap_entries_per_request;
6222 }
6223 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6224
6225 uint32_t num = 0;
6226 bool truncated = false;
6227 bufferlist bl;
6228 if (oi.is_omap()) {
6229 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6230 coll, ghobject_t(soid)
6231 );
6232 if (!iter) {
6233 result = -ENOENT;
6234 goto fail;
6235 }
6236 iter->upper_bound(start_after);
6237 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6238 for (num = 0;
6239 iter->valid() &&
6240 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6241 ++num, iter->next(false)) {
6242 dout(20) << "Found key " << iter->key() << dendl;
6243 if (num >= max_return ||
6244 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6245 truncated = true;
6246 break;
6247 }
6248 ::encode(iter->key(), bl);
6249 ::encode(iter->value(), bl);
6250 }
6251 } // else return empty out_set
6252 ::encode(num, osd_op.outdata);
6253 osd_op.outdata.claim_append(bl);
6254 ::encode(truncated, osd_op.outdata);
6255 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6256 ctx->delta_stats.num_rd++;
6257 }
6258 break;
6259
6260 case CEPH_OSD_OP_OMAPGETHEADER:
6261 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6262 if (!oi.is_omap()) {
6263 // return empty header
6264 break;
6265 }
6266 ++ctx->num_read;
6267 {
6268 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6269 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6270 ctx->delta_stats.num_rd++;
6271 }
6272 break;
6273
6274 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6275 ++ctx->num_read;
6276 {
6277 set<string> keys_to_get;
6278 try {
6279 ::decode(keys_to_get, bp);
6280 }
6281 catch (buffer::error& e) {
6282 result = -EINVAL;
6283 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6284 goto fail;
6285 }
6286 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6287 map<string, bufferlist> out;
6288 if (oi.is_omap()) {
6289 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6290 } // else return empty omap entries
6291 ::encode(out, osd_op.outdata);
6292 ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6293 ctx->delta_stats.num_rd++;
6294 }
6295 break;
6296
6297 case CEPH_OSD_OP_OMAP_CMP:
6298 ++ctx->num_read;
6299 {
6300 if (!obs.exists || oi.is_whiteout()) {
6301 result = -ENOENT;
6302 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6303 break;
6304 }
6305 map<string, pair<bufferlist, int> > assertions;
6306 try {
6307 ::decode(assertions, bp);
6308 }
6309 catch (buffer::error& e) {
6310 result = -EINVAL;
6311 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6312 goto fail;
6313 }
6314 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6315
6316 map<string, bufferlist> out;
6317
6318 if (oi.is_omap()) {
6319 set<string> to_get;
6320 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6321 i != assertions.end();
6322 ++i)
6323 to_get.insert(i->first);
6324 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6325 to_get, &out);
6326 if (r < 0) {
6327 result = r;
6328 break;
6329 }
6330 } // else leave out empty
6331
6332 //Should set num_rd_kb based on encode length of map
6333 ctx->delta_stats.num_rd++;
6334
6335 int r = 0;
6336 bufferlist empty;
6337 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6338 i != assertions.end();
6339 ++i) {
6340 auto out_entry = out.find(i->first);
6341 bufferlist &bl = (out_entry != out.end()) ?
6342 out_entry->second : empty;
6343 switch (i->second.second) {
6344 case CEPH_OSD_CMPXATTR_OP_EQ:
6345 if (!(bl == i->second.first)) {
6346 r = -ECANCELED;
6347 }
6348 break;
6349 case CEPH_OSD_CMPXATTR_OP_LT:
6350 if (!(bl < i->second.first)) {
6351 r = -ECANCELED;
6352 }
6353 break;
6354 case CEPH_OSD_CMPXATTR_OP_GT:
6355 if (!(bl > i->second.first)) {
6356 r = -ECANCELED;
6357 }
6358 break;
6359 default:
6360 r = -EINVAL;
6361 break;
6362 }
6363 if (r < 0)
6364 break;
6365 }
6366 if (r < 0) {
6367 result = r;
6368 }
6369 }
6370 break;
6371
6372 // OMAP Write ops
6373 case CEPH_OSD_OP_OMAPSETVALS:
6374 if (!pool.info.supports_omap()) {
6375 result = -EOPNOTSUPP;
6376 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6377 break;
6378 }
6379 ++ctx->num_write;
6380 {
6381 maybe_create_new_object(ctx);
6382 bufferlist to_set_bl;
6383 try {
6384 decode_str_str_map_to_bl(bp, &to_set_bl);
6385 }
6386 catch (buffer::error& e) {
6387 result = -EINVAL;
6388 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6389 goto fail;
6390 }
6391 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6392 if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6393 dout(20) << "setting vals: " << dendl;
6394 map<string,bufferlist> to_set;
6395 bufferlist::iterator pt = to_set_bl.begin();
6396 ::decode(to_set, pt);
6397 for (map<string, bufferlist>::iterator i = to_set.begin();
6398 i != to_set.end();
6399 ++i) {
6400 dout(20) << "\t" << i->first << dendl;
6401 }
6402 }
6403 t->omap_setkeys(soid, to_set_bl);
6404 ctx->delta_stats.num_wr++;
6405 }
6406 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6407 obs.oi.clear_omap_digest();
6408 break;
6409
6410 case CEPH_OSD_OP_OMAPSETHEADER:
6411 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6412 if (!pool.info.supports_omap()) {
6413 result = -EOPNOTSUPP;
6414 break;
6415 }
6416 ++ctx->num_write;
6417 {
6418 maybe_create_new_object(ctx);
6419 t->omap_setheader(soid, osd_op.indata);
6420 ctx->delta_stats.num_wr++;
6421 }
6422 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6423 obs.oi.clear_omap_digest();
6424 break;
6425
6426 case CEPH_OSD_OP_OMAPCLEAR:
6427 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6428 if (!pool.info.supports_omap()) {
6429 result = -EOPNOTSUPP;
6430 break;
6431 }
6432 ++ctx->num_write;
6433 {
6434 if (!obs.exists || oi.is_whiteout()) {
6435 result = -ENOENT;
6436 break;
6437 }
6438 if (oi.is_omap()) {
6439 t->omap_clear(soid);
6440 ctx->delta_stats.num_wr++;
6441 obs.oi.clear_omap_digest();
6442 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6443 }
6444 }
6445 break;
6446
6447 case CEPH_OSD_OP_OMAPRMKEYS:
6448 if (!pool.info.supports_omap()) {
6449 result = -EOPNOTSUPP;
6450 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6451 break;
6452 }
6453 ++ctx->num_write;
6454 {
6455 if (!obs.exists || oi.is_whiteout()) {
6456 result = -ENOENT;
6457 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6458 break;
6459 }
6460 bufferlist to_rm_bl;
6461 try {
6462 decode_str_set_to_bl(bp, &to_rm_bl);
6463 }
6464 catch (buffer::error& e) {
6465 result = -EINVAL;
6466 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6467 goto fail;
6468 }
6469 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6470 t->omap_rmkeys(soid, to_rm_bl);
6471 ctx->delta_stats.num_wr++;
6472 }
6473 obs.oi.clear_omap_digest();
6474 break;
6475
6476 case CEPH_OSD_OP_COPY_GET:
6477 ++ctx->num_read;
6478 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), soid.snap.val);
6479 result = fill_in_copy_get(ctx, bp, osd_op, ctx->obc);
6480 break;
6481
6482 case CEPH_OSD_OP_COPY_FROM:
6483 ++ctx->num_write;
6484 {
6485 object_t src_name;
6486 object_locator_t src_oloc;
6487 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6488 version_t src_version = op.copy_from.src_version;
6489 try {
6490 ::decode(src_name, bp);
6491 ::decode(src_oloc, bp);
6492 }
6493 catch (buffer::error& e) {
6494 result = -EINVAL;
6495 tracepoint(osd,
6496 do_osd_op_pre_copy_from,
6497 soid.oid.name.c_str(),
6498 soid.snap.val,
6499 "???",
6500 0,
6501 "???",
6502 "???",
6503 0,
6504 src_snapid,
6505 src_version);
6506 goto fail;
6507 }
6508 tracepoint(osd,
6509 do_osd_op_pre_copy_from,
6510 soid.oid.name.c_str(),
6511 soid.snap.val,
6512 src_name.name.c_str(),
6513 src_oloc.pool,
6514 src_oloc.key.c_str(),
6515 src_oloc.nspace.c_str(),
6516 src_oloc.hash,
6517 src_snapid,
6518 src_version);
6519 if (!ctx->copy_cb) {
6520 // start
6521 pg_t raw_pg;
6522 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6523 hobject_t src(src_name, src_oloc.key, src_snapid,
6524 raw_pg.ps(), raw_pg.pool(),
6525 src_oloc.nspace);
6526 if (src == soid) {
6527 dout(20) << " copy from self is invalid" << dendl;
6528 result = -EINVAL;
6529 break;
6530 }
6531 CopyFromCallback *cb = new CopyFromCallback(ctx);
6532 ctx->copy_cb = cb;
6533 start_copy(cb, ctx->obc, src, src_oloc, src_version,
6534 op.copy_from.flags,
6535 false,
6536 op.copy_from.src_fadvise_flags,
6537 op.flags);
6538 result = -EINPROGRESS;
6539 } else {
6540 // finish
6541 assert(ctx->copy_cb->get_result() >= 0);
6542 finish_copyfrom(ctx);
6543 result = 0;
6544 }
6545 }
6546 break;
6547
6548 default:
6549 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6550 dout(1) << "unrecognized osd op " << op.op
6551 << " " << ceph_osd_op_name(op.op)
6552 << dendl;
6553 result = -EOPNOTSUPP;
6554 }
6555
6556 fail:
6557 osd_op.rval = result;
6558 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6559 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6560 result = 0;
6561
6562 if (result < 0)
6563 break;
6564 }
6565 return result;
6566 }
6567
6568 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6569 {
6570 if (ctx->new_obs.oi.size == 0) {
6571 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6572 return -ENODATA;
6573 }
6574 vector<OSDOp> nops(1);
6575 OSDOp &newop = nops[0];
6576 newop.op.op = CEPH_OSD_OP_TMAPGET;
6577 do_osd_ops(ctx, nops);
6578 try {
6579 bufferlist::iterator i = newop.outdata.begin();
6580 ::decode(*header, i);
6581 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6582 } catch (...) {
6583 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6584 << dendl;
6585 return -EINVAL;
6586 }
6587 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6588 << dendl;
6589 return 0;
6590 }
6591
6592 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6593 const SnapSet& ss)
6594 {
6595 // verify that all clones have been evicted
6596 dout(20) << __func__ << " verifying clones are absent "
6597 << ss << dendl;
6598 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6599 p != ss.clones.end();
6600 ++p) {
6601 hobject_t clone_oid = soid;
6602 clone_oid.snap = *p;
6603 if (is_missing_object(clone_oid))
6604 return -EBUSY;
6605 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6606 if (clone_obc && clone_obc->obs.exists) {
6607 dout(10) << __func__ << " cannot evict head before clone "
6608 << clone_oid << dendl;
6609 return -EBUSY;
6610 }
6611 if (copy_ops.count(clone_oid)) {
6612 dout(10) << __func__ << " cannot evict head, pending promote on clone "
6613 << clone_oid << dendl;
6614 return -EBUSY;
6615 }
6616 }
6617 return 0;
6618 }
6619
6620 inline int PrimaryLogPG::_delete_oid(
6621 OpContext *ctx,
6622 bool no_whiteout, // no whiteouts, no matter what.
6623 bool try_no_whiteout) // try not to whiteout
6624 {
6625 SnapSet& snapset = ctx->new_snapset;
6626 ObjectState& obs = ctx->new_obs;
6627 object_info_t& oi = obs.oi;
6628 const hobject_t& soid = oi.soid;
6629 PGTransaction* t = ctx->op_t.get();
6630
6631 // cache: cache: set whiteout on delete?
6632 bool whiteout = false;
6633 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6634 && !no_whiteout
6635 && !try_no_whiteout) {
6636 whiteout = true;
6637 }
6638 bool legacy;
6639 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6640 legacy = false;
6641 // in luminous or later, we can't delete the head if there are
6642 // clones. we trust the caller passing no_whiteout has already
6643 // verified they don't exist.
6644 if (!snapset.clones.empty() ||
6645 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6646 if (no_whiteout) {
6647 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6648 << dendl;
6649 } else {
6650 dout(20) << __func__ << " has or will have clones; will whiteout"
6651 << dendl;
6652 whiteout = true;
6653 }
6654 }
6655 } else {
6656 legacy = false;
6657 }
6658 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6659 << " no_whiteout=" << (int)no_whiteout
6660 << " try_no_whiteout=" << (int)try_no_whiteout
6661 << dendl;
6662 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6663 return -ENOENT;
6664
6665 t->remove(soid);
6666
6667 if (oi.size > 0) {
6668 interval_set<uint64_t> ch;
6669 ch.insert(0, oi.size);
6670 ctx->modified_ranges.union_of(ch);
6671 }
6672
6673 ctx->delta_stats.num_wr++;
6674 if (soid.is_snap()) {
6675 assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6676 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6677 } else {
6678 ctx->delta_stats.num_bytes -= oi.size;
6679 }
6680 oi.size = 0;
6681 oi.new_object();
6682
6683 // disconnect all watchers
6684 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6685 oi.watchers.begin();
6686 p != oi.watchers.end();
6687 ++p) {
6688 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6689 ctx->watch_disconnects.push_back(
6690 watch_disconnect_t(p->first.first, p->first.second, true));
6691 }
6692 oi.watchers.clear();
6693
6694 if (whiteout) {
6695 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6696 oi.set_flag(object_info_t::FLAG_WHITEOUT);
6697 ctx->delta_stats.num_whiteouts++;
6698 t->create(soid);
6699 osd->logger->inc(l_osd_tier_whiteout);
6700 return 0;
6701 }
6702
6703 // delete the head
6704 ctx->delta_stats.num_objects--;
6705 if (soid.is_snap())
6706 ctx->delta_stats.num_object_clones--;
6707 if (oi.is_whiteout()) {
6708 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6709 ctx->delta_stats.num_whiteouts--;
6710 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6711 }
6712 if (oi.is_cache_pinned()) {
6713 ctx->delta_stats.num_objects_pinned--;
6714 }
6715 if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6716 snapset.head_exists = false;
6717 }
6718 obs.exists = false;
6719 return 0;
6720 }
6721
6722 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6723 {
6724 SnapSet& snapset = ctx->new_snapset;
6725 ObjectState& obs = ctx->new_obs;
6726 object_info_t& oi = obs.oi;
6727 const hobject_t& soid = oi.soid;
6728 PGTransaction* t = ctx->op_t.get();
6729 snapid_t snapid = (uint64_t)op.snap.snapid;
6730 hobject_t missing_oid;
6731
6732 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6733
6734 ObjectContextRef rollback_to;
6735 int ret = find_object_context(
6736 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6737 soid.get_namespace()),
6738 &rollback_to, false, false, &missing_oid);
6739 if (ret == -EAGAIN) {
6740 /* clone must be missing */
6741 assert(is_missing_object(missing_oid));
6742 dout(20) << "_rollback_to attempted to roll back to a missing object "
6743 << missing_oid << " (requested snapid: ) " << snapid << dendl;
6744 block_write_on_degraded_snap(missing_oid, ctx->op);
6745 return ret;
6746 }
6747 {
6748 ObjectContextRef promote_obc;
6749 cache_result_t tier_mode_result;
6750 if (obs.exists && obs.oi.has_manifest()) {
6751 tier_mode_result =
6752 maybe_handle_manifest_detail(
6753 ctx->op,
6754 true,
6755 rollback_to);
6756 } else {
6757 tier_mode_result =
6758 maybe_handle_cache_detail(
6759 ctx->op,
6760 true,
6761 rollback_to,
6762 ret,
6763 missing_oid,
6764 true,
6765 false,
6766 &promote_obc);
6767 }
6768 switch (tier_mode_result) {
6769 case cache_result_t::NOOP:
6770 break;
6771 case cache_result_t::BLOCKED_PROMOTE:
6772 assert(promote_obc);
6773 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6774 return -EAGAIN;
6775 case cache_result_t::BLOCKED_FULL:
6776 block_write_on_full_cache(soid, ctx->op);
6777 return -EAGAIN;
6778 default:
6779 assert(0 == "must promote was set, other values are not valid");
6780 return -EAGAIN;
6781 }
6782 }
6783
6784 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6785 // there's no snapshot here, or there's no object.
6786 // if there's no snapshot, we delete the object; otherwise, do nothing.
6787 dout(20) << "_rollback_to deleting head on " << soid.oid
6788 << " because got ENOENT|whiteout on find_object_context" << dendl;
6789 if (ctx->obc->obs.oi.watchers.size()) {
6790 // Cannot delete an object with watchers
6791 ret = -EBUSY;
6792 } else {
6793 _delete_oid(ctx, false, false);
6794 ret = 0;
6795 }
6796 } else if (ret) {
6797 // ummm....huh? It *can't* return anything else at time of writing.
6798 assert(0 == "unexpected error code in _rollback_to");
6799 } else { //we got our context, let's use it to do the rollback!
6800 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
6801 if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
6802 dout(20) << "_rollback_to attempted to roll back to a degraded object "
6803 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
6804 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
6805 ret = -EAGAIN;
6806 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
6807 // rolling back to the head; we just need to clone it.
6808 ctx->modify = true;
6809 } else {
6810 /* 1) Delete current head
6811 * 2) Clone correct snapshot into head
6812 * 3) Calculate clone_overlaps by following overlaps
6813 * forward from rollback snapshot */
6814 dout(10) << "_rollback_to deleting " << soid.oid
6815 << " and rolling back to old snap" << dendl;
6816
6817 if (obs.exists) {
6818 t->remove(soid);
6819 }
6820 t->clone(soid, rollback_to_sobject);
6821 snapset.head_exists = true;
6822 t->add_obc(rollback_to);
6823
6824 map<snapid_t, interval_set<uint64_t> >::iterator iter =
6825 snapset.clone_overlap.lower_bound(snapid);
6826 interval_set<uint64_t> overlaps = iter->second;
6827 assert(iter != snapset.clone_overlap.end());
6828 for ( ;
6829 iter != snapset.clone_overlap.end();
6830 ++iter)
6831 overlaps.intersection_of(iter->second);
6832
6833 if (obs.oi.size > 0) {
6834 interval_set<uint64_t> modified;
6835 modified.insert(0, obs.oi.size);
6836 overlaps.intersection_of(modified);
6837 modified.subtract(overlaps);
6838 ctx->modified_ranges.union_of(modified);
6839 }
6840
6841 // Adjust the cached objectcontext
6842 maybe_create_new_object(ctx, true);
6843 ctx->delta_stats.num_bytes -= obs.oi.size;
6844 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
6845 obs.oi.size = rollback_to->obs.oi.size;
6846 if (rollback_to->obs.oi.is_data_digest())
6847 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
6848 else
6849 obs.oi.clear_data_digest();
6850 if (rollback_to->obs.oi.is_omap_digest())
6851 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
6852 else
6853 obs.oi.clear_omap_digest();
6854
6855 if (rollback_to->obs.oi.is_omap()) {
6856 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
6857 obs.oi.set_flag(object_info_t::FLAG_OMAP);
6858 } else {
6859 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
6860 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6861 }
6862
6863 snapset.head_exists = true;
6864 }
6865 }
6866 return ret;
6867 }
6868
6869 void PrimaryLogPG::_make_clone(
6870 OpContext *ctx,
6871 PGTransaction* t,
6872 ObjectContextRef obc,
6873 const hobject_t& head, const hobject_t& coid,
6874 object_info_t *poi)
6875 {
6876 bufferlist bv;
6877 ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
6878
6879 t->clone(coid, head);
6880 setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
6881 rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
6882 }
6883
6884 void PrimaryLogPG::make_writeable(OpContext *ctx)
6885 {
6886 const hobject_t& soid = ctx->obs->oi.soid;
6887 SnapContext& snapc = ctx->snapc;
6888
6889 // clone?
6890 assert(soid.snap == CEPH_NOSNAP);
6891 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
6892 << " snapc=" << snapc << dendl;
6893
6894 bool was_dirty = ctx->obc->obs.oi.is_dirty();
6895 if (ctx->new_obs.exists) {
6896 // we will mark the object dirty
6897 if (ctx->undirty && was_dirty) {
6898 dout(20) << " clearing DIRTY flag" << dendl;
6899 assert(ctx->new_obs.oi.is_dirty());
6900 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6901 --ctx->delta_stats.num_objects_dirty;
6902 osd->logger->inc(l_osd_tier_clean);
6903 } else if (!was_dirty && !ctx->undirty) {
6904 dout(20) << " setting DIRTY flag" << dendl;
6905 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
6906 ++ctx->delta_stats.num_objects_dirty;
6907 osd->logger->inc(l_osd_tier_dirty);
6908 }
6909 } else {
6910 if (was_dirty) {
6911 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
6912 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6913 --ctx->delta_stats.num_objects_dirty;
6914 }
6915 }
6916
6917 if ((ctx->new_obs.exists &&
6918 ctx->new_obs.oi.is_omap()) &&
6919 (!ctx->obc->obs.exists ||
6920 !ctx->obc->obs.oi.is_omap())) {
6921 ++ctx->delta_stats.num_objects_omap;
6922 }
6923 if ((!ctx->new_obs.exists ||
6924 !ctx->new_obs.oi.is_omap()) &&
6925 (ctx->obc->obs.exists &&
6926 ctx->obc->obs.oi.is_omap())) {
6927 --ctx->delta_stats.num_objects_omap;
6928 }
6929
6930 // use newer snapc?
6931 if (ctx->new_snapset.seq > snapc.seq) {
6932 snapc.seq = ctx->new_snapset.seq;
6933 snapc.snaps = ctx->new_snapset.snaps;
6934 filter_snapc(snapc.snaps);
6935 dout(10) << " using newer snapc " << snapc << dendl;
6936 }
6937
6938 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
6939 snapc.snaps.size() && // there are snaps
6940 !ctx->cache_evict &&
6941 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
6942 // clone
6943 hobject_t coid = soid;
6944 coid.snap = snapc.seq;
6945
6946 unsigned l;
6947 for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
6948
6949 vector<snapid_t> snaps(l);
6950 for (unsigned i=0; i<l; i++)
6951 snaps[i] = snapc.snaps[i];
6952
6953 // prepare clone
6954 object_info_t static_snap_oi(coid);
6955 object_info_t *snap_oi;
6956 if (is_primary()) {
6957 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
6958 ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
6959 ctx->clone_obc->obs.oi = static_snap_oi;
6960 ctx->clone_obc->obs.exists = true;
6961 ctx->clone_obc->ssc = ctx->obc->ssc;
6962 ctx->clone_obc->ssc->ref++;
6963 if (pool.info.require_rollback())
6964 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
6965 snap_oi = &ctx->clone_obc->obs.oi;
6966 bool got = ctx->lock_manager.get_write_greedy(
6967 coid,
6968 ctx->clone_obc,
6969 ctx->op);
6970 assert(got);
6971 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
6972 } else {
6973 snap_oi = &static_snap_oi;
6974 }
6975 snap_oi->version = ctx->at_version;
6976 snap_oi->prior_version = ctx->obs->oi.version;
6977 snap_oi->copy_user_bits(ctx->obs->oi);
6978
6979 bool legacy = ctx->new_snapset.is_legacy() ||
6980 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
6981 if (legacy) {
6982 snap_oi->legacy_snaps = snaps;
6983 }
6984
6985 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
6986
6987 ctx->delta_stats.num_objects++;
6988 if (snap_oi->is_dirty()) {
6989 ctx->delta_stats.num_objects_dirty++;
6990 osd->logger->inc(l_osd_tier_dirty);
6991 }
6992 if (snap_oi->is_omap())
6993 ctx->delta_stats.num_objects_omap++;
6994 if (snap_oi->is_cache_pinned())
6995 ctx->delta_stats.num_objects_pinned++;
6996 ctx->delta_stats.num_object_clones++;
6997 ctx->new_snapset.clones.push_back(coid.snap);
6998 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
6999 if (!legacy) {
7000 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7001 }
7002
7003 // clone_overlap should contain an entry for each clone
7004 // (an empty interval_set if there is no overlap)
7005 ctx->new_snapset.clone_overlap[coid.snap];
7006 if (ctx->obs->oi.size)
7007 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7008
7009 // log clone
7010 dout(10) << " cloning v " << ctx->obs->oi.version
7011 << " to " << coid << " v " << ctx->at_version
7012 << " snaps=" << snaps
7013 << " snapset=" << ctx->new_snapset << dendl;
7014 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7015 ctx->obs->oi.version,
7016 ctx->obs->oi.user_version,
7017 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7018 ::encode(snaps, ctx->log.back().snaps);
7019
7020 ctx->at_version.version++;
7021 }
7022
7023 // update most recent clone_overlap and usage stats
7024 if (ctx->new_snapset.clones.size() > 0) {
7025 /* we need to check whether the most recent clone exists, if it's been evicted,
7026 * it's not included in the stats */
7027 hobject_t last_clone_oid = soid;
7028 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7029 if (is_present_clone(last_clone_oid)) {
7030 interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7031 ctx->modified_ranges.intersection_of(newest_overlap);
7032 // modified_ranges is still in use by the clone
7033 add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7034 newest_overlap.subtract(ctx->modified_ranges);
7035 }
7036 }
7037
7038 // update snapset with latest snap context
7039 ctx->new_snapset.seq = snapc.seq;
7040 ctx->new_snapset.snaps = snapc.snaps;
7041 if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7042 // pessimistic assumption that this is a net-new legacy SnapSet
7043 ctx->delta_stats.num_legacy_snapsets++;
7044 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7045 } else if (ctx->new_snapset.is_legacy()) {
7046 ctx->new_snapset.head_exists = ctx->new_obs.exists;
7047 }
7048 dout(20) << "make_writeable " << soid
7049 << " done, snapset=" << ctx->new_snapset << dendl;
7050 }
7051
7052
7053 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7054 interval_set<uint64_t>& modified, uint64_t offset,
7055 uint64_t length, bool write_full)
7056 {
7057 interval_set<uint64_t> ch;
7058 if (write_full) {
7059 if (oi.size)
7060 ch.insert(0, oi.size);
7061 } else if (length)
7062 ch.insert(offset, length);
7063 modified.union_of(ch);
7064 if (write_full || offset + length > oi.size) {
7065 uint64_t new_size = offset + length;
7066 delta_stats.num_bytes -= oi.size;
7067 delta_stats.num_bytes += new_size;
7068 oi.size = new_size;
7069 }
7070 delta_stats.num_wr++;
7071 delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7072 }
7073
7074 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7075 {
7076 for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7077 delta_stats.num_bytes += p.get_len();
7078 }
7079 }
7080
7081 void PrimaryLogPG::complete_disconnect_watches(
7082 ObjectContextRef obc,
7083 const list<watch_disconnect_t> &to_disconnect)
7084 {
7085 for (list<watch_disconnect_t>::const_iterator i =
7086 to_disconnect.begin();
7087 i != to_disconnect.end();
7088 ++i) {
7089 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7090 auto watchers_entry = obc->watchers.find(watcher);
7091 if (watchers_entry != obc->watchers.end()) {
7092 WatchRef watch = watchers_entry->second;
7093 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7094 obc->watchers.erase(watcher);
7095 watch->remove(i->send_disconnect);
7096 } else {
7097 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7098 << watcher << dendl;
7099 }
7100 }
7101 }
7102
7103 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7104 {
7105 entity_name_t entity = ctx->reqid.name;
7106 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7107
7108 // disconnects first
7109 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7110
7111 assert(conn);
7112
7113 boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7114 if (!session.get())
7115 return;
7116 session->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7117
7118 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7119 i != ctx->watch_connects.end();
7120 ++i) {
7121 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7122 dout(15) << "do_osd_op_effects applying watch connect on session "
7123 << session.get() << " watcher " << watcher << dendl;
7124 WatchRef watch;
7125 if (ctx->obc->watchers.count(watcher)) {
7126 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7127 << dendl;
7128 watch = ctx->obc->watchers[watcher];
7129 } else {
7130 dout(15) << "do_osd_op_effects new watcher " << watcher
7131 << dendl;
7132 watch = Watch::makeWatchRef(
7133 this, osd, ctx->obc, i->first.timeout_seconds,
7134 i->first.cookie, entity, conn->get_peer_addr());
7135 ctx->obc->watchers.insert(
7136 make_pair(
7137 watcher,
7138 watch));
7139 }
7140 watch->connect(conn, i->second);
7141 }
7142
7143 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7144 p != ctx->notifies.end();
7145 ++p) {
7146 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7147 ConnectionRef conn(ctx->op->get_req()->get_connection());
7148 NotifyRef notif(
7149 Notify::makeNotifyRef(
7150 conn,
7151 ctx->reqid.name.num(),
7152 p->bl,
7153 p->timeout,
7154 p->cookie,
7155 p->notify_id,
7156 ctx->obc->obs.oi.user_version,
7157 osd));
7158 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7159 ctx->obc->watchers.begin();
7160 i != ctx->obc->watchers.end();
7161 ++i) {
7162 dout(10) << "starting notify on watch " << i->first << dendl;
7163 i->second->start_notify(notif);
7164 }
7165 notif->init();
7166 }
7167
7168 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7169 p != ctx->notify_acks.end();
7170 ++p) {
7171 if (p->watch_cookie)
7172 dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7173 else
7174 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7175 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7176 ctx->obc->watchers.begin();
7177 i != ctx->obc->watchers.end();
7178 ++i) {
7179 if (i->first.second != entity) continue;
7180 if (p->watch_cookie &&
7181 p->watch_cookie.get() != i->first.first) continue;
7182 dout(10) << "acking notify on watch " << i->first << dendl;
7183 i->second->notify_ack(p->notify_id, p->reply_bl);
7184 }
7185 }
7186 }
7187
7188 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7189 {
7190 ostringstream ss;
7191 ss << "temp_" << info.pgid << "_" << get_role()
7192 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7193 hobject_t hoid = target.make_temp_hobject(ss.str());
7194 dout(20) << __func__ << " " << hoid << dendl;
7195 return hoid;
7196 }
7197
7198 hobject_t PrimaryLogPG::get_temp_recovery_object(
7199 const hobject_t& target,
7200 eversion_t version)
7201 {
7202 ostringstream ss;
7203 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
7204 << "_" << version
7205 << "_" << info.history.same_interval_since
7206 << "_" << target.snap;
7207 // pgid + version + interval + snapid is unique, and short
7208 hobject_t hoid = target.make_temp_hobject(ss.str());
7209 dout(20) << __func__ << " " << hoid << dendl;
7210 return hoid;
7211 }
7212
7213 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7214 {
7215 assert(!ctx->ops.empty());
7216
7217 const hobject_t& soid = ctx->obs->oi.soid;
7218
7219 // valid snap context?
7220 if (!ctx->snapc.is_valid()) {
7221 dout(10) << " invalid snapc " << ctx->snapc << dendl;
7222 return -EINVAL;
7223 }
7224
7225 // prepare the actual mutation
7226 int result = do_osd_ops(ctx, ctx->ops);
7227 if (result < 0) {
7228 if (ctx->op->may_write() &&
7229 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7230 // need to save the error code in the pg log, to detect dup ops,
7231 // but do nothing else
7232 ctx->update_log_only = true;
7233 }
7234 return result;
7235 }
7236
7237 // read-op? write-op noop? done?
7238 if (ctx->op_t->empty() && !ctx->modify) {
7239 unstable_stats.add(ctx->delta_stats);
7240 if (ctx->op->may_write() &&
7241 get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7242 ctx->update_log_only = true;
7243 }
7244 return result;
7245 }
7246
7247 // check for full
7248 if ((ctx->delta_stats.num_bytes > 0 ||
7249 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
7250 (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7251 get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7252 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7253 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
7254 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7255 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7256 << dendl;
7257 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7258 // they tried, they failed.
7259 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7260 return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7261 } else {
7262 // drop request
7263 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7264 return -EAGAIN;
7265 }
7266 }
7267
7268 // clone, if necessary
7269 if (soid.snap == CEPH_NOSNAP)
7270 make_writeable(ctx);
7271
7272 finish_ctx(ctx,
7273 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7274 pg_log_entry_t::DELETE);
7275
7276 return result;
7277 }
7278
7279 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7280 {
7281 const hobject_t& soid = ctx->obs->oi.soid;
7282 dout(20) << __func__ << " " << soid << " " << ctx
7283 << " op " << pg_log_entry_t::get_op_name(log_op_type)
7284 << dendl;
7285 utime_t now = ceph_clock_now();
7286
7287 // snapset
7288 bufferlist bss;
7289
7290 if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7291 ::encode(ctx->new_snapset, bss);
7292 assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7293 !ctx->new_snapset.is_legacy());
7294
7295 if (ctx->new_obs.exists) {
7296 if (!ctx->obs->exists) {
7297 if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7298 hobject_t snapoid = soid.get_snapdir();
7299 dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7300 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7301 ctx->at_version,
7302 ctx->snapset_obc->obs.oi.version,
7303 0, osd_reqid_t(), ctx->mtime, 0));
7304 ctx->op_t->remove(snapoid);
7305
7306 ctx->at_version.version++;
7307
7308 ctx->snapset_obc->obs.exists = false;
7309 }
7310 }
7311 } else if (!ctx->new_snapset.clones.empty() &&
7312 !ctx->cache_evict &&
7313 !ctx->new_snapset.head_exists &&
7314 (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7315 // save snapset on _snap
7316 hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7317 info.pgid.pool(), soid.get_namespace());
7318 dout(10) << " final snapset " << ctx->new_snapset
7319 << " in " << snapoid << dendl;
7320 assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7321 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7322 ctx->at_version,
7323 eversion_t(),
7324 0, osd_reqid_t(), ctx->mtime, 0));
7325
7326 if (!ctx->snapset_obc)
7327 ctx->snapset_obc = get_object_context(snapoid, true);
7328 bool got = false;
7329 if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7330 got = ctx->lock_manager.get_write_greedy(
7331 snapoid,
7332 ctx->snapset_obc,
7333 ctx->op);
7334 } else {
7335 assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7336 got = ctx->lock_manager.get_lock_type(
7337 ObjectContext::RWState::RWEXCL,
7338 snapoid,
7339 ctx->snapset_obc,
7340 ctx->op);
7341 }
7342 assert(got);
7343 dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7344 ctx->snapset_obc->obs.exists = true;
7345 ctx->snapset_obc->obs.oi.version = ctx->at_version;
7346 ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7347 ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7348 ctx->snapset_obc->obs.oi.local_mtime = now;
7349
7350 map<string, bufferlist> attrs;
7351 bufferlist bv(sizeof(ctx->new_obs.oi));
7352 ::encode(ctx->snapset_obc->obs.oi, bv,
7353 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7354 ctx->op_t->create(snapoid);
7355 attrs[OI_ATTR].claim(bv);
7356 attrs[SS_ATTR].claim(bss);
7357 setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7358 ctx->at_version.version++;
7359 }
7360 }
7361
7362 // finish and log the op.
7363 if (ctx->user_modify) {
7364 // update the user_version for any modify ops, except for the watch op
7365 ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7366 /* In order for new clients and old clients to interoperate properly
7367 * when exchanging versions, we need to lower bound the user_version
7368 * (which our new clients pay proper attention to)
7369 * by the at_version (which is all the old clients can ever see). */
7370 if (ctx->at_version.version > ctx->user_at_version)
7371 ctx->user_at_version = ctx->at_version.version;
7372 ctx->new_obs.oi.user_version = ctx->user_at_version;
7373 }
7374 ctx->bytes_written = ctx->op_t->get_bytes_written();
7375
7376 if (ctx->new_obs.exists) {
7377 // on the head object
7378 ctx->new_obs.oi.version = ctx->at_version;
7379 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7380 ctx->new_obs.oi.last_reqid = ctx->reqid;
7381 if (ctx->mtime != utime_t()) {
7382 ctx->new_obs.oi.mtime = ctx->mtime;
7383 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7384 ctx->new_obs.oi.local_mtime = now;
7385 } else {
7386 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7387 }
7388
7389 map <string, bufferlist> attrs;
7390 bufferlist bv(sizeof(ctx->new_obs.oi));
7391 ::encode(ctx->new_obs.oi, bv,
7392 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7393 attrs[OI_ATTR].claim(bv);
7394
7395 if (soid.snap == CEPH_NOSNAP) {
7396 dout(10) << " final snapset " << ctx->new_snapset
7397 << " in " << soid << dendl;
7398 attrs[SS_ATTR].claim(bss);
7399 } else {
7400 dout(10) << " no snapset (this is a clone)" << dendl;
7401 }
7402 ctx->op_t->setattrs(soid, attrs);
7403 } else {
7404 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7405 }
7406
7407 bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7408 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7409
7410 // append to log
7411 ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7412 ctx->obs->oi.version,
7413 ctx->user_at_version, ctx->reqid,
7414 ctx->mtime, 0));
7415 if (soid.snap < CEPH_NOSNAP) {
7416 switch (log_op_type) {
7417 case pg_log_entry_t::MODIFY:
7418 case pg_log_entry_t::PROMOTE:
7419 case pg_log_entry_t::CLEAN:
7420 if (legacy_snapset) {
7421 dout(20) << __func__ << " encoding legacy_snaps "
7422 << ctx->new_obs.oi.legacy_snaps
7423 << dendl;
7424 ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7425 } else {
7426 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7427 << dendl;
7428 ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7429 }
7430 break;
7431 default:
7432 break;
7433 }
7434 }
7435
7436 if (!ctx->extra_reqids.empty()) {
7437 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << dendl;
7438 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7439 }
7440
7441 // apply new object state.
7442 ctx->obc->obs = ctx->new_obs;
7443
7444 if (soid.is_head() && !ctx->obc->obs.exists &&
7445 (!maintain_ssc || ctx->cache_evict)) {
7446 ctx->obc->ssc->exists = false;
7447 ctx->obc->ssc->snapset = SnapSet();
7448 } else {
7449 ctx->obc->ssc->exists = true;
7450 ctx->obc->ssc->snapset = ctx->new_snapset;
7451 }
7452 }
7453
7454 void PrimaryLogPG::apply_stats(
7455 const hobject_t &soid,
7456 const object_stat_sum_t &delta_stats) {
7457
7458 info.stats.stats.add(delta_stats);
7459
7460 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7461 i != backfill_targets.end();
7462 ++i) {
7463 pg_shard_t bt = *i;
7464 pg_info_t& pinfo = peer_info[bt];
7465 if (soid <= pinfo.last_backfill)
7466 pinfo.stats.stats.add(delta_stats);
7467 else if (soid <= last_backfill_started)
7468 pending_backfill_updates[soid].stats.add(delta_stats);
7469 }
7470
7471 if (is_primary() && scrubber.active) {
7472 if (soid < scrubber.start) {
7473 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7474 << "," << scrubber.end << ")" << dendl;
7475 scrub_cstat.add(delta_stats);
7476 } else {
7477 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7478 << "," << scrubber.end << ")" << dendl;
7479 }
7480 }
7481 }
7482
7483 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7484 {
7485 const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7486 assert(ctx->async_reads_complete());
7487
7488 for (vector<OSDOp>::iterator p = ctx->ops.begin();
7489 p != ctx->ops.end() && result >= 0; ++p) {
7490 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7491 result = p->rval;
7492 break;
7493 }
7494 ctx->bytes_read += p->outdata.length();
7495 }
7496 ctx->reply->claim_op_out_data(ctx->ops);
7497 ctx->reply->get_header().data_off = ctx->data_off;
7498
7499 MOSDOpReply *reply = ctx->reply;
7500 ctx->reply = nullptr;
7501
7502 if (result >= 0) {
7503 if (!ctx->ignore_log_op_stats) {
7504 log_op_stats(ctx);
7505 publish_stats_to_osd();
7506 }
7507
7508 // on read, return the current object version
7509 if (ctx->obs) {
7510 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7511 } else {
7512 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7513 }
7514 } else if (result == -ENOENT) {
7515 // on ENOENT, set a floor for what the next user version will be.
7516 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7517 }
7518
7519 reply->set_result(result);
7520 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7521 osd->send_message_osd_client(reply, m->get_connection());
7522 close_op_ctx(ctx);
7523 }
7524
7525 // ========================================================================
7526 // copyfrom
7527
7528 struct C_Copyfrom : public Context {
7529 PrimaryLogPGRef pg;
7530 hobject_t oid;
7531 epoch_t last_peering_reset;
7532 ceph_tid_t tid;
7533 PrimaryLogPG::CopyOpRef cop;
7534 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7535 const PrimaryLogPG::CopyOpRef& c)
7536 : pg(p), oid(o), last_peering_reset(lpr),
7537 tid(0), cop(c)
7538 {}
7539 void finish(int r) override {
7540 if (r == -ECANCELED)
7541 return;
7542 pg->lock();
7543 if (last_peering_reset == pg->get_last_peering_reset()) {
7544 pg->process_copy_chunk(oid, tid, r);
7545 }
7546 pg->unlock();
7547 }
7548 };
7549
7550 struct C_CopyFrom_AsyncReadCb : public Context {
7551 OSDOp *osd_op;
7552 object_copy_data_t reply_obj;
7553 uint64_t features;
7554 size_t len;
7555 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7556 osd_op(osd_op), features(features), len(0) {}
7557 void finish(int r) override {
7558 assert(len > 0);
7559 assert(len <= reply_obj.data.length());
7560 bufferlist bl;
7561 bl.substr_of(reply_obj.data, 0, len);
7562 reply_obj.data.swap(bl);
7563 ::encode(reply_obj, osd_op->outdata, features);
7564 }
7565 };
7566
7567 int PrimaryLogPG::fill_in_copy_get(
7568 OpContext *ctx,
7569 bufferlist::iterator& bp,
7570 OSDOp& osd_op,
7571 ObjectContextRef &obc)
7572 {
7573 object_info_t& oi = obc->obs.oi;
7574 hobject_t& soid = oi.soid;
7575 int result = 0;
7576 object_copy_cursor_t cursor;
7577 uint64_t out_max;
7578 try {
7579 ::decode(cursor, bp);
7580 ::decode(out_max, bp);
7581 }
7582 catch (buffer::error& e) {
7583 result = -EINVAL;
7584 return result;
7585 }
7586
7587 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7588 uint64_t features = op->get_features();
7589
7590 bool async_read_started = false;
7591 object_copy_data_t _reply_obj;
7592 C_CopyFrom_AsyncReadCb *cb = NULL;
7593 if (pool.info.require_rollback()) {
7594 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7595 }
7596 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7597 // size, mtime
7598 reply_obj.size = oi.size;
7599 reply_obj.mtime = oi.mtime;
7600 assert(obc->ssc);
7601 if (soid.snap < CEPH_NOSNAP) {
7602 if (obc->ssc->snapset.is_legacy()) {
7603 reply_obj.snaps = oi.legacy_snaps;
7604 } else {
7605 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7606 assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7607 reply_obj.snaps = p->second;
7608 }
7609 } else {
7610 reply_obj.snap_seq = obc->ssc->snapset.seq;
7611 }
7612 if (oi.is_data_digest()) {
7613 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7614 reply_obj.data_digest = oi.data_digest;
7615 }
7616 if (oi.is_omap_digest()) {
7617 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7618 reply_obj.omap_digest = oi.omap_digest;
7619 }
7620 reply_obj.truncate_seq = oi.truncate_seq;
7621 reply_obj.truncate_size = oi.truncate_size;
7622
7623 // attrs
7624 map<string,bufferlist>& out_attrs = reply_obj.attrs;
7625 if (!cursor.attr_complete) {
7626 result = getattrs_maybe_cache(
7627 ctx->obc,
7628 &out_attrs,
7629 true);
7630 if (result < 0) {
7631 if (cb) {
7632 delete cb;
7633 }
7634 return result;
7635 }
7636 cursor.attr_complete = true;
7637 dout(20) << " got attrs" << dendl;
7638 }
7639
7640 int64_t left = out_max - osd_op.outdata.length();
7641
7642 // data
7643 bufferlist& bl = reply_obj.data;
7644 if (left > 0 && !cursor.data_complete) {
7645 if (cursor.data_offset < oi.size) {
7646 uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7647 if (cb) {
7648 async_read_started = true;
7649 ctx->pending_async_reads.push_back(
7650 make_pair(
7651 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7652 make_pair(&bl, cb)));
7653 result = max_read;
7654 cb->len = result;
7655 } else {
7656 result = pgbackend->objects_read_sync(
7657 oi.soid, cursor.data_offset, left, osd_op.op.flags, &bl);
7658 if (result < 0)
7659 return result;
7660 }
7661 assert(result <= left);
7662 left -= result;
7663 cursor.data_offset += result;
7664 }
7665 if (cursor.data_offset == oi.size) {
7666 cursor.data_complete = true;
7667 dout(20) << " got data" << dendl;
7668 }
7669 assert(cursor.data_offset <= oi.size);
7670 }
7671
7672 // omap
7673 uint32_t omap_keys = 0;
7674 if (!pool.info.supports_omap() || !oi.is_omap()) {
7675 cursor.omap_complete = true;
7676 } else {
7677 if (left > 0 && !cursor.omap_complete) {
7678 assert(cursor.data_complete);
7679 if (cursor.omap_offset.empty()) {
7680 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7681 &reply_obj.omap_header);
7682 }
7683 bufferlist omap_data;
7684 ObjectMap::ObjectMapIterator iter =
7685 osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7686 assert(iter);
7687 iter->upper_bound(cursor.omap_offset);
7688 for (; iter->valid(); iter->next(false)) {
7689 ++omap_keys;
7690 ::encode(iter->key(), omap_data);
7691 ::encode(iter->value(), omap_data);
7692 left -= iter->key().length() + 4 + iter->value().length() + 4;
7693 if (left <= 0)
7694 break;
7695 }
7696 if (omap_keys) {
7697 ::encode(omap_keys, reply_obj.omap_data);
7698 reply_obj.omap_data.claim_append(omap_data);
7699 }
7700 if (iter->valid()) {
7701 cursor.omap_offset = iter->key();
7702 } else {
7703 cursor.omap_complete = true;
7704 dout(20) << " got omap" << dendl;
7705 }
7706 }
7707 }
7708
7709 if (cursor.is_complete()) {
7710 // include reqids only in the final step. this is a bit fragile
7711 // but it works...
7712 pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7713 dout(20) << " got reqids" << dendl;
7714 }
7715
7716 dout(20) << " cursor.is_complete=" << cursor.is_complete()
7717 << " " << out_attrs.size() << " attrs"
7718 << " " << bl.length() << " bytes"
7719 << " " << reply_obj.omap_header.length() << " omap header bytes"
7720 << " " << reply_obj.omap_data.length() << " omap data bytes in "
7721 << omap_keys << " keys"
7722 << " " << reply_obj.reqids.size() << " reqids"
7723 << dendl;
7724 reply_obj.cursor = cursor;
7725 if (!async_read_started) {
7726 ::encode(reply_obj, osd_op.outdata, features);
7727 }
7728 if (cb && !async_read_started) {
7729 delete cb;
7730 }
7731 result = 0;
7732 return result;
7733 }
7734
7735 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7736 OSDOp& osd_op)
7737 {
7738 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7739 // be careful not to modify anything else that will upset a racing
7740 // operator<<
7741 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7742 uint64_t features = m->get_features();
7743 object_copy_data_t reply_obj;
7744
7745 pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7746 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7747 ::encode(reply_obj, osd_op.outdata, features);
7748 osd_op.rval = -ENOENT;
7749 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7750 reply->claim_op_out_data(m->ops);
7751 reply->set_result(-ENOENT);
7752 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7753 osd->send_message_osd_client(reply, m->get_connection());
7754 }
7755
7756 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7757 hobject_t src, object_locator_t oloc,
7758 version_t version, unsigned flags,
7759 bool mirror_snapset,
7760 unsigned src_obj_fadvise_flags,
7761 unsigned dest_obj_fadvise_flags)
7762 {
7763 const hobject_t& dest = obc->obs.oi.soid;
7764 dout(10) << __func__ << " " << dest
7765 << " from " << src << " " << oloc << " v" << version
7766 << " flags " << flags
7767 << (mirror_snapset ? " mirror_snapset" : "")
7768 << dendl;
7769
7770 assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7771 src.snap == CEPH_SNAPDIR));
7772
7773 // cancel a previous in-progress copy?
7774 if (copy_ops.count(dest)) {
7775 // FIXME: if the src etc match, we could avoid restarting from the
7776 // beginning.
7777 CopyOpRef cop = copy_ops[dest];
7778 cancel_copy(cop, false);
7779 }
7780
7781 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
7782 mirror_snapset, src_obj_fadvise_flags,
7783 dest_obj_fadvise_flags));
7784 copy_ops[dest] = cop;
7785 obc->start_block();
7786
7787 _copy_some(obc, cop);
7788 }
7789
7790 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
7791 {
7792 dout(10) << __func__ << " " << obc << " " << cop << dendl;
7793
7794 unsigned flags = 0;
7795 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
7796 flags |= CEPH_OSD_FLAG_FLUSH;
7797 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
7798 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
7799 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
7800 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
7801 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
7802 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
7803 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
7804 flags |= CEPH_OSD_FLAG_RWORDERED;
7805
7806 C_GatherBuilder gather(cct);
7807
7808 if (cop->cursor.is_initial() && cop->mirror_snapset) {
7809 // list snaps too.
7810 assert(cop->src.snap == CEPH_NOSNAP);
7811 ObjectOperation op;
7812 op.list_snaps(&cop->results.snapset, NULL);
7813 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7814 CEPH_SNAPDIR, NULL,
7815 flags, gather.new_sub(), NULL);
7816 cop->objecter_tid2 = tid;
7817 }
7818
7819 ObjectOperation op;
7820 if (cop->results.user_version) {
7821 op.assert_version(cop->results.user_version);
7822 } else {
7823 // we should learn the version after the first chunk, if we didn't know
7824 // it already!
7825 assert(cop->cursor.is_initial());
7826 }
7827 op.copy_get(&cop->cursor, get_copy_chunk_size(),
7828 &cop->results.object_size, &cop->results.mtime,
7829 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
7830 &cop->results.snaps, &cop->results.snap_seq,
7831 &cop->results.flags,
7832 &cop->results.source_data_digest,
7833 &cop->results.source_omap_digest,
7834 &cop->results.reqids,
7835 &cop->results.truncate_seq,
7836 &cop->results.truncate_size,
7837 &cop->rval);
7838 op.set_last_op_flags(cop->src_obj_fadvise_flags);
7839
7840 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
7841 get_last_peering_reset(), cop);
7842 gather.set_finisher(new C_OnFinisher(fin,
7843 &osd->objecter_finisher));
7844
7845 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7846 cop->src.snap, NULL,
7847 flags,
7848 gather.new_sub(),
7849 // discover the object version if we don't know it yet
7850 cop->results.user_version ? NULL : &cop->results.user_version);
7851 fin->tid = tid;
7852 cop->objecter_tid = tid;
7853 gather.activate();
7854 }
7855
7856 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
7857 {
7858 dout(10) << __func__ << " " << oid << " tid " << tid
7859 << " " << cpp_strerror(r) << dendl;
7860 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
7861 if (p == copy_ops.end()) {
7862 dout(10) << __func__ << " no copy_op found" << dendl;
7863 return;
7864 }
7865 CopyOpRef cop = p->second;
7866 if (tid != cop->objecter_tid) {
7867 dout(10) << __func__ << " tid " << tid << " != cop " << cop
7868 << " tid " << cop->objecter_tid << dendl;
7869 return;
7870 }
7871
7872 if (cop->omap_data.length() || cop->omap_header.length())
7873 cop->results.has_omap = true;
7874
7875 if (r >= 0 && !pool.info.supports_omap() &&
7876 (cop->omap_data.length() || cop->omap_header.length())) {
7877 r = -EOPNOTSUPP;
7878 }
7879 cop->objecter_tid = 0;
7880 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
7881 ObjectContextRef& cobc = cop->obc;
7882
7883 if (r < 0)
7884 goto out;
7885
7886 assert(cop->rval >= 0);
7887
7888 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
7889 // verify snap hasn't been deleted
7890 vector<snapid_t>::iterator p = cop->results.snaps.begin();
7891 while (p != cop->results.snaps.end()) {
7892 if (pool.info.is_removed_snap(*p)) {
7893 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
7894 << dendl;
7895 for (vector<snapid_t>::iterator q = p + 1;
7896 q != cop->results.snaps.end();
7897 ++q)
7898 *(q - 1) = *q;
7899 cop->results.snaps.resize(cop->results.snaps.size() - 1);
7900 } else {
7901 ++p;
7902 }
7903 }
7904 if (cop->results.snaps.empty()) {
7905 dout(10) << __func__ << " no more snaps for " << oid << dendl;
7906 r = -ENOENT;
7907 goto out;
7908 }
7909 }
7910
7911 assert(cop->rval >= 0);
7912
7913 if (!cop->temp_cursor.data_complete) {
7914 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
7915 }
7916 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
7917 if (cop->omap_header.length()) {
7918 cop->results.omap_digest =
7919 cop->omap_header.crc32c(cop->results.omap_digest);
7920 }
7921 if (cop->omap_data.length()) {
7922 bufferlist keys;
7923 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
7924 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
7925 }
7926 }
7927
7928 if (!cop->temp_cursor.attr_complete) {
7929 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
7930 p != cop->attrs.end();
7931 ++p) {
7932 cop->results.attrs[string("_") + p->first] = p->second;
7933 }
7934 cop->attrs.clear();
7935 }
7936
7937 if (!cop->cursor.is_complete()) {
7938 // write out what we have so far
7939 if (cop->temp_cursor.is_initial()) {
7940 assert(!cop->results.started_temp_obj);
7941 cop->results.started_temp_obj = true;
7942 cop->results.temp_oid = generate_temp_object(oid);
7943 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
7944 }
7945 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
7946 OpContextUPtr ctx = simple_opc_create(tempobc);
7947 if (cop->temp_cursor.is_initial()) {
7948 ctx->new_temp_oid = cop->results.temp_oid;
7949 }
7950 _write_copy_chunk(cop, ctx->op_t.get());
7951 simple_opc_submit(std::move(ctx));
7952 dout(10) << __func__ << " fetching more" << dendl;
7953 _copy_some(cobc, cop);
7954 return;
7955 }
7956
7957 // verify digests?
7958 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
7959 dout(20) << __func__ << std::hex
7960 << " got digest: rx data 0x" << cop->results.data_digest
7961 << " omap 0x" << cop->results.omap_digest
7962 << ", source: data 0x" << cop->results.source_data_digest
7963 << " omap 0x" << cop->results.source_omap_digest
7964 << std::dec
7965 << " flags " << cop->results.flags
7966 << dendl;
7967 }
7968 if (cop->results.is_data_digest() &&
7969 cop->results.data_digest != cop->results.source_data_digest) {
7970 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
7971 << " != source 0x" << cop->results.source_data_digest << std::dec
7972 << dendl;
7973 osd->clog->error() << info.pgid << " copy from " << cop->src
7974 << " to " << cop->obc->obs.oi.soid << std::hex
7975 << " data digest 0x" << cop->results.data_digest
7976 << " != source 0x" << cop->results.source_data_digest
7977 << std::dec;
7978 r = -EIO;
7979 goto out;
7980 }
7981 if (cop->results.is_omap_digest() &&
7982 cop->results.omap_digest != cop->results.source_omap_digest) {
7983 derr << __func__ << std::hex
7984 << " omap digest 0x" << cop->results.omap_digest
7985 << " != source 0x" << cop->results.source_omap_digest
7986 << std::dec << dendl;
7987 osd->clog->error() << info.pgid << " copy from " << cop->src
7988 << " to " << cop->obc->obs.oi.soid << std::hex
7989 << " omap digest 0x" << cop->results.omap_digest
7990 << " != source 0x" << cop->results.source_omap_digest
7991 << std::dec;
7992 r = -EIO;
7993 goto out;
7994 }
7995 if (cct->_conf->osd_debug_inject_copyfrom_error) {
7996 derr << __func__ << " injecting copyfrom failure" << dendl;
7997 r = -EIO;
7998 goto out;
7999 }
8000
8001 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8002 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8003 ObjectState& obs = cop->obc->obs;
8004 if (cop->temp_cursor.is_initial()) {
8005 dout(20) << "fill_in_final_tx: writing "
8006 << "directly to final object" << dendl;
8007 // write directly to final object
8008 cop->results.temp_oid = obs.oi.soid;
8009 _write_copy_chunk(cop, t);
8010 } else {
8011 // finish writing to temp object, then move into place
8012 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8013 _write_copy_chunk(cop, t);
8014 t->rename(obs.oi.soid, cop->results.temp_oid);
8015 }
8016 t->setattrs(obs.oi.soid, cop->results.attrs);
8017 });
8018
8019 dout(20) << __func__ << " success; committing" << dendl;
8020
8021 out:
8022 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8023 CopyCallbackResults results(r, &cop->results);
8024 cop->cb->complete(results);
8025
8026 copy_ops.erase(cobc->obs.oi.soid);
8027 cobc->stop_block();
8028
8029 if (r < 0 && cop->results.started_temp_obj) {
8030 dout(10) << __func__ << " deleting partial temp object "
8031 << cop->results.temp_oid << dendl;
8032 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8033 OpContextUPtr ctx = simple_opc_create(tempobc);
8034 ctx->op_t->remove(cop->results.temp_oid);
8035 ctx->discard_temp_oid = cop->results.temp_oid;
8036 simple_opc_submit(std::move(ctx));
8037 }
8038
8039 // cancel and requeue proxy ops on this object
8040 if (!r) {
8041 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8042 it != proxyread_ops.end();) {
8043 if (it->second->soid == cobc->obs.oi.soid) {
8044 cancel_proxy_read((it++)->second);
8045 } else {
8046 ++it;
8047 }
8048 }
8049 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8050 it != proxywrite_ops.end();) {
8051 if (it->second->soid == cobc->obs.oi.soid) {
8052 cancel_proxy_write((it++)->second);
8053 } else {
8054 ++it;
8055 }
8056 }
8057 kick_proxy_ops_blocked(cobc->obs.oi.soid);
8058 }
8059
8060 kick_object_context_blocked(cobc);
8061 }
8062
8063 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8064 {
8065 dout(20) << __func__ << " " << cop
8066 << " " << cop->attrs.size() << " attrs"
8067 << " " << cop->data.length() << " bytes"
8068 << " " << cop->omap_header.length() << " omap header bytes"
8069 << " " << cop->omap_data.length() << " omap data bytes"
8070 << dendl;
8071 if (!cop->temp_cursor.attr_complete) {
8072 t->create(cop->results.temp_oid);
8073 }
8074 if (!cop->temp_cursor.data_complete) {
8075 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8076 cop->cursor.data_offset);
8077 if (pool.info.requires_aligned_append() &&
8078 !cop->cursor.data_complete) {
8079 /**
8080 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8081 * to pick it up on the next pass.
8082 */
8083 assert(cop->temp_cursor.data_offset %
8084 pool.info.required_alignment() == 0);
8085 if (cop->data.length() % pool.info.required_alignment() != 0) {
8086 uint64_t to_trim =
8087 cop->data.length() % pool.info.required_alignment();
8088 bufferlist bl;
8089 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8090 cop->data.swap(bl);
8091 cop->cursor.data_offset -= to_trim;
8092 assert(cop->data.length() + cop->temp_cursor.data_offset ==
8093 cop->cursor.data_offset);
8094 }
8095 }
8096 if (cop->data.length()) {
8097 t->write(
8098 cop->results.temp_oid,
8099 cop->temp_cursor.data_offset,
8100 cop->data.length(),
8101 cop->data,
8102 cop->dest_obj_fadvise_flags);
8103 }
8104 cop->data.clear();
8105 }
8106 if (pool.info.supports_omap()) {
8107 if (!cop->temp_cursor.omap_complete) {
8108 if (cop->omap_header.length()) {
8109 t->omap_setheader(
8110 cop->results.temp_oid,
8111 cop->omap_header);
8112 cop->omap_header.clear();
8113 }
8114 if (cop->omap_data.length()) {
8115 map<string,bufferlist> omap;
8116 bufferlist::iterator p = cop->omap_data.begin();
8117 ::decode(omap, p);
8118 t->omap_setkeys(cop->results.temp_oid, omap);
8119 cop->omap_data.clear();
8120 }
8121 }
8122 } else {
8123 assert(cop->omap_header.length() == 0);
8124 assert(cop->omap_data.length() == 0);
8125 }
8126 cop->temp_cursor = cop->cursor;
8127 }
8128
8129 void PrimaryLogPG::finish_copyfrom(OpContext *ctx)
8130 {
8131 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8132 ObjectState& obs = ctx->new_obs;
8133 CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
8134
8135 if (obs.exists) {
8136 dout(20) << __func__ << ": exists, removing" << dendl;
8137 ctx->op_t->remove(obs.oi.soid);
8138 } else {
8139 ctx->delta_stats.num_objects++;
8140 obs.exists = true;
8141 }
8142 if (cb->is_temp_obj_used()) {
8143 ctx->discard_temp_oid = cb->results->temp_oid;
8144 }
8145 cb->results->fill_in_final_tx(ctx->op_t.get());
8146
8147 // CopyFromCallback fills this in for us
8148 obs.oi.user_version = ctx->user_at_version;
8149
8150 obs.oi.set_data_digest(cb->results->data_digest);
8151 obs.oi.set_omap_digest(cb->results->omap_digest);
8152
8153 obs.oi.truncate_seq = cb->results->truncate_seq;
8154 obs.oi.truncate_size = cb->results->truncate_size;
8155
8156 ctx->extra_reqids = cb->results->reqids;
8157
8158 // cache: clear whiteout?
8159 if (obs.oi.is_whiteout()) {
8160 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8161 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8162 --ctx->delta_stats.num_whiteouts;
8163 }
8164
8165 if (cb->results->has_omap) {
8166 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8167 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8168 } else {
8169 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8170 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8171 }
8172
8173 interval_set<uint64_t> ch;
8174 if (obs.oi.size > 0)
8175 ch.insert(0, obs.oi.size);
8176 ctx->modified_ranges.union_of(ch);
8177
8178 if (cb->get_data_size() != obs.oi.size) {
8179 ctx->delta_stats.num_bytes -= obs.oi.size;
8180 obs.oi.size = cb->get_data_size();
8181 ctx->delta_stats.num_bytes += obs.oi.size;
8182 }
8183 ctx->delta_stats.num_wr++;
8184 ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8185
8186 osd->logger->inc(l_osd_copyfrom);
8187 }
8188
8189 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8190 ObjectContextRef obc)
8191 {
8192 const hobject_t& soid = obc->obs.oi.soid;
8193 dout(10) << __func__ << " " << soid << " r=" << r
8194 << " uv" << results->user_version << dendl;
8195
8196 if (r == -ECANCELED) {
8197 return;
8198 }
8199
8200 if (r != -ENOENT && soid.is_snap()) {
8201 if (results->snaps.empty()) {
8202 // we must have read "snap" content from the head object in
8203 // the base pool. use snap_seq to construct what snaps should
8204 // be for this clone (what is was before we evicted the clean
8205 // clone from this pool, and what it will be when we flush and
8206 // the clone eventually happens in the base pool).
8207 SnapSet& snapset = obc->ssc->snapset;
8208 vector<snapid_t>::iterator p = snapset.snaps.begin();
8209 while (p != snapset.snaps.end() && *p > soid.snap)
8210 ++p;
8211 while (p != snapset.snaps.end() && *p > results->snap_seq) {
8212 results->snaps.push_back(*p);
8213 ++p;
8214 }
8215 }
8216
8217 dout(20) << __func__ << " snaps " << results->snaps << dendl;
8218 filter_snapc(results->snaps);
8219
8220 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8221 if (results->snaps.empty()) {
8222 dout(20) << __func__
8223 << " snaps are empty, clone is invalid,"
8224 << " setting r to ENOENT" << dendl;
8225 r = -ENOENT;
8226 }
8227 }
8228
8229 if (r < 0 && results->started_temp_obj) {
8230 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8231 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8232 assert(tempobc);
8233 OpContextUPtr ctx = simple_opc_create(tempobc);
8234 ctx->op_t->remove(results->temp_oid);
8235 simple_opc_submit(std::move(ctx));
8236 results->started_temp_obj = false;
8237 }
8238
8239 if (r == -ENOENT && soid.is_snap()) {
8240 dout(10) << __func__
8241 << ": enoent while trying to promote clone, " << soid
8242 << " must have been trimmed, removing from snapset"
8243 << dendl;
8244 hobject_t head(soid.get_head());
8245 ObjectContextRef obc = get_object_context(head, false);
8246 assert(obc);
8247
8248 OpContextUPtr tctx = simple_opc_create(obc);
8249 tctx->at_version = get_next_version();
8250 filter_snapc(tctx->new_snapset.snaps);
8251 vector<snapid_t> new_clones;
8252 map<snapid_t, vector<snapid_t>> new_clone_snaps;
8253 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8254 i != tctx->new_snapset.clones.end();
8255 ++i) {
8256 if (*i != soid.snap) {
8257 new_clones.push_back(*i);
8258 auto p = tctx->new_snapset.clone_snaps.find(*i);
8259 if (p != tctx->new_snapset.clone_snaps.end()) {
8260 new_clone_snaps[*i] = p->second;
8261 }
8262 }
8263 }
8264 tctx->new_snapset.clones.swap(new_clones);
8265 tctx->new_snapset.clone_overlap.erase(soid.snap);
8266 tctx->new_snapset.clone_size.erase(soid.snap);
8267 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8268
8269 // take RWWRITE lock for duration of our local write. ignore starvation.
8270 if (!tctx->lock_manager.take_write_lock(
8271 head,
8272 obc)) {
8273 assert(0 == "problem!");
8274 }
8275 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8276
8277 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8278
8279 simple_opc_submit(std::move(tctx));
8280 return;
8281 }
8282
8283 bool whiteout = false;
8284 if (r == -ENOENT) {
8285 assert(soid.snap == CEPH_NOSNAP); // snap case is above
8286 dout(10) << __func__ << " whiteout " << soid << dendl;
8287 whiteout = true;
8288 }
8289
8290 if (r < 0 && !whiteout) {
8291 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8292 // pass error to everyone blocked on this object
8293 // FIXME: this is pretty sloppy, but at this point we got
8294 // something unexpected and don't have many other options.
8295 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8296 waiting_for_blocked_object.find(soid);
8297 if (blocked_iter != waiting_for_blocked_object.end()) {
8298 while (!blocked_iter->second.empty()) {
8299 osd->reply_op_error(blocked_iter->second.front(), r);
8300 blocked_iter->second.pop_front();
8301 }
8302 waiting_for_blocked_object.erase(blocked_iter);
8303 }
8304 return;
8305 }
8306
8307 osd->promote_finish(results->object_size);
8308
8309 OpContextUPtr tctx = simple_opc_create(obc);
8310 tctx->at_version = get_next_version();
8311
8312 ++tctx->delta_stats.num_objects;
8313 if (soid.snap < CEPH_NOSNAP)
8314 ++tctx->delta_stats.num_object_clones;
8315 tctx->new_obs.exists = true;
8316
8317 tctx->extra_reqids = results->reqids;
8318
8319 bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8320 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8321
8322 if (whiteout) {
8323 // create a whiteout
8324 tctx->op_t->create(soid);
8325 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8326 ++tctx->delta_stats.num_whiteouts;
8327 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8328 osd->logger->inc(l_osd_tier_whiteout);
8329 } else {
8330 if (results->has_omap) {
8331 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8332 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8333 ++tctx->delta_stats.num_objects_omap;
8334 }
8335
8336 results->fill_in_final_tx(tctx->op_t.get());
8337 if (results->started_temp_obj) {
8338 tctx->discard_temp_oid = results->temp_oid;
8339 }
8340 tctx->new_obs.oi.size = results->object_size;
8341 tctx->new_obs.oi.user_version = results->user_version;
8342 // Don't care src object whether have data or omap digest
8343 if (results->object_size)
8344 tctx->new_obs.oi.set_data_digest(results->data_digest);
8345 if (results->has_omap)
8346 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8347 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8348 tctx->new_obs.oi.truncate_size = results->truncate_size;
8349
8350 if (soid.snap != CEPH_NOSNAP) {
8351 if (legacy_snapset) {
8352 tctx->new_obs.oi.legacy_snaps = results->snaps;
8353 assert(!tctx->new_obs.oi.legacy_snaps.empty());
8354 } else {
8355 // it's already in the snapset
8356 assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8357 }
8358 assert(obc->ssc->snapset.clone_size.count(soid.snap));
8359 assert(obc->ssc->snapset.clone_size[soid.snap] ==
8360 results->object_size);
8361 assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8362
8363 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8364 } else {
8365 tctx->delta_stats.num_bytes += results->object_size;
8366 }
8367 }
8368
8369 if (results->mirror_snapset) {
8370 assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8371 tctx->new_snapset.from_snap_set(
8372 results->snapset,
8373 get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8374 }
8375 tctx->new_snapset.head_exists = true;
8376 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8377
8378 // take RWWRITE lock for duration of our local write. ignore starvation.
8379 if (!tctx->lock_manager.take_write_lock(
8380 obc->obs.oi.soid,
8381 obc)) {
8382 assert(0 == "problem!");
8383 }
8384 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8385
8386 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8387
8388 simple_opc_submit(std::move(tctx));
8389
8390 osd->logger->inc(l_osd_tier_promote);
8391
8392 if (agent_state &&
8393 agent_state->is_idle())
8394 agent_choose_mode();
8395 }
8396
8397 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8398 {
8399 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8400 << " from " << cop->src << " " << cop->oloc
8401 << " v" << cop->results.user_version << dendl;
8402
8403 // cancel objecter op, if we can
8404 if (cop->objecter_tid) {
8405 osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8406 cop->objecter_tid = 0;
8407 if (cop->objecter_tid2) {
8408 osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8409 cop->objecter_tid2 = 0;
8410 }
8411 }
8412
8413 copy_ops.erase(cop->obc->obs.oi.soid);
8414 cop->obc->stop_block();
8415
8416 kick_object_context_blocked(cop->obc);
8417 cop->results.should_requeue = requeue;
8418 CopyCallbackResults result(-ECANCELED, &cop->results);
8419 cop->cb->complete(result);
8420
8421 // There may still be an objecter callback referencing this copy op.
8422 // That callback will not need the obc since it's been canceled, and
8423 // we need the obc reference to go away prior to flush.
8424 cop->obc = ObjectContextRef();
8425 }
8426
8427 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8428 {
8429 dout(10) << __func__ << dendl;
8430 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8431 while (p != copy_ops.end()) {
8432 // requeue this op? can I queue up all of them?
8433 cancel_copy((p++)->second, requeue);
8434 }
8435 }
8436
8437
8438 // ========================================================================
8439 // flush
8440 //
8441 // Flush a dirty object in the cache tier by writing it back to the
8442 // base tier. The sequence looks like:
8443 //
8444 // * send a copy-from operation to the base tier to copy the current
8445 // version of the object
8446 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8447 // * on completion, we check if the object has been modified. if so,
8448 // just reply with -EAGAIN.
8449 // * try to take a write lock so we can clear the dirty flag. if this
8450 // fails, wait and retry
8451 // * start a repop that clears the bit.
8452 //
8453 // If we have to wait, we will retry by coming back through the
8454 // start_flush method. We check if a flush is already in progress
8455 // and, if so, try to finish it by rechecking the version and trying
8456 // to clear the dirty bit.
8457 //
8458 // In order for the cache-flush (a write op) to not block the copy-get
8459 // from reading the object, the client *must* set the SKIPRWLOCKS
8460 // flag.
8461 //
8462 // NOTE: normally writes are strictly ordered for the client, but
8463 // flushes are special in that they can be reordered with respect to
8464 // other writes. In particular, we can't have a flush request block
8465 // an update to the cache pool object!
8466
8467 struct C_Flush : public Context {
8468 PrimaryLogPGRef pg;
8469 hobject_t oid;
8470 epoch_t last_peering_reset;
8471 ceph_tid_t tid;
8472 utime_t start;
8473 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8474 : pg(p), oid(o), last_peering_reset(lpr),
8475 tid(0), start(ceph_clock_now())
8476 {}
8477 void finish(int r) override {
8478 if (r == -ECANCELED)
8479 return;
8480 pg->lock();
8481 if (last_peering_reset == pg->get_last_peering_reset()) {
8482 pg->finish_flush(oid, tid, r);
8483 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8484 }
8485 pg->unlock();
8486 }
8487 };
8488
8489 int PrimaryLogPG::start_flush(
8490 OpRequestRef op, ObjectContextRef obc,
8491 bool blocking, hobject_t *pmissing,
8492 boost::optional<std::function<void()>> &&on_flush)
8493 {
8494 const object_info_t& oi = obc->obs.oi;
8495 const hobject_t& soid = oi.soid;
8496 dout(10) << __func__ << " " << soid
8497 << " v" << oi.version
8498 << " uv" << oi.user_version
8499 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8500 << dendl;
8501
8502 // get a filtered snapset, need to remove removed snaps
8503 SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8504
8505 // verify there are no (older) check for dirty clones
8506 {
8507 dout(20) << " snapset " << snapset << dendl;
8508 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8509 while (p != snapset.clones.rend() && *p >= soid.snap)
8510 ++p;
8511 if (p != snapset.clones.rend()) {
8512 hobject_t next = soid;
8513 next.snap = *p;
8514 assert(next.snap < soid.snap);
8515 if (pg_log.get_missing().is_missing(next)) {
8516 dout(10) << __func__ << " missing clone is " << next << dendl;
8517 if (pmissing)
8518 *pmissing = next;
8519 return -ENOENT;
8520 }
8521 ObjectContextRef older_obc = get_object_context(next, false);
8522 if (older_obc) {
8523 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8524 << dendl;
8525 if (older_obc->obs.oi.is_dirty()) {
8526 dout(10) << __func__ << " next oldest clone is dirty: "
8527 << older_obc->obs.oi << dendl;
8528 return -EBUSY;
8529 }
8530 } else {
8531 dout(20) << __func__ << " next oldest clone " << next
8532 << " is not present; implicitly clean" << dendl;
8533 }
8534 } else {
8535 dout(20) << __func__ << " no older clones" << dendl;
8536 }
8537 }
8538
8539 if (blocking)
8540 obc->start_block();
8541
8542 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8543 if (p != flush_ops.end()) {
8544 FlushOpRef fop = p->second;
8545 if (fop->op == op) {
8546 // we couldn't take the write lock on a cache-try-flush before;
8547 // now we are trying again for the lock.
8548 return try_flush_mark_clean(fop);
8549 }
8550 if (fop->flushed_version == obc->obs.oi.user_version &&
8551 (fop->blocking || !blocking)) {
8552 // nonblocking can join anything
8553 // blocking can only join a blocking flush
8554 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8555 if (op)
8556 fop->dup_ops.push_back(op);
8557 return -EAGAIN; // clean up this ctx; op will retry later
8558 }
8559
8560 // cancel current flush since it will fail anyway, or because we
8561 // are blocking and the existing flush is nonblocking.
8562 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8563 if (fop->op)
8564 osd->reply_op_error(fop->op, -EBUSY);
8565 while (!fop->dup_ops.empty()) {
8566 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8567 fop->dup_ops.pop_front();
8568 }
8569 cancel_flush(fop, false);
8570 }
8571
8572 /**
8573 * In general, we need to send a delete and a copyfrom.
8574 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8575 * where 4 is marked as clean. To flush 10, we have to:
8576 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8577 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8578 *
8579 * There is a complicating case. Supposed there had been a clone 7
8580 * for snaps [7, 6] which has been trimmed since they no longer exist.
8581 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8582 * the delete, the snap will be promoted to 5, and the head will become
8583 * a snapdir. When the copy-from goes through, we'll end up with
8584 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8585 *
8586 * Another complication is the case where there is an interval change
8587 * after doing the delete and the flush but before marking the object
8588 * clean. We'll happily delete head and then recreate it at the same
8589 * sequence number, which works out ok.
8590 */
8591
8592 SnapContext snapc, dsnapc;
8593 if (snapset.seq != 0) {
8594 if (soid.snap == CEPH_NOSNAP) {
8595 snapc.seq = snapset.seq;
8596 snapc.snaps = snapset.snaps;
8597 } else {
8598 snapid_t min_included_snap;
8599 if (snapset.is_legacy()) {
8600 min_included_snap = oi.legacy_snaps.back();
8601 } else {
8602 auto p = snapset.clone_snaps.find(soid.snap);
8603 assert(p != snapset.clone_snaps.end());
8604 min_included_snap = p->second.back();
8605 }
8606 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8607 }
8608
8609 snapid_t prev_snapc = 0;
8610 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8611 citer != snapset.clones.rend();
8612 ++citer) {
8613 if (*citer < soid.snap) {
8614 prev_snapc = *citer;
8615 break;
8616 }
8617 }
8618
8619 dsnapc = snapset.get_ssc_as_of(prev_snapc);
8620 }
8621
8622 object_locator_t base_oloc(soid);
8623 base_oloc.pool = pool.info.tier_of;
8624
8625 if (dsnapc.seq < snapc.seq) {
8626 ObjectOperation o;
8627 o.remove();
8628 osd->objecter->mutate(
8629 soid.oid,
8630 base_oloc,
8631 o,
8632 dsnapc,
8633 ceph::real_clock::from_ceph_timespec(oi.mtime),
8634 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8635 CEPH_OSD_FLAG_ENFORCE_SNAPC),
8636 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8637 }
8638
8639 FlushOpRef fop(std::make_shared<FlushOp>());
8640 fop->obc = obc;
8641 fop->flushed_version = oi.user_version;
8642 fop->blocking = blocking;
8643 fop->on_flush = std::move(on_flush);
8644 fop->op = op;
8645
8646 ObjectOperation o;
8647 if (oi.is_whiteout()) {
8648 fop->removal = true;
8649 o.remove();
8650 } else {
8651 object_locator_t oloc(soid);
8652 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8653 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8654 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8655 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8656 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8657 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8658
8659 //mean the base tier don't cache data after this
8660 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8661 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8662 }
8663 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8664
8665 ceph_tid_t tid = osd->objecter->mutate(
8666 soid.oid, base_oloc, o, snapc,
8667 ceph::real_clock::from_ceph_timespec(oi.mtime),
8668 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8669 new C_OnFinisher(fin,
8670 &osd->objecter_finisher));
8671 /* we're under the pg lock and fin->finish() is grabbing that */
8672 fin->tid = tid;
8673 fop->objecter_tid = tid;
8674
8675 flush_ops[soid] = fop;
8676 info.stats.stats.sum.num_flush++;
8677 info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8678 return -EINPROGRESS;
8679 }
8680
8681 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8682 {
8683 dout(10) << __func__ << " " << oid << " tid " << tid
8684 << " " << cpp_strerror(r) << dendl;
8685 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8686 if (p == flush_ops.end()) {
8687 dout(10) << __func__ << " no flush_op found" << dendl;
8688 return;
8689 }
8690 FlushOpRef fop = p->second;
8691 if (tid != fop->objecter_tid) {
8692 dout(10) << __func__ << " tid " << tid << " != fop " << fop
8693 << " tid " << fop->objecter_tid << dendl;
8694 return;
8695 }
8696 ObjectContextRef obc = fop->obc;
8697 fop->objecter_tid = 0;
8698
8699 if (r < 0 && !(r == -ENOENT && fop->removal)) {
8700 if (fop->op)
8701 osd->reply_op_error(fop->op, -EBUSY);
8702 if (fop->blocking) {
8703 obc->stop_block();
8704 kick_object_context_blocked(obc);
8705 }
8706
8707 if (!fop->dup_ops.empty()) {
8708 dout(20) << __func__ << " requeueing dups" << dendl;
8709 requeue_ops(fop->dup_ops);
8710 }
8711 if (fop->on_flush) {
8712 (*(fop->on_flush))();
8713 fop->on_flush = boost::none;
8714 }
8715 flush_ops.erase(oid);
8716 return;
8717 }
8718
8719 r = try_flush_mark_clean(fop);
8720 if (r == -EBUSY && fop->op) {
8721 osd->reply_op_error(fop->op, r);
8722 }
8723 }
8724
8725 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8726 {
8727 ObjectContextRef obc = fop->obc;
8728 const hobject_t& oid = obc->obs.oi.soid;
8729
8730 if (fop->blocking) {
8731 obc->stop_block();
8732 kick_object_context_blocked(obc);
8733 }
8734
8735 if (fop->flushed_version != obc->obs.oi.user_version ||
8736 !obc->obs.exists) {
8737 if (obc->obs.exists)
8738 dout(10) << __func__ << " flushed_version " << fop->flushed_version
8739 << " != current " << obc->obs.oi.user_version
8740 << dendl;
8741 else
8742 dout(10) << __func__ << " object no longer exists" << dendl;
8743
8744 if (!fop->dup_ops.empty()) {
8745 dout(20) << __func__ << " requeueing dups" << dendl;
8746 requeue_ops(fop->dup_ops);
8747 }
8748 if (fop->on_flush) {
8749 (*(fop->on_flush))();
8750 fop->on_flush = boost::none;
8751 }
8752 flush_ops.erase(oid);
8753 if (fop->blocking)
8754 osd->logger->inc(l_osd_tier_flush_fail);
8755 else
8756 osd->logger->inc(l_osd_tier_try_flush_fail);
8757 return -EBUSY;
8758 }
8759
8760 if (!fop->blocking &&
8761 scrubber.write_blocked_by_scrub(oid)) {
8762 if (fop->op) {
8763 dout(10) << __func__ << " blocked by scrub" << dendl;
8764 requeue_op(fop->op);
8765 requeue_ops(fop->dup_ops);
8766 return -EAGAIN; // will retry
8767 } else {
8768 osd->logger->inc(l_osd_tier_try_flush_fail);
8769 cancel_flush(fop, false);
8770 return -ECANCELED;
8771 }
8772 }
8773
8774 // successfully flushed, can we evict this object?
8775 if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8776 agent_maybe_evict(obc, true)) {
8777 osd->logger->inc(l_osd_tier_clean);
8778 if (fop->on_flush) {
8779 (*(fop->on_flush))();
8780 fop->on_flush = boost::none;
8781 }
8782 flush_ops.erase(oid);
8783 return 0;
8784 }
8785
8786 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
8787 OpContextUPtr ctx = simple_opc_create(fop->obc);
8788
8789 // successfully flushed; can we clear the dirty bit?
8790 // try to take the lock manually, since we don't
8791 // have a ctx yet.
8792 if (ctx->lock_manager.get_lock_type(
8793 ObjectContext::RWState::RWWRITE,
8794 oid,
8795 obc,
8796 fop->op)) {
8797 dout(20) << __func__ << " took write lock" << dendl;
8798 } else if (fop->op) {
8799 dout(10) << __func__ << " waiting on write lock" << dendl;
8800 close_op_ctx(ctx.release());
8801 requeue_op(fop->op);
8802 requeue_ops(fop->dup_ops);
8803 return -EAGAIN; // will retry
8804 } else {
8805 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
8806 close_op_ctx(ctx.release());
8807 osd->logger->inc(l_osd_tier_try_flush_fail);
8808 cancel_flush(fop, false);
8809 return -ECANCELED;
8810 }
8811
8812 if (fop->on_flush) {
8813 ctx->register_on_finish(*(fop->on_flush));
8814 fop->on_flush = boost::none;
8815 }
8816
8817 ctx->at_version = get_next_version();
8818
8819 ctx->new_obs = obc->obs;
8820 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8821 --ctx->delta_stats.num_objects_dirty;
8822
8823 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
8824
8825 osd->logger->inc(l_osd_tier_clean);
8826
8827 if (!fop->dup_ops.empty() || fop->op) {
8828 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
8829 list<OpRequestRef> ls;
8830 if (fop->op)
8831 ls.push_back(fop->op);
8832 ls.splice(ls.end(), fop->dup_ops);
8833 requeue_ops(ls);
8834 }
8835
8836 simple_opc_submit(std::move(ctx));
8837
8838 flush_ops.erase(oid);
8839
8840 if (fop->blocking)
8841 osd->logger->inc(l_osd_tier_flush);
8842 else
8843 osd->logger->inc(l_osd_tier_try_flush);
8844
8845 return -EINPROGRESS;
8846 }
8847
8848 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
8849 {
8850 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
8851 << fop->objecter_tid << dendl;
8852 if (fop->objecter_tid) {
8853 osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
8854 fop->objecter_tid = 0;
8855 }
8856 if (fop->blocking) {
8857 fop->obc->stop_block();
8858 kick_object_context_blocked(fop->obc);
8859 }
8860 if (requeue) {
8861 if (fop->op)
8862 requeue_op(fop->op);
8863 requeue_ops(fop->dup_ops);
8864 }
8865 if (fop->on_flush) {
8866 (*(fop->on_flush))();
8867 fop->on_flush = boost::none;
8868 }
8869 flush_ops.erase(fop->obc->obs.oi.soid);
8870 }
8871
8872 void PrimaryLogPG::cancel_flush_ops(bool requeue)
8873 {
8874 dout(10) << __func__ << dendl;
8875 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
8876 while (p != flush_ops.end()) {
8877 cancel_flush((p++)->second, requeue);
8878 }
8879 }
8880
8881 bool PrimaryLogPG::is_present_clone(hobject_t coid)
8882 {
8883 if (!pool.info.allow_incomplete_clones())
8884 return true;
8885 if (is_missing_object(coid))
8886 return true;
8887 ObjectContextRef obc = get_object_context(coid, false);
8888 return obc && obc->obs.exists;
8889 }
8890
8891 // ========================================================================
8892 // rep op gather
8893
8894 class C_OSD_RepopApplied : public Context {
8895 PrimaryLogPGRef pg;
8896 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8897 public:
8898 C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8899 : pg(pg), repop(repop) {}
8900 void finish(int) override {
8901 pg->repop_all_applied(repop.get());
8902 }
8903 };
8904
8905
8906 void PrimaryLogPG::repop_all_applied(RepGather *repop)
8907 {
8908 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
8909 << dendl;
8910 assert(!repop->applies_with_commit);
8911 repop->all_applied = true;
8912 if (!repop->rep_aborted) {
8913 eval_repop(repop);
8914 }
8915 }
8916
8917 class C_OSD_RepopCommit : public Context {
8918 PrimaryLogPGRef pg;
8919 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8920 public:
8921 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8922 : pg(pg), repop(repop) {}
8923 void finish(int) override {
8924 pg->repop_all_committed(repop.get());
8925 }
8926 };
8927
8928 void PrimaryLogPG::repop_all_committed(RepGather *repop)
8929 {
8930 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
8931 << dendl;
8932 repop->all_committed = true;
8933 if (repop->applies_with_commit) {
8934 assert(!repop->all_applied);
8935 repop->all_applied = true;
8936 }
8937
8938 if (!repop->rep_aborted) {
8939 if (repop->v != eversion_t()) {
8940 last_update_ondisk = repop->v;
8941 last_complete_ondisk = repop->pg_local_last_complete;
8942 }
8943 eval_repop(repop);
8944 }
8945 }
8946
8947 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
8948 {
8949 dout(10) << "op_applied version " << applied_version << dendl;
8950 if (applied_version == eversion_t())
8951 return;
8952 assert(applied_version > last_update_applied);
8953 assert(applied_version <= info.last_update);
8954 last_update_applied = applied_version;
8955 if (is_primary()) {
8956 if (scrubber.active) {
8957 if (last_update_applied == scrubber.subset_last_update) {
8958 if (ops_blocked_by_scrub()) {
8959 requeue_scrub(true);
8960 } else {
8961 requeue_scrub(false);
8962 }
8963
8964 }
8965 } else {
8966 assert(scrubber.start == scrubber.end);
8967 }
8968 } else {
8969 if (scrubber.active_rep_scrub) {
8970 if (last_update_applied == static_cast<const MOSDRepScrub*>(
8971 scrubber.active_rep_scrub->get_req())->scrub_to) {
8972 osd->enqueue_back(
8973 info.pgid,
8974 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
8975 scrubber.active_rep_scrub = OpRequestRef();
8976 }
8977 }
8978 }
8979 }
8980
8981 void PrimaryLogPG::eval_repop(RepGather *repop)
8982 {
8983 const MOSDOp *m = NULL;
8984 if (repop->op)
8985 m = static_cast<const MOSDOp *>(repop->op->get_req());
8986
8987 if (m)
8988 dout(10) << "eval_repop " << *repop
8989 << (repop->rep_done ? " DONE" : "")
8990 << dendl;
8991 else
8992 dout(10) << "eval_repop " << *repop << " (no op)"
8993 << (repop->rep_done ? " DONE" : "")
8994 << dendl;
8995
8996 if (repop->rep_done)
8997 return;
8998
8999 // ondisk?
9000 if (repop->all_committed) {
9001 dout(10) << " commit: " << *repop << dendl;
9002 for (auto p = repop->on_committed.begin();
9003 p != repop->on_committed.end();
9004 repop->on_committed.erase(p++)) {
9005 (*p)();
9006 }
9007 // send dup commits, in order
9008 if (waiting_for_ondisk.count(repop->v)) {
9009 assert(waiting_for_ondisk.begin()->first == repop->v);
9010 for (list<pair<OpRequestRef, version_t> >::iterator i =
9011 waiting_for_ondisk[repop->v].begin();
9012 i != waiting_for_ondisk[repop->v].end();
9013 ++i) {
9014 osd->reply_op_error(i->first, repop->r, repop->v,
9015 i->second);
9016 }
9017 waiting_for_ondisk.erase(repop->v);
9018 }
9019 }
9020
9021 // applied?
9022 if (repop->all_applied) {
9023 if (repop->applies_with_commit) {
9024 assert(repop->on_applied.empty());
9025 }
9026 dout(10) << " applied: " << *repop << " " << dendl;
9027 for (auto p = repop->on_applied.begin();
9028 p != repop->on_applied.end();
9029 repop->on_applied.erase(p++)) {
9030 (*p)();
9031 }
9032 }
9033
9034 // done.
9035 if (repop->all_applied && repop->all_committed) {
9036 repop->rep_done = true;
9037
9038 publish_stats_to_osd();
9039 calc_min_last_complete_ondisk();
9040
9041 dout(10) << " removing " << *repop << dendl;
9042 assert(!repop_queue.empty());
9043 dout(20) << " q front is " << *repop_queue.front() << dendl;
9044 if (repop_queue.front() != repop) {
9045 if (!repop->applies_with_commit) {
9046 dout(0) << " removing " << *repop << dendl;
9047 dout(0) << " q front is " << *repop_queue.front() << dendl;
9048 assert(repop_queue.front() == repop);
9049 }
9050 } else {
9051 RepGather *to_remove = nullptr;
9052 while (!repop_queue.empty() &&
9053 (to_remove = repop_queue.front())->rep_done) {
9054 repop_queue.pop_front();
9055 for (auto p = to_remove->on_success.begin();
9056 p != to_remove->on_success.end();
9057 to_remove->on_success.erase(p++)) {
9058 (*p)();
9059 }
9060 remove_repop(to_remove);
9061 }
9062 }
9063 }
9064 }
9065
9066 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9067 {
9068 FUNCTRACE();
9069 const hobject_t& soid = ctx->obs->oi.soid;
9070 dout(7) << "issue_repop rep_tid " << repop->rep_tid
9071 << " o " << soid
9072 << dendl;
9073
9074 repop->v = ctx->at_version;
9075 if (ctx->at_version > eversion_t()) {
9076 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9077 i != actingbackfill.end();
9078 ++i) {
9079 if (*i == get_primary()) continue;
9080 pg_info_t &pinfo = peer_info[*i];
9081 // keep peer_info up to date
9082 if (pinfo.last_complete == pinfo.last_update)
9083 pinfo.last_complete = ctx->at_version;
9084 pinfo.last_update = ctx->at_version;
9085 }
9086 }
9087
9088 ctx->obc->ondisk_write_lock();
9089
9090 bool unlock_snapset_obc = false;
9091 ctx->op_t->add_obc(ctx->obc);
9092 if (ctx->clone_obc) {
9093 ctx->clone_obc->ondisk_write_lock();
9094 ctx->op_t->add_obc(ctx->clone_obc);
9095 }
9096 if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9097 ctx->obc->obs.oi.soid) {
9098 ctx->snapset_obc->ondisk_write_lock();
9099 unlock_snapset_obc = true;
9100 ctx->op_t->add_obc(ctx->snapset_obc);
9101 }
9102
9103 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9104 Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9105 Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9106 ctx->obc,
9107 ctx->clone_obc,
9108 unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9109 if (!(ctx->log.empty())) {
9110 assert(ctx->at_version >= projected_last_update);
9111 projected_last_update = ctx->at_version;
9112 }
9113 for (auto &&entry: ctx->log) {
9114 projected_log.add(entry);
9115 }
9116 pgbackend->submit_transaction(
9117 soid,
9118 ctx->delta_stats,
9119 ctx->at_version,
9120 std::move(ctx->op_t),
9121 pg_trim_to,
9122 min_last_complete_ondisk,
9123 ctx->log,
9124 ctx->updated_hset_history,
9125 onapplied_sync,
9126 on_all_applied,
9127 on_all_commit,
9128 repop->rep_tid,
9129 ctx->reqid,
9130 ctx->op);
9131 }
9132
9133 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9134 OpContext *ctx, ObjectContextRef obc,
9135 ceph_tid_t rep_tid)
9136 {
9137 if (ctx->op)
9138 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9139 else
9140 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9141
9142 RepGather *repop = new RepGather(
9143 ctx, rep_tid, info.last_complete, false);
9144
9145 repop->start = ceph_clock_now();
9146
9147 repop_queue.push_back(&repop->queue_item);
9148 repop->get();
9149
9150 osd->logger->inc(l_osd_op_wip);
9151
9152 dout(10) << __func__ << ": " << *repop << dendl;
9153 return repop;
9154 }
9155
9156 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9157 eversion_t version,
9158 int r,
9159 ObcLockManager &&manager,
9160 OpRequestRef &&op,
9161 boost::optional<std::function<void(void)> > &&on_complete)
9162 {
9163 RepGather *repop = new RepGather(
9164 std::move(manager),
9165 std::move(op),
9166 std::move(on_complete),
9167 osd->get_tid(),
9168 info.last_complete,
9169 true,
9170 r);
9171 repop->v = version;
9172
9173 repop->start = ceph_clock_now();
9174
9175 repop_queue.push_back(&repop->queue_item);
9176
9177 osd->logger->inc(l_osd_op_wip);
9178
9179 dout(10) << __func__ << ": " << *repop << dendl;
9180 return boost::intrusive_ptr<RepGather>(repop);
9181 }
9182
9183 void PrimaryLogPG::remove_repop(RepGather *repop)
9184 {
9185 dout(20) << __func__ << " " << *repop << dendl;
9186
9187 for (auto p = repop->on_finish.begin();
9188 p != repop->on_finish.end();
9189 repop->on_finish.erase(p++)) {
9190 (*p)();
9191 }
9192
9193 release_object_locks(
9194 repop->lock_manager);
9195 repop->put();
9196
9197 osd->logger->dec(l_osd_op_wip);
9198 }
9199
9200 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9201 {
9202 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9203 vector<OSDOp> ops;
9204 ceph_tid_t rep_tid = osd->get_tid();
9205 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9206 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, ops, obc, this));
9207 ctx->op_t.reset(new PGTransaction());
9208 ctx->mtime = ceph_clock_now();
9209 return ctx;
9210 }
9211
9212 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9213 {
9214 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9215 dout(20) << __func__ << " " << repop << dendl;
9216 issue_repop(repop, ctx.get());
9217 eval_repop(repop);
9218 repop->put();
9219 }
9220
9221
9222 void PrimaryLogPG::submit_log_entries(
9223 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9224 ObcLockManager &&manager,
9225 boost::optional<std::function<void(void)> > &&_on_complete,
9226 OpRequestRef op,
9227 int r)
9228 {
9229 dout(10) << __func__ << " " << entries << dendl;
9230 assert(is_primary());
9231
9232 eversion_t version;
9233 if (!entries.empty()) {
9234 assert(entries.rbegin()->version >= projected_last_update);
9235 version = projected_last_update = entries.rbegin()->version;
9236 }
9237
9238 boost::intrusive_ptr<RepGather> repop;
9239 boost::optional<std::function<void(void)> > on_complete;
9240 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9241 repop = new_repop(
9242 version,
9243 r,
9244 std::move(manager),
9245 std::move(op),
9246 std::move(_on_complete));
9247 } else {
9248 on_complete = std::move(_on_complete);
9249 }
9250
9251 pgbackend->call_write_ordered(
9252 [this, entries, repop, on_complete]() {
9253 ObjectStore::Transaction t;
9254 eversion_t old_last_update = info.last_update;
9255 merge_new_log_entries(entries, t);
9256
9257
9258 set<pg_shard_t> waiting_on;
9259 for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9260 i != actingbackfill.end();
9261 ++i) {
9262 pg_shard_t peer(*i);
9263 if (peer == pg_whoami) continue;
9264 assert(peer_missing.count(peer));
9265 assert(peer_info.count(peer));
9266 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9267 assert(repop);
9268 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9269 entries,
9270 spg_t(info.pgid.pgid, i->shard),
9271 pg_whoami.shard,
9272 get_osdmap()->get_epoch(),
9273 last_peering_reset,
9274 repop->rep_tid);
9275 osd->send_message_osd_cluster(
9276 peer.osd, m, get_osdmap()->get_epoch());
9277 waiting_on.insert(peer);
9278 } else {
9279 MOSDPGLog *m = new MOSDPGLog(
9280 peer.shard, pg_whoami.shard,
9281 info.last_update.epoch,
9282 info);
9283 m->log.log = entries;
9284 m->log.tail = old_last_update;
9285 m->log.head = info.last_update;
9286 osd->send_message_osd_cluster(
9287 peer.osd, m, get_osdmap()->get_epoch());
9288 }
9289 }
9290 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9291 ceph_tid_t rep_tid = repop->rep_tid;
9292 waiting_on.insert(pg_whoami);
9293 log_entry_update_waiting_on.insert(
9294 make_pair(
9295 rep_tid,
9296 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9297 ));
9298 struct OnComplete : public Context {
9299 PrimaryLogPGRef pg;
9300 ceph_tid_t rep_tid;
9301 epoch_t epoch;
9302 OnComplete(
9303 PrimaryLogPGRef pg,
9304 ceph_tid_t rep_tid,
9305 epoch_t epoch)
9306 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9307 void finish(int) override {
9308 pg->lock();
9309 if (!pg->pg_has_reset_since(epoch)) {
9310 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9311 assert(it != pg->log_entry_update_waiting_on.end());
9312 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9313 assert(it2 != it->second.waiting_on.end());
9314 it->second.waiting_on.erase(it2);
9315 if (it->second.waiting_on.empty()) {
9316 pg->repop_all_committed(it->second.repop.get());
9317 pg->log_entry_update_waiting_on.erase(it);
9318 }
9319 }
9320 pg->unlock();
9321 }
9322 };
9323 t.register_on_commit(
9324 new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9325 } else {
9326 if (on_complete) {
9327 struct OnComplete : public Context {
9328 PrimaryLogPGRef pg;
9329 std::function<void(void)> on_complete;
9330 epoch_t epoch;
9331 OnComplete(
9332 PrimaryLogPGRef pg,
9333 const std::function<void(void)> &on_complete,
9334 epoch_t epoch)
9335 : pg(pg),
9336 on_complete(std::move(on_complete)),
9337 epoch(epoch) {}
9338 void finish(int) override {
9339 pg->lock();
9340 if (!pg->pg_has_reset_since(epoch))
9341 on_complete();
9342 pg->unlock();
9343 }
9344 };
9345 t.register_on_complete(
9346 new OnComplete{
9347 this, *on_complete, get_osdmap()->get_epoch()
9348 });
9349 }
9350 }
9351 t.register_on_applied(
9352 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9353 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9354 assert(r == 0);
9355 });
9356 }
9357
9358 void PrimaryLogPG::cancel_log_updates()
9359 {
9360 // get rid of all the LogUpdateCtx so their references to repops are
9361 // dropped
9362 log_entry_update_waiting_on.clear();
9363 }
9364
9365 // -------------------------------------------------------
9366
9367 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9368 {
9369 pair<hobject_t, ObjectContextRef> i;
9370 while (object_contexts.get_next(i.first, &i)) {
9371 ObjectContextRef obc(i.second);
9372 get_obc_watchers(obc, pg_watchers);
9373 }
9374 }
9375
9376 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9377 {
9378 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9379 obc->watchers.begin();
9380 j != obc->watchers.end();
9381 ++j) {
9382 obj_watch_item_t owi;
9383
9384 owi.obj = obc->obs.oi.soid;
9385 owi.wi.addr = j->second->get_peer_addr();
9386 owi.wi.name = j->second->get_entity();
9387 owi.wi.cookie = j->second->get_cookie();
9388 owi.wi.timeout_seconds = j->second->get_timeout();
9389
9390 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9391 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9392
9393 pg_watchers.push_back(owi);
9394 }
9395 }
9396
9397 void PrimaryLogPG::check_blacklisted_watchers()
9398 {
9399 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9400 pair<hobject_t, ObjectContextRef> i;
9401 while (object_contexts.get_next(i.first, &i))
9402 check_blacklisted_obc_watchers(i.second);
9403 }
9404
9405 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9406 {
9407 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9408 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9409 obc->watchers.begin();
9410 k != obc->watchers.end();
9411 ) {
9412 //Advance iterator now so handle_watch_timeout() can erase element
9413 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9414 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9415 entity_addr_t ea = j->second->get_peer_addr();
9416 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9417 if (get_osdmap()->is_blacklisted(ea)) {
9418 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9419 assert(j->second->get_pg() == this);
9420 j->second->unregister_cb();
9421 handle_watch_timeout(j->second);
9422 }
9423 }
9424 }
9425
9426 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9427 {
9428 assert(is_active());
9429 assert((recovering.count(obc->obs.oi.soid) ||
9430 !is_missing_object(obc->obs.oi.soid)) ||
9431 (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9432 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9433 pg_log_entry_t::LOST_REVERT &&
9434 pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9435 obc->obs.oi.version));
9436
9437 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9438 assert(obc->watchers.empty());
9439 // populate unconnected_watchers
9440 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9441 obc->obs.oi.watchers.begin();
9442 p != obc->obs.oi.watchers.end();
9443 ++p) {
9444 utime_t expire = info.stats.last_became_active;
9445 expire += p->second.timeout_seconds;
9446 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
9447 WatchRef watch(
9448 Watch::makeWatchRef(
9449 this, osd, obc, p->second.timeout_seconds, p->first.first,
9450 p->first.second, p->second.addr));
9451 watch->disconnect();
9452 obc->watchers.insert(
9453 make_pair(
9454 make_pair(p->first.first, p->first.second),
9455 watch));
9456 }
9457 // Look for watchers from blacklisted clients and drop
9458 check_blacklisted_obc_watchers(obc);
9459 }
9460
9461 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9462 {
9463 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9464 dout(10) << "handle_watch_timeout obc " << obc << dendl;
9465
9466 if (!is_active()) {
9467 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9468 return;
9469 }
9470 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9471 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9472 watch->get_delayed_cb()
9473 );
9474 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9475 << obc->obs.oi.soid
9476 << dendl;
9477 return;
9478 }
9479
9480 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9481 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9482 << obc->obs.oi.soid
9483 << dendl;
9484 scrubber.add_callback(
9485 watch->get_delayed_cb() // This callback!
9486 );
9487 return;
9488 }
9489
9490 OpContextUPtr ctx = simple_opc_create(obc);
9491 ctx->at_version = get_next_version();
9492
9493 object_info_t& oi = ctx->new_obs.oi;
9494 oi.watchers.erase(make_pair(watch->get_cookie(),
9495 watch->get_entity()));
9496
9497 list<watch_disconnect_t> watch_disconnects = {
9498 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9499 };
9500 ctx->register_on_success(
9501 [this, obc, watch_disconnects]() {
9502 complete_disconnect_watches(obc, watch_disconnects);
9503 });
9504
9505
9506 PGTransaction *t = ctx->op_t.get();
9507 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9508 ctx->at_version,
9509 oi.version,
9510 0,
9511 osd_reqid_t(), ctx->mtime, 0));
9512
9513 oi.prior_version = obc->obs.oi.version;
9514 oi.version = ctx->at_version;
9515 bufferlist bl;
9516 ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9517 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9518
9519 // apply new object state.
9520 ctx->obc->obs = ctx->new_obs;
9521
9522 // no ctx->delta_stats
9523 simple_opc_submit(std::move(ctx));
9524 }
9525
9526 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9527 SnapSetContext *ssc)
9528 {
9529 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9530 assert(obc->destructor_callback == NULL);
9531 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9532 obc->obs.oi = oi;
9533 obc->obs.exists = false;
9534 obc->ssc = ssc;
9535 if (ssc)
9536 register_snapset_context(ssc);
9537 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9538 if (is_active())
9539 populate_obc_watchers(obc);
9540 return obc;
9541 }
9542
9543 ObjectContextRef PrimaryLogPG::get_object_context(
9544 const hobject_t& soid,
9545 bool can_create,
9546 const map<string, bufferlist> *attrs)
9547 {
9548 assert(
9549 attrs || !pg_log.get_missing().is_missing(soid) ||
9550 // or this is a revert... see recover_primary()
9551 (pg_log.get_log().objects.count(soid) &&
9552 pg_log.get_log().objects.find(soid)->second->op ==
9553 pg_log_entry_t::LOST_REVERT));
9554 ObjectContextRef obc = object_contexts.lookup(soid);
9555 osd->logger->inc(l_osd_object_ctx_cache_total);
9556 if (obc) {
9557 osd->logger->inc(l_osd_object_ctx_cache_hit);
9558 dout(10) << __func__ << ": found obc in cache: " << obc
9559 << dendl;
9560 } else {
9561 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9562 // check disk
9563 bufferlist bv;
9564 if (attrs) {
9565 assert(attrs->count(OI_ATTR));
9566 bv = attrs->find(OI_ATTR)->second;
9567 } else {
9568 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9569 if (r < 0) {
9570 if (!can_create) {
9571 dout(10) << __func__ << ": no obc for soid "
9572 << soid << " and !can_create"
9573 << dendl;
9574 return ObjectContextRef(); // -ENOENT!
9575 }
9576
9577 dout(10) << __func__ << ": no obc for soid "
9578 << soid << " but can_create"
9579 << dendl;
9580 // new object.
9581 object_info_t oi(soid);
9582 SnapSetContext *ssc = get_snapset_context(
9583 soid, true, 0, false);
9584 obc = create_object_context(oi, ssc);
9585 dout(10) << __func__ << ": " << obc << " " << soid
9586 << " " << obc->rwstate
9587 << " oi: " << obc->obs.oi
9588 << " ssc: " << obc->ssc
9589 << " snapset: " << obc->ssc->snapset << dendl;
9590 return obc;
9591 }
9592 }
9593
9594 object_info_t oi;
9595 try {
9596 bufferlist::iterator bliter = bv.begin();
9597 ::decode(oi, bliter);
9598 } catch (...) {
9599 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9600 return ObjectContextRef(); // -ENOENT!
9601 }
9602
9603 assert(oi.soid.pool == (int64_t)info.pgid.pool());
9604
9605 obc = object_contexts.lookup_or_create(oi.soid);
9606 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9607 obc->obs.oi = oi;
9608 obc->obs.exists = true;
9609
9610 obc->ssc = get_snapset_context(
9611 soid, true,
9612 soid.has_snapset() ? attrs : 0);
9613
9614 if (is_active())
9615 populate_obc_watchers(obc);
9616
9617 if (pool.info.require_rollback()) {
9618 if (attrs) {
9619 obc->attr_cache = *attrs;
9620 } else {
9621 int r = pgbackend->objects_get_attrs(
9622 soid,
9623 &obc->attr_cache);
9624 assert(r == 0);
9625 }
9626 }
9627
9628 dout(10) << __func__ << ": creating obc from disk: " << obc
9629 << dendl;
9630 }
9631 assert(obc->ssc);
9632 dout(10) << __func__ << ": " << obc << " " << soid
9633 << " " << obc->rwstate
9634 << " oi: " << obc->obs.oi
9635 << " exists: " << (int)obc->obs.exists
9636 << " ssc: " << obc->ssc
9637 << " snapset: " << obc->ssc->snapset << dendl;
9638 return obc;
9639 }
9640
9641 void PrimaryLogPG::context_registry_on_change()
9642 {
9643 pair<hobject_t, ObjectContextRef> i;
9644 while (object_contexts.get_next(i.first, &i)) {
9645 ObjectContextRef obc(i.second);
9646 if (obc) {
9647 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9648 obc->watchers.begin();
9649 j != obc->watchers.end();
9650 obc->watchers.erase(j++)) {
9651 j->second->discard();
9652 }
9653 }
9654 }
9655 }
9656
9657
9658 /*
9659 * If we return an error, and set *pmissing, then promoting that
9660 * object may help.
9661 *
9662 * If we return -EAGAIN, we will always set *pmissing to the missing
9663 * object to wait for.
9664 *
9665 * If we return an error but do not set *pmissing, then we know the
9666 * object does not exist.
9667 */
9668 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9669 ObjectContextRef *pobc,
9670 bool can_create,
9671 bool map_snapid_to_clone,
9672 hobject_t *pmissing)
9673 {
9674 FUNCTRACE();
9675 assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9676 // want the head?
9677 if (oid.snap == CEPH_NOSNAP) {
9678 ObjectContextRef obc = get_object_context(oid, can_create);
9679 if (!obc) {
9680 if (pmissing)
9681 *pmissing = oid;
9682 return -ENOENT;
9683 }
9684 dout(10) << "find_object_context " << oid
9685 << " @" << oid.snap
9686 << " oi=" << obc->obs.oi
9687 << dendl;
9688 *pobc = obc;
9689
9690 return 0;
9691 }
9692
9693 hobject_t head = oid.get_head();
9694
9695 // want the snapdir?
9696 if (oid.snap == CEPH_SNAPDIR) {
9697 // return head or snapdir, whichever exists.
9698 ObjectContextRef headobc = get_object_context(head, can_create);
9699 ObjectContextRef obc = headobc;
9700 if (!obc || !obc->obs.exists)
9701 obc = get_object_context(oid, can_create);
9702 if (!obc || !obc->obs.exists) {
9703 // if we have neither, we would want to promote the head.
9704 if (pmissing)
9705 *pmissing = head;
9706 if (pobc)
9707 *pobc = headobc; // may be null
9708 return -ENOENT;
9709 }
9710 dout(10) << "find_object_context " << oid
9711 << " @" << oid.snap
9712 << " oi=" << obc->obs.oi
9713 << dendl;
9714 *pobc = obc;
9715
9716 // always populate ssc for SNAPDIR...
9717 if (!obc->ssc)
9718 obc->ssc = get_snapset_context(
9719 oid, true);
9720 return 0;
9721 }
9722
9723 // we want a snap
9724 if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9725 dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9726 return -ENOENT;
9727 }
9728
9729 SnapSetContext *ssc = get_snapset_context(oid, can_create);
9730 if (!ssc || !(ssc->exists || can_create)) {
9731 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9732 if (pmissing)
9733 *pmissing = head; // start by getting the head
9734 if (ssc)
9735 put_snapset_context(ssc);
9736 return -ENOENT;
9737 }
9738
9739 if (map_snapid_to_clone) {
9740 dout(10) << "find_object_context " << oid << " @" << oid.snap
9741 << " snapset " << ssc->snapset
9742 << " map_snapid_to_clone=true" << dendl;
9743 if (oid.snap > ssc->snapset.seq) {
9744 // already must be readable
9745 ObjectContextRef obc = get_object_context(head, false);
9746 dout(10) << "find_object_context " << oid << " @" << oid.snap
9747 << " snapset " << ssc->snapset
9748 << " maps to head" << dendl;
9749 *pobc = obc;
9750 put_snapset_context(ssc);
9751 return (obc && obc->obs.exists) ? 0 : -ENOENT;
9752 } else {
9753 vector<snapid_t>::const_iterator citer = std::find(
9754 ssc->snapset.clones.begin(),
9755 ssc->snapset.clones.end(),
9756 oid.snap);
9757 if (citer == ssc->snapset.clones.end()) {
9758 dout(10) << "find_object_context " << oid << " @" << oid.snap
9759 << " snapset " << ssc->snapset
9760 << " maps to nothing" << dendl;
9761 put_snapset_context(ssc);
9762 return -ENOENT;
9763 }
9764
9765 dout(10) << "find_object_context " << oid << " @" << oid.snap
9766 << " snapset " << ssc->snapset
9767 << " maps to " << oid << dendl;
9768
9769 if (pg_log.get_missing().is_missing(oid)) {
9770 dout(10) << "find_object_context " << oid << " @" << oid.snap
9771 << " snapset " << ssc->snapset
9772 << " " << oid << " is missing" << dendl;
9773 if (pmissing)
9774 *pmissing = oid;
9775 put_snapset_context(ssc);
9776 return -EAGAIN;
9777 }
9778
9779 ObjectContextRef obc = get_object_context(oid, false);
9780 if (!obc || !obc->obs.exists) {
9781 dout(10) << "find_object_context " << oid << " @" << oid.snap
9782 << " snapset " << ssc->snapset
9783 << " " << oid << " is not present" << dendl;
9784 if (pmissing)
9785 *pmissing = oid;
9786 put_snapset_context(ssc);
9787 return -ENOENT;
9788 }
9789 dout(10) << "find_object_context " << oid << " @" << oid.snap
9790 << " snapset " << ssc->snapset
9791 << " " << oid << " HIT" << dendl;
9792 *pobc = obc;
9793 put_snapset_context(ssc);
9794 return 0;
9795 }
9796 ceph_abort(); //unreachable
9797 }
9798
9799 dout(10) << "find_object_context " << oid << " @" << oid.snap
9800 << " snapset " << ssc->snapset << dendl;
9801
9802 // head?
9803 if (oid.snap > ssc->snapset.seq) {
9804 if (ssc->snapset.head_exists) {
9805 ObjectContextRef obc = get_object_context(head, false);
9806 dout(10) << "find_object_context " << head
9807 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9808 << " -- HIT " << obc->obs
9809 << dendl;
9810 if (!obc->ssc)
9811 obc->ssc = ssc;
9812 else {
9813 assert(ssc == obc->ssc);
9814 put_snapset_context(ssc);
9815 }
9816 *pobc = obc;
9817 return 0;
9818 }
9819 dout(10) << "find_object_context " << head
9820 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9821 << " but head dne -- DNE"
9822 << dendl;
9823 put_snapset_context(ssc);
9824 return -ENOENT;
9825 }
9826
9827 // which clone would it be?
9828 unsigned k = 0;
9829 while (k < ssc->snapset.clones.size() &&
9830 ssc->snapset.clones[k] < oid.snap)
9831 k++;
9832 if (k == ssc->snapset.clones.size()) {
9833 dout(10) << "find_object_context no clones with last >= oid.snap "
9834 << oid.snap << " -- DNE" << dendl;
9835 put_snapset_context(ssc);
9836 return -ENOENT;
9837 }
9838 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
9839 info.pgid.pool(), oid.get_namespace());
9840
9841 if (pg_log.get_missing().is_missing(soid)) {
9842 dout(20) << "find_object_context " << soid << " missing, try again later"
9843 << dendl;
9844 if (pmissing)
9845 *pmissing = soid;
9846 put_snapset_context(ssc);
9847 return -EAGAIN;
9848 }
9849
9850 ObjectContextRef obc = get_object_context(soid, false);
9851 if (!obc || !obc->obs.exists) {
9852 dout(20) << __func__ << " missing clone " << soid << dendl;
9853 if (pmissing)
9854 *pmissing = soid;
9855 put_snapset_context(ssc);
9856 return -ENOENT;
9857 }
9858
9859 if (!obc->ssc) {
9860 obc->ssc = ssc;
9861 } else {
9862 assert(obc->ssc == ssc);
9863 put_snapset_context(ssc);
9864 }
9865 ssc = 0;
9866
9867 // clone
9868 dout(20) << "find_object_context " << soid
9869 << " snapset " << obc->ssc->snapset
9870 << " legacy_snaps " << obc->obs.oi.legacy_snaps
9871 << dendl;
9872 snapid_t first, last;
9873 if (obc->ssc->snapset.is_legacy()) {
9874 first = obc->obs.oi.legacy_snaps.back();
9875 last = obc->obs.oi.legacy_snaps.front();
9876 } else {
9877 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9878 assert(p != obc->ssc->snapset.clone_snaps.end());
9879 first = p->second.back();
9880 last = p->second.front();
9881 }
9882 if (first <= oid.snap) {
9883 dout(20) << "find_object_context " << soid << " [" << first << "," << last
9884 << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
9885 *pobc = obc;
9886 return 0;
9887 } else {
9888 dout(20) << "find_object_context " << soid << " [" << first << "," << last
9889 << "] does not contain " << oid.snap << " -- DNE" << dendl;
9890 return -ENOENT;
9891 }
9892 }
9893
9894 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
9895 {
9896 if (obc->ssc)
9897 put_snapset_context(obc->ssc);
9898 }
9899
9900 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
9901 {
9902 object_info_t& oi = obc->obs.oi;
9903
9904 dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
9905 object_stat_sum_t stat;
9906
9907 stat.num_bytes += oi.size;
9908
9909 if (oi.soid.snap != CEPH_SNAPDIR)
9910 stat.num_objects++;
9911 if (oi.is_dirty())
9912 stat.num_objects_dirty++;
9913 if (oi.is_whiteout())
9914 stat.num_whiteouts++;
9915 if (oi.is_omap())
9916 stat.num_objects_omap++;
9917 if (oi.is_cache_pinned())
9918 stat.num_objects_pinned++;
9919
9920 if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
9921 stat.num_object_clones++;
9922
9923 if (!obc->ssc)
9924 obc->ssc = get_snapset_context(oi.soid, false);
9925 assert(obc->ssc);
9926
9927 // subtract off clone overlap
9928 if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
9929 interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
9930 for (interval_set<uint64_t>::const_iterator r = o.begin();
9931 r != o.end();
9932 ++r) {
9933 stat.num_bytes -= r.get_len();
9934 }
9935 }
9936 }
9937
9938 // add it in
9939 pgstat->stats.sum.add(stat);
9940 }
9941
9942 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
9943 {
9944 const hobject_t& soid = obc->obs.oi.soid;
9945 if (obc->is_blocked()) {
9946 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
9947 return;
9948 }
9949
9950 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
9951 if (p != waiting_for_blocked_object.end()) {
9952 list<OpRequestRef>& ls = p->second;
9953 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
9954 requeue_ops(ls);
9955 waiting_for_blocked_object.erase(p);
9956 }
9957
9958 map<hobject_t, ObjectContextRef>::iterator i =
9959 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
9960 if (i != objects_blocked_on_snap_promotion.end()) {
9961 assert(i->second == obc);
9962 objects_blocked_on_snap_promotion.erase(i);
9963 }
9964
9965 if (obc->requeue_scrub_on_unblock) {
9966 obc->requeue_scrub_on_unblock = false;
9967 requeue_scrub();
9968 }
9969 }
9970
9971 SnapSetContext *PrimaryLogPG::get_snapset_context(
9972 const hobject_t& oid,
9973 bool can_create,
9974 const map<string, bufferlist> *attrs,
9975 bool oid_existed)
9976 {
9977 Mutex::Locker l(snapset_contexts_lock);
9978 SnapSetContext *ssc;
9979 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
9980 oid.get_snapdir());
9981 if (p != snapset_contexts.end()) {
9982 if (can_create || p->second->exists) {
9983 ssc = p->second;
9984 } else {
9985 return NULL;
9986 }
9987 } else {
9988 bufferlist bv;
9989 if (!attrs) {
9990 int r = -ENOENT;
9991 if (!(oid.is_head() && !oid_existed))
9992 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
9993 if (r < 0) {
9994 // try _snapset
9995 if (!(oid.is_snapdir() && !oid_existed))
9996 r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
9997 if (r < 0 && !can_create)
9998 return NULL;
9999 }
10000 } else {
10001 assert(attrs->count(SS_ATTR));
10002 bv = attrs->find(SS_ATTR)->second;
10003 }
10004 ssc = new SnapSetContext(oid.get_snapdir());
10005 _register_snapset_context(ssc);
10006 if (bv.length()) {
10007 bufferlist::iterator bvp = bv.begin();
10008 ssc->snapset.decode(bvp);
10009 ssc->exists = true;
10010 } else {
10011 ssc->exists = false;
10012 }
10013 }
10014 assert(ssc);
10015 ssc->ref++;
10016 return ssc;
10017 }
10018
10019 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10020 {
10021 Mutex::Locker l(snapset_contexts_lock);
10022 --ssc->ref;
10023 if (ssc->ref == 0) {
10024 if (ssc->registered)
10025 snapset_contexts.erase(ssc->oid);
10026 delete ssc;
10027 }
10028 }
10029
10030 /** pull - request object from a peer
10031 */
10032
10033 /*
10034 * Return values:
10035 * NONE - didn't pull anything
10036 * YES - pulled what the caller wanted
10037 * OTHER - needed to pull something else first (_head or _snapdir)
10038 */
10039 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10040
10041 int PrimaryLogPG::recover_missing(
10042 const hobject_t &soid, eversion_t v,
10043 int priority,
10044 PGBackend::RecoveryHandle *h)
10045 {
10046 if (missing_loc.is_unfound(soid)) {
10047 dout(7) << "pull " << soid
10048 << " v " << v
10049 << " but it is unfound" << dendl;
10050 return PULL_NONE;
10051 }
10052
10053 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10054 ObjectContextRef obc;
10055 ObjectContextRef head_obc;
10056 if (soid.snap && soid.snap < CEPH_NOSNAP) {
10057 // do we have the head and/or snapdir?
10058 hobject_t head = soid.get_head();
10059 if (pg_log.get_missing().is_missing(head)) {
10060 if (recovering.count(head)) {
10061 dout(10) << " missing but already recovering head " << head << dendl;
10062 return PULL_NONE;
10063 } else {
10064 int r = recover_missing(
10065 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10066 h);
10067 if (r != PULL_NONE)
10068 return PULL_OTHER;
10069 return PULL_NONE;
10070 }
10071 }
10072 head = soid.get_snapdir();
10073 if (pg_log.get_missing().is_missing(head)) {
10074 if (recovering.count(head)) {
10075 dout(10) << " missing but already recovering snapdir " << head << dendl;
10076 return PULL_NONE;
10077 } else {
10078 int r = recover_missing(
10079 head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10080 h);
10081 if (r != PULL_NONE)
10082 return PULL_OTHER;
10083 return PULL_NONE;
10084 }
10085 }
10086
10087 // we must have one or the other
10088 head_obc = get_object_context(
10089 soid.get_head(),
10090 false,
10091 0);
10092 if (!head_obc)
10093 head_obc = get_object_context(
10094 soid.get_snapdir(),
10095 false,
10096 0);
10097 assert(head_obc);
10098 }
10099 start_recovery_op(soid);
10100 assert(!recovering.count(soid));
10101 recovering.insert(make_pair(soid, obc));
10102 pgbackend->recover_object(
10103 soid,
10104 v,
10105 head_obc,
10106 obc,
10107 h);
10108 return PULL_YES;
10109 }
10110
10111 void PrimaryLogPG::send_remove_op(
10112 const hobject_t& oid, eversion_t v, pg_shard_t peer)
10113 {
10114 ceph_tid_t tid = osd->get_tid();
10115 osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10116
10117 dout(10) << "send_remove_op " << oid << " from osd." << peer
10118 << " tid " << tid << dendl;
10119
10120 MOSDSubOp *subop = new MOSDSubOp(
10121 rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10122 oid, CEPH_OSD_FLAG_ACK,
10123 get_osdmap()->get_epoch(), tid, v);
10124 subop->ops = vector<OSDOp>(1);
10125 subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10126
10127 osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10128 }
10129
10130
10131 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10132 {
10133 dout(10) << "finish_degraded_object " << oid << dendl;
10134 ObjectContextRef obc(object_contexts.lookup(oid));
10135 if (callbacks_for_degraded_object.count(oid)) {
10136 list<Context*> contexts;
10137 contexts.swap(callbacks_for_degraded_object[oid]);
10138 callbacks_for_degraded_object.erase(oid);
10139 for (list<Context*>::iterator i = contexts.begin();
10140 i != contexts.end();
10141 ++i) {
10142 (*i)->complete(0);
10143 }
10144 }
10145 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10146 oid.get_head());
10147 if (i != objects_blocked_on_degraded_snap.end() &&
10148 i->second == oid.snap)
10149 objects_blocked_on_degraded_snap.erase(i);
10150 }
10151
10152 void PrimaryLogPG::_committed_pushed_object(
10153 epoch_t epoch, eversion_t last_complete)
10154 {
10155 lock();
10156 if (!pg_has_reset_since(epoch)) {
10157 dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10158 last_complete_ondisk = last_complete;
10159
10160 if (last_complete_ondisk == info.last_update) {
10161 if (!is_primary()) {
10162 // Either we are a replica or backfill target.
10163 // we are fully up to date. tell the primary!
10164 osd->send_message_osd_cluster(
10165 get_primary().osd,
10166 new MOSDPGTrim(
10167 get_osdmap()->get_epoch(),
10168 spg_t(info.pgid.pgid, get_primary().shard),
10169 last_complete_ondisk),
10170 get_osdmap()->get_epoch());
10171 } else {
10172 calc_min_last_complete_ondisk();
10173 }
10174 }
10175
10176 } else {
10177 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10178 }
10179
10180 unlock();
10181 }
10182
10183 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10184 {
10185 lock();
10186 dout(10) << "_applied_recovered_object " << *obc << dendl;
10187
10188 assert(active_pushes >= 1);
10189 --active_pushes;
10190
10191 // requeue an active chunky scrub waiting on recovery ops
10192 if (!deleting && active_pushes == 0
10193 && scrubber.is_chunky_scrub_active()) {
10194 if (ops_blocked_by_scrub()) {
10195 requeue_scrub(true);
10196 } else {
10197 requeue_scrub(false);
10198 }
10199 }
10200
10201 unlock();
10202 }
10203
10204 void PrimaryLogPG::_applied_recovered_object_replica()
10205 {
10206 lock();
10207 dout(10) << "_applied_recovered_object_replica" << dendl;
10208
10209 assert(active_pushes >= 1);
10210 --active_pushes;
10211
10212 // requeue an active chunky scrub waiting on recovery ops
10213 if (!deleting && active_pushes == 0 &&
10214 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10215 scrubber.active_rep_scrub->get_req())->chunky) {
10216 osd->enqueue_back(
10217 info.pgid,
10218 PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10219 scrubber.active_rep_scrub = OpRequestRef();
10220 }
10221
10222 unlock();
10223 }
10224
10225 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10226 {
10227 dout(10) << "got missing " << oid << " v " << v << dendl;
10228 pg_log.recover_got(oid, v, info);
10229 if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10230 dout(10) << "last_complete now " << info.last_complete
10231 << " log.complete_to " << pg_log.get_log().complete_to->version
10232 << dendl;
10233 } else {
10234 dout(10) << "last_complete now " << info.last_complete
10235 << " log.complete_to at end" << dendl;
10236 //below is not true in the repair case.
10237 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10238 assert(info.last_complete == info.last_update);
10239 }
10240 }
10241
10242 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10243 {
10244 dout(20) << __func__ << ": " << soid << dendl;
10245 assert(recovering.count(soid));
10246 auto obc = recovering[soid];
10247 if (obc) {
10248 list<OpRequestRef> blocked_ops;
10249 obc->drop_recovery_read(&blocked_ops);
10250 requeue_ops(blocked_ops);
10251 }
10252 recovering.erase(soid);
10253 for (auto&& i : from)
10254 missing_loc.remove_location(soid, i);
10255 dout(0) << __func__ << " " << soid << " from shard " << from
10256 << ", reps on " << missing_loc.get_locations(soid)
10257 << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10258 finish_recovery_op(soid); // close out this attempt,
10259 }
10260
10261 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10262 {
10263 const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10264 assert(m->get_type() == MSG_OSD_SUBOP);
10265 dout(7) << "sub_op_remove " << m->poid << dendl;
10266
10267 op->mark_started();
10268
10269 ObjectStore::Transaction t;
10270 remove_snap_mapped_object(t, m->poid);
10271 int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10272 assert(r == 0);
10273 }
10274
10275 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10276 {
10277 eversion_t v;
10278 pg_missing_item pmi;
10279 bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10280 assert(is_missing);
10281 v = pmi.have;
10282 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10283
10284 assert(!actingbackfill.empty());
10285 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10286 i != actingbackfill.end();
10287 ++i) {
10288 if (*i == get_primary()) continue;
10289 pg_shard_t peer = *i;
10290 if (!peer_missing[peer].is_missing(oid)) {
10291 assert(is_backfill_targets(peer));
10292 continue;
10293 }
10294 eversion_t h = peer_missing[peer].get_items().at(oid).have;
10295 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10296 if (h > v)
10297 v = h;
10298 }
10299
10300 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10301 return v;
10302 }
10303
10304 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10305 {
10306 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10307 op->get_req());
10308 assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10309 ObjectStore::Transaction t;
10310 append_log_entries_update_missing(m->entries, t);
10311
10312 Context *complete = new FunctionContext(
10313 [=](int) {
10314 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10315 op->get_req());
10316 lock();
10317 if (!pg_has_reset_since(msg->get_epoch())) {
10318 MOSDPGUpdateLogMissingReply *reply =
10319 new MOSDPGUpdateLogMissingReply(
10320 spg_t(info.pgid.pgid, primary_shard().shard),
10321 pg_whoami.shard,
10322 msg->get_epoch(),
10323 msg->min_epoch,
10324 msg->get_tid());
10325 reply->set_priority(CEPH_MSG_PRIO_HIGH);
10326 msg->get_connection()->send_message(reply);
10327 }
10328 unlock();
10329 });
10330
10331 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10332 t.register_on_commit(complete);
10333 } else {
10334 /* Hack to work around the fact that ReplicatedBackend sends
10335 * ack+commit if commit happens first
10336 *
10337 * This behavior is no longer necessary, but we preserve it so old
10338 * primaries can keep their repops in order */
10339 if (pool.info.ec_pool()) {
10340 t.register_on_complete(complete);
10341 } else {
10342 t.register_on_commit(complete);
10343 }
10344 }
10345 t.register_on_applied(
10346 new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10347 int tr = osd->store->queue_transaction(
10348 osr.get(),
10349 std::move(t),
10350 nullptr);
10351 assert(tr == 0);
10352 }
10353
10354 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10355 {
10356 const MOSDPGUpdateLogMissingReply *m =
10357 static_cast<const MOSDPGUpdateLogMissingReply*>(
10358 op->get_req());
10359 dout(20) << __func__ << " got reply from "
10360 << m->get_from() << dendl;
10361
10362 auto it = log_entry_update_waiting_on.find(m->get_tid());
10363 if (it != log_entry_update_waiting_on.end()) {
10364 if (it->second.waiting_on.count(m->get_from())) {
10365 it->second.waiting_on.erase(m->get_from());
10366 } else {
10367 osd->clog->error()
10368 << info.pgid << " got reply "
10369 << *m << " from shard we are not waiting for "
10370 << m->get_from();
10371 }
10372
10373 if (it->second.waiting_on.empty()) {
10374 repop_all_committed(it->second.repop.get());
10375 log_entry_update_waiting_on.erase(it);
10376 }
10377 } else {
10378 osd->clog->error()
10379 << info.pgid << " got reply "
10380 << *m << " on unknown tid " << m->get_tid();
10381 }
10382 }
10383
10384 /* Mark all unfound objects as lost.
10385 */
10386 void PrimaryLogPG::mark_all_unfound_lost(
10387 int what,
10388 ConnectionRef con,
10389 ceph_tid_t tid)
10390 {
10391 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10392
10393 dout(30) << __func__ << ": log before:\n";
10394 pg_log.get_log().print(*_dout);
10395 *_dout << dendl;
10396
10397 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10398
10399 utime_t mtime = ceph_clock_now();
10400 map<hobject_t, pg_missing_item>::const_iterator m =
10401 missing_loc.get_needs_recovery().begin();
10402 map<hobject_t, pg_missing_item>::const_iterator mend =
10403 missing_loc.get_needs_recovery().end();
10404
10405 ObcLockManager manager;
10406 eversion_t v = get_next_version();
10407 v.epoch = get_osdmap()->get_epoch();
10408 uint64_t num_unfound = missing_loc.num_unfound();
10409 while (m != mend) {
10410 const hobject_t &oid(m->first);
10411 if (!missing_loc.is_unfound(oid)) {
10412 // We only care about unfound objects
10413 ++m;
10414 continue;
10415 }
10416
10417 ObjectContextRef obc;
10418 eversion_t prev;
10419
10420 switch (what) {
10421 case pg_log_entry_t::LOST_MARK:
10422 assert(0 == "actually, not implemented yet!");
10423 break;
10424
10425 case pg_log_entry_t::LOST_REVERT:
10426 prev = pick_newest_available(oid);
10427 if (prev > eversion_t()) {
10428 // log it
10429 pg_log_entry_t e(
10430 pg_log_entry_t::LOST_REVERT, oid, v,
10431 m->second.need, 0, osd_reqid_t(), mtime, 0);
10432 e.reverting_to = prev;
10433 e.mark_unrollbackable();
10434 log_entries.push_back(e);
10435 dout(10) << e << dendl;
10436
10437 // we are now missing the new version; recovery code will sort it out.
10438 ++v.version;
10439 ++m;
10440 break;
10441 }
10442
10443 case pg_log_entry_t::LOST_DELETE:
10444 {
10445 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10446 0, osd_reqid_t(), mtime, 0);
10447 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10448 if (pool.info.require_rollback()) {
10449 e.mod_desc.try_rmobject(v.version);
10450 } else {
10451 e.mark_unrollbackable();
10452 }
10453 } // otherwise, just do what we used to do
10454 dout(10) << e << dendl;
10455 log_entries.push_back(e);
10456
10457 ++v.version;
10458 ++m;
10459 }
10460 break;
10461
10462 default:
10463 ceph_abort();
10464 }
10465 }
10466
10467 info.stats.stats_invalid = true;
10468
10469 submit_log_entries(
10470 log_entries,
10471 std::move(manager),
10472 boost::optional<std::function<void(void)> >(
10473 [=]() {
10474 requeue_ops(waiting_for_all_missing);
10475 waiting_for_all_missing.clear();
10476 for (auto& p : waiting_for_unreadable_object) {
10477 release_backoffs(p.first);
10478 }
10479 requeue_object_waiters(waiting_for_unreadable_object);
10480 queue_recovery();
10481
10482 stringstream ss;
10483 ss << "pg has " << num_unfound
10484 << " objects unfound and apparently lost marking";
10485 string rs = ss.str();
10486 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10487 osd->clog->info() << rs;
10488 if (con) {
10489 MCommandReply *reply = new MCommandReply(0, rs);
10490 reply->set_tid(tid);
10491 con->send_message(reply);
10492 }
10493 }),
10494 OpRequestRef());
10495 }
10496
10497 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10498 {
10499 assert(repop_queue.empty());
10500 }
10501
10502 /*
10503 * pg status change notification
10504 */
10505
10506 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10507 {
10508 list<OpRequestRef> rq;
10509
10510 // apply all repops
10511 while (!repop_queue.empty()) {
10512 RepGather *repop = repop_queue.front();
10513 repop_queue.pop_front();
10514 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10515 repop->rep_aborted = true;
10516 repop->on_applied.clear();
10517 repop->on_committed.clear();
10518 repop->on_success.clear();
10519
10520 if (requeue) {
10521 if (repop->op) {
10522 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10523 rq.push_back(repop->op);
10524 repop->op = OpRequestRef();
10525 }
10526
10527 // also requeue any dups, interleaved into position
10528 map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10529 waiting_for_ondisk.find(repop->v);
10530 if (p != waiting_for_ondisk.end()) {
10531 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10532 for (list<pair<OpRequestRef, version_t> >::iterator i =
10533 p->second.begin();
10534 i != p->second.end();
10535 ++i) {
10536 rq.push_back(i->first);
10537 }
10538 waiting_for_ondisk.erase(p);
10539 }
10540 }
10541
10542 remove_repop(repop);
10543 }
10544
10545 assert(repop_queue.empty());
10546
10547 if (requeue) {
10548 requeue_ops(rq);
10549 if (!waiting_for_ondisk.empty()) {
10550 for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10551 waiting_for_ondisk.begin();
10552 i != waiting_for_ondisk.end();
10553 ++i) {
10554 for (list<pair<OpRequestRef, version_t> >::iterator j =
10555 i->second.begin();
10556 j != i->second.end();
10557 ++j) {
10558 derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10559 << i->first << dendl;
10560 }
10561 }
10562 assert(waiting_for_ondisk.empty());
10563 }
10564 }
10565
10566 waiting_for_ondisk.clear();
10567 }
10568
10569 void PrimaryLogPG::on_flushed()
10570 {
10571 assert(flushes_in_progress > 0);
10572 flushes_in_progress--;
10573 if (flushes_in_progress == 0) {
10574 requeue_ops(waiting_for_peered);
10575 }
10576 if (!is_peered() || !is_primary()) {
10577 pair<hobject_t, ObjectContextRef> i;
10578 while (object_contexts.get_next(i.first, &i)) {
10579 derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10580 }
10581 assert(object_contexts.empty());
10582 }
10583 pgbackend->on_flushed();
10584 }
10585
10586 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10587 {
10588 dout(10) << "on_removal" << dendl;
10589
10590 // adjust info to backfill
10591 info.set_last_backfill(hobject_t());
10592 pg_log.reset_backfill();
10593 dirty_info = true;
10594
10595
10596 // clear log
10597 PGLogEntryHandler rollbacker{this, t};
10598 pg_log.roll_forward(&rollbacker);
10599
10600 write_if_dirty(*t);
10601
10602 if (!deleting)
10603 on_shutdown();
10604 }
10605
10606 void PrimaryLogPG::on_shutdown()
10607 {
10608 dout(10) << "on_shutdown" << dendl;
10609
10610 // remove from queues
10611 osd->pg_stat_queue_dequeue(this);
10612 osd->peering_wq.dequeue(this);
10613
10614 // handles queue races
10615 deleting = true;
10616
10617 clear_scrub_reserved();
10618 scrub_clear_state();
10619
10620 unreg_next_scrub();
10621 cancel_copy_ops(false);
10622 cancel_flush_ops(false);
10623 cancel_proxy_ops(false);
10624 apply_and_flush_repops(false);
10625 cancel_log_updates();
10626 // we must remove PGRefs, so do this this prior to release_backoffs() callers
10627 clear_backoffs();
10628 // clean up snap trim references
10629 snap_trimmer_machine.process_event(Reset());
10630
10631 pgbackend->on_change();
10632
10633 context_registry_on_change();
10634 object_contexts.clear();
10635
10636 osd->remote_reserver.cancel_reservation(info.pgid);
10637 osd->local_reserver.cancel_reservation(info.pgid);
10638
10639 clear_primary_state();
10640 cancel_recovery();
10641 }
10642
10643 void PrimaryLogPG::on_activate()
10644 {
10645 // all clean?
10646 if (needs_recovery()) {
10647 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10648 queue_peering_event(
10649 CephPeeringEvtRef(
10650 std::make_shared<CephPeeringEvt>(
10651 get_osdmap()->get_epoch(),
10652 get_osdmap()->get_epoch(),
10653 DoRecovery())));
10654 } else if (needs_backfill()) {
10655 dout(10) << "activate queueing backfill" << dendl;
10656 queue_peering_event(
10657 CephPeeringEvtRef(
10658 std::make_shared<CephPeeringEvt>(
10659 get_osdmap()->get_epoch(),
10660 get_osdmap()->get_epoch(),
10661 RequestBackfill())));
10662 } else {
10663 dout(10) << "activate all replicas clean, no recovery" << dendl;
10664 queue_peering_event(
10665 CephPeeringEvtRef(
10666 std::make_shared<CephPeeringEvt>(
10667 get_osdmap()->get_epoch(),
10668 get_osdmap()->get_epoch(),
10669 AllReplicasRecovered())));
10670 }
10671
10672 publish_stats_to_osd();
10673
10674 if (!backfill_targets.empty()) {
10675 last_backfill_started = earliest_backfill();
10676 new_backfill = true;
10677 assert(!last_backfill_started.is_max());
10678 dout(5) << "on activate: bft=" << backfill_targets
10679 << " from " << last_backfill_started << dendl;
10680 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
10681 i != backfill_targets.end();
10682 ++i) {
10683 dout(5) << "target shard " << *i
10684 << " from " << peer_info[*i].last_backfill
10685 << dendl;
10686 }
10687 }
10688
10689 hit_set_setup();
10690 agent_setup();
10691 }
10692
10693 void PrimaryLogPG::_on_new_interval()
10694 {
10695 }
10696
10697 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
10698 {
10699 dout(10) << "on_change" << dendl;
10700
10701 if (hit_set && hit_set->insert_count() == 0) {
10702 dout(20) << " discarding empty hit_set" << dendl;
10703 hit_set_clear();
10704 }
10705
10706 if (recovery_queued) {
10707 recovery_queued = false;
10708 osd->clear_queued_recovery(this);
10709 }
10710
10711 // requeue everything in the reverse order they should be
10712 // reexamined.
10713 requeue_ops(waiting_for_peered);
10714 requeue_ops(waiting_for_active);
10715
10716 clear_scrub_reserved();
10717
10718 cancel_copy_ops(is_primary());
10719 cancel_flush_ops(is_primary());
10720 cancel_proxy_ops(is_primary());
10721
10722 // requeue object waiters
10723 for (auto& p : waiting_for_unreadable_object) {
10724 release_backoffs(p.first);
10725 }
10726 if (is_primary()) {
10727 requeue_object_waiters(waiting_for_unreadable_object);
10728 } else {
10729 waiting_for_unreadable_object.clear();
10730 }
10731 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
10732 p != waiting_for_degraded_object.end();
10733 waiting_for_degraded_object.erase(p++)) {
10734 release_backoffs(p->first);
10735 if (is_primary())
10736 requeue_ops(p->second);
10737 else
10738 p->second.clear();
10739 finish_degraded_object(p->first);
10740 }
10741
10742 // requeues waiting_for_scrub
10743 scrub_clear_state();
10744
10745 for (auto p = waiting_for_blocked_object.begin();
10746 p != waiting_for_blocked_object.end();
10747 waiting_for_blocked_object.erase(p++)) {
10748 if (is_primary())
10749 requeue_ops(p->second);
10750 else
10751 p->second.clear();
10752 }
10753 for (auto i = callbacks_for_degraded_object.begin();
10754 i != callbacks_for_degraded_object.end();
10755 ) {
10756 finish_degraded_object((i++)->first);
10757 }
10758 assert(callbacks_for_degraded_object.empty());
10759
10760 if (is_primary()) {
10761 requeue_ops(waiting_for_cache_not_full);
10762 requeue_ops(waiting_for_all_missing);
10763 } else {
10764 waiting_for_cache_not_full.clear();
10765 waiting_for_all_missing.clear();
10766 }
10767 objects_blocked_on_cache_full.clear();
10768
10769 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
10770 in_progress_async_reads.begin();
10771 i != in_progress_async_reads.end();
10772 in_progress_async_reads.erase(i++)) {
10773 close_op_ctx(i->second);
10774 if (is_primary())
10775 requeue_op(i->first);
10776 }
10777
10778 // this will requeue ops we were working on but didn't finish, and
10779 // any dups
10780 apply_and_flush_repops(is_primary());
10781 cancel_log_updates();
10782
10783 // do this *after* apply_and_flush_repops so that we catch any newly
10784 // registered watches.
10785 context_registry_on_change();
10786
10787 pgbackend->on_change_cleanup(t);
10788 scrubber.cleanup_store(t);
10789 pgbackend->on_change();
10790
10791 // clear snap_trimmer state
10792 snap_trimmer_machine.process_event(Reset());
10793
10794 debug_op_order.clear();
10795 unstable_stats.clear();
10796
10797 // we don't want to cache object_contexts through the interval change
10798 // NOTE: we actually assert that all currently live references are dead
10799 // by the time the flush for the next interval completes.
10800 object_contexts.clear();
10801
10802 // should have been cleared above by finishing all of the degraded objects
10803 assert(objects_blocked_on_degraded_snap.empty());
10804 }
10805
10806 void PrimaryLogPG::on_role_change()
10807 {
10808 dout(10) << "on_role_change" << dendl;
10809 if (get_role() != 0 && hit_set) {
10810 dout(10) << " clearing hit set" << dendl;
10811 hit_set_clear();
10812 }
10813 }
10814
10815 void PrimaryLogPG::on_pool_change()
10816 {
10817 dout(10) << __func__ << dendl;
10818 // requeue cache full waiters just in case the cache_mode is
10819 // changing away from writeback mode. note that if we are not
10820 // active the normal requeuing machinery is sufficient (and properly
10821 // ordered).
10822 if (is_active() &&
10823 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10824 !waiting_for_cache_not_full.empty()) {
10825 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
10826 << dendl;
10827 requeue_ops(waiting_for_cache_not_full);
10828 objects_blocked_on_cache_full.clear();
10829 }
10830 hit_set_setup();
10831 agent_setup();
10832 }
10833
10834 // clear state. called on recovery completion AND cancellation.
10835 void PrimaryLogPG::_clear_recovery_state()
10836 {
10837 missing_loc.clear();
10838 #ifdef DEBUG_RECOVERY_OIDS
10839 recovering_oids.clear();
10840 #endif
10841 last_backfill_started = hobject_t();
10842 set<hobject_t>::iterator i = backfills_in_flight.begin();
10843 while (i != backfills_in_flight.end()) {
10844 assert(recovering.count(*i));
10845 backfills_in_flight.erase(i++);
10846 }
10847
10848 list<OpRequestRef> blocked_ops;
10849 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
10850 i != recovering.end();
10851 recovering.erase(i++)) {
10852 if (i->second) {
10853 i->second->drop_recovery_read(&blocked_ops);
10854 requeue_ops(blocked_ops);
10855 }
10856 }
10857 assert(backfills_in_flight.empty());
10858 pending_backfill_updates.clear();
10859 assert(recovering.empty());
10860 pgbackend->clear_recovery_state();
10861 }
10862
10863 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
10864 {
10865 dout(20) << __func__ << ": " << soid << dendl;
10866 assert(recovering.count(soid));
10867 ObjectContextRef obc = recovering[soid];
10868 if (obc) {
10869 list<OpRequestRef> blocked_ops;
10870 obc->drop_recovery_read(&blocked_ops);
10871 requeue_ops(blocked_ops);
10872 }
10873 recovering.erase(soid);
10874 finish_recovery_op(soid);
10875 release_backoffs(soid);
10876 if (waiting_for_degraded_object.count(soid)) {
10877 dout(20) << " kicking degraded waiters on " << soid << dendl;
10878 requeue_ops(waiting_for_degraded_object[soid]);
10879 waiting_for_degraded_object.erase(soid);
10880 }
10881 if (waiting_for_unreadable_object.count(soid)) {
10882 dout(20) << " kicking unreadable waiters on " << soid << dendl;
10883 requeue_ops(waiting_for_unreadable_object[soid]);
10884 waiting_for_unreadable_object.erase(soid);
10885 }
10886 if (is_missing_object(soid))
10887 pg_log.set_last_requested(0); // get recover_primary to start over
10888 finish_degraded_object(soid);
10889 }
10890
10891 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
10892 {
10893 /*
10894 * check that any peers we are planning to (or currently) pulling
10895 * objects from are dealt with.
10896 */
10897 missing_loc.check_recovery_sources(osdmap);
10898 pgbackend->check_recovery_sources(osdmap);
10899
10900 for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
10901 i != peer_log_requested.end();
10902 ) {
10903 if (!osdmap->is_up(i->osd)) {
10904 dout(10) << "peer_log_requested removing " << *i << dendl;
10905 peer_log_requested.erase(i++);
10906 } else {
10907 ++i;
10908 }
10909 }
10910
10911 for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
10912 i != peer_missing_requested.end();
10913 ) {
10914 if (!osdmap->is_up(i->osd)) {
10915 dout(10) << "peer_missing_requested removing " << *i << dendl;
10916 peer_missing_requested.erase(i++);
10917 } else {
10918 ++i;
10919 }
10920 }
10921 }
10922
10923 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
10924 {
10925 set<pg_shard_t> now_down;
10926 for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
10927 p != missing_loc_sources.end();
10928 ) {
10929 if (osdmap->is_up(p->osd)) {
10930 ++p;
10931 continue;
10932 }
10933 ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
10934 now_down.insert(*p);
10935 missing_loc_sources.erase(p++);
10936 }
10937
10938 if (now_down.empty()) {
10939 ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
10940 } else {
10941 ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
10942 << missing_loc_sources << dendl;
10943
10944 // filter missing_loc
10945 map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
10946 while (p != missing_loc.end()) {
10947 set<pg_shard_t>::iterator q = p->second.begin();
10948 while (q != p->second.end())
10949 if (now_down.count(*q)) {
10950 p->second.erase(q++);
10951 } else {
10952 ++q;
10953 }
10954 if (p->second.empty())
10955 missing_loc.erase(p++);
10956 else
10957 ++p;
10958 }
10959 }
10960 }
10961
10962
10963 bool PrimaryLogPG::start_recovery_ops(
10964 uint64_t max,
10965 ThreadPool::TPHandle &handle,
10966 uint64_t *ops_started)
10967 {
10968 uint64_t& started = *ops_started;
10969 started = 0;
10970 bool work_in_progress = false;
10971 assert(is_primary());
10972
10973 if (!state_test(PG_STATE_RECOVERING) &&
10974 !state_test(PG_STATE_BACKFILL)) {
10975 /* TODO: I think this case is broken and will make do_recovery()
10976 * unhappy since we're returning false */
10977 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
10978 return false;
10979 }
10980
10981 const pg_missing_t &missing = pg_log.get_missing();
10982
10983 unsigned int num_missing = missing.num_missing();
10984 uint64_t num_unfound = get_num_unfound();
10985
10986 if (num_missing == 0) {
10987 info.last_complete = info.last_update;
10988 }
10989
10990 if (num_missing == num_unfound) {
10991 // All of the missing objects we have are unfound.
10992 // Recover the replicas.
10993 started = recover_replicas(max, handle);
10994 }
10995 if (!started) {
10996 // We still have missing objects that we should grab from replicas.
10997 started += recover_primary(max, handle);
10998 }
10999 if (!started && num_unfound != get_num_unfound()) {
11000 // second chance to recovery replicas
11001 started = recover_replicas(max, handle);
11002 }
11003
11004 if (started)
11005 work_in_progress = true;
11006
11007 bool deferred_backfill = false;
11008 if (recovering.empty() &&
11009 state_test(PG_STATE_BACKFILL) &&
11010 !backfill_targets.empty() && started < max &&
11011 missing.num_missing() == 0 &&
11012 waiting_on_backfill.empty()) {
11013 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11014 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11015 deferred_backfill = true;
11016 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11017 !is_degraded()) {
11018 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11019 deferred_backfill = true;
11020 } else if (!backfill_reserved) {
11021 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11022 if (!backfill_reserving) {
11023 dout(10) << "queueing RequestBackfill" << dendl;
11024 backfill_reserving = true;
11025 queue_peering_event(
11026 CephPeeringEvtRef(
11027 std::make_shared<CephPeeringEvt>(
11028 get_osdmap()->get_epoch(),
11029 get_osdmap()->get_epoch(),
11030 RequestBackfill())));
11031 }
11032 deferred_backfill = true;
11033 } else {
11034 started += recover_backfill(max - started, handle, &work_in_progress);
11035 }
11036 }
11037
11038 dout(10) << " started " << started << dendl;
11039 osd->logger->inc(l_osd_rop, started);
11040
11041 if (!recovering.empty() ||
11042 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11043 return work_in_progress;
11044
11045 assert(recovering.empty());
11046 assert(recovery_ops_active == 0);
11047
11048 dout(10) << __func__ << " needs_recovery: "
11049 << missing_loc.get_needs_recovery()
11050 << dendl;
11051 dout(10) << __func__ << " missing_loc: "
11052 << missing_loc.get_missing_locs()
11053 << dendl;
11054 int unfound = get_num_unfound();
11055 if (unfound) {
11056 dout(10) << " still have " << unfound << " unfound" << dendl;
11057 return work_in_progress;
11058 }
11059
11060 if (missing.num_missing() > 0) {
11061 // this shouldn't happen!
11062 osd->clog->error() << info.pgid << " recovery ending with " << missing.num_missing()
11063 << ": " << missing.get_items();
11064 return work_in_progress;
11065 }
11066
11067 if (needs_recovery()) {
11068 // this shouldn't happen!
11069 // We already checked num_missing() so we must have missing replicas
11070 osd->clog->error() << info.pgid << " recovery ending with missing replicas";
11071 return work_in_progress;
11072 }
11073
11074 if (state_test(PG_STATE_RECOVERING)) {
11075 state_clear(PG_STATE_RECOVERING);
11076 if (needs_backfill()) {
11077 dout(10) << "recovery done, queuing backfill" << dendl;
11078 queue_peering_event(
11079 CephPeeringEvtRef(
11080 std::make_shared<CephPeeringEvt>(
11081 get_osdmap()->get_epoch(),
11082 get_osdmap()->get_epoch(),
11083 RequestBackfill())));
11084 } else {
11085 dout(10) << "recovery done, no backfill" << dendl;
11086 queue_peering_event(
11087 CephPeeringEvtRef(
11088 std::make_shared<CephPeeringEvt>(
11089 get_osdmap()->get_epoch(),
11090 get_osdmap()->get_epoch(),
11091 AllReplicasRecovered())));
11092 }
11093 } else { // backfilling
11094 state_clear(PG_STATE_BACKFILL);
11095 dout(10) << "recovery done, backfill done" << dendl;
11096 queue_peering_event(
11097 CephPeeringEvtRef(
11098 std::make_shared<CephPeeringEvt>(
11099 get_osdmap()->get_epoch(),
11100 get_osdmap()->get_epoch(),
11101 Backfilled())));
11102 }
11103
11104 return false;
11105 }
11106
11107 /**
11108 * do one recovery op.
11109 * return true if done, false if nothing left to do.
11110 */
11111 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11112 {
11113 assert(is_primary());
11114
11115 const pg_missing_t &missing = pg_log.get_missing();
11116
11117 dout(10) << "recover_primary recovering " << recovering.size()
11118 << " in pg" << dendl;
11119 dout(10) << "recover_primary " << missing << dendl;
11120 dout(25) << "recover_primary " << missing.get_items() << dendl;
11121
11122 // look at log!
11123 pg_log_entry_t *latest = 0;
11124 unsigned started = 0;
11125 int skipped = 0;
11126
11127 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11128 map<version_t, hobject_t>::const_iterator p =
11129 missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11130 while (p != missing.get_rmissing().end()) {
11131 handle.reset_tp_timeout();
11132 hobject_t soid;
11133 version_t v = p->first;
11134
11135 if (pg_log.get_log().objects.count(p->second)) {
11136 latest = pg_log.get_log().objects.find(p->second)->second;
11137 assert(latest->is_update());
11138 soid = latest->soid;
11139 } else {
11140 latest = 0;
11141 soid = p->second;
11142 }
11143 const pg_missing_item& item = missing.get_items().find(p->second)->second;
11144 ++p;
11145
11146 hobject_t head = soid;
11147 head.snap = CEPH_NOSNAP;
11148
11149 eversion_t need = item.need;
11150
11151 dout(10) << "recover_primary "
11152 << soid << " " << item.need
11153 << (missing.is_missing(soid) ? " (missing)":"")
11154 << (missing.is_missing(head) ? " (missing head)":"")
11155 << (recovering.count(soid) ? " (recovering)":"")
11156 << (recovering.count(head) ? " (recovering head)":"")
11157 << dendl;
11158
11159 if (latest) {
11160 switch (latest->op) {
11161 case pg_log_entry_t::CLONE:
11162 /*
11163 * Handling for this special case removed for now, until we
11164 * can correctly construct an accurate SnapSet from the old
11165 * one.
11166 */
11167 break;
11168
11169 case pg_log_entry_t::LOST_REVERT:
11170 {
11171 if (item.have == latest->reverting_to) {
11172 ObjectContextRef obc = get_object_context(soid, true);
11173
11174 if (obc->obs.oi.version == latest->version) {
11175 // I'm already reverting
11176 dout(10) << " already reverting " << soid << dendl;
11177 } else {
11178 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11179 obc->ondisk_write_lock();
11180 obc->obs.oi.version = latest->version;
11181
11182 ObjectStore::Transaction t;
11183 bufferlist b2;
11184 obc->obs.oi.encode(
11185 b2,
11186 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11187 assert(!pool.info.require_rollback());
11188 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11189
11190 recover_got(soid, latest->version);
11191 missing_loc.add_location(soid, pg_whoami);
11192
11193 ++active_pushes;
11194
11195 osd->store->queue_transaction(osr.get(), std::move(t),
11196 new C_OSD_AppliedRecoveredObject(this, obc),
11197 new C_OSD_CommittedPushedObject(
11198 this,
11199 get_osdmap()->get_epoch(),
11200 info.last_complete),
11201 new C_OSD_OndiskWriteUnlock(obc));
11202 continue;
11203 }
11204 } else {
11205 /*
11206 * Pull the old version of the object. Update missing_loc here to have the location
11207 * of the version we want.
11208 *
11209 * This doesn't use the usual missing_loc paths, but that's okay:
11210 * - if we have it locally, we hit the case above, and go from there.
11211 * - if we don't, we always pass through this case during recovery and set up the location
11212 * properly.
11213 * - this way we don't need to mangle the missing code to be general about needing an old
11214 * version...
11215 */
11216 eversion_t alternate_need = latest->reverting_to;
11217 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11218
11219 for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11220 p != peer_missing.end();
11221 ++p)
11222 if (p->second.is_missing(soid, need) &&
11223 p->second.get_items().at(soid).have == alternate_need) {
11224 missing_loc.add_location(soid, p->first);
11225 }
11226 dout(10) << " will pull " << alternate_need << " or " << need
11227 << " from one of " << missing_loc.get_locations(soid)
11228 << dendl;
11229 }
11230 }
11231 break;
11232 }
11233 }
11234
11235 if (!recovering.count(soid)) {
11236 if (recovering.count(head)) {
11237 ++skipped;
11238 } else {
11239 int r = recover_missing(
11240 soid, need, get_recovery_op_priority(), h);
11241 switch (r) {
11242 case PULL_YES:
11243 ++started;
11244 break;
11245 case PULL_OTHER:
11246 ++started;
11247 case PULL_NONE:
11248 ++skipped;
11249 break;
11250 default:
11251 ceph_abort();
11252 }
11253 if (started >= max)
11254 break;
11255 }
11256 }
11257
11258 // only advance last_requested if we haven't skipped anything
11259 if (!skipped)
11260 pg_log.set_last_requested(v);
11261 }
11262
11263 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11264 return started;
11265 }
11266
11267 int PrimaryLogPG::prep_object_replica_pushes(
11268 const hobject_t& soid, eversion_t v,
11269 PGBackend::RecoveryHandle *h)
11270 {
11271 assert(is_primary());
11272 dout(10) << __func__ << ": on " << soid << dendl;
11273
11274 // NOTE: we know we will get a valid oloc off of disk here.
11275 ObjectContextRef obc = get_object_context(soid, false);
11276 if (!obc) {
11277 pg_log.missing_add(soid, v, eversion_t());
11278 missing_loc.remove_location(soid, pg_whoami);
11279 bool uhoh = true;
11280 assert(!actingbackfill.empty());
11281 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11282 i != actingbackfill.end();
11283 ++i) {
11284 if (*i == get_primary()) continue;
11285 pg_shard_t peer = *i;
11286 if (!peer_missing[peer].is_missing(soid, v)) {
11287 missing_loc.add_location(soid, peer);
11288 dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11289 << ", there should be a copy on shard " << peer << dendl;
11290 uhoh = false;
11291 }
11292 }
11293 if (uhoh)
11294 osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11295 else
11296 osd->clog->error() << info.pgid << " missing primary copy of " << soid
11297 << ", will try copies on " << missing_loc.get_locations(soid);
11298 return 0;
11299 }
11300
11301 if (!obc->get_recovery_read()) {
11302 dout(20) << "recovery delayed on " << soid
11303 << "; could not get rw_manager lock" << dendl;
11304 return 0;
11305 } else {
11306 dout(20) << "recovery got recovery read lock on " << soid
11307 << dendl;
11308 }
11309
11310 start_recovery_op(soid);
11311 assert(!recovering.count(soid));
11312 recovering.insert(make_pair(soid, obc));
11313
11314 /* We need this in case there is an in progress write on the object. In fact,
11315 * the only possible write is an update to the xattr due to a lost_revert --
11316 * a client write would be blocked since the object is degraded.
11317 * In almost all cases, therefore, this lock should be uncontended.
11318 */
11319 obc->ondisk_read_lock();
11320 pgbackend->recover_object(
11321 soid,
11322 v,
11323 ObjectContextRef(),
11324 obc, // has snapset context
11325 h);
11326 obc->ondisk_read_unlock();
11327 return 1;
11328 }
11329
11330 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11331 {
11332 dout(10) << __func__ << "(" << max << ")" << dendl;
11333 uint64_t started = 0;
11334
11335 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11336
11337 // this is FAR from an optimal recovery order. pretty lame, really.
11338 assert(!actingbackfill.empty());
11339 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11340 i != actingbackfill.end();
11341 ++i) {
11342 if (*i == get_primary()) continue;
11343 pg_shard_t peer = *i;
11344 map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11345 assert(pm != peer_missing.end());
11346 map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11347 assert(pi != peer_info.end());
11348 size_t m_sz = pm->second.num_missing();
11349
11350 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11351 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11352
11353 // oldest first!
11354 const pg_missing_t &m(pm->second);
11355 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11356 p != m.get_rmissing().end() && started < max;
11357 ++p) {
11358 handle.reset_tp_timeout();
11359 const hobject_t soid(p->second);
11360
11361 if (soid > pi->second.last_backfill) {
11362 if (!recovering.count(soid)) {
11363 derr << __func__ << ": object added to missing set for backfill, but "
11364 << "is not in recovering, error!" << dendl;
11365 ceph_abort();
11366 }
11367 continue;
11368 }
11369
11370 if (recovering.count(soid)) {
11371 dout(10) << __func__ << ": already recovering " << soid << dendl;
11372 continue;
11373 }
11374
11375 if (missing_loc.is_unfound(soid)) {
11376 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11377 continue;
11378 }
11379
11380 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11381 dout(10) << __func__ << ": " << soid.get_head()
11382 << " still missing on primary" << dendl;
11383 continue;
11384 }
11385
11386 if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11387 dout(10) << __func__ << ": " << soid.get_snapdir()
11388 << " still missing on primary" << dendl;
11389 continue;
11390 }
11391
11392 if (pg_log.get_missing().is_missing(soid)) {
11393 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11394 continue;
11395 }
11396
11397 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11398 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11399 started += prep_object_replica_pushes(soid, r->second.need,
11400 h);
11401 }
11402 }
11403
11404 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11405 return started;
11406 }
11407
11408 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11409 {
11410 hobject_t e = hobject_t::get_max();
11411 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11412 i != backfill_targets.end();
11413 ++i) {
11414 pg_shard_t peer = *i;
11415 map<pg_shard_t, BackfillInterval>::const_iterator iter =
11416 peer_backfill_info.find(peer);
11417 assert(iter != peer_backfill_info.end());
11418 if (iter->second.begin < e)
11419 e = iter->second.begin;
11420 }
11421 return e;
11422 }
11423
11424 bool PrimaryLogPG::all_peer_done() const
11425 {
11426 // Primary hasn't got any more objects
11427 assert(backfill_info.empty());
11428
11429 for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11430 i != backfill_targets.end();
11431 ++i) {
11432 pg_shard_t bt = *i;
11433 map<pg_shard_t, BackfillInterval>::const_iterator piter =
11434 peer_backfill_info.find(bt);
11435 assert(piter != peer_backfill_info.end());
11436 const BackfillInterval& pbi = piter->second;
11437 // See if peer has more to process
11438 if (!pbi.extends_to_end() || !pbi.empty())
11439 return false;
11440 }
11441 return true;
11442 }
11443
11444 /**
11445 * recover_backfill
11446 *
11447 * Invariants:
11448 *
11449 * backfilled: fully pushed to replica or present in replica's missing set (both
11450 * our copy and theirs).
11451 *
11452 * All objects on a backfill_target in
11453 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11454 * objects have been actually deleted and all logically-valid objects are replicated.
11455 * There may be PG objects in this interval yet to be backfilled.
11456 *
11457 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11458 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11459 *
11460 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11461 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11462 * interval remain on the backfill target.
11463 *
11464 * For a backfill target, all objects <= peer_info[target].last_backfill
11465 * have been backfilled to target
11466 *
11467 * There *MAY* be missing/outdated objects between last_backfill_started and
11468 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11469 * io created objects since the last scan. For this reason, we call
11470 * update_range() again before continuing backfill.
11471 */
11472 uint64_t PrimaryLogPG::recover_backfill(
11473 uint64_t max,
11474 ThreadPool::TPHandle &handle, bool *work_started)
11475 {
11476 dout(10) << "recover_backfill (" << max << ")"
11477 << " bft=" << backfill_targets
11478 << " last_backfill_started " << last_backfill_started
11479 << (new_backfill ? " new_backfill":"")
11480 << dendl;
11481 assert(!backfill_targets.empty());
11482
11483 // Initialize from prior backfill state
11484 if (new_backfill) {
11485 // on_activate() was called prior to getting here
11486 assert(last_backfill_started == earliest_backfill());
11487 new_backfill = false;
11488
11489 // initialize BackfillIntervals
11490 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11491 i != backfill_targets.end();
11492 ++i) {
11493 peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11494 }
11495 backfill_info.reset(last_backfill_started);
11496
11497 backfills_in_flight.clear();
11498 pending_backfill_updates.clear();
11499 }
11500
11501 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11502 i != backfill_targets.end();
11503 ++i) {
11504 dout(10) << "peer osd." << *i
11505 << " info " << peer_info[*i]
11506 << " interval " << peer_backfill_info[*i].begin
11507 << "-" << peer_backfill_info[*i].end
11508 << " " << peer_backfill_info[*i].objects.size() << " objects"
11509 << dendl;
11510 }
11511
11512 // update our local interval to cope with recent changes
11513 backfill_info.begin = last_backfill_started;
11514 update_range(&backfill_info, handle);
11515
11516 unsigned ops = 0;
11517 vector<boost::tuple<hobject_t, eversion_t,
11518 ObjectContextRef, vector<pg_shard_t> > > to_push;
11519 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11520 set<hobject_t> add_to_stat;
11521
11522 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11523 i != backfill_targets.end();
11524 ++i) {
11525 peer_backfill_info[*i].trim_to(
11526 std::max(peer_info[*i].last_backfill, last_backfill_started));
11527 }
11528 backfill_info.trim_to(last_backfill_started);
11529
11530 while (ops < max) {
11531 if (backfill_info.begin <= earliest_peer_backfill() &&
11532 !backfill_info.extends_to_end() && backfill_info.empty()) {
11533 hobject_t next = backfill_info.end;
11534 backfill_info.reset(next);
11535 backfill_info.end = hobject_t::get_max();
11536 update_range(&backfill_info, handle);
11537 backfill_info.trim();
11538 }
11539
11540 dout(20) << " my backfill interval " << backfill_info << dendl;
11541
11542 bool sent_scan = false;
11543 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11544 i != backfill_targets.end();
11545 ++i) {
11546 pg_shard_t bt = *i;
11547 BackfillInterval& pbi = peer_backfill_info[bt];
11548
11549 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11550 if (pbi.begin <= backfill_info.begin &&
11551 !pbi.extends_to_end() && pbi.empty()) {
11552 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11553 epoch_t e = get_osdmap()->get_epoch();
11554 MOSDPGScan *m = new MOSDPGScan(
11555 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11556 spg_t(info.pgid.pgid, bt.shard),
11557 pbi.end, hobject_t());
11558 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11559 assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11560 waiting_on_backfill.insert(bt);
11561 sent_scan = true;
11562 }
11563 }
11564
11565 // Count simultaneous scans as a single op and let those complete
11566 if (sent_scan) {
11567 ops++;
11568 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11569 break;
11570 }
11571
11572 if (backfill_info.empty() && all_peer_done()) {
11573 dout(10) << " reached end for both local and all peers" << dendl;
11574 break;
11575 }
11576
11577 // Get object within set of peers to operate on and
11578 // the set of targets for which that object applies.
11579 hobject_t check = earliest_peer_backfill();
11580
11581 if (check < backfill_info.begin) {
11582
11583 set<pg_shard_t> check_targets;
11584 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11585 i != backfill_targets.end();
11586 ++i) {
11587 pg_shard_t bt = *i;
11588 BackfillInterval& pbi = peer_backfill_info[bt];
11589 if (pbi.begin == check)
11590 check_targets.insert(bt);
11591 }
11592 assert(!check_targets.empty());
11593
11594 dout(20) << " BACKFILL removing " << check
11595 << " from peers " << check_targets << dendl;
11596 for (set<pg_shard_t>::iterator i = check_targets.begin();
11597 i != check_targets.end();
11598 ++i) {
11599 pg_shard_t bt = *i;
11600 BackfillInterval& pbi = peer_backfill_info[bt];
11601 assert(pbi.begin == check);
11602
11603 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11604 pbi.pop_front();
11605 }
11606
11607 /* This requires a bit of explanation. We compare head against
11608 * last_backfill to determine whether to send an operation
11609 * to the replica. A single write operation can touch up to three
11610 * objects: head, the snapdir, and a new clone which sorts closer to
11611 * head than any existing clone. If last_backfill points at a clone,
11612 * the transaction won't be sent and all 3 must lie on the right side
11613 * of the line (i.e., we'll backfill them later). If last_backfill
11614 * points at snapdir, it sorts greater than head, so we send the
11615 * transaction which is correct because all three must lie to the left
11616 * of the line.
11617 *
11618 * If it points at head, we have a bit of an issue. If head actually
11619 * exists, no problem, because any transaction which touches snapdir
11620 * must end up creating it (and deleting head), so sending the
11621 * operation won't pose a problem -- we'll end up having to scan it,
11622 * but it'll end up being the right version so we won't bother to
11623 * rebackfill it. However, if head doesn't exist, any write on head
11624 * will remove snapdir. For a replicated pool, this isn't a problem,
11625 * ENOENT on remove isn't an issue and it's in backfill future anyway.
11626 * It only poses a problem for EC pools, because we never just delete
11627 * an object, we rename it into a rollback object. That operation
11628 * will end up crashing the osd with ENOENT. Tolerating the failure
11629 * wouldn't work either, even if snapdir exists, we'd be creating a
11630 * rollback object past the last_backfill line which wouldn't get
11631 * cleaned up (no rollback objects past the last_backfill line is an
11632 * existing important invariant). Thus, let's avoid the whole issue
11633 * by just not updating last_backfill_started here if head doesn't
11634 * exist and snapdir does. We aren't using up a recovery count here,
11635 * so we're going to recover snapdir immediately anyway. We'll only
11636 * fail "backward" if we fail to get the rw lock and that just means
11637 * we'll re-process this section of the hash space again.
11638 *
11639 * I'm choosing this hack here because the really "correct" answer is
11640 * going to be to unify snapdir and head into a single object (a
11641 * snapdir is really just a confusing way to talk about head existing
11642 * as a whiteout), but doing that is going to be a somewhat larger
11643 * undertaking.
11644 *
11645 * @see http://tracker.ceph.com/issues/17668
11646 */
11647 if (!(check.is_head() &&
11648 backfill_info.begin.is_snapdir() &&
11649 check == backfill_info.begin.get_head()))
11650 last_backfill_started = check;
11651
11652 // Don't increment ops here because deletions
11653 // are cheap and not replied to unlike real recovery_ops,
11654 // and we can't increment ops without requeueing ourself
11655 // for recovery.
11656 } else {
11657 eversion_t& obj_v = backfill_info.objects.begin()->second;
11658
11659 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
11660 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11661 i != backfill_targets.end();
11662 ++i) {
11663 pg_shard_t bt = *i;
11664 BackfillInterval& pbi = peer_backfill_info[bt];
11665 // Find all check peers that have the wrong version
11666 if (check == backfill_info.begin && check == pbi.begin) {
11667 if (pbi.objects.begin()->second != obj_v) {
11668 need_ver_targs.push_back(bt);
11669 } else {
11670 keep_ver_targs.push_back(bt);
11671 }
11672 } else {
11673 pg_info_t& pinfo = peer_info[bt];
11674
11675 // Only include peers that we've caught up to their backfill line
11676 // otherwise, they only appear to be missing this object
11677 // because their pbi.begin > backfill_info.begin.
11678 if (backfill_info.begin > pinfo.last_backfill)
11679 missing_targs.push_back(bt);
11680 else
11681 skip_targs.push_back(bt);
11682 }
11683 }
11684
11685 if (!keep_ver_targs.empty()) {
11686 // These peers have version obj_v
11687 dout(20) << " BACKFILL keeping " << check
11688 << " with ver " << obj_v
11689 << " on peers " << keep_ver_targs << dendl;
11690 //assert(!waiting_for_degraded_object.count(check));
11691 }
11692 if (!need_ver_targs.empty() || !missing_targs.empty()) {
11693 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11694 assert(obc);
11695 if (obc->get_recovery_read()) {
11696 if (!need_ver_targs.empty()) {
11697 dout(20) << " BACKFILL replacing " << check
11698 << " with ver " << obj_v
11699 << " to peers " << need_ver_targs << dendl;
11700 }
11701 if (!missing_targs.empty()) {
11702 dout(20) << " BACKFILL pushing " << backfill_info.begin
11703 << " with ver " << obj_v
11704 << " to peers " << missing_targs << dendl;
11705 }
11706 vector<pg_shard_t> all_push = need_ver_targs;
11707 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
11708
11709 to_push.push_back(
11710 boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<pg_shard_t> >
11711 (backfill_info.begin, obj_v, obc, all_push));
11712 // Count all simultaneous pushes of the same object as a single op
11713 ops++;
11714 } else {
11715 *work_started = true;
11716 dout(20) << "backfill blocking on " << backfill_info.begin
11717 << "; could not get rw_manager lock" << dendl;
11718 break;
11719 }
11720 }
11721 dout(20) << "need_ver_targs=" << need_ver_targs
11722 << " keep_ver_targs=" << keep_ver_targs << dendl;
11723 dout(20) << "backfill_targets=" << backfill_targets
11724 << " missing_targs=" << missing_targs
11725 << " skip_targs=" << skip_targs << dendl;
11726
11727 last_backfill_started = backfill_info.begin;
11728 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
11729 backfill_info.pop_front();
11730 vector<pg_shard_t> check_targets = need_ver_targs;
11731 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
11732 for (vector<pg_shard_t>::iterator i = check_targets.begin();
11733 i != check_targets.end();
11734 ++i) {
11735 pg_shard_t bt = *i;
11736 BackfillInterval& pbi = peer_backfill_info[bt];
11737 pbi.pop_front();
11738 }
11739 }
11740 }
11741
11742 hobject_t backfill_pos =
11743 std::min(backfill_info.begin, earliest_peer_backfill());
11744
11745 for (set<hobject_t>::iterator i = add_to_stat.begin();
11746 i != add_to_stat.end();
11747 ++i) {
11748 ObjectContextRef obc = get_object_context(*i, false);
11749 assert(obc);
11750 pg_stat_t stat;
11751 add_object_context_to_pg_stat(obc, &stat);
11752 pending_backfill_updates[*i] = stat;
11753 }
11754 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
11755 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
11756 for (unsigned i = 0; i < to_remove.size(); ++i) {
11757 handle.reset_tp_timeout();
11758 const hobject_t& oid = to_remove[i].get<0>();
11759 eversion_t v = to_remove[i].get<1>();
11760 pg_shard_t peer = to_remove[i].get<2>();
11761 MOSDPGBackfillRemove *m;
11762 auto it = reqs.find(peer);
11763 if (it != reqs.end()) {
11764 m = it->second;
11765 } else {
11766 m = reqs[peer] = new MOSDPGBackfillRemove(
11767 spg_t(info.pgid.pgid, peer.shard),
11768 get_osdmap()->get_epoch());
11769 }
11770 m->ls.push_back(make_pair(oid, v));
11771
11772 if (oid <= last_backfill_started)
11773 pending_backfill_updates[oid]; // add empty stat!
11774 }
11775 for (auto p : reqs) {
11776 osd->send_message_osd_cluster(p.first.osd, p.second,
11777 get_osdmap()->get_epoch());
11778 }
11779 } else {
11780 // for jewel targets
11781 for (unsigned i = 0; i < to_remove.size(); ++i) {
11782 handle.reset_tp_timeout();
11783
11784 // ordered before any subsequent updates
11785 send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
11786 to_remove[i].get<2>());
11787
11788 if (to_remove[i].get<0>() <= last_backfill_started)
11789 pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
11790 }
11791 }
11792
11793 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11794 for (unsigned i = 0; i < to_push.size(); ++i) {
11795 handle.reset_tp_timeout();
11796 prep_backfill_object_push(to_push[i].get<0>(), to_push[i].get<1>(),
11797 to_push[i].get<2>(), to_push[i].get<3>(), h);
11798 }
11799 pgbackend->run_recovery_op(h, get_recovery_op_priority());
11800
11801 dout(5) << "backfill_pos is " << backfill_pos << dendl;
11802 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
11803 i != backfills_in_flight.end();
11804 ++i) {
11805 dout(20) << *i << " is still in flight" << dendl;
11806 }
11807
11808 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
11809 backfill_pos : *(backfills_in_flight.begin());
11810 hobject_t new_last_backfill = earliest_backfill();
11811 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
11812 for (map<hobject_t, pg_stat_t>::iterator i =
11813 pending_backfill_updates.begin();
11814 i != pending_backfill_updates.end() &&
11815 i->first < next_backfill_to_complete;
11816 pending_backfill_updates.erase(i++)) {
11817 dout(20) << " pending_backfill_update " << i->first << dendl;
11818 assert(i->first > new_last_backfill);
11819 for (set<pg_shard_t>::iterator j = backfill_targets.begin();
11820 j != backfill_targets.end();
11821 ++j) {
11822 pg_shard_t bt = *j;
11823 pg_info_t& pinfo = peer_info[bt];
11824 //Add stats to all peers that were missing object
11825 if (i->first > pinfo.last_backfill)
11826 pinfo.stats.add(i->second);
11827 }
11828 new_last_backfill = i->first;
11829 }
11830 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
11831
11832 assert(!pending_backfill_updates.empty() ||
11833 new_last_backfill == last_backfill_started);
11834 if (pending_backfill_updates.empty() &&
11835 backfill_pos.is_max()) {
11836 assert(backfills_in_flight.empty());
11837 new_last_backfill = backfill_pos;
11838 last_backfill_started = backfill_pos;
11839 }
11840 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
11841
11842 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
11843 // all the backfill targets. Otherwise, we will move last_backfill up on
11844 // those targets need it and send OP_BACKFILL_PROGRESS to them.
11845 for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11846 i != backfill_targets.end();
11847 ++i) {
11848 pg_shard_t bt = *i;
11849 pg_info_t& pinfo = peer_info[bt];
11850
11851 if (new_last_backfill > pinfo.last_backfill) {
11852 pinfo.set_last_backfill(new_last_backfill);
11853 epoch_t e = get_osdmap()->get_epoch();
11854 MOSDPGBackfill *m = NULL;
11855 if (pinfo.last_backfill.is_max()) {
11856 m = new MOSDPGBackfill(
11857 MOSDPGBackfill::OP_BACKFILL_FINISH,
11858 e,
11859 last_peering_reset,
11860 spg_t(info.pgid.pgid, bt.shard));
11861 // Use default priority here, must match sub_op priority
11862 /* pinfo.stats might be wrong if we did log-based recovery on the
11863 * backfilled portion in addition to continuing backfill.
11864 */
11865 pinfo.stats = info.stats;
11866 start_recovery_op(hobject_t::get_max());
11867 } else {
11868 m = new MOSDPGBackfill(
11869 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
11870 e,
11871 last_peering_reset,
11872 spg_t(info.pgid.pgid, bt.shard));
11873 // Use default priority here, must match sub_op priority
11874 }
11875 m->last_backfill = pinfo.last_backfill;
11876 m->stats = pinfo.stats;
11877 osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11878 dout(10) << " peer " << bt
11879 << " num_objects now " << pinfo.stats.stats.sum.num_objects
11880 << " / " << info.stats.stats.sum.num_objects << dendl;
11881 }
11882 }
11883
11884 if (ops)
11885 *work_started = true;
11886 return ops;
11887 }
11888
11889 void PrimaryLogPG::prep_backfill_object_push(
11890 hobject_t oid, eversion_t v,
11891 ObjectContextRef obc,
11892 vector<pg_shard_t> peers,
11893 PGBackend::RecoveryHandle *h)
11894 {
11895 dout(10) << "push_backfill_object " << oid << " v " << v << " to peers " << peers << dendl;
11896 assert(!peers.empty());
11897
11898 backfills_in_flight.insert(oid);
11899 for (unsigned int i = 0 ; i < peers.size(); ++i) {
11900 map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
11901 assert(bpm != peer_missing.end());
11902 bpm->second.add(oid, eversion_t(), eversion_t());
11903 }
11904
11905 assert(!recovering.count(oid));
11906
11907 start_recovery_op(oid);
11908 recovering.insert(make_pair(oid, obc));
11909
11910 // We need to take the read_lock here in order to flush in-progress writes
11911 obc->ondisk_read_lock();
11912 pgbackend->recover_object(
11913 oid,
11914 v,
11915 ObjectContextRef(),
11916 obc,
11917 h);
11918 obc->ondisk_read_unlock();
11919 }
11920
11921 void PrimaryLogPG::update_range(
11922 BackfillInterval *bi,
11923 ThreadPool::TPHandle &handle)
11924 {
11925 int local_min = cct->_conf->osd_backfill_scan_min;
11926 int local_max = cct->_conf->osd_backfill_scan_max;
11927
11928 if (bi->version < info.log_tail) {
11929 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
11930 << dendl;
11931 if (last_update_applied >= info.log_tail) {
11932 bi->version = last_update_applied;
11933 } else {
11934 osr->flush();
11935 bi->version = info.last_update;
11936 }
11937 scan_range(local_min, local_max, bi, handle);
11938 }
11939
11940 if (bi->version >= projected_last_update) {
11941 dout(10) << __func__<< ": bi is current " << dendl;
11942 assert(bi->version == projected_last_update);
11943 } else if (bi->version >= info.log_tail) {
11944 if (pg_log.get_log().empty() && projected_log.empty()) {
11945 /* Because we don't move log_tail on split, the log might be
11946 * empty even if log_tail != last_update. However, the only
11947 * way to get here with an empty log is if log_tail is actually
11948 * eversion_t(), because otherwise the entry which changed
11949 * last_update since the last scan would have to be present.
11950 */
11951 assert(bi->version == eversion_t());
11952 return;
11953 }
11954
11955 dout(10) << __func__<< ": bi is old, (" << bi->version
11956 << ") can be updated with log to projected_last_update "
11957 << projected_last_update << dendl;
11958
11959 auto func = [&](const pg_log_entry_t &e) {
11960 dout(10) << __func__ << ": updating from version " << e.version
11961 << dendl;
11962 const hobject_t &soid = e.soid;
11963 if (soid >= bi->begin &&
11964 soid < bi->end) {
11965 if (e.is_update()) {
11966 dout(10) << __func__ << ": " << e.soid << " updated to version "
11967 << e.version << dendl;
11968 bi->objects.erase(e.soid);
11969 bi->objects.insert(
11970 make_pair(
11971 e.soid,
11972 e.version));
11973 } else if (e.is_delete()) {
11974 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
11975 bi->objects.erase(e.soid);
11976 }
11977 }
11978 };
11979 dout(10) << "scanning pg log first" << dendl;
11980 pg_log.get_log().scan_log_after(bi->version, func);
11981 dout(10) << "scanning projected log" << dendl;
11982 projected_log.scan_log_after(bi->version, func);
11983 bi->version = projected_last_update;
11984 } else {
11985 assert(0 == "scan_range should have raised bi->version past log_tail");
11986 }
11987 }
11988
11989 void PrimaryLogPG::scan_range(
11990 int min, int max, BackfillInterval *bi,
11991 ThreadPool::TPHandle &handle)
11992 {
11993 assert(is_locked());
11994 dout(10) << "scan_range from " << bi->begin << dendl;
11995 bi->clear_objects();
11996
11997 vector<hobject_t> ls;
11998 ls.reserve(max);
11999 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12000 assert(r >= 0);
12001 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12002 dout(20) << ls << dendl;
12003
12004 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12005 handle.reset_tp_timeout();
12006 ObjectContextRef obc;
12007 if (is_primary())
12008 obc = object_contexts.lookup(*p);
12009 if (obc) {
12010 bi->objects[*p] = obc->obs.oi.version;
12011 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
12012 } else {
12013 bufferlist bl;
12014 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12015
12016 /* If the object does not exist here, it must have been removed
12017 * between the collection_list_partial and here. This can happen
12018 * for the first item in the range, which is usually last_backfill.
12019 */
12020 if (r == -ENOENT)
12021 continue;
12022
12023 assert(r >= 0);
12024 object_info_t oi(bl);
12025 bi->objects[*p] = oi.version;
12026 dout(20) << " " << *p << " " << oi.version << dendl;
12027 }
12028 }
12029 }
12030
12031
12032 /** check_local
12033 *
12034 * verifies that stray objects have been deleted
12035 */
12036 void PrimaryLogPG::check_local()
12037 {
12038 dout(10) << __func__ << dendl;
12039
12040 assert(info.last_update >= pg_log.get_tail()); // otherwise we need some help!
12041
12042 if (!cct->_conf->osd_debug_verify_stray_on_activate)
12043 return;
12044
12045 // just scan the log.
12046 set<hobject_t> did;
12047 for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12048 p != pg_log.get_log().log.rend();
12049 ++p) {
12050 if (did.count(p->soid))
12051 continue;
12052 did.insert(p->soid);
12053
12054 if (p->is_delete()) {
12055 dout(10) << " checking " << p->soid
12056 << " at " << p->version << dendl;
12057 struct stat st;
12058 int r = osd->store->stat(
12059 ch,
12060 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12061 &st);
12062 if (r != -ENOENT) {
12063 derr << __func__ << " " << p->soid << " exists, but should have been "
12064 << "deleted" << dendl;
12065 assert(0 == "erroneously present object");
12066 }
12067 } else {
12068 // ignore old(+missing) objects
12069 }
12070 }
12071 }
12072
12073
12074
12075 // ===========================
12076 // hit sets
12077
12078 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12079 {
12080 ostringstream ss;
12081 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12082 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12083 info.pgid.ps(), info.pgid.pool(),
12084 cct->_conf->osd_hit_set_namespace);
12085 dout(20) << __func__ << " " << hoid << dendl;
12086 return hoid;
12087 }
12088
12089 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12090 utime_t end,
12091 bool using_gmt)
12092 {
12093 ostringstream ss;
12094 ss << "hit_set_" << info.pgid.pgid << "_archive_";
12095 if (using_gmt) {
12096 start.gmtime(ss) << "_";
12097 end.gmtime(ss);
12098 } else {
12099 start.localtime(ss) << "_";
12100 end.localtime(ss);
12101 }
12102 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12103 info.pgid.ps(), info.pgid.pool(),
12104 cct->_conf->osd_hit_set_namespace);
12105 dout(20) << __func__ << " " << hoid << dendl;
12106 return hoid;
12107 }
12108
12109 void PrimaryLogPG::hit_set_clear()
12110 {
12111 dout(20) << __func__ << dendl;
12112 hit_set.reset();
12113 hit_set_start_stamp = utime_t();
12114 }
12115
12116 void PrimaryLogPG::hit_set_setup()
12117 {
12118 if (!is_active() ||
12119 !is_primary()) {
12120 hit_set_clear();
12121 return;
12122 }
12123
12124 if (is_active() && is_primary() &&
12125 (!pool.info.hit_set_count ||
12126 !pool.info.hit_set_period ||
12127 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12128 hit_set_clear();
12129
12130 // only primary is allowed to remove all the hit set objects
12131 hit_set_remove_all();
12132 return;
12133 }
12134
12135 // FIXME: discard any previous data for now
12136 hit_set_create();
12137
12138 // include any writes we know about from the pg log. this doesn't
12139 // capture reads, but it is better than nothing!
12140 hit_set_apply_log();
12141 }
12142
12143 void PrimaryLogPG::hit_set_remove_all()
12144 {
12145 // If any archives are degraded we skip this
12146 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12147 p != info.hit_set.history.end();
12148 ++p) {
12149 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12150
12151 // Once we hit a degraded object just skip
12152 if (is_degraded_or_backfilling_object(aoid))
12153 return;
12154 if (scrubber.write_blocked_by_scrub(aoid))
12155 return;
12156 }
12157
12158 if (!info.hit_set.history.empty()) {
12159 list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12160 assert(p != info.hit_set.history.rend());
12161 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12162 assert(!is_degraded_or_backfilling_object(oid));
12163 ObjectContextRef obc = get_object_context(oid, false);
12164 assert(obc);
12165
12166 OpContextUPtr ctx = simple_opc_create(obc);
12167 ctx->at_version = get_next_version();
12168 ctx->updated_hset_history = info.hit_set;
12169 utime_t now = ceph_clock_now();
12170 ctx->mtime = now;
12171 hit_set_trim(ctx, 0);
12172 simple_opc_submit(std::move(ctx));
12173 }
12174
12175 info.hit_set = pg_hit_set_history_t();
12176 if (agent_state) {
12177 agent_state->discard_hit_sets();
12178 }
12179 }
12180
12181 void PrimaryLogPG::hit_set_create()
12182 {
12183 utime_t now = ceph_clock_now();
12184 // make a copy of the params to modify
12185 HitSet::Params params(pool.info.hit_set_params);
12186
12187 dout(20) << __func__ << " " << params << dendl;
12188 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12189 BloomHitSet::Params *p =
12190 static_cast<BloomHitSet::Params*>(params.impl.get());
12191
12192 // convert false positive rate so it holds up across the full period
12193 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12194 if (p->get_fpp() <= 0.0)
12195 p->set_fpp(.01); // fpp cannot be zero!
12196
12197 // if we don't have specified size, estimate target size based on the
12198 // previous bin!
12199 if (p->target_size == 0 && hit_set) {
12200 utime_t dur = now - hit_set_start_stamp;
12201 unsigned unique = hit_set->approx_unique_insert_count();
12202 dout(20) << __func__ << " previous set had approx " << unique
12203 << " unique items over " << dur << " seconds" << dendl;
12204 p->target_size = (double)unique * (double)pool.info.hit_set_period
12205 / (double)dur;
12206 }
12207 if (p->target_size <
12208 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12209 p->target_size = cct->_conf->osd_hit_set_min_size;
12210
12211 if (p->target_size
12212 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12213 p->target_size = cct->_conf->osd_hit_set_max_size;
12214
12215 p->seed = now.sec();
12216
12217 dout(10) << __func__ << " target_size " << p->target_size
12218 << " fpp " << p->get_fpp() << dendl;
12219 }
12220 hit_set.reset(new HitSet(params));
12221 hit_set_start_stamp = now;
12222 }
12223
12224 /**
12225 * apply log entries to set
12226 *
12227 * this would only happen after peering, to at least capture writes
12228 * during an interval that was potentially lost.
12229 */
12230 bool PrimaryLogPG::hit_set_apply_log()
12231 {
12232 if (!hit_set)
12233 return false;
12234
12235 eversion_t to = info.last_update;
12236 eversion_t from = info.hit_set.current_last_update;
12237 if (to <= from) {
12238 dout(20) << __func__ << " no update" << dendl;
12239 return false;
12240 }
12241
12242 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12243 list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12244 while (p != pg_log.get_log().log.rend() && p->version > to)
12245 ++p;
12246 while (p != pg_log.get_log().log.rend() && p->version > from) {
12247 hit_set->insert(p->soid);
12248 ++p;
12249 }
12250
12251 return true;
12252 }
12253
12254 void PrimaryLogPG::hit_set_persist()
12255 {
12256 dout(10) << __func__ << dendl;
12257 bufferlist bl;
12258 unsigned max = pool.info.hit_set_count;
12259
12260 utime_t now = ceph_clock_now();
12261 hobject_t oid;
12262
12263 // If any archives are degraded we skip this persist request
12264 // account for the additional entry being added below
12265 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12266 p != info.hit_set.history.end();
12267 ++p) {
12268 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12269
12270 // Once we hit a degraded object just skip further trim
12271 if (is_degraded_or_backfilling_object(aoid))
12272 return;
12273 if (scrubber.write_blocked_by_scrub(aoid))
12274 return;
12275 }
12276
12277 // If backfill is in progress and we could possibly overlap with the
12278 // hit_set_* objects, back off. Since these all have
12279 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12280 // look just at that. This is necessary because our transactions
12281 // may include a modify of the new hit_set *and* a delete of the
12282 // old one, and this may span the backfill boundary.
12283 for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12284 p != backfill_targets.end();
12285 ++p) {
12286 assert(peer_info.count(*p));
12287 const pg_info_t& pi = peer_info[*p];
12288 if (pi.last_backfill == hobject_t() ||
12289 pi.last_backfill.get_hash() == info.pgid.ps()) {
12290 dout(10) << __func__ << " backfill target osd." << *p
12291 << " last_backfill has not progressed past pgid ps"
12292 << dendl;
12293 return;
12294 }
12295 }
12296
12297
12298 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12299 new_hset.begin = hit_set_start_stamp;
12300 new_hset.end = now;
12301 oid = get_hit_set_archive_object(
12302 new_hset.begin,
12303 new_hset.end,
12304 new_hset.using_gmt);
12305
12306 // If the current object is degraded we skip this persist request
12307 if (scrubber.write_blocked_by_scrub(oid))
12308 return;
12309
12310 hit_set->seal();
12311 ::encode(*hit_set, bl);
12312 dout(20) << __func__ << " archive " << oid << dendl;
12313
12314 if (agent_state) {
12315 agent_state->add_hit_set(new_hset.begin, hit_set);
12316 uint32_t size = agent_state->hit_set_map.size();
12317 if (size >= pool.info.hit_set_count) {
12318 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12319 }
12320 hit_set_in_memory_trim(size);
12321 }
12322
12323 ObjectContextRef obc = get_object_context(oid, true);
12324 OpContextUPtr ctx = simple_opc_create(obc);
12325
12326 ctx->at_version = get_next_version();
12327 ctx->updated_hset_history = info.hit_set;
12328 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12329
12330 updated_hit_set_hist.current_last_update = info.last_update;
12331 new_hset.version = ctx->at_version;
12332
12333 updated_hit_set_hist.history.push_back(new_hset);
12334 hit_set_create();
12335
12336 // fabricate an object_info_t and SnapSet
12337 obc->obs.oi.version = ctx->at_version;
12338 obc->obs.oi.mtime = now;
12339 obc->obs.oi.size = bl.length();
12340 obc->obs.exists = true;
12341 obc->obs.oi.set_data_digest(bl.crc32c(-1));
12342
12343 ctx->new_obs = obc->obs;
12344
12345 obc->ssc->snapset.head_exists = true;
12346 ctx->new_snapset = obc->ssc->snapset;
12347
12348 ctx->delta_stats.num_objects++;
12349 ctx->delta_stats.num_objects_hit_set_archive++;
12350 ctx->delta_stats.num_bytes += bl.length();
12351 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12352
12353 bufferlist bss;
12354 ::encode(ctx->new_snapset, bss);
12355 bufferlist boi(sizeof(ctx->new_obs.oi));
12356 ::encode(ctx->new_obs.oi, boi,
12357 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12358
12359 ctx->op_t->create(oid);
12360 if (bl.length()) {
12361 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12362 }
12363 map <string, bufferlist> attrs;
12364 attrs[OI_ATTR].claim(boi);
12365 attrs[SS_ATTR].claim(bss);
12366 setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12367 ctx->log.push_back(
12368 pg_log_entry_t(
12369 pg_log_entry_t::MODIFY,
12370 oid,
12371 ctx->at_version,
12372 eversion_t(),
12373 0,
12374 osd_reqid_t(),
12375 ctx->mtime,
12376 0)
12377 );
12378
12379 hit_set_trim(ctx, max);
12380
12381 simple_opc_submit(std::move(ctx));
12382 }
12383
12384 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12385 {
12386 assert(ctx->updated_hset_history);
12387 pg_hit_set_history_t &updated_hit_set_hist =
12388 *(ctx->updated_hset_history);
12389 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12390 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12391 assert(p != updated_hit_set_hist.history.end());
12392 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12393
12394 assert(!is_degraded_or_backfilling_object(oid));
12395
12396 dout(20) << __func__ << " removing " << oid << dendl;
12397 ++ctx->at_version.version;
12398 ctx->log.push_back(
12399 pg_log_entry_t(pg_log_entry_t::DELETE,
12400 oid,
12401 ctx->at_version,
12402 p->version,
12403 0,
12404 osd_reqid_t(),
12405 ctx->mtime,
12406 0));
12407
12408 ctx->op_t->remove(oid);
12409 updated_hit_set_hist.history.pop_front();
12410
12411 ObjectContextRef obc = get_object_context(oid, false);
12412 assert(obc);
12413 --ctx->delta_stats.num_objects;
12414 --ctx->delta_stats.num_objects_hit_set_archive;
12415 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12416 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12417 }
12418 }
12419
12420 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12421 {
12422 while (agent_state->hit_set_map.size() > max_in_memory) {
12423 agent_state->remove_oldest_hit_set();
12424 }
12425 }
12426
12427
12428 // =======================================
12429 // cache agent
12430
12431 void PrimaryLogPG::agent_setup()
12432 {
12433 assert(is_locked());
12434 if (!is_active() ||
12435 !is_primary() ||
12436 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12437 pool.info.tier_of < 0 ||
12438 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12439 agent_clear();
12440 return;
12441 }
12442 if (!agent_state) {
12443 agent_state.reset(new TierAgentState);
12444
12445 // choose random starting position
12446 agent_state->position = hobject_t();
12447 agent_state->position.pool = info.pgid.pool();
12448 agent_state->position.set_hash(pool.info.get_random_pg_position(
12449 info.pgid.pgid,
12450 rand()));
12451 agent_state->start = agent_state->position;
12452
12453 dout(10) << __func__ << " allocated new state, position "
12454 << agent_state->position << dendl;
12455 } else {
12456 dout(10) << __func__ << " keeping existing state" << dendl;
12457 }
12458
12459 if (info.stats.stats_invalid) {
12460 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12461 }
12462
12463 agent_choose_mode();
12464 }
12465
12466 void PrimaryLogPG::agent_clear()
12467 {
12468 agent_stop();
12469 agent_state.reset(NULL);
12470 }
12471
12472 // Return false if no objects operated on since start of object hash space
12473 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12474 {
12475 lock();
12476 if (!agent_state) {
12477 dout(10) << __func__ << " no agent state, stopping" << dendl;
12478 unlock();
12479 return true;
12480 }
12481
12482 assert(!deleting);
12483
12484 if (agent_state->is_idle()) {
12485 dout(10) << __func__ << " idle, stopping" << dendl;
12486 unlock();
12487 return true;
12488 }
12489
12490 osd->logger->inc(l_osd_agent_wake);
12491
12492 dout(10) << __func__
12493 << " max " << start_max
12494 << ", flush " << agent_state->get_flush_mode_name()
12495 << ", evict " << agent_state->get_evict_mode_name()
12496 << ", pos " << agent_state->position
12497 << dendl;
12498 assert(is_primary());
12499 assert(is_active());
12500
12501 agent_load_hit_sets();
12502
12503 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12504 assert(base_pool);
12505
12506 int ls_min = 1;
12507 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12508
12509 // list some objects. this conveniently lists clones (oldest to
12510 // newest) before heads... the same order we want to flush in.
12511 //
12512 // NOTE: do not flush the Sequencer. we will assume that the
12513 // listing we get back is imprecise.
12514 vector<hobject_t> ls;
12515 hobject_t next;
12516 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12517 &ls, &next);
12518 assert(r >= 0);
12519 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12520 int started = 0;
12521 for (vector<hobject_t>::iterator p = ls.begin();
12522 p != ls.end();
12523 ++p) {
12524 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12525 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12526 osd->logger->inc(l_osd_agent_skip);
12527 continue;
12528 }
12529 if (is_degraded_or_backfilling_object(*p)) {
12530 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12531 osd->logger->inc(l_osd_agent_skip);
12532 continue;
12533 }
12534 if (is_missing_object(p->get_head())) {
12535 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12536 osd->logger->inc(l_osd_agent_skip);
12537 continue;
12538 }
12539 ObjectContextRef obc = get_object_context(*p, false, NULL);
12540 if (!obc) {
12541 // we didn't flush; we may miss something here.
12542 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12543 osd->logger->inc(l_osd_agent_skip);
12544 continue;
12545 }
12546 if (!obc->obs.exists) {
12547 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12548 osd->logger->inc(l_osd_agent_skip);
12549 continue;
12550 }
12551 if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12552 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12553 osd->logger->inc(l_osd_agent_skip);
12554 continue;
12555 }
12556 if (obc->is_blocked()) {
12557 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12558 osd->logger->inc(l_osd_agent_skip);
12559 continue;
12560 }
12561 if (obc->is_request_pending()) {
12562 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12563 osd->logger->inc(l_osd_agent_skip);
12564 continue;
12565 }
12566
12567 // be careful flushing omap to an EC pool.
12568 if (!base_pool->supports_omap() &&
12569 obc->obs.oi.is_omap()) {
12570 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12571 osd->logger->inc(l_osd_agent_skip);
12572 continue;
12573 }
12574
12575 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12576 agent_maybe_evict(obc, false))
12577 ++started;
12578 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12579 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12580 ++started;
12581 --agent_flush_quota;
12582 }
12583 if (started >= start_max) {
12584 // If finishing early, set "next" to the next object
12585 if (++p != ls.end())
12586 next = *p;
12587 break;
12588 }
12589 }
12590
12591 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12592 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12593 agent_state->hist_age = 0;
12594 agent_state->temp_hist.decay();
12595 }
12596
12597 // Total objects operated on so far
12598 int total_started = agent_state->started + started;
12599 bool need_delay = false;
12600
12601 dout(20) << __func__ << " start pos " << agent_state->position
12602 << " next start pos " << next
12603 << " started " << total_started << dendl;
12604
12605 // See if we've made a full pass over the object hash space
12606 // This might check at most ls_max objects a second time to notice that
12607 // we've checked every objects at least once.
12608 if (agent_state->position < agent_state->start &&
12609 next >= agent_state->start) {
12610 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
12611 if (total_started == 0)
12612 need_delay = true;
12613 else
12614 total_started = 0;
12615 agent_state->start = next;
12616 }
12617 agent_state->started = total_started;
12618
12619 // See if we are starting from beginning
12620 if (next.is_max())
12621 agent_state->position = hobject_t();
12622 else
12623 agent_state->position = next;
12624
12625 // Discard old in memory HitSets
12626 hit_set_in_memory_trim(pool.info.hit_set_count);
12627
12628 if (need_delay) {
12629 assert(agent_state->delaying == false);
12630 agent_delay();
12631 unlock();
12632 return false;
12633 }
12634 agent_choose_mode();
12635 unlock();
12636 return true;
12637 }
12638
12639 void PrimaryLogPG::agent_load_hit_sets()
12640 {
12641 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
12642 return;
12643 }
12644
12645 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
12646 dout(10) << __func__ << dendl;
12647 for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12648 p != info.hit_set.history.end(); ++p) {
12649 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
12650 dout(10) << __func__ << " loading " << p->begin << "-"
12651 << p->end << dendl;
12652 if (!pool.info.is_replicated()) {
12653 // FIXME: EC not supported here yet
12654 derr << __func__ << " on non-replicated pool" << dendl;
12655 break;
12656 }
12657
12658 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12659 if (is_unreadable_object(oid)) {
12660 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
12661 break;
12662 }
12663
12664 ObjectContextRef obc = get_object_context(oid, false);
12665 if (!obc) {
12666 derr << __func__ << ": could not load hitset " << oid << dendl;
12667 break;
12668 }
12669
12670 bufferlist bl;
12671 {
12672 obc->ondisk_read_lock();
12673 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
12674 assert(r >= 0);
12675 obc->ondisk_read_unlock();
12676 }
12677 HitSetRef hs(new HitSet);
12678 bufferlist::iterator pbl = bl.begin();
12679 ::decode(*hs, pbl);
12680 agent_state->add_hit_set(p->begin.sec(), hs);
12681 }
12682 }
12683 }
12684 }
12685
12686 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
12687 {
12688 if (!obc->obs.oi.is_dirty()) {
12689 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
12690 osd->logger->inc(l_osd_agent_skip);
12691 return false;
12692 }
12693 if (obc->obs.oi.is_cache_pinned()) {
12694 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12695 osd->logger->inc(l_osd_agent_skip);
12696 return false;
12697 }
12698
12699 utime_t now = ceph_clock_now();
12700 utime_t ob_local_mtime;
12701 if (obc->obs.oi.local_mtime != utime_t()) {
12702 ob_local_mtime = obc->obs.oi.local_mtime;
12703 } else {
12704 ob_local_mtime = obc->obs.oi.mtime;
12705 }
12706 bool evict_mode_full =
12707 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
12708 if (!evict_mode_full &&
12709 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
12710 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
12711 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12712 osd->logger->inc(l_osd_agent_skip);
12713 return false;
12714 }
12715
12716 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
12717 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
12718 osd->logger->inc(l_osd_agent_skip);
12719 return false;
12720 }
12721
12722 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
12723
12724 // FIXME: flush anything dirty, regardless of what distribution of
12725 // ages we expect.
12726
12727 hobject_t oid = obc->obs.oi.soid;
12728 osd->agent_start_op(oid);
12729 // no need to capture a pg ref, can't outlive fop or ctx
12730 std::function<void()> on_flush = [this, oid]() {
12731 osd->agent_finish_op(oid);
12732 };
12733
12734 int result = start_flush(
12735 OpRequestRef(), obc, false, NULL,
12736 on_flush);
12737 if (result != -EINPROGRESS) {
12738 on_flush();
12739 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
12740 << " with " << result << dendl;
12741 osd->logger->inc(l_osd_agent_skip);
12742 return false;
12743 }
12744
12745 osd->logger->inc(l_osd_agent_flush);
12746 return true;
12747 }
12748
12749 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
12750 {
12751 const hobject_t& soid = obc->obs.oi.soid;
12752 if (!after_flush && obc->obs.oi.is_dirty()) {
12753 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
12754 return false;
12755 }
12756 if (!obc->obs.oi.watchers.empty()) {
12757 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
12758 return false;
12759 }
12760 if (obc->is_blocked()) {
12761 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12762 return false;
12763 }
12764 if (obc->obs.oi.is_cache_pinned()) {
12765 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12766 return false;
12767 }
12768
12769 if (soid.snap == CEPH_NOSNAP) {
12770 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
12771 if (result < 0) {
12772 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
12773 return false;
12774 }
12775 }
12776
12777 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
12778 // is this object old than cache_min_evict_age?
12779 utime_t now = ceph_clock_now();
12780 utime_t ob_local_mtime;
12781 if (obc->obs.oi.local_mtime != utime_t()) {
12782 ob_local_mtime = obc->obs.oi.local_mtime;
12783 } else {
12784 ob_local_mtime = obc->obs.oi.mtime;
12785 }
12786 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
12787 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12788 osd->logger->inc(l_osd_agent_skip);
12789 return false;
12790 }
12791 // is this object old and/or cold enough?
12792 int temp = 0;
12793 uint64_t temp_upper = 0, temp_lower = 0;
12794 if (hit_set)
12795 agent_estimate_temp(soid, &temp);
12796 agent_state->temp_hist.add(temp);
12797 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
12798
12799 dout(20) << __func__
12800 << " temp " << temp
12801 << " pos " << temp_lower << "-" << temp_upper
12802 << ", evict_effort " << agent_state->evict_effort
12803 << dendl;
12804 dout(30) << "agent_state:\n";
12805 Formatter *f = Formatter::create("");
12806 f->open_object_section("agent_state");
12807 agent_state->dump(f);
12808 f->close_section();
12809 f->flush(*_dout);
12810 delete f;
12811 *_dout << dendl;
12812
12813 if (1000000 - temp_upper >= agent_state->evict_effort)
12814 return false;
12815 }
12816
12817 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
12818 OpContextUPtr ctx = simple_opc_create(obc);
12819
12820 if (!ctx->lock_manager.get_lock_type(
12821 ObjectContext::RWState::RWWRITE,
12822 obc->obs.oi.soid,
12823 obc,
12824 OpRequestRef())) {
12825 close_op_ctx(ctx.release());
12826 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
12827 return false;
12828 }
12829
12830 osd->agent_start_evict_op();
12831 ctx->register_on_finish(
12832 [this]() {
12833 osd->agent_finish_evict_op();
12834 });
12835
12836 ctx->at_version = get_next_version();
12837 assert(ctx->new_obs.exists);
12838 int r = _delete_oid(ctx.get(), true, false);
12839 if (obc->obs.oi.is_omap())
12840 ctx->delta_stats.num_objects_omap--;
12841 ctx->delta_stats.num_evict++;
12842 ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
12843 if (obc->obs.oi.is_dirty())
12844 --ctx->delta_stats.num_objects_dirty;
12845 assert(r == 0);
12846 finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
12847 simple_opc_submit(std::move(ctx));
12848 osd->logger->inc(l_osd_tier_evict);
12849 osd->logger->inc(l_osd_agent_evict);
12850 return true;
12851 }
12852
12853 void PrimaryLogPG::agent_stop()
12854 {
12855 dout(20) << __func__ << dendl;
12856 if (agent_state && !agent_state->is_idle()) {
12857 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
12858 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12859 osd->agent_disable_pg(this, agent_state->evict_effort);
12860 }
12861 }
12862
12863 void PrimaryLogPG::agent_delay()
12864 {
12865 dout(20) << __func__ << dendl;
12866 if (agent_state && !agent_state->is_idle()) {
12867 assert(agent_state->delaying == false);
12868 agent_state->delaying = true;
12869 osd->agent_disable_pg(this, agent_state->evict_effort);
12870 }
12871 }
12872
12873 void PrimaryLogPG::agent_choose_mode_restart()
12874 {
12875 dout(20) << __func__ << dendl;
12876 lock();
12877 if (agent_state && agent_state->delaying) {
12878 agent_state->delaying = false;
12879 agent_choose_mode(true);
12880 }
12881 unlock();
12882 }
12883
12884 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
12885 {
12886 bool requeued = false;
12887 // Let delay play out
12888 if (agent_state->delaying) {
12889 dout(20) << __func__ << this << " delaying, ignored" << dendl;
12890 return requeued;
12891 }
12892
12893 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12894 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
12895 unsigned evict_effort = 0;
12896
12897 if (info.stats.stats_invalid) {
12898 // idle; stats can't be trusted until we scrub.
12899 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
12900 goto skip_calc;
12901 }
12902
12903 {
12904 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
12905 assert(divisor > 0);
12906
12907 // adjust (effective) user objects down based on the number
12908 // of HitSet objects, which should not count toward our total since
12909 // they cannot be flushed.
12910 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
12911
12912 // also exclude omap objects if ec backing pool
12913 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12914 assert(base_pool);
12915 if (!base_pool->supports_omap())
12916 unflushable += info.stats.stats.sum.num_objects_omap;
12917
12918 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
12919 if (num_user_objects > unflushable)
12920 num_user_objects -= unflushable;
12921 else
12922 num_user_objects = 0;
12923
12924 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
12925 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
12926 num_user_bytes -= unflushable_bytes;
12927 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
12928 num_user_bytes += num_overhead_bytes;
12929
12930 // also reduce the num_dirty by num_objects_omap
12931 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
12932 if (!base_pool->supports_omap()) {
12933 if (num_dirty > info.stats.stats.sum.num_objects_omap)
12934 num_dirty -= info.stats.stats.sum.num_objects_omap;
12935 else
12936 num_dirty = 0;
12937 }
12938
12939 dout(10) << __func__
12940 << " flush_mode: "
12941 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
12942 << " evict_mode: "
12943 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
12944 << " num_objects: " << info.stats.stats.sum.num_objects
12945 << " num_bytes: " << info.stats.stats.sum.num_bytes
12946 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
12947 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
12948 << " num_dirty: " << num_dirty
12949 << " num_user_objects: " << num_user_objects
12950 << " num_user_bytes: " << num_user_bytes
12951 << " num_overhead_bytes: " << num_overhead_bytes
12952 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
12953 << " pool.info.target_max_objects: " << pool.info.target_max_objects
12954 << dendl;
12955
12956 // get dirty, full ratios
12957 uint64_t dirty_micro = 0;
12958 uint64_t full_micro = 0;
12959 if (pool.info.target_max_bytes && num_user_objects > 0) {
12960 uint64_t avg_size = num_user_bytes / num_user_objects;
12961 dirty_micro =
12962 num_dirty * avg_size * 1000000 /
12963 MAX(pool.info.target_max_bytes / divisor, 1);
12964 full_micro =
12965 num_user_objects * avg_size * 1000000 /
12966 MAX(pool.info.target_max_bytes / divisor, 1);
12967 }
12968 if (pool.info.target_max_objects > 0) {
12969 uint64_t dirty_objects_micro =
12970 num_dirty * 1000000 /
12971 MAX(pool.info.target_max_objects / divisor, 1);
12972 if (dirty_objects_micro > dirty_micro)
12973 dirty_micro = dirty_objects_micro;
12974 uint64_t full_objects_micro =
12975 num_user_objects * 1000000 /
12976 MAX(pool.info.target_max_objects / divisor, 1);
12977 if (full_objects_micro > full_micro)
12978 full_micro = full_objects_micro;
12979 }
12980 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
12981 << " full " << ((float)full_micro / 1000000.0)
12982 << dendl;
12983
12984 // flush mode
12985 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
12986 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
12987 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
12988 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
12989 flush_target += flush_slop;
12990 flush_high_target += flush_slop;
12991 } else {
12992 flush_target -= MIN(flush_target, flush_slop);
12993 flush_high_target -= MIN(flush_high_target, flush_slop);
12994 }
12995
12996 if (dirty_micro > flush_high_target) {
12997 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
12998 } else if (dirty_micro > flush_target) {
12999 flush_mode = TierAgentState::FLUSH_MODE_LOW;
13000 }
13001
13002 // evict mode
13003 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13004 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13005 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13006 evict_target += evict_slop;
13007 else
13008 evict_target -= MIN(evict_target, evict_slop);
13009
13010 if (full_micro > 1000000) {
13011 // evict anything clean
13012 evict_mode = TierAgentState::EVICT_MODE_FULL;
13013 evict_effort = 1000000;
13014 } else if (full_micro > evict_target) {
13015 // set effort in [0..1] range based on where we are between
13016 evict_mode = TierAgentState::EVICT_MODE_SOME;
13017 uint64_t over = full_micro - evict_target;
13018 uint64_t span = 1000000 - evict_target;
13019 evict_effort = MAX(over * 1000000 / span,
13020 (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13021
13022 // quantize effort to avoid too much reordering in the agent_queue.
13023 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13024 assert(inc > 0);
13025 uint64_t was = evict_effort;
13026 evict_effort -= evict_effort % inc;
13027 if (evict_effort < inc)
13028 evict_effort = inc;
13029 assert(evict_effort >= inc && evict_effort <= 1000000);
13030 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13031 }
13032 }
13033
13034 skip_calc:
13035 bool old_idle = agent_state->is_idle();
13036 if (flush_mode != agent_state->flush_mode) {
13037 dout(5) << __func__ << " flush_mode "
13038 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13039 << " -> "
13040 << TierAgentState::get_flush_mode_name(flush_mode)
13041 << dendl;
13042 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13043 osd->agent_inc_high_count();
13044 info.stats.stats.sum.num_flush_mode_high = 1;
13045 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13046 info.stats.stats.sum.num_flush_mode_low = 1;
13047 }
13048 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13049 osd->agent_dec_high_count();
13050 info.stats.stats.sum.num_flush_mode_high = 0;
13051 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13052 info.stats.stats.sum.num_flush_mode_low = 0;
13053 }
13054 agent_state->flush_mode = flush_mode;
13055 }
13056 if (evict_mode != agent_state->evict_mode) {
13057 dout(5) << __func__ << " evict_mode "
13058 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13059 << " -> "
13060 << TierAgentState::get_evict_mode_name(evict_mode)
13061 << dendl;
13062 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13063 is_active()) {
13064 if (op)
13065 requeue_op(op);
13066 requeue_ops(waiting_for_active);
13067 requeue_ops(waiting_for_scrub);
13068 requeue_ops(waiting_for_cache_not_full);
13069 objects_blocked_on_cache_full.clear();
13070 requeued = true;
13071 }
13072 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13073 info.stats.stats.sum.num_evict_mode_some = 1;
13074 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13075 info.stats.stats.sum.num_evict_mode_full = 1;
13076 }
13077 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13078 info.stats.stats.sum.num_evict_mode_some = 0;
13079 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13080 info.stats.stats.sum.num_evict_mode_full = 0;
13081 }
13082 agent_state->evict_mode = evict_mode;
13083 }
13084 uint64_t old_effort = agent_state->evict_effort;
13085 if (evict_effort != agent_state->evict_effort) {
13086 dout(5) << __func__ << " evict_effort "
13087 << ((float)agent_state->evict_effort / 1000000.0)
13088 << " -> "
13089 << ((float)evict_effort / 1000000.0)
13090 << dendl;
13091 agent_state->evict_effort = evict_effort;
13092 }
13093
13094 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13095 // (including flush). This is probably fine (they should be
13096 // correlated) but it is not precisely correct.
13097 if (agent_state->is_idle()) {
13098 if (!restart && !old_idle) {
13099 osd->agent_disable_pg(this, old_effort);
13100 }
13101 } else {
13102 if (restart || old_idle) {
13103 osd->agent_enable_pg(this, agent_state->evict_effort);
13104 } else if (old_effort != agent_state->evict_effort) {
13105 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13106 }
13107 }
13108 return requeued;
13109 }
13110
13111 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13112 {
13113 assert(hit_set);
13114 assert(temp);
13115 *temp = 0;
13116 if (hit_set->contains(oid))
13117 *temp = 1000000;
13118 unsigned i = 0;
13119 int last_n = pool.info.hit_set_search_last_n;
13120 for (map<time_t,HitSetRef>::reverse_iterator p =
13121 agent_state->hit_set_map.rbegin(); last_n > 0 &&
13122 p != agent_state->hit_set_map.rend(); ++p, ++i) {
13123 if (p->second->contains(oid)) {
13124 *temp += pool.info.get_grade(i);
13125 --last_n;
13126 }
13127 }
13128 }
13129
13130 // Dup op detection
13131
13132 bool PrimaryLogPG::already_complete(eversion_t v)
13133 {
13134 dout(20) << __func__ << ": " << v << dendl;
13135 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13136 !i.end();
13137 ++i) {
13138 dout(20) << __func__ << ": " << **i << dendl;
13139 // skip copy from temp object ops
13140 if ((*i)->v == eversion_t()) {
13141 dout(20) << __func__ << ": " << **i
13142 << " version is empty" << dendl;
13143 continue;
13144 }
13145 if ((*i)->v > v) {
13146 dout(20) << __func__ << ": " << **i
13147 << " (*i)->v past v" << dendl;
13148 break;
13149 }
13150 if (!(*i)->all_committed) {
13151 dout(20) << __func__ << ": " << **i
13152 << " not committed, returning false"
13153 << dendl;
13154 return false;
13155 }
13156 }
13157 dout(20) << __func__ << ": returning true" << dendl;
13158 return true;
13159 }
13160
13161 bool PrimaryLogPG::already_ack(eversion_t v)
13162 {
13163 dout(20) << __func__ << ": " << v << dendl;
13164 for (xlist<RepGather*>::iterator i = repop_queue.begin();
13165 !i.end();
13166 ++i) {
13167 // skip copy from temp object ops
13168 if ((*i)->v == eversion_t()) {
13169 dout(20) << __func__ << ": " << **i
13170 << " version is empty" << dendl;
13171 continue;
13172 }
13173 if ((*i)->v > v) {
13174 dout(20) << __func__ << ": " << **i
13175 << " (*i)->v past v" << dendl;
13176 break;
13177 }
13178 if (!(*i)->all_applied) {
13179 dout(20) << __func__ << ": " << **i
13180 << " not applied, returning false"
13181 << dendl;
13182 return false;
13183 }
13184 }
13185 dout(20) << __func__ << ": returning true" << dendl;
13186 return true;
13187 }
13188
13189
13190 // ==========================================================================================
13191 // SCRUB
13192
13193
13194 bool PrimaryLogPG::_range_available_for_scrub(
13195 const hobject_t &begin, const hobject_t &end)
13196 {
13197 pair<hobject_t, ObjectContextRef> next;
13198 next.second = object_contexts.lookup(begin);
13199 next.first = begin;
13200 bool more = true;
13201 while (more && next.first < end) {
13202 if (next.second && next.second->is_blocked()) {
13203 next.second->requeue_scrub_on_unblock = true;
13204 dout(10) << __func__ << ": scrub delayed, "
13205 << next.first << " is blocked"
13206 << dendl;
13207 return false;
13208 }
13209 more = object_contexts.get_next(next.first, &next);
13210 }
13211 return true;
13212 }
13213
13214 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13215 const vector<snapid_t>::reverse_iterator &curclone) {
13216 return snapset && curclone != snapset.get().clones.rend();
13217 }
13218
13219 void PrimaryLogPG::log_missing(unsigned missing,
13220 const boost::optional<hobject_t> &head,
13221 LogChannelRef clog,
13222 const spg_t &pgid,
13223 const char *func,
13224 const char *mode,
13225 bool allow_incomplete_clones)
13226 {
13227 assert(head);
13228 if (allow_incomplete_clones) {
13229 dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13230 << " skipped " << missing << " clone(s) in cache tier" << dendl;
13231 } else {
13232 clog->info() << mode << " " << pgid << " " << head.get()
13233 << " " << missing << " missing clone(s)";
13234 }
13235 }
13236
13237 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13238 const boost::optional<SnapSet> &snapset,
13239 LogChannelRef clog,
13240 const spg_t &pgid,
13241 const char *mode,
13242 bool allow_incomplete_clones,
13243 boost::optional<snapid_t> target,
13244 vector<snapid_t>::reverse_iterator *curclone,
13245 inconsistent_snapset_wrapper &e)
13246 {
13247 assert(head);
13248 assert(snapset);
13249 unsigned missing = 0;
13250
13251 // NOTE: clones are in descending order, thus **curclone > target test here
13252 hobject_t next_clone(head.get());
13253 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13254 ++missing;
13255 // it is okay to be missing one or more clones in a cache tier.
13256 // skip higher-numbered clones in the list.
13257 if (!allow_incomplete_clones) {
13258 next_clone.snap = **curclone;
13259 clog->error() << mode << " " << pgid << " " << head.get()
13260 << " expected clone " << next_clone;
13261 ++scrubber.shallow_errors;
13262 e.set_clone_missing(next_clone.snap);
13263 }
13264 // Clones are descending
13265 ++(*curclone);
13266 }
13267 return missing;
13268 }
13269
13270 /*
13271 * Validate consistency of the object info and snap sets.
13272 *
13273 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13274 * the comparison of the objects is against multiple snapset.clones. There are
13275 * multiple clone lists and in between lists we expect head or snapdir.
13276 *
13277 * Example
13278 *
13279 * objects expected
13280 * ======= =======
13281 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13282 * obj2 head head/snapdir, head ok
13283 * [SnapSet clones 6 4 2 1]
13284 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13285 * obj2 snap 6 obj2 snap 6, match
13286 * obj2 snap 4 obj2 snap 4, match
13287 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13288 * [Snapset clones 3 1]
13289 * obj3 snap 3 obj3 snap 3 match
13290 * obj3 snap 1 obj3 snap 1 match
13291 * obj4 snapdir head/snapdir, snapdir ok
13292 * [Snapset clones 4]
13293 * EOL obj4 snap 4, (expected)
13294 */
13295 void PrimaryLogPG::scrub_snapshot_metadata(
13296 ScrubMap &scrubmap,
13297 const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13298 {
13299 dout(10) << __func__ << dendl;
13300
13301 coll_t c(info.pgid);
13302 bool repair = state_test(PG_STATE_REPAIR);
13303 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13304 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13305 boost::optional<snapid_t> all_clones; // Unspecified snapid_t or boost::none
13306
13307 /// snapsets to repair
13308 map<hobject_t,SnapSet> snapset_to_repair;
13309
13310 // traverse in reverse order.
13311 boost::optional<hobject_t> head;
13312 boost::optional<SnapSet> snapset; // If initialized so will head (above)
13313 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13314 unsigned missing = 0;
13315 inconsistent_snapset_wrapper soid_error, head_error;
13316
13317 bufferlist last_data;
13318
13319 for (map<hobject_t,ScrubMap::object>::reverse_iterator
13320 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13321 const hobject_t& soid = p->first;
13322 soid_error = inconsistent_snapset_wrapper{soid};
13323 object_stat_sum_t stat;
13324 boost::optional<object_info_t> oi;
13325
13326 if (!soid.is_snapdir())
13327 stat.num_objects++;
13328
13329 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13330 stat.num_objects_hit_set_archive++;
13331
13332 if (soid.is_snap()) {
13333 // it's a clone
13334 stat.num_object_clones++;
13335 }
13336
13337 // basic checks.
13338 if (p->second.attrs.count(OI_ATTR) == 0) {
13339 oi = boost::none;
13340 osd->clog->error() << mode << " " << info.pgid << " " << soid
13341 << " no '" << OI_ATTR << "' attr";
13342 ++scrubber.shallow_errors;
13343 soid_error.set_oi_attr_missing();
13344 } else {
13345 bufferlist bv;
13346 bv.push_back(p->second.attrs[OI_ATTR]);
13347 try {
13348 oi = object_info_t(); // Initialize optional<> before decode into it
13349 oi.get().decode(bv);
13350 } catch (buffer::error& e) {
13351 oi = boost::none;
13352 osd->clog->error() << mode << " " << info.pgid << " " << soid
13353 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13354 ++scrubber.shallow_errors;
13355 soid_error.set_oi_attr_corrupted();
13356 soid_error.set_oi_attr_missing(); // Not available too
13357 }
13358 }
13359
13360 if (oi) {
13361 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13362 osd->clog->error() << mode << " " << info.pgid << " " << soid
13363 << " on disk size (" << p->second.size
13364 << ") does not match object info size ("
13365 << oi->size << ") adjusted for ondisk to ("
13366 << pgbackend->be_get_ondisk_size(oi->size)
13367 << ")";
13368 soid_error.set_size_mismatch();
13369 ++scrubber.shallow_errors;
13370 }
13371
13372 dout(20) << mode << " " << soid << " " << oi.get() << dendl;
13373
13374 // A clone num_bytes will be added later when we have snapset
13375 if (!soid.is_snap()) {
13376 stat.num_bytes += oi->size;
13377 }
13378 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13379 stat.num_bytes_hit_set_archive += oi->size;
13380
13381 if (!soid.is_snapdir()) {
13382 if (oi->is_dirty())
13383 ++stat.num_objects_dirty;
13384 if (oi->is_whiteout())
13385 ++stat.num_whiteouts;
13386 if (oi->is_omap())
13387 ++stat.num_objects_omap;
13388 if (oi->is_cache_pinned())
13389 ++stat.num_objects_pinned;
13390 }
13391 } else {
13392 // pessimistic assumption that this object might contain a
13393 // legacy SnapSet
13394 stat.num_legacy_snapsets++;
13395 }
13396
13397 // Check for any problems while processing clones
13398 if (doing_clones(snapset, curclone)) {
13399 boost::optional<snapid_t> target;
13400 // Expecting an object with snap for current head
13401 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13402
13403 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13404 << soid << " while processing " << head.get() << dendl;
13405
13406 target = all_clones;
13407 } else {
13408 assert(soid.is_snap());
13409 target = soid.snap;
13410 }
13411
13412 // Log any clones we were expecting to be there up to target
13413 // This will set missing, but will be a no-op if snap.soid == *curclone.
13414 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13415 pool.info.allow_incomplete_clones(), target, &curclone,
13416 head_error);
13417 }
13418 bool expected;
13419 // Check doing_clones() again in case we ran process_clones_to()
13420 if (doing_clones(snapset, curclone)) {
13421 // A head/snapdir would have processed all clones above
13422 // or all greater than *curclone.
13423 assert(soid.is_snap() && *curclone <= soid.snap);
13424
13425 // After processing above clone snap should match the expected curclone
13426 expected = (*curclone == soid.snap);
13427 } else {
13428 // If we aren't doing clones any longer, then expecting head/snapdir
13429 expected = soid.has_snapset();
13430 }
13431 if (!expected) {
13432 // If we couldn't read the head's snapset, just ignore clones
13433 if (head && !snapset) {
13434 osd->clog->error() << mode << " " << info.pgid << " " << soid
13435 << " clone ignored due to missing snapset";
13436 } else {
13437 osd->clog->error() << mode << " " << info.pgid << " " << soid
13438 << " is an unexpected clone";
13439 }
13440 ++scrubber.shallow_errors;
13441 soid_error.set_headless();
13442 scrubber.store->add_snap_error(pool.id, soid_error);
13443 if (head && soid.get_head() == head->get_head())
13444 head_error.set_clone(soid.snap);
13445 continue;
13446 }
13447
13448 // new snapset?
13449 if (soid.has_snapset()) {
13450
13451 if (missing) {
13452 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13453 pool.info.allow_incomplete_clones());
13454 }
13455
13456 // Save previous head error information
13457 if (head && head_error.errors)
13458 scrubber.store->add_snap_error(pool.id, head_error);
13459 // Set this as a new head object
13460 head = soid;
13461 missing = 0;
13462 head_error = soid_error;
13463
13464 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13465
13466 if (p->second.attrs.count(SS_ATTR) == 0) {
13467 osd->clog->error() << mode << " " << info.pgid << " " << soid
13468 << " no '" << SS_ATTR << "' attr";
13469 ++scrubber.shallow_errors;
13470 snapset = boost::none;
13471 head_error.set_ss_attr_missing();
13472 } else {
13473 bufferlist bl;
13474 bl.push_back(p->second.attrs[SS_ATTR]);
13475 bufferlist::iterator blp = bl.begin();
13476 try {
13477 snapset = SnapSet(); // Initialize optional<> before decoding into it
13478 ::decode(snapset.get(), blp);
13479 } catch (buffer::error& e) {
13480 snapset = boost::none;
13481 osd->clog->error() << mode << " " << info.pgid << " " << soid
13482 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13483 ++scrubber.shallow_errors;
13484 head_error.set_ss_attr_corrupted();
13485 }
13486 }
13487
13488 if (snapset) {
13489 // what will be next?
13490 curclone = snapset->clones.rbegin();
13491
13492 if (!snapset->clones.empty()) {
13493 dout(20) << " snapset " << snapset.get() << dendl;
13494 if (snapset->seq == 0) {
13495 osd->clog->error() << mode << " " << info.pgid << " " << soid
13496 << " snaps.seq not set";
13497 ++scrubber.shallow_errors;
13498 head_error.set_snapset_mismatch();
13499 }
13500 }
13501
13502 if (soid.is_head() && !snapset->head_exists) {
13503 osd->clog->error() << mode << " " << info.pgid << " " << soid
13504 << " snapset.head_exists=false, but head exists";
13505 ++scrubber.shallow_errors;
13506 head_error.set_head_mismatch();
13507 }
13508 if (soid.is_snapdir() && snapset->head_exists) {
13509 osd->clog->error() << mode << " " << info.pgid << " " << soid
13510 << " snapset.head_exists=true, but snapdir exists";
13511 ++scrubber.shallow_errors;
13512 head_error.set_head_mismatch();
13513 }
13514
13515 if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13516 if (soid.is_snapdir()) {
13517 dout(10) << " will move snapset to head from " << soid << dendl;
13518 snapset_to_repair[soid.get_head()] = *snapset;
13519 } else if (snapset->is_legacy()) {
13520 dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13521 << dendl;
13522 snapset_to_repair[soid.get_head()] = *snapset;
13523 }
13524 } else {
13525 stat.num_legacy_snapsets++;
13526 }
13527 } else {
13528 // pessimistic assumption that this object might contain a
13529 // legacy SnapSet
13530 stat.num_legacy_snapsets++;
13531 }
13532 } else {
13533 assert(soid.is_snap());
13534 assert(head);
13535 assert(snapset);
13536 assert(soid.snap == *curclone);
13537
13538 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13539
13540 if (snapset->clone_size.count(soid.snap) == 0) {
13541 osd->clog->error() << mode << " " << info.pgid << " " << soid
13542 << " is missing in clone_size";
13543 ++scrubber.shallow_errors;
13544 soid_error.set_size_mismatch();
13545 } else {
13546 if (oi && oi->size != snapset->clone_size[soid.snap]) {
13547 osd->clog->error() << mode << " " << info.pgid << " " << soid
13548 << " size " << oi->size << " != clone_size "
13549 << snapset->clone_size[*curclone];
13550 ++scrubber.shallow_errors;
13551 soid_error.set_size_mismatch();
13552 }
13553
13554 if (snapset->clone_overlap.count(soid.snap) == 0) {
13555 osd->clog->error() << mode << " " << info.pgid << " " << soid
13556 << " is missing in clone_overlap";
13557 ++scrubber.shallow_errors;
13558 soid_error.set_size_mismatch();
13559 } else {
13560 // This checking is based on get_clone_bytes(). The first 2 asserts
13561 // can't happen because we know we have a clone_size and
13562 // a clone_overlap. Now we check that the interval_set won't
13563 // cause the last assert.
13564 uint64_t size = snapset->clone_size.find(soid.snap)->second;
13565 const interval_set<uint64_t> &overlap =
13566 snapset->clone_overlap.find(soid.snap)->second;
13567 bool bad_interval_set = false;
13568 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13569 i != overlap.end(); ++i) {
13570 if (size < i.get_len()) {
13571 bad_interval_set = true;
13572 break;
13573 }
13574 size -= i.get_len();
13575 }
13576
13577 if (bad_interval_set) {
13578 osd->clog->error() << mode << " " << info.pgid << " " << soid
13579 << " bad interval_set in clone_overlap";
13580 ++scrubber.shallow_errors;
13581 soid_error.set_size_mismatch();
13582 } else {
13583 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13584 }
13585 }
13586 }
13587
13588 // migrate legacy_snaps to snapset?
13589 auto p = snapset_to_repair.find(soid.get_head());
13590 if (p != snapset_to_repair.end()) {
13591 if (!oi || oi->legacy_snaps.empty()) {
13592 osd->clog->error() << mode << " " << info.pgid << " " << soid
13593 << " has no oi or legacy_snaps; cannot convert "
13594 << *snapset;
13595 ++scrubber.shallow_errors;
13596 } else {
13597 dout(20) << __func__ << " copying legacy_snaps " << oi->legacy_snaps
13598 << " to snapset " << p->second << dendl;
13599 p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13600 }
13601 }
13602
13603 // what's next?
13604 ++curclone;
13605 if (soid_error.errors)
13606 scrubber.store->add_snap_error(pool.id, soid_error);
13607 }
13608
13609 scrub_cstat.add(stat);
13610 }
13611
13612 if (doing_clones(snapset, curclone)) {
13613 dout(10) << __func__ << " " << mode << " " << info.pgid
13614 << " No more objects while processing " << head.get() << dendl;
13615
13616 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13617 pool.info.allow_incomplete_clones(), all_clones, &curclone,
13618 head_error);
13619 }
13620 // There could be missing found by the test above or even
13621 // before dropping out of the loop for the last head.
13622 if (missing) {
13623 log_missing(missing, head, osd->clog, info.pgid, __func__,
13624 mode, pool.info.allow_incomplete_clones());
13625 }
13626 if (head && head_error.errors)
13627 scrubber.store->add_snap_error(pool.id, head_error);
13628
13629 for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
13630 missing_digest.begin();
13631 p != missing_digest.end();
13632 ++p) {
13633 if (p->first.is_snapdir())
13634 continue;
13635 dout(10) << __func__ << " recording digests for " << p->first << dendl;
13636 ObjectContextRef obc = get_object_context(p->first, false);
13637 if (!obc) {
13638 osd->clog->error() << info.pgid << " " << mode
13639 << " cannot get object context for "
13640 << p->first;
13641 continue;
13642 } else if (obc->obs.oi.soid != p->first) {
13643 osd->clog->error() << info.pgid << " " << mode
13644 << " object " << p->first
13645 << " has a valid oi attr with a mismatched name, "
13646 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13647 continue;
13648 }
13649 OpContextUPtr ctx = simple_opc_create(obc);
13650 ctx->at_version = get_next_version();
13651 ctx->mtime = utime_t(); // do not update mtime
13652 ctx->new_obs.oi.set_data_digest(p->second.first);
13653 ctx->new_obs.oi.set_omap_digest(p->second.second);
13654 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13655
13656 ctx->register_on_success(
13657 [this]() {
13658 dout(20) << "updating scrub digest" << dendl;
13659 if (--scrubber.num_digest_updates_pending == 0) {
13660 requeue_scrub();
13661 }
13662 });
13663
13664 simple_opc_submit(std::move(ctx));
13665 ++scrubber.num_digest_updates_pending;
13666 }
13667 for (auto& p : snapset_to_repair) {
13668 // cache pools may not have the clones, which means we won't know
13669 // what snaps they have. fake out the clone_snaps entries anyway (with
13670 // blank snap lists).
13671 p.second.head_exists = true;
13672 if (pool.info.allow_incomplete_clones()) {
13673 for (auto s : p.second.clones) {
13674 if (p.second.clone_snaps.count(s) == 0) {
13675 dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
13676 << s << dendl;
13677 p.second.clone_snaps[s];
13678 }
13679 }
13680 }
13681 if (p.second.clones.size() != p.second.clone_snaps.size() ||
13682 p.second.is_legacy()) {
13683 // this happens if we encounter other errors above, like a missing
13684 // or extra clone.
13685 dout(10) << __func__ << " not writing snapset to " << p.first
13686 << " snapset " << p.second << " clones " << p.second.clones
13687 << "; didn't convert fully" << dendl;
13688 scrub_cstat.sum.num_legacy_snapsets++;
13689 continue;
13690 }
13691 dout(10) << __func__ << " writing snapset to " << p.first
13692 << " " << p.second << dendl;
13693 ObjectContextRef obc = get_object_context(p.first, true);
13694 if (!obc) {
13695 osd->clog->error() << info.pgid << " " << mode
13696 << " cannot get object context for "
13697 << p.first;
13698 continue;
13699 } else if (obc->obs.oi.soid != p.first) {
13700 osd->clog->error() << info.pgid << " " << mode
13701 << " object " << p.first
13702 << " has a valid oi attr with a mismatched name, "
13703 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13704 continue;
13705 }
13706 ObjectContextRef snapset_obc;
13707 if (!obc->obs.exists) {
13708 snapset_obc = get_object_context(p.first.get_snapdir(), false);
13709 if (!snapset_obc) {
13710 osd->clog->error() << info.pgid << " " << mode
13711 << " cannot get object context for "
13712 << p.first.get_snapdir();
13713 continue;
13714 }
13715 }
13716 OpContextUPtr ctx = simple_opc_create(obc);
13717 PGTransaction *t = ctx->op_t.get();
13718 ctx->snapset_obc = snapset_obc;
13719 ctx->at_version = get_next_version();
13720 ctx->mtime = utime_t(); // do not update mtime
13721 ctx->new_snapset = p.second;
13722 if (!ctx->new_obs.exists) {
13723 dout(20) << __func__ << " making " << p.first << " a whiteout" << dendl;
13724 ctx->new_obs.exists = true;
13725 ctx->new_snapset.head_exists = true;
13726 ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
13727 ++ctx->delta_stats.num_whiteouts;
13728 ++ctx->delta_stats.num_objects;
13729 t->create(p.first);
13730 if (p.first < scrubber.start) {
13731 dout(20) << __func__ << " kludging around update outside of scrub range"
13732 << dendl;
13733 } else {
13734 scrub_cstat.add(ctx->delta_stats);
13735 }
13736 }
13737 dout(20) << __func__ << " final snapset " << ctx->new_snapset << dendl;
13738 assert(!ctx->new_snapset.is_legacy());
13739 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13740 ctx->register_on_success(
13741 [this]() {
13742 dout(20) << "updating snapset" << dendl;
13743 if (--scrubber.num_digest_updates_pending == 0) {
13744 requeue_scrub();
13745 }
13746 });
13747
13748 simple_opc_submit(std::move(ctx));
13749 ++scrubber.num_digest_updates_pending;
13750 }
13751
13752 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
13753 }
13754
13755 void PrimaryLogPG::_scrub_clear_state()
13756 {
13757 scrub_cstat = object_stat_collection_t();
13758 }
13759
13760 void PrimaryLogPG::_scrub_finish()
13761 {
13762 bool repair = state_test(PG_STATE_REPAIR);
13763 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13764 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13765
13766 if (info.stats.stats_invalid) {
13767 info.stats.stats = scrub_cstat;
13768 info.stats.stats_invalid = false;
13769
13770 if (agent_state)
13771 agent_choose_mode();
13772 }
13773
13774 dout(10) << mode << " got "
13775 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13776 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13777 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13778 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13779 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13780 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13781 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13782 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
13783 << dendl;
13784
13785 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
13786 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
13787 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
13788 !info.stats.dirty_stats_invalid) ||
13789 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
13790 !info.stats.omap_stats_invalid) ||
13791 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
13792 !info.stats.pin_stats_invalid) ||
13793 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
13794 !info.stats.hitset_stats_invalid) ||
13795 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
13796 !info.stats.hitset_bytes_stats_invalid) ||
13797 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
13798 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
13799 osd->clog->error() << info.pgid << " " << mode
13800 << " stat mismatch, got "
13801 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13802 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13803 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13804 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13805 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13806 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13807 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
13808 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13809 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
13810 ++scrubber.shallow_errors;
13811
13812 if (repair) {
13813 ++scrubber.fixed;
13814 info.stats.stats = scrub_cstat;
13815 info.stats.dirty_stats_invalid = false;
13816 info.stats.omap_stats_invalid = false;
13817 info.stats.hitset_stats_invalid = false;
13818 info.stats.hitset_bytes_stats_invalid = false;
13819 publish_stats_to_osd();
13820 share_pg_info();
13821 }
13822 } else if (scrub_cstat.sum.num_legacy_snapsets !=
13823 info.stats.stats.sum.num_legacy_snapsets) {
13824 osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
13825 << " from " << info.stats.stats.sum.num_legacy_snapsets
13826 << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
13827 info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
13828 publish_stats_to_osd();
13829 share_pg_info();
13830 }
13831 }
13832
13833 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
13834 {
13835 return osd->check_osdmap_full(missing_on);
13836 }
13837
13838 /*---SnapTrimmer Logging---*/
13839 #undef dout_prefix
13840 #define dout_prefix *_dout << pg->gen_prefix()
13841
13842 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
13843 {
13844 ldout(pg->cct, 20) << "enter " << state_name << dendl;
13845 }
13846
13847 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
13848 {
13849 ldout(pg->cct, 20) << "exit " << state_name << dendl;
13850 }
13851
13852 /*---SnapTrimmer states---*/
13853 #undef dout_prefix
13854 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
13855 << "SnapTrimmer state<" << get_state_name() << ">: ")
13856
13857 /* NotTrimming */
13858 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
13859 : my_base(ctx),
13860 NamedState(context< SnapTrimmer >().pg, "NotTrimming")
13861 {
13862 context< SnapTrimmer >().log_enter(state_name);
13863 }
13864
13865 void PrimaryLogPG::NotTrimming::exit()
13866 {
13867 context< SnapTrimmer >().log_exit(state_name, enter_time);
13868 }
13869
13870 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
13871 {
13872 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
13873 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
13874
13875 if (!(pg->is_primary() && pg->is_active())) {
13876 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
13877 return discard_event();
13878 }
13879 if (!pg->is_clean() ||
13880 pg->snap_trimq.empty()) {
13881 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
13882 return discard_event();
13883 }
13884 if (pg->scrubber.active) {
13885 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
13886 pg->scrubber.queue_snap_trim = true;
13887 return transit< WaitScrub >();
13888 } else {
13889 return transit< Trimming >();
13890 }
13891 }
13892
13893 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
13894 {
13895 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
13896 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
13897
13898 pending = nullptr;
13899 if (!context< SnapTrimmer >().can_trim()) {
13900 post_event(KickTrim());
13901 return transit< NotTrimming >();
13902 }
13903
13904 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
13905 ldout(pg->cct, 10) << "NotTrimming: trimming "
13906 << pg->snap_trimq.range_start()
13907 << dendl;
13908 return transit< AwaitAsyncWork >();
13909 }
13910
13911 /* AwaitAsyncWork */
13912 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
13913 : my_base(ctx),
13914 NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
13915 {
13916 auto *pg = context< SnapTrimmer >().pg;
13917 context< SnapTrimmer >().log_enter(state_name);
13918 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
13919 pg->state_set(PG_STATE_SNAPTRIM);
13920 pg->publish_stats_to_osd();
13921 }
13922
13923 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
13924 {
13925 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
13926 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
13927 auto &in_flight = context<Trimming>().in_flight;
13928 assert(in_flight.empty());
13929
13930 assert(pg->is_primary() && pg->is_active());
13931 if (!context< SnapTrimmer >().can_trim()) {
13932 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
13933 post_event(KickTrim());
13934 return transit< NotTrimming >();
13935 }
13936
13937 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
13938
13939 vector<hobject_t> to_trim;
13940 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
13941 to_trim.reserve(max);
13942 int r = pg->snap_mapper.get_next_objects_to_trim(
13943 snap_to_trim,
13944 max,
13945 &to_trim);
13946 if (r != 0 && r != -ENOENT) {
13947 lderr(pg->cct) << "get_next_objects_to_trim returned "
13948 << cpp_strerror(r) << dendl;
13949 assert(0 == "get_next_objects_to_trim returned an invalid code");
13950 } else if (r == -ENOENT) {
13951 // Done!
13952 ldout(pg->cct, 10) << "got ENOENT" << dendl;
13953
13954 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
13955 << " to purged_snaps"
13956 << dendl;
13957 pg->info.purged_snaps.insert(snap_to_trim);
13958 pg->snap_trimq.erase(snap_to_trim);
13959 ldout(pg->cct, 10) << "purged_snaps now "
13960 << pg->info.purged_snaps << ", snap_trimq now "
13961 << pg->snap_trimq << dendl;
13962
13963 ObjectStore::Transaction t;
13964 pg->dirty_big_info = true;
13965 pg->write_if_dirty(t);
13966 int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
13967 assert(tr == 0);
13968
13969 pg->share_pg_info();
13970 post_event(KickTrim());
13971 return transit< NotTrimming >();
13972 }
13973 assert(!to_trim.empty());
13974
13975 for (auto &&object: to_trim) {
13976 // Get next
13977 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
13978 OpContextUPtr ctx = pg->trim_object(in_flight.empty(), object);
13979 if (!ctx) {
13980 ldout(pg->cct, 10) << "could not get write lock on obj "
13981 << object << dendl;
13982 if (in_flight.empty()) {
13983 ldout(pg->cct, 10) << "waiting for it to clear"
13984 << dendl;
13985 return transit< WaitRWLock >();
13986
13987 } else {
13988 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
13989 return transit< WaitRepops >();
13990 }
13991 }
13992
13993 in_flight.insert(object);
13994 ctx->register_on_success(
13995 [pg, object, &in_flight]() {
13996 assert(in_flight.find(object) != in_flight.end());
13997 in_flight.erase(object);
13998 if (in_flight.empty())
13999 pg->snap_trimmer_machine.process_event(RepopsComplete());
14000 });
14001
14002 pg->simple_opc_submit(std::move(ctx));
14003 }
14004
14005 return transit< WaitRepops >();
14006 }
14007
14008 void PrimaryLogPG::setattr_maybe_cache(
14009 ObjectContextRef obc,
14010 OpContext *op,
14011 PGTransaction *t,
14012 const string &key,
14013 bufferlist &val)
14014 {
14015 t->setattr(obc->obs.oi.soid, key, val);
14016 }
14017
14018 void PrimaryLogPG::setattrs_maybe_cache(
14019 ObjectContextRef obc,
14020 OpContext *op,
14021 PGTransaction *t,
14022 map<string, bufferlist> &attrs)
14023 {
14024 t->setattrs(obc->obs.oi.soid, attrs);
14025 }
14026
14027 void PrimaryLogPG::rmattr_maybe_cache(
14028 ObjectContextRef obc,
14029 OpContext *op,
14030 PGTransaction *t,
14031 const string &key)
14032 {
14033 t->rmattr(obc->obs.oi.soid, key);
14034 }
14035
14036 int PrimaryLogPG::getattr_maybe_cache(
14037 ObjectContextRef obc,
14038 const string &key,
14039 bufferlist *val)
14040 {
14041 if (pool.info.require_rollback()) {
14042 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14043 if (i != obc->attr_cache.end()) {
14044 if (val)
14045 *val = i->second;
14046 return 0;
14047 } else {
14048 return -ENODATA;
14049 }
14050 }
14051 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14052 }
14053
14054 int PrimaryLogPG::getattrs_maybe_cache(
14055 ObjectContextRef obc,
14056 map<string, bufferlist> *out,
14057 bool user_only)
14058 {
14059 int r = 0;
14060 if (pool.info.require_rollback()) {
14061 if (out)
14062 *out = obc->attr_cache;
14063 } else {
14064 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14065 }
14066 if (out && user_only) {
14067 map<string, bufferlist> tmp;
14068 for (map<string, bufferlist>::iterator i = out->begin();
14069 i != out->end();
14070 ++i) {
14071 if (i->first.size() > 1 && i->first[0] == '_')
14072 tmp[i->first.substr(1, i->first.size())].claim(i->second);
14073 }
14074 tmp.swap(*out);
14075 }
14076 return r;
14077 }
14078
14079 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14080 return osd->check_failsafe_full(ss);
14081 }
14082
14083 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14084 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14085
14086 #ifdef PG_DEBUG_REFS
14087 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14088 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14089 #endif
14090
14091 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14092 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }