]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2013 Inktank Storage, Inc. | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | #include "common/errno.h" | |
15 | #include "ReplicatedBackend.h" | |
16 | #include "messages/MOSDOp.h" | |
17 | #include "messages/MOSDSubOp.h" | |
18 | #include "messages/MOSDRepOp.h" | |
19 | #include "messages/MOSDSubOpReply.h" | |
20 | #include "messages/MOSDRepOpReply.h" | |
21 | #include "messages/MOSDPGPush.h" | |
22 | #include "messages/MOSDPGPull.h" | |
23 | #include "messages/MOSDPGPushReply.h" | |
24 | #include "common/EventTrace.h" | |
25 | ||
26 | #define dout_context cct | |
27 | #define dout_subsys ceph_subsys_osd | |
28 | #define DOUT_PREFIX_ARGS this | |
29 | #undef dout_prefix | |
30 | #define dout_prefix _prefix(_dout, this) | |
31 | static ostream& _prefix(std::ostream *_dout, ReplicatedBackend *pgb) { | |
32 | return *_dout << pgb->get_parent()->gen_dbg_prefix(); | |
33 | } | |
34 | ||
35 | namespace { | |
36 | class PG_SendMessageOnConn: public Context { | |
37 | PGBackend::Listener *pg; | |
38 | Message *reply; | |
39 | ConnectionRef conn; | |
40 | public: | |
41 | PG_SendMessageOnConn( | |
42 | PGBackend::Listener *pg, | |
43 | Message *reply, | |
44 | ConnectionRef conn) : pg(pg), reply(reply), conn(conn) {} | |
45 | void finish(int) override { | |
46 | pg->send_message_osd_cluster(reply, conn.get()); | |
47 | } | |
48 | }; | |
49 | ||
50 | class PG_RecoveryQueueAsync : public Context { | |
51 | PGBackend::Listener *pg; | |
52 | unique_ptr<GenContext<ThreadPool::TPHandle&>> c; | |
53 | public: | |
54 | PG_RecoveryQueueAsync( | |
55 | PGBackend::Listener *pg, | |
56 | GenContext<ThreadPool::TPHandle&> *c) : pg(pg), c(c) {} | |
57 | void finish(int) override { | |
58 | pg->schedule_recovery_work(c.release()); | |
59 | } | |
60 | }; | |
61 | } | |
62 | ||
63 | struct ReplicatedBackend::C_OSD_RepModifyApply : public Context { | |
64 | ReplicatedBackend *pg; | |
65 | RepModifyRef rm; | |
66 | C_OSD_RepModifyApply(ReplicatedBackend *pg, RepModifyRef r) | |
67 | : pg(pg), rm(r) {} | |
68 | void finish(int r) override { | |
69 | pg->repop_applied(rm); | |
70 | } | |
71 | }; | |
72 | ||
73 | struct ReplicatedBackend::C_OSD_RepModifyCommit : public Context { | |
74 | ReplicatedBackend *pg; | |
75 | RepModifyRef rm; | |
76 | C_OSD_RepModifyCommit(ReplicatedBackend *pg, RepModifyRef r) | |
77 | : pg(pg), rm(r) {} | |
78 | void finish(int r) override { | |
79 | pg->repop_commit(rm); | |
80 | } | |
81 | }; | |
82 | ||
83 | static void log_subop_stats( | |
84 | PerfCounters *logger, | |
85 | OpRequestRef op, int subop) | |
86 | { | |
87 | utime_t now = ceph_clock_now(); | |
88 | utime_t latency = now; | |
89 | latency -= op->get_req()->get_recv_stamp(); | |
90 | ||
91 | ||
92 | logger->inc(l_osd_sop); | |
93 | logger->tinc(l_osd_sop_lat, latency); | |
94 | logger->inc(subop); | |
95 | ||
96 | if (subop != l_osd_sop_pull) { | |
97 | uint64_t inb = op->get_req()->get_data().length(); | |
98 | logger->inc(l_osd_sop_inb, inb); | |
99 | if (subop == l_osd_sop_w) { | |
100 | logger->inc(l_osd_sop_w_inb, inb); | |
101 | logger->tinc(l_osd_sop_w_lat, latency); | |
102 | } else if (subop == l_osd_sop_push) { | |
103 | logger->inc(l_osd_sop_push_inb, inb); | |
104 | logger->tinc(l_osd_sop_push_lat, latency); | |
105 | } else | |
106 | assert("no support subop" == 0); | |
107 | } else { | |
108 | logger->tinc(l_osd_sop_pull_lat, latency); | |
109 | } | |
110 | } | |
111 | ||
112 | ReplicatedBackend::ReplicatedBackend( | |
113 | PGBackend::Listener *pg, | |
114 | coll_t coll, | |
115 | ObjectStore::CollectionHandle &c, | |
116 | ObjectStore *store, | |
117 | CephContext *cct) : | |
118 | PGBackend(cct, pg, store, coll, c) {} | |
119 | ||
120 | void ReplicatedBackend::run_recovery_op( | |
121 | PGBackend::RecoveryHandle *_h, | |
122 | int priority) | |
123 | { | |
124 | RPGHandle *h = static_cast<RPGHandle *>(_h); | |
125 | send_pushes(priority, h->pushes); | |
126 | send_pulls(priority, h->pulls); | |
127 | delete h; | |
128 | } | |
129 | ||
130 | void ReplicatedBackend::recover_object( | |
131 | const hobject_t &hoid, | |
132 | eversion_t v, | |
133 | ObjectContextRef head, | |
134 | ObjectContextRef obc, | |
135 | RecoveryHandle *_h | |
136 | ) | |
137 | { | |
138 | dout(10) << __func__ << ": " << hoid << dendl; | |
139 | RPGHandle *h = static_cast<RPGHandle *>(_h); | |
140 | if (get_parent()->get_local_missing().is_missing(hoid)) { | |
141 | assert(!obc); | |
142 | // pull | |
143 | prepare_pull( | |
144 | v, | |
145 | hoid, | |
146 | head, | |
147 | h); | |
148 | return; | |
149 | } else { | |
150 | assert(obc); | |
151 | int started = start_pushes( | |
152 | hoid, | |
153 | obc, | |
154 | h); | |
155 | assert(started > 0); | |
156 | } | |
157 | } | |
158 | ||
159 | void ReplicatedBackend::check_recovery_sources(const OSDMapRef& osdmap) | |
160 | { | |
161 | for(map<pg_shard_t, set<hobject_t> >::iterator i = pull_from_peer.begin(); | |
162 | i != pull_from_peer.end(); | |
163 | ) { | |
164 | if (osdmap->is_down(i->first.osd)) { | |
165 | dout(10) << "check_recovery_sources resetting pulls from osd." << i->first | |
166 | << ", osdmap has it marked down" << dendl; | |
167 | for (set<hobject_t>::iterator j = i->second.begin(); | |
168 | j != i->second.end(); | |
169 | ++j) { | |
170 | get_parent()->cancel_pull(*j); | |
171 | clear_pull(pulling.find(*j), false); | |
172 | } | |
173 | pull_from_peer.erase(i++); | |
174 | } else { | |
175 | ++i; | |
176 | } | |
177 | } | |
178 | } | |
179 | ||
180 | bool ReplicatedBackend::can_handle_while_inactive(OpRequestRef op) | |
181 | { | |
182 | dout(10) << __func__ << ": " << op << dendl; | |
183 | switch (op->get_req()->get_type()) { | |
184 | case MSG_OSD_PG_PULL: | |
185 | return true; | |
186 | default: | |
187 | return false; | |
188 | } | |
189 | } | |
190 | ||
191 | bool ReplicatedBackend::handle_message( | |
192 | OpRequestRef op | |
193 | ) | |
194 | { | |
195 | dout(10) << __func__ << ": " << op << dendl; | |
196 | switch (op->get_req()->get_type()) { | |
197 | case MSG_OSD_PG_PUSH: | |
198 | do_push(op); | |
199 | return true; | |
200 | ||
201 | case MSG_OSD_PG_PULL: | |
202 | do_pull(op); | |
203 | return true; | |
204 | ||
205 | case MSG_OSD_PG_PUSH_REPLY: | |
206 | do_push_reply(op); | |
207 | return true; | |
208 | ||
209 | case MSG_OSD_SUBOP: { | |
210 | const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req()); | |
211 | if (m->ops.size() == 0) { | |
212 | assert(0); | |
213 | } | |
214 | break; | |
215 | } | |
216 | ||
217 | case MSG_OSD_REPOP: { | |
218 | do_repop(op); | |
219 | return true; | |
220 | } | |
221 | ||
222 | case MSG_OSD_REPOPREPLY: { | |
223 | do_repop_reply(op); | |
224 | return true; | |
225 | } | |
226 | ||
227 | default: | |
228 | break; | |
229 | } | |
230 | return false; | |
231 | } | |
232 | ||
233 | void ReplicatedBackend::clear_recovery_state() | |
234 | { | |
235 | // clear pushing/pulling maps | |
236 | for (auto &&i: pushing) { | |
237 | for (auto &&j: i.second) { | |
238 | get_parent()->release_locks(j.second.lock_manager); | |
239 | } | |
240 | } | |
241 | pushing.clear(); | |
242 | ||
243 | for (auto &&i: pulling) { | |
244 | get_parent()->release_locks(i.second.lock_manager); | |
245 | } | |
246 | pulling.clear(); | |
247 | pull_from_peer.clear(); | |
248 | } | |
249 | ||
250 | void ReplicatedBackend::on_change() | |
251 | { | |
252 | dout(10) << __func__ << dendl; | |
253 | for (map<ceph_tid_t, InProgressOp>::iterator i = in_progress_ops.begin(); | |
254 | i != in_progress_ops.end(); | |
255 | in_progress_ops.erase(i++)) { | |
256 | if (i->second.on_commit) | |
257 | delete i->second.on_commit; | |
258 | if (i->second.on_applied) | |
259 | delete i->second.on_applied; | |
260 | } | |
261 | clear_recovery_state(); | |
262 | } | |
263 | ||
264 | void ReplicatedBackend::on_flushed() | |
265 | { | |
266 | } | |
267 | ||
268 | int ReplicatedBackend::objects_read_sync( | |
269 | const hobject_t &hoid, | |
270 | uint64_t off, | |
271 | uint64_t len, | |
272 | uint32_t op_flags, | |
273 | bufferlist *bl) | |
274 | { | |
275 | return store->read(ch, ghobject_t(hoid), off, len, *bl, op_flags); | |
276 | } | |
277 | ||
278 | struct AsyncReadCallback : public GenContext<ThreadPool::TPHandle&> { | |
279 | int r; | |
280 | Context *c; | |
281 | AsyncReadCallback(int r, Context *c) : r(r), c(c) {} | |
282 | void finish(ThreadPool::TPHandle&) override { | |
283 | c->complete(r); | |
284 | c = NULL; | |
285 | } | |
286 | ~AsyncReadCallback() override { | |
287 | delete c; | |
288 | } | |
289 | }; | |
290 | void ReplicatedBackend::objects_read_async( | |
291 | const hobject_t &hoid, | |
292 | const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, | |
293 | pair<bufferlist*, Context*> > > &to_read, | |
294 | Context *on_complete, | |
295 | bool fast_read) | |
296 | { | |
297 | // There is no fast read implementation for replication backend yet | |
298 | assert(!fast_read); | |
299 | ||
300 | int r = 0; | |
301 | for (list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>, | |
302 | pair<bufferlist*, Context*> > >::const_iterator i = | |
303 | to_read.begin(); | |
304 | i != to_read.end() && r >= 0; | |
305 | ++i) { | |
306 | int _r = store->read(ch, ghobject_t(hoid), i->first.get<0>(), | |
307 | i->first.get<1>(), *(i->second.first), | |
308 | i->first.get<2>()); | |
309 | if (i->second.second) { | |
310 | get_parent()->schedule_recovery_work( | |
311 | get_parent()->bless_gencontext( | |
312 | new AsyncReadCallback(_r, i->second.second))); | |
313 | } | |
314 | if (_r < 0) | |
315 | r = _r; | |
316 | } | |
317 | get_parent()->schedule_recovery_work( | |
318 | get_parent()->bless_gencontext( | |
319 | new AsyncReadCallback(r, on_complete))); | |
320 | } | |
321 | ||
322 | class C_OSD_OnOpCommit : public Context { | |
323 | ReplicatedBackend *pg; | |
324 | ReplicatedBackend::InProgressOp *op; | |
325 | public: | |
326 | C_OSD_OnOpCommit(ReplicatedBackend *pg, ReplicatedBackend::InProgressOp *op) | |
327 | : pg(pg), op(op) {} | |
328 | void finish(int) override { | |
329 | pg->op_commit(op); | |
330 | } | |
331 | }; | |
332 | ||
333 | class C_OSD_OnOpApplied : public Context { | |
334 | ReplicatedBackend *pg; | |
335 | ReplicatedBackend::InProgressOp *op; | |
336 | public: | |
337 | C_OSD_OnOpApplied(ReplicatedBackend *pg, ReplicatedBackend::InProgressOp *op) | |
338 | : pg(pg), op(op) {} | |
339 | void finish(int) override { | |
340 | pg->op_applied(op); | |
341 | } | |
342 | }; | |
343 | ||
344 | void generate_transaction( | |
345 | PGTransactionUPtr &pgt, | |
346 | const coll_t &coll, | |
347 | bool legacy_log_entries, | |
348 | vector<pg_log_entry_t> &log_entries, | |
349 | ObjectStore::Transaction *t, | |
350 | set<hobject_t> *added, | |
351 | set<hobject_t> *removed) | |
352 | { | |
353 | assert(t); | |
354 | assert(added); | |
355 | assert(removed); | |
356 | ||
357 | for (auto &&le: log_entries) { | |
358 | le.mark_unrollbackable(); | |
359 | auto oiter = pgt->op_map.find(le.soid); | |
360 | if (oiter != pgt->op_map.end() && oiter->second.updated_snaps) { | |
361 | vector<snapid_t> snaps( | |
362 | oiter->second.updated_snaps->second.begin(), | |
363 | oiter->second.updated_snaps->second.end()); | |
364 | ::encode(snaps, le.snaps); | |
365 | } | |
366 | } | |
367 | ||
368 | pgt->safe_create_traverse( | |
369 | [&](pair<const hobject_t, PGTransaction::ObjectOperation> &obj_op) { | |
370 | const hobject_t &oid = obj_op.first; | |
371 | const ghobject_t goid = | |
372 | ghobject_t(oid, ghobject_t::NO_GEN, shard_id_t::NO_SHARD); | |
373 | const PGTransaction::ObjectOperation &op = obj_op.second; | |
374 | ||
375 | if (oid.is_temp()) { | |
376 | if (op.is_fresh_object()) { | |
377 | added->insert(oid); | |
378 | } else if (op.is_delete()) { | |
379 | removed->insert(oid); | |
380 | } | |
381 | } | |
382 | ||
383 | if (op.delete_first) { | |
384 | t->remove(coll, goid); | |
385 | } | |
386 | ||
387 | match( | |
388 | op.init_type, | |
389 | [&](const PGTransaction::ObjectOperation::Init::None &) { | |
390 | }, | |
391 | [&](const PGTransaction::ObjectOperation::Init::Create &op) { | |
392 | t->touch(coll, goid); | |
393 | }, | |
394 | [&](const PGTransaction::ObjectOperation::Init::Clone &op) { | |
395 | t->clone( | |
396 | coll, | |
397 | ghobject_t( | |
398 | op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), | |
399 | goid); | |
400 | }, | |
401 | [&](const PGTransaction::ObjectOperation::Init::Rename &op) { | |
402 | assert(op.source.is_temp()); | |
403 | t->collection_move_rename( | |
404 | coll, | |
405 | ghobject_t( | |
406 | op.source, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), | |
407 | coll, | |
408 | goid); | |
409 | }); | |
410 | ||
411 | if (op.truncate) { | |
412 | t->truncate(coll, goid, op.truncate->first); | |
413 | if (op.truncate->first != op.truncate->second) | |
414 | t->truncate(coll, goid, op.truncate->second); | |
415 | } | |
416 | ||
417 | if (!op.attr_updates.empty()) { | |
418 | map<string, bufferlist> attrs; | |
419 | for (auto &&p: op.attr_updates) { | |
420 | if (p.second) | |
421 | attrs[p.first] = *(p.second); | |
422 | else | |
423 | t->rmattr(coll, goid, p.first); | |
424 | } | |
425 | t->setattrs(coll, goid, attrs); | |
426 | } | |
427 | ||
428 | if (op.clear_omap) | |
429 | t->omap_clear(coll, goid); | |
430 | if (op.omap_header) | |
431 | t->omap_setheader(coll, goid, *(op.omap_header)); | |
432 | ||
433 | for (auto &&up: op.omap_updates) { | |
434 | using UpdateType = PGTransaction::ObjectOperation::OmapUpdateType; | |
435 | switch (up.first) { | |
436 | case UpdateType::Remove: | |
437 | t->omap_rmkeys(coll, goid, up.second); | |
438 | break; | |
439 | case UpdateType::Insert: | |
440 | t->omap_setkeys(coll, goid, up.second); | |
441 | break; | |
442 | } | |
443 | } | |
444 | ||
445 | // updated_snaps doesn't matter since we marked unrollbackable | |
446 | ||
447 | if (op.alloc_hint) { | |
448 | auto &hint = *(op.alloc_hint); | |
449 | t->set_alloc_hint( | |
450 | coll, | |
451 | goid, | |
452 | hint.expected_object_size, | |
453 | hint.expected_write_size, | |
454 | hint.flags); | |
455 | } | |
456 | ||
457 | for (auto &&extent: op.buffer_updates) { | |
458 | using BufferUpdate = PGTransaction::ObjectOperation::BufferUpdate; | |
459 | match( | |
460 | extent.get_val(), | |
461 | [&](const BufferUpdate::Write &op) { | |
462 | t->write( | |
463 | coll, | |
464 | goid, | |
465 | extent.get_off(), | |
466 | extent.get_len(), | |
467 | op.buffer); | |
468 | }, | |
469 | [&](const BufferUpdate::Zero &op) { | |
470 | t->zero( | |
471 | coll, | |
472 | goid, | |
473 | extent.get_off(), | |
474 | extent.get_len()); | |
475 | }, | |
476 | [&](const BufferUpdate::CloneRange &op) { | |
477 | assert(op.len == extent.get_len()); | |
478 | t->clone_range( | |
479 | coll, | |
480 | ghobject_t(op.from, ghobject_t::NO_GEN, shard_id_t::NO_SHARD), | |
481 | goid, | |
482 | op.offset, | |
483 | extent.get_len(), | |
484 | extent.get_off()); | |
485 | }); | |
486 | } | |
487 | }); | |
488 | } | |
489 | ||
490 | void ReplicatedBackend::submit_transaction( | |
491 | const hobject_t &soid, | |
492 | const object_stat_sum_t &delta_stats, | |
493 | const eversion_t &at_version, | |
494 | PGTransactionUPtr &&_t, | |
495 | const eversion_t &trim_to, | |
496 | const eversion_t &roll_forward_to, | |
497 | const vector<pg_log_entry_t> &_log_entries, | |
498 | boost::optional<pg_hit_set_history_t> &hset_history, | |
499 | Context *on_local_applied_sync, | |
500 | Context *on_all_acked, | |
501 | Context *on_all_commit, | |
502 | ceph_tid_t tid, | |
503 | osd_reqid_t reqid, | |
504 | OpRequestRef orig_op) | |
505 | { | |
506 | parent->apply_stats( | |
507 | soid, | |
508 | delta_stats); | |
509 | ||
510 | vector<pg_log_entry_t> log_entries(_log_entries); | |
511 | ObjectStore::Transaction op_t; | |
512 | PGTransactionUPtr t(std::move(_t)); | |
513 | set<hobject_t> added, removed; | |
514 | generate_transaction( | |
515 | t, | |
516 | coll, | |
517 | !get_osdmap()->test_flag(CEPH_OSDMAP_REQUIRE_KRAKEN), | |
518 | log_entries, | |
519 | &op_t, | |
520 | &added, | |
521 | &removed); | |
522 | assert(added.size() <= 1); | |
523 | assert(removed.size() <= 1); | |
524 | ||
525 | assert(!in_progress_ops.count(tid)); | |
526 | InProgressOp &op = in_progress_ops.insert( | |
527 | make_pair( | |
528 | tid, | |
529 | InProgressOp( | |
530 | tid, on_all_commit, on_all_acked, | |
531 | orig_op, at_version) | |
532 | ) | |
533 | ).first->second; | |
534 | ||
535 | op.waiting_for_applied.insert( | |
536 | parent->get_actingbackfill_shards().begin(), | |
537 | parent->get_actingbackfill_shards().end()); | |
538 | op.waiting_for_commit.insert( | |
539 | parent->get_actingbackfill_shards().begin(), | |
540 | parent->get_actingbackfill_shards().end()); | |
541 | ||
542 | issue_op( | |
543 | soid, | |
544 | at_version, | |
545 | tid, | |
546 | reqid, | |
547 | trim_to, | |
548 | at_version, | |
549 | added.size() ? *(added.begin()) : hobject_t(), | |
550 | removed.size() ? *(removed.begin()) : hobject_t(), | |
551 | log_entries, | |
552 | hset_history, | |
553 | &op, | |
554 | op_t); | |
555 | ||
556 | add_temp_objs(added); | |
557 | clear_temp_objs(removed); | |
558 | ||
559 | parent->log_operation( | |
560 | log_entries, | |
561 | hset_history, | |
562 | trim_to, | |
563 | at_version, | |
564 | true, | |
565 | op_t); | |
566 | ||
567 | op_t.register_on_applied_sync(on_local_applied_sync); | |
568 | op_t.register_on_applied( | |
569 | parent->bless_context( | |
570 | new C_OSD_OnOpApplied(this, &op))); | |
571 | op_t.register_on_commit( | |
572 | parent->bless_context( | |
573 | new C_OSD_OnOpCommit(this, &op))); | |
574 | ||
575 | vector<ObjectStore::Transaction> tls; | |
576 | tls.push_back(std::move(op_t)); | |
577 | ||
578 | parent->queue_transactions(tls, op.op); | |
579 | } | |
580 | ||
581 | void ReplicatedBackend::op_applied( | |
582 | InProgressOp *op) | |
583 | { | |
584 | FUNCTRACE(); | |
585 | OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_APPLIED_BEGIN", true); | |
586 | dout(10) << __func__ << ": " << op->tid << dendl; | |
587 | if (op->op) { | |
588 | op->op->mark_event("op_applied"); | |
589 | op->op->pg_trace.event("op applied"); | |
590 | } | |
591 | ||
592 | op->waiting_for_applied.erase(get_parent()->whoami_shard()); | |
593 | parent->op_applied(op->v); | |
594 | ||
595 | if (op->waiting_for_applied.empty()) { | |
596 | op->on_applied->complete(0); | |
597 | op->on_applied = 0; | |
598 | } | |
599 | if (op->done()) { | |
600 | assert(!op->on_commit && !op->on_applied); | |
601 | in_progress_ops.erase(op->tid); | |
602 | } | |
603 | } | |
604 | ||
605 | void ReplicatedBackend::op_commit( | |
606 | InProgressOp *op) | |
607 | { | |
608 | FUNCTRACE(); | |
609 | OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true); | |
610 | dout(10) << __func__ << ": " << op->tid << dendl; | |
611 | if (op->op) { | |
612 | op->op->mark_event("op_commit"); | |
613 | op->op->pg_trace.event("op commit"); | |
614 | } | |
615 | ||
616 | op->waiting_for_commit.erase(get_parent()->whoami_shard()); | |
617 | ||
618 | if (op->waiting_for_commit.empty()) { | |
619 | op->on_commit->complete(0); | |
620 | op->on_commit = 0; | |
621 | } | |
622 | if (op->done()) { | |
623 | assert(!op->on_commit && !op->on_applied); | |
624 | in_progress_ops.erase(op->tid); | |
625 | } | |
626 | } | |
627 | ||
628 | void ReplicatedBackend::do_repop_reply(OpRequestRef op) | |
629 | { | |
630 | static_cast<MOSDRepOpReply*>(op->get_nonconst_req())->finish_decode(); | |
631 | const MOSDRepOpReply *r = static_cast<const MOSDRepOpReply *>(op->get_req()); | |
632 | assert(r->get_header().type == MSG_OSD_REPOPREPLY); | |
633 | ||
634 | op->mark_started(); | |
635 | ||
636 | // must be replication. | |
637 | ceph_tid_t rep_tid = r->get_tid(); | |
638 | pg_shard_t from = r->from; | |
639 | ||
640 | if (in_progress_ops.count(rep_tid)) { | |
641 | map<ceph_tid_t, InProgressOp>::iterator iter = | |
642 | in_progress_ops.find(rep_tid); | |
643 | InProgressOp &ip_op = iter->second; | |
644 | const MOSDOp *m = NULL; | |
645 | if (ip_op.op) | |
646 | m = static_cast<const MOSDOp *>(ip_op.op->get_req()); | |
647 | ||
648 | if (m) | |
649 | dout(7) << __func__ << ": tid " << ip_op.tid << " op " //<< *m | |
650 | << " ack_type " << (int)r->ack_type | |
651 | << " from " << from | |
652 | << dendl; | |
653 | else | |
654 | dout(7) << __func__ << ": tid " << ip_op.tid << " (no op) " | |
655 | << " ack_type " << (int)r->ack_type | |
656 | << " from " << from | |
657 | << dendl; | |
658 | ||
659 | // oh, good. | |
660 | ||
661 | if (r->ack_type & CEPH_OSD_FLAG_ONDISK) { | |
662 | assert(ip_op.waiting_for_commit.count(from)); | |
663 | ip_op.waiting_for_commit.erase(from); | |
664 | if (ip_op.op) { | |
665 | ostringstream ss; | |
666 | ss << "sub_op_commit_rec from " << from; | |
667 | ip_op.op->mark_event_string(ss.str()); | |
668 | ip_op.op->pg_trace.event("sub_op_commit_rec"); | |
669 | } | |
670 | } else { | |
671 | assert(ip_op.waiting_for_applied.count(from)); | |
672 | if (ip_op.op) { | |
673 | ostringstream ss; | |
674 | ss << "sub_op_applied_rec from " << from; | |
675 | ip_op.op->mark_event_string(ss.str()); | |
676 | ip_op.op->pg_trace.event("sub_op_applied_rec"); | |
677 | } | |
678 | } | |
679 | ip_op.waiting_for_applied.erase(from); | |
680 | ||
681 | parent->update_peer_last_complete_ondisk( | |
682 | from, | |
683 | r->get_last_complete_ondisk()); | |
684 | ||
685 | if (ip_op.waiting_for_applied.empty() && | |
686 | ip_op.on_applied) { | |
687 | ip_op.on_applied->complete(0); | |
688 | ip_op.on_applied = 0; | |
689 | } | |
690 | if (ip_op.waiting_for_commit.empty() && | |
691 | ip_op.on_commit) { | |
692 | ip_op.on_commit->complete(0); | |
693 | ip_op.on_commit= 0; | |
694 | } | |
695 | if (ip_op.done()) { | |
696 | assert(!ip_op.on_commit && !ip_op.on_applied); | |
697 | in_progress_ops.erase(iter); | |
698 | } | |
699 | } | |
700 | } | |
701 | ||
702 | void ReplicatedBackend::be_deep_scrub( | |
703 | const hobject_t &poid, | |
704 | uint32_t seed, | |
705 | ScrubMap::object &o, | |
706 | ThreadPool::TPHandle &handle) | |
707 | { | |
708 | dout(10) << __func__ << " " << poid << " seed " | |
709 | << std::hex << seed << std::dec << dendl; | |
710 | bufferhash h(seed), oh(seed); | |
711 | bufferlist bl, hdrbl; | |
712 | int r; | |
713 | __u64 pos = 0; | |
714 | ||
715 | uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; | |
716 | ||
717 | while (true) { | |
718 | handle.reset_tp_timeout(); | |
719 | r = store->read( | |
720 | ch, | |
721 | ghobject_t( | |
722 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
723 | pos, | |
724 | cct->_conf->osd_deep_scrub_stride, bl, | |
725 | fadvise_flags, true); | |
726 | if (r <= 0) | |
727 | break; | |
728 | ||
729 | h << bl; | |
730 | pos += bl.length(); | |
731 | bl.clear(); | |
732 | } | |
733 | if (r == -EIO) { | |
734 | dout(25) << __func__ << " " << poid << " got " | |
735 | << r << " on read, read_error" << dendl; | |
736 | o.read_error = true; | |
737 | return; | |
738 | } | |
739 | o.digest = h.digest(); | |
740 | o.digest_present = true; | |
741 | ||
742 | bl.clear(); | |
743 | r = store->omap_get_header( | |
744 | coll, | |
745 | ghobject_t( | |
746 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
747 | &hdrbl, true); | |
748 | // NOTE: bobtail to giant, we would crc the head as (len, head). | |
749 | // that changes at the same time we start using a non-zero seed. | |
750 | if (r == 0 && hdrbl.length()) { | |
751 | dout(25) << "CRC header " << string(hdrbl.c_str(), hdrbl.length()) | |
752 | << dendl; | |
753 | if (seed == 0) { | |
754 | // legacy | |
755 | bufferlist bl; | |
756 | ::encode(hdrbl, bl); | |
757 | oh << bl; | |
758 | } else { | |
759 | oh << hdrbl; | |
760 | } | |
761 | } else if (r == -EIO) { | |
762 | dout(25) << __func__ << " " << poid << " got " | |
763 | << r << " on omap header read, read_error" << dendl; | |
764 | o.read_error = true; | |
765 | return; | |
766 | } | |
767 | ||
768 | ObjectMap::ObjectMapIterator iter = store->get_omap_iterator( | |
769 | coll, | |
770 | ghobject_t( | |
771 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
772 | assert(iter); | |
773 | uint64_t keys_scanned = 0; | |
774 | for (iter->seek_to_first(); iter->status() == 0 && iter->valid(); | |
775 | iter->next(false)) { | |
776 | if (cct->_conf->osd_scan_list_ping_tp_interval && | |
777 | (keys_scanned % cct->_conf->osd_scan_list_ping_tp_interval == 0)) { | |
778 | handle.reset_tp_timeout(); | |
779 | } | |
780 | ++keys_scanned; | |
781 | ||
782 | dout(25) << "CRC key " << iter->key() << " value:\n"; | |
783 | iter->value().hexdump(*_dout); | |
784 | *_dout << dendl; | |
785 | ||
786 | ::encode(iter->key(), bl); | |
787 | ::encode(iter->value(), bl); | |
788 | oh << bl; | |
789 | bl.clear(); | |
790 | } | |
791 | ||
792 | if (iter->status() < 0) { | |
793 | dout(25) << __func__ << " " << poid | |
794 | << " on omap scan, db status error" << dendl; | |
795 | o.read_error = true; | |
796 | return; | |
797 | } | |
798 | ||
799 | //Store final calculated CRC32 of omap header & key/values | |
800 | o.omap_digest = oh.digest(); | |
801 | o.omap_digest_present = true; | |
802 | dout(20) << __func__ << " " << poid << " omap_digest " | |
803 | << std::hex << o.omap_digest << std::dec << dendl; | |
804 | } | |
805 | ||
806 | void ReplicatedBackend::_do_push(OpRequestRef op) | |
807 | { | |
808 | const MOSDPGPush *m = static_cast<const MOSDPGPush *>(op->get_req()); | |
809 | assert(m->get_type() == MSG_OSD_PG_PUSH); | |
810 | pg_shard_t from = m->from; | |
811 | ||
812 | op->mark_started(); | |
813 | ||
814 | vector<PushReplyOp> replies; | |
815 | ObjectStore::Transaction t; | |
816 | ostringstream ss; | |
817 | if (get_parent()->check_failsafe_full(ss)) { | |
818 | dout(10) << __func__ << " Out of space (failsafe) processing push request: " << ss.str() << dendl; | |
819 | ceph_abort(); | |
820 | } | |
821 | for (vector<PushOp>::const_iterator i = m->pushes.begin(); | |
822 | i != m->pushes.end(); | |
823 | ++i) { | |
824 | replies.push_back(PushReplyOp()); | |
825 | handle_push(from, *i, &(replies.back()), &t); | |
826 | } | |
827 | ||
828 | MOSDPGPushReply *reply = new MOSDPGPushReply; | |
829 | reply->from = get_parent()->whoami_shard(); | |
830 | reply->set_priority(m->get_priority()); | |
831 | reply->pgid = get_info().pgid; | |
832 | reply->map_epoch = m->map_epoch; | |
833 | reply->min_epoch = m->min_epoch; | |
834 | reply->replies.swap(replies); | |
835 | reply->compute_cost(cct); | |
836 | ||
837 | t.register_on_complete( | |
838 | new PG_SendMessageOnConn( | |
839 | get_parent(), reply, m->get_connection())); | |
840 | ||
841 | get_parent()->queue_transaction(std::move(t)); | |
842 | } | |
843 | ||
844 | struct C_ReplicatedBackend_OnPullComplete : GenContext<ThreadPool::TPHandle&> { | |
845 | ReplicatedBackend *bc; | |
846 | list<ReplicatedBackend::pull_complete_info> to_continue; | |
847 | int priority; | |
848 | C_ReplicatedBackend_OnPullComplete(ReplicatedBackend *bc, int priority) | |
849 | : bc(bc), priority(priority) {} | |
850 | ||
851 | void finish(ThreadPool::TPHandle &handle) override { | |
852 | ReplicatedBackend::RPGHandle *h = bc->_open_recovery_op(); | |
853 | for (auto &&i: to_continue) { | |
854 | auto j = bc->pulling.find(i.hoid); | |
855 | assert(j != bc->pulling.end()); | |
856 | ObjectContextRef obc = j->second.obc; | |
857 | bc->clear_pull(j, false /* already did it */); | |
858 | if (!bc->start_pushes(i.hoid, obc, h)) { | |
859 | bc->get_parent()->on_global_recover( | |
860 | i.hoid, i.stat); | |
861 | } | |
862 | handle.reset_tp_timeout(); | |
863 | } | |
864 | bc->run_recovery_op(h, priority); | |
865 | } | |
866 | }; | |
867 | ||
868 | void ReplicatedBackend::_do_pull_response(OpRequestRef op) | |
869 | { | |
870 | const MOSDPGPush *m = static_cast<const MOSDPGPush *>(op->get_req()); | |
871 | assert(m->get_type() == MSG_OSD_PG_PUSH); | |
872 | pg_shard_t from = m->from; | |
873 | ||
874 | op->mark_started(); | |
875 | ||
876 | vector<PullOp> replies(1); | |
877 | ||
878 | ostringstream ss; | |
879 | if (get_parent()->check_failsafe_full(ss)) { | |
880 | dout(10) << __func__ << " Out of space (failsafe) processing pull response (push): " << ss.str() << dendl; | |
881 | ceph_abort(); | |
882 | } | |
883 | ||
884 | ObjectStore::Transaction t; | |
885 | list<pull_complete_info> to_continue; | |
886 | for (vector<PushOp>::const_iterator i = m->pushes.begin(); | |
887 | i != m->pushes.end(); | |
888 | ++i) { | |
889 | bool more = handle_pull_response(from, *i, &(replies.back()), &to_continue, &t); | |
890 | if (more) | |
891 | replies.push_back(PullOp()); | |
892 | } | |
893 | if (!to_continue.empty()) { | |
894 | C_ReplicatedBackend_OnPullComplete *c = | |
895 | new C_ReplicatedBackend_OnPullComplete( | |
896 | this, | |
897 | m->get_priority()); | |
898 | c->to_continue.swap(to_continue); | |
899 | t.register_on_complete( | |
900 | new PG_RecoveryQueueAsync( | |
901 | get_parent(), | |
902 | get_parent()->bless_gencontext(c))); | |
903 | } | |
904 | replies.erase(replies.end() - 1); | |
905 | ||
906 | if (replies.size()) { | |
907 | MOSDPGPull *reply = new MOSDPGPull; | |
908 | reply->from = parent->whoami_shard(); | |
909 | reply->set_priority(m->get_priority()); | |
910 | reply->pgid = get_info().pgid; | |
911 | reply->map_epoch = m->map_epoch; | |
912 | reply->min_epoch = m->min_epoch; | |
913 | reply->set_pulls(&replies); | |
914 | reply->compute_cost(cct); | |
915 | ||
916 | t.register_on_complete( | |
917 | new PG_SendMessageOnConn( | |
918 | get_parent(), reply, m->get_connection())); | |
919 | } | |
920 | ||
921 | get_parent()->queue_transaction(std::move(t)); | |
922 | } | |
923 | ||
924 | void ReplicatedBackend::do_pull(OpRequestRef op) | |
925 | { | |
926 | MOSDPGPull *m = static_cast<MOSDPGPull *>(op->get_nonconst_req()); | |
927 | assert(m->get_type() == MSG_OSD_PG_PULL); | |
928 | pg_shard_t from = m->from; | |
929 | ||
930 | map<pg_shard_t, vector<PushOp> > replies; | |
931 | vector<PullOp> pulls; | |
932 | m->take_pulls(&pulls); | |
933 | for (auto& i : pulls) { | |
934 | replies[from].push_back(PushOp()); | |
935 | handle_pull(from, i, &(replies[from].back())); | |
936 | } | |
937 | send_pushes(m->get_priority(), replies); | |
938 | } | |
939 | ||
940 | void ReplicatedBackend::do_push_reply(OpRequestRef op) | |
941 | { | |
942 | const MOSDPGPushReply *m = static_cast<const MOSDPGPushReply *>(op->get_req()); | |
943 | assert(m->get_type() == MSG_OSD_PG_PUSH_REPLY); | |
944 | pg_shard_t from = m->from; | |
945 | ||
946 | vector<PushOp> replies(1); | |
947 | for (vector<PushReplyOp>::const_iterator i = m->replies.begin(); | |
948 | i != m->replies.end(); | |
949 | ++i) { | |
950 | bool more = handle_push_reply(from, *i, &(replies.back())); | |
951 | if (more) | |
952 | replies.push_back(PushOp()); | |
953 | } | |
954 | replies.erase(replies.end() - 1); | |
955 | ||
956 | map<pg_shard_t, vector<PushOp> > _replies; | |
957 | _replies[from].swap(replies); | |
958 | send_pushes(m->get_priority(), _replies); | |
959 | } | |
960 | ||
961 | Message * ReplicatedBackend::generate_subop( | |
962 | const hobject_t &soid, | |
963 | const eversion_t &at_version, | |
964 | ceph_tid_t tid, | |
965 | osd_reqid_t reqid, | |
966 | eversion_t pg_trim_to, | |
967 | eversion_t pg_roll_forward_to, | |
968 | hobject_t new_temp_oid, | |
969 | hobject_t discard_temp_oid, | |
970 | const vector<pg_log_entry_t> &log_entries, | |
971 | boost::optional<pg_hit_set_history_t> &hset_hist, | |
972 | ObjectStore::Transaction &op_t, | |
973 | pg_shard_t peer, | |
974 | const pg_info_t &pinfo) | |
975 | { | |
976 | int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; | |
977 | // forward the write/update/whatever | |
978 | MOSDRepOp *wr = new MOSDRepOp( | |
979 | reqid, parent->whoami_shard(), | |
980 | spg_t(get_info().pgid.pgid, peer.shard), | |
981 | soid, acks_wanted, | |
982 | get_osdmap()->get_epoch(), | |
983 | parent->get_last_peering_reset_epoch(), | |
984 | tid, at_version); | |
985 | ||
986 | // ship resulting transaction, log entries, and pg_stats | |
987 | if (!parent->should_send_op(peer, soid)) { | |
988 | dout(10) << "issue_repop shipping empty opt to osd." << peer | |
989 | <<", object " << soid | |
990 | << " beyond MAX(last_backfill_started " | |
991 | << ", pinfo.last_backfill " | |
992 | << pinfo.last_backfill << ")" << dendl; | |
993 | ObjectStore::Transaction t; | |
994 | ::encode(t, wr->get_data()); | |
995 | } else { | |
996 | ::encode(op_t, wr->get_data()); | |
997 | wr->get_header().data_off = op_t.get_data_alignment(); | |
998 | } | |
999 | ||
1000 | ::encode(log_entries, wr->logbl); | |
1001 | ||
1002 | if (pinfo.is_incomplete()) | |
1003 | wr->pg_stats = pinfo.stats; // reflects backfill progress | |
1004 | else | |
1005 | wr->pg_stats = get_info().stats; | |
1006 | ||
1007 | wr->pg_trim_to = pg_trim_to; | |
1008 | wr->pg_roll_forward_to = pg_roll_forward_to; | |
1009 | ||
1010 | wr->new_temp_oid = new_temp_oid; | |
1011 | wr->discard_temp_oid = discard_temp_oid; | |
1012 | wr->updated_hit_set_history = hset_hist; | |
1013 | return wr; | |
1014 | } | |
1015 | ||
1016 | void ReplicatedBackend::issue_op( | |
1017 | const hobject_t &soid, | |
1018 | const eversion_t &at_version, | |
1019 | ceph_tid_t tid, | |
1020 | osd_reqid_t reqid, | |
1021 | eversion_t pg_trim_to, | |
1022 | eversion_t pg_roll_forward_to, | |
1023 | hobject_t new_temp_oid, | |
1024 | hobject_t discard_temp_oid, | |
1025 | const vector<pg_log_entry_t> &log_entries, | |
1026 | boost::optional<pg_hit_set_history_t> &hset_hist, | |
1027 | InProgressOp *op, | |
1028 | ObjectStore::Transaction &op_t) | |
1029 | { | |
1030 | if (op->op) | |
1031 | op->op->pg_trace.event("issue replication ops"); | |
1032 | ||
1033 | if (parent->get_actingbackfill_shards().size() > 1) { | |
1034 | ostringstream ss; | |
1035 | set<pg_shard_t> replicas = parent->get_actingbackfill_shards(); | |
1036 | replicas.erase(parent->whoami_shard()); | |
1037 | ss << "waiting for subops from " << replicas; | |
1038 | if (op->op) | |
1039 | op->op->mark_sub_op_sent(ss.str()); | |
1040 | } | |
1041 | for (set<pg_shard_t>::const_iterator i = | |
1042 | parent->get_actingbackfill_shards().begin(); | |
1043 | i != parent->get_actingbackfill_shards().end(); | |
1044 | ++i) { | |
1045 | if (*i == parent->whoami_shard()) continue; | |
1046 | pg_shard_t peer = *i; | |
1047 | const pg_info_t &pinfo = parent->get_shard_info().find(peer)->second; | |
1048 | ||
1049 | Message *wr; | |
1050 | wr = generate_subop( | |
1051 | soid, | |
1052 | at_version, | |
1053 | tid, | |
1054 | reqid, | |
1055 | pg_trim_to, | |
1056 | pg_roll_forward_to, | |
1057 | new_temp_oid, | |
1058 | discard_temp_oid, | |
1059 | log_entries, | |
1060 | hset_hist, | |
1061 | op_t, | |
1062 | peer, | |
1063 | pinfo); | |
1064 | if (op->op) | |
1065 | wr->trace.init("replicated op", nullptr, &op->op->pg_trace); | |
1066 | get_parent()->send_message_osd_cluster( | |
1067 | peer.osd, wr, get_osdmap()->get_epoch()); | |
1068 | } | |
1069 | } | |
1070 | ||
1071 | // sub op modify | |
1072 | void ReplicatedBackend::do_repop(OpRequestRef op) | |
1073 | { | |
1074 | static_cast<MOSDRepOp*>(op->get_nonconst_req())->finish_decode(); | |
1075 | const MOSDRepOp *m = static_cast<const MOSDRepOp *>(op->get_req()); | |
1076 | int msg_type = m->get_type(); | |
1077 | assert(MSG_OSD_REPOP == msg_type); | |
1078 | ||
1079 | const hobject_t& soid = m->poid; | |
1080 | ||
1081 | dout(10) << __func__ << " " << soid | |
1082 | << " v " << m->version | |
1083 | << (m->logbl.length() ? " (transaction)" : " (parallel exec") | |
1084 | << " " << m->logbl.length() | |
1085 | << dendl; | |
1086 | ||
1087 | // sanity checks | |
1088 | assert(m->map_epoch >= get_info().history.same_interval_since); | |
1089 | ||
1090 | // we better not be missing this. | |
1091 | assert(!parent->get_log().get_missing().is_missing(soid)); | |
1092 | ||
1093 | int ackerosd = m->get_source().num(); | |
1094 | ||
1095 | op->mark_started(); | |
1096 | ||
1097 | RepModifyRef rm(std::make_shared<RepModify>()); | |
1098 | rm->op = op; | |
1099 | rm->ackerosd = ackerosd; | |
1100 | rm->last_complete = get_info().last_complete; | |
1101 | rm->epoch_started = get_osdmap()->get_epoch(); | |
1102 | ||
1103 | assert(m->logbl.length()); | |
1104 | // shipped transaction and log entries | |
1105 | vector<pg_log_entry_t> log; | |
1106 | ||
1107 | bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin(); | |
1108 | ::decode(rm->opt, p); | |
1109 | ||
1110 | if (m->new_temp_oid != hobject_t()) { | |
1111 | dout(20) << __func__ << " start tracking temp " << m->new_temp_oid << dendl; | |
1112 | add_temp_obj(m->new_temp_oid); | |
1113 | } | |
1114 | if (m->discard_temp_oid != hobject_t()) { | |
1115 | dout(20) << __func__ << " stop tracking temp " << m->discard_temp_oid << dendl; | |
1116 | if (rm->opt.empty()) { | |
1117 | dout(10) << __func__ << ": removing object " << m->discard_temp_oid | |
1118 | << " since we won't get the transaction" << dendl; | |
1119 | rm->localt.remove(coll, ghobject_t(m->discard_temp_oid)); | |
1120 | } | |
1121 | clear_temp_obj(m->discard_temp_oid); | |
1122 | } | |
1123 | ||
1124 | p = const_cast<bufferlist&>(m->logbl).begin(); | |
1125 | ::decode(log, p); | |
1126 | rm->opt.set_fadvise_flag(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); | |
1127 | ||
1128 | bool update_snaps = false; | |
1129 | if (!rm->opt.empty()) { | |
1130 | // If the opt is non-empty, we infer we are before | |
1131 | // last_backfill (according to the primary, not our | |
1132 | // not-quite-accurate value), and should update the | |
1133 | // collections now. Otherwise, we do it later on push. | |
1134 | update_snaps = true; | |
1135 | } | |
1136 | parent->update_stats(m->pg_stats); | |
1137 | parent->log_operation( | |
1138 | log, | |
1139 | m->updated_hit_set_history, | |
1140 | m->pg_trim_to, | |
1141 | m->pg_roll_forward_to, | |
1142 | update_snaps, | |
1143 | rm->localt); | |
1144 | ||
1145 | rm->opt.register_on_commit( | |
1146 | parent->bless_context( | |
1147 | new C_OSD_RepModifyCommit(this, rm))); | |
1148 | rm->localt.register_on_applied( | |
1149 | parent->bless_context( | |
1150 | new C_OSD_RepModifyApply(this, rm))); | |
1151 | vector<ObjectStore::Transaction> tls; | |
1152 | tls.reserve(2); | |
1153 | tls.push_back(std::move(rm->localt)); | |
1154 | tls.push_back(std::move(rm->opt)); | |
1155 | parent->queue_transactions(tls, op); | |
1156 | // op is cleaned up by oncommit/onapply when both are executed | |
1157 | } | |
1158 | ||
1159 | void ReplicatedBackend::repop_applied(RepModifyRef rm) | |
1160 | { | |
1161 | rm->op->mark_event("sub_op_applied"); | |
1162 | rm->applied = true; | |
1163 | rm->op->pg_trace.event("sup_op_applied"); | |
1164 | ||
1165 | dout(10) << __func__ << " on " << rm << " op " | |
1166 | << *rm->op->get_req() << dendl; | |
1167 | const Message *m = rm->op->get_req(); | |
1168 | const MOSDRepOp *req = static_cast<const MOSDRepOp*>(m); | |
1169 | eversion_t version = req->version; | |
1170 | ||
1171 | // send ack to acker only if we haven't sent a commit already | |
1172 | if (!rm->committed) { | |
1173 | Message *ack = new MOSDRepOpReply( | |
1174 | req, parent->whoami_shard(), | |
1175 | 0, get_osdmap()->get_epoch(), req->min_epoch, CEPH_OSD_FLAG_ACK); | |
1176 | ack->set_priority(CEPH_MSG_PRIO_HIGH); // this better match commit priority! | |
1177 | ack->trace = rm->op->pg_trace; | |
1178 | get_parent()->send_message_osd_cluster( | |
1179 | rm->ackerosd, ack, get_osdmap()->get_epoch()); | |
1180 | } | |
1181 | ||
1182 | parent->op_applied(version); | |
1183 | } | |
1184 | ||
1185 | void ReplicatedBackend::repop_commit(RepModifyRef rm) | |
1186 | { | |
1187 | rm->op->mark_commit_sent(); | |
1188 | rm->op->pg_trace.event("sup_op_commit"); | |
1189 | rm->committed = true; | |
1190 | ||
1191 | // send commit. | |
1192 | const MOSDRepOp *m = static_cast<const MOSDRepOp*>(rm->op->get_req()); | |
1193 | assert(m->get_type() == MSG_OSD_REPOP); | |
1194 | dout(10) << __func__ << " on op " << *m | |
1195 | << ", sending commit to osd." << rm->ackerosd | |
1196 | << dendl; | |
1197 | assert(get_osdmap()->is_up(rm->ackerosd)); | |
1198 | ||
1199 | get_parent()->update_last_complete_ondisk(rm->last_complete); | |
1200 | ||
1201 | MOSDRepOpReply *reply = new MOSDRepOpReply( | |
1202 | m, | |
1203 | get_parent()->whoami_shard(), | |
1204 | 0, get_osdmap()->get_epoch(), m->get_min_epoch(), CEPH_OSD_FLAG_ONDISK); | |
1205 | reply->set_last_complete_ondisk(rm->last_complete); | |
1206 | reply->set_priority(CEPH_MSG_PRIO_HIGH); // this better match ack priority! | |
1207 | reply->trace = rm->op->pg_trace; | |
1208 | get_parent()->send_message_osd_cluster( | |
1209 | rm->ackerosd, reply, get_osdmap()->get_epoch()); | |
1210 | ||
1211 | log_subop_stats(get_parent()->get_logger(), rm->op, l_osd_sop_w); | |
1212 | } | |
1213 | ||
1214 | ||
1215 | // =========================================================== | |
1216 | ||
1217 | void ReplicatedBackend::calc_head_subsets( | |
1218 | ObjectContextRef obc, SnapSet& snapset, const hobject_t& head, | |
1219 | const pg_missing_t& missing, | |
1220 | const hobject_t &last_backfill, | |
1221 | interval_set<uint64_t>& data_subset, | |
1222 | map<hobject_t, interval_set<uint64_t>>& clone_subsets, | |
1223 | ObcLockManager &manager) | |
1224 | { | |
1225 | dout(10) << "calc_head_subsets " << head | |
1226 | << " clone_overlap " << snapset.clone_overlap << dendl; | |
1227 | ||
1228 | uint64_t size = obc->obs.oi.size; | |
1229 | if (size) | |
1230 | data_subset.insert(0, size); | |
1231 | ||
1232 | if (get_parent()->get_pool().allow_incomplete_clones()) { | |
1233 | dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; | |
1234 | return; | |
1235 | } | |
1236 | ||
1237 | if (!cct->_conf->osd_recover_clone_overlap) { | |
1238 | dout(10) << "calc_head_subsets " << head << " -- osd_recover_clone_overlap disabled" << dendl; | |
1239 | return; | |
1240 | } | |
1241 | ||
1242 | ||
1243 | interval_set<uint64_t> cloning; | |
1244 | interval_set<uint64_t> prev; | |
1245 | if (size) | |
1246 | prev.insert(0, size); | |
1247 | ||
1248 | for (int j=snapset.clones.size()-1; j>=0; j--) { | |
1249 | hobject_t c = head; | |
1250 | c.snap = snapset.clones[j]; | |
1251 | prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); | |
1252 | if (!missing.is_missing(c) && | |
1253 | c < last_backfill && | |
1254 | get_parent()->try_lock_for_read(c, manager)) { | |
1255 | dout(10) << "calc_head_subsets " << head << " has prev " << c | |
1256 | << " overlap " << prev << dendl; | |
1257 | clone_subsets[c] = prev; | |
1258 | cloning.union_of(prev); | |
1259 | break; | |
1260 | } | |
1261 | dout(10) << "calc_head_subsets " << head << " does not have prev " << c | |
1262 | << " overlap " << prev << dendl; | |
1263 | } | |
1264 | ||
1265 | ||
1266 | if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { | |
1267 | dout(10) << "skipping clone, too many holes" << dendl; | |
1268 | get_parent()->release_locks(manager); | |
1269 | clone_subsets.clear(); | |
1270 | cloning.clear(); | |
1271 | } | |
1272 | ||
1273 | // what's left for us to push? | |
1274 | data_subset.subtract(cloning); | |
1275 | ||
1276 | dout(10) << "calc_head_subsets " << head | |
1277 | << " data_subset " << data_subset | |
1278 | << " clone_subsets " << clone_subsets << dendl; | |
1279 | } | |
1280 | ||
1281 | void ReplicatedBackend::calc_clone_subsets( | |
1282 | SnapSet& snapset, const hobject_t& soid, | |
1283 | const pg_missing_t& missing, | |
1284 | const hobject_t &last_backfill, | |
1285 | interval_set<uint64_t>& data_subset, | |
1286 | map<hobject_t, interval_set<uint64_t>>& clone_subsets, | |
1287 | ObcLockManager &manager) | |
1288 | { | |
1289 | dout(10) << "calc_clone_subsets " << soid | |
1290 | << " clone_overlap " << snapset.clone_overlap << dendl; | |
1291 | ||
1292 | uint64_t size = snapset.clone_size[soid.snap]; | |
1293 | if (size) | |
1294 | data_subset.insert(0, size); | |
1295 | ||
1296 | if (get_parent()->get_pool().allow_incomplete_clones()) { | |
1297 | dout(10) << __func__ << ": caching (was) enabled, skipping clone subsets" << dendl; | |
1298 | return; | |
1299 | } | |
1300 | ||
1301 | if (!cct->_conf->osd_recover_clone_overlap) { | |
1302 | dout(10) << "calc_clone_subsets " << soid << " -- osd_recover_clone_overlap disabled" << dendl; | |
1303 | return; | |
1304 | } | |
1305 | ||
1306 | unsigned i; | |
1307 | for (i=0; i < snapset.clones.size(); i++) | |
1308 | if (snapset.clones[i] == soid.snap) | |
1309 | break; | |
1310 | ||
1311 | // any overlap with next older clone? | |
1312 | interval_set<uint64_t> cloning; | |
1313 | interval_set<uint64_t> prev; | |
1314 | if (size) | |
1315 | prev.insert(0, size); | |
1316 | for (int j=i-1; j>=0; j--) { | |
1317 | hobject_t c = soid; | |
1318 | c.snap = snapset.clones[j]; | |
1319 | prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); | |
1320 | if (!missing.is_missing(c) && | |
1321 | c < last_backfill && | |
1322 | get_parent()->try_lock_for_read(c, manager)) { | |
1323 | dout(10) << "calc_clone_subsets " << soid << " has prev " << c | |
1324 | << " overlap " << prev << dendl; | |
1325 | clone_subsets[c] = prev; | |
1326 | cloning.union_of(prev); | |
1327 | break; | |
1328 | } | |
1329 | dout(10) << "calc_clone_subsets " << soid << " does not have prev " << c | |
1330 | << " overlap " << prev << dendl; | |
1331 | } | |
1332 | ||
1333 | // overlap with next newest? | |
1334 | interval_set<uint64_t> next; | |
1335 | if (size) | |
1336 | next.insert(0, size); | |
1337 | for (unsigned j=i+1; j<snapset.clones.size(); j++) { | |
1338 | hobject_t c = soid; | |
1339 | c.snap = snapset.clones[j]; | |
1340 | next.intersection_of(snapset.clone_overlap[snapset.clones[j-1]]); | |
1341 | if (!missing.is_missing(c) && | |
1342 | c < last_backfill && | |
1343 | get_parent()->try_lock_for_read(c, manager)) { | |
1344 | dout(10) << "calc_clone_subsets " << soid << " has next " << c | |
1345 | << " overlap " << next << dendl; | |
1346 | clone_subsets[c] = next; | |
1347 | cloning.union_of(next); | |
1348 | break; | |
1349 | } | |
1350 | dout(10) << "calc_clone_subsets " << soid << " does not have next " << c | |
1351 | << " overlap " << next << dendl; | |
1352 | } | |
1353 | ||
1354 | if (cloning.num_intervals() > cct->_conf->osd_recover_clone_overlap_limit) { | |
1355 | dout(10) << "skipping clone, too many holes" << dendl; | |
1356 | get_parent()->release_locks(manager); | |
1357 | clone_subsets.clear(); | |
1358 | cloning.clear(); | |
1359 | } | |
1360 | ||
1361 | ||
1362 | // what's left for us to push? | |
1363 | data_subset.subtract(cloning); | |
1364 | ||
1365 | dout(10) << "calc_clone_subsets " << soid | |
1366 | << " data_subset " << data_subset | |
1367 | << " clone_subsets " << clone_subsets << dendl; | |
1368 | } | |
1369 | ||
1370 | void ReplicatedBackend::prepare_pull( | |
1371 | eversion_t v, | |
1372 | const hobject_t& soid, | |
1373 | ObjectContextRef headctx, | |
1374 | RPGHandle *h) | |
1375 | { | |
1376 | assert(get_parent()->get_local_missing().get_items().count(soid)); | |
1377 | eversion_t _v = get_parent()->get_local_missing().get_items().find( | |
1378 | soid)->second.need; | |
1379 | assert(_v == v); | |
1380 | const map<hobject_t, set<pg_shard_t>> &missing_loc( | |
1381 | get_parent()->get_missing_loc_shards()); | |
1382 | const map<pg_shard_t, pg_missing_t > &peer_missing( | |
1383 | get_parent()->get_shard_missing()); | |
1384 | map<hobject_t, set<pg_shard_t>>::const_iterator q = missing_loc.find(soid); | |
1385 | assert(q != missing_loc.end()); | |
1386 | assert(!q->second.empty()); | |
1387 | ||
1388 | // pick a pullee | |
1389 | vector<pg_shard_t> shuffle(q->second.begin(), q->second.end()); | |
1390 | random_shuffle(shuffle.begin(), shuffle.end()); | |
1391 | vector<pg_shard_t>::iterator p = shuffle.begin(); | |
1392 | assert(get_osdmap()->is_up(p->osd)); | |
1393 | pg_shard_t fromshard = *p; | |
1394 | ||
1395 | dout(7) << "pull " << soid | |
1396 | << " v " << v | |
1397 | << " on osds " << q->second | |
1398 | << " from osd." << fromshard | |
1399 | << dendl; | |
1400 | ||
1401 | assert(peer_missing.count(fromshard)); | |
1402 | const pg_missing_t &pmissing = peer_missing.find(fromshard)->second; | |
1403 | if (pmissing.is_missing(soid, v)) { | |
1404 | assert(pmissing.get_items().find(soid)->second.have != v); | |
1405 | dout(10) << "pulling soid " << soid << " from osd " << fromshard | |
1406 | << " at version " << pmissing.get_items().find(soid)->second.have | |
1407 | << " rather than at version " << v << dendl; | |
1408 | v = pmissing.get_items().find(soid)->second.have; | |
1409 | assert(get_parent()->get_log().get_log().objects.count(soid) && | |
1410 | (get_parent()->get_log().get_log().objects.find(soid)->second->op == | |
1411 | pg_log_entry_t::LOST_REVERT) && | |
1412 | (get_parent()->get_log().get_log().objects.find( | |
1413 | soid)->second->reverting_to == | |
1414 | v)); | |
1415 | } | |
1416 | ||
1417 | ObjectRecoveryInfo recovery_info; | |
1418 | ObcLockManager lock_manager; | |
1419 | ||
1420 | if (soid.is_snap()) { | |
1421 | assert(!get_parent()->get_local_missing().is_missing( | |
1422 | soid.get_head()) || | |
1423 | !get_parent()->get_local_missing().is_missing( | |
1424 | soid.get_snapdir())); | |
1425 | assert(headctx); | |
1426 | // check snapset | |
1427 | SnapSetContext *ssc = headctx->ssc; | |
1428 | assert(ssc); | |
1429 | dout(10) << " snapset " << ssc->snapset << dendl; | |
1430 | recovery_info.ss = ssc->snapset; | |
1431 | calc_clone_subsets( | |
1432 | ssc->snapset, soid, get_parent()->get_local_missing(), | |
1433 | get_info().last_backfill, | |
1434 | recovery_info.copy_subset, | |
1435 | recovery_info.clone_subset, | |
1436 | lock_manager); | |
1437 | // FIXME: this may overestimate if we are pulling multiple clones in parallel... | |
1438 | dout(10) << " pulling " << recovery_info << dendl; | |
1439 | ||
1440 | assert(ssc->snapset.clone_size.count(soid.snap)); | |
1441 | recovery_info.size = ssc->snapset.clone_size[soid.snap]; | |
1442 | } else { | |
1443 | // pulling head or unversioned object. | |
1444 | // always pull the whole thing. | |
1445 | recovery_info.copy_subset.insert(0, (uint64_t)-1); | |
1446 | recovery_info.size = ((uint64_t)-1); | |
1447 | } | |
1448 | ||
1449 | h->pulls[fromshard].push_back(PullOp()); | |
1450 | PullOp &op = h->pulls[fromshard].back(); | |
1451 | op.soid = soid; | |
1452 | ||
1453 | op.recovery_info = recovery_info; | |
1454 | op.recovery_info.soid = soid; | |
1455 | op.recovery_info.version = v; | |
1456 | op.recovery_progress.data_complete = false; | |
1457 | op.recovery_progress.omap_complete = false; | |
1458 | op.recovery_progress.data_recovered_to = 0; | |
1459 | op.recovery_progress.first = true; | |
1460 | ||
1461 | assert(!pulling.count(soid)); | |
1462 | pull_from_peer[fromshard].insert(soid); | |
1463 | PullInfo &pi = pulling[soid]; | |
1464 | pi.from = fromshard; | |
1465 | pi.soid = soid; | |
1466 | pi.head_ctx = headctx; | |
1467 | pi.recovery_info = op.recovery_info; | |
1468 | pi.recovery_progress = op.recovery_progress; | |
1469 | pi.cache_dont_need = h->cache_dont_need; | |
1470 | pi.lock_manager = std::move(lock_manager); | |
1471 | } | |
1472 | ||
1473 | /* | |
1474 | * intelligently push an object to a replica. make use of existing | |
1475 | * clones/heads and dup data ranges where possible. | |
1476 | */ | |
1477 | void ReplicatedBackend::prep_push_to_replica( | |
1478 | ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer, | |
1479 | PushOp *pop, bool cache_dont_need) | |
1480 | { | |
1481 | const object_info_t& oi = obc->obs.oi; | |
1482 | uint64_t size = obc->obs.oi.size; | |
1483 | ||
1484 | dout(10) << __func__ << ": " << soid << " v" << oi.version | |
1485 | << " size " << size << " to osd." << peer << dendl; | |
1486 | ||
1487 | map<hobject_t, interval_set<uint64_t>> clone_subsets; | |
1488 | interval_set<uint64_t> data_subset; | |
1489 | ||
1490 | ObcLockManager lock_manager; | |
1491 | // are we doing a clone on the replica? | |
1492 | if (soid.snap && soid.snap < CEPH_NOSNAP) { | |
1493 | hobject_t head = soid; | |
1494 | head.snap = CEPH_NOSNAP; | |
1495 | ||
1496 | // try to base push off of clones that succeed/preceed poid | |
1497 | // we need the head (and current SnapSet) locally to do that. | |
1498 | if (get_parent()->get_local_missing().is_missing(head)) { | |
1499 | dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl; | |
1500 | return prep_push(obc, soid, peer, pop, cache_dont_need); | |
1501 | } | |
1502 | hobject_t snapdir = head; | |
1503 | snapdir.snap = CEPH_SNAPDIR; | |
1504 | if (get_parent()->get_local_missing().is_missing(snapdir)) { | |
1505 | dout(15) << "push_to_replica missing snapdir " << snapdir | |
1506 | << ", pushing raw clone" << dendl; | |
1507 | return prep_push(obc, soid, peer, pop, cache_dont_need); | |
1508 | } | |
1509 | ||
1510 | SnapSetContext *ssc = obc->ssc; | |
1511 | assert(ssc); | |
1512 | dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; | |
1513 | pop->recovery_info.ss = ssc->snapset; | |
1514 | map<pg_shard_t, pg_missing_t>::const_iterator pm = | |
1515 | get_parent()->get_shard_missing().find(peer); | |
1516 | assert(pm != get_parent()->get_shard_missing().end()); | |
1517 | map<pg_shard_t, pg_info_t>::const_iterator pi = | |
1518 | get_parent()->get_shard_info().find(peer); | |
1519 | assert(pi != get_parent()->get_shard_info().end()); | |
1520 | calc_clone_subsets( | |
1521 | ssc->snapset, soid, | |
1522 | pm->second, | |
1523 | pi->second.last_backfill, | |
1524 | data_subset, clone_subsets, | |
1525 | lock_manager); | |
1526 | } else if (soid.snap == CEPH_NOSNAP) { | |
1527 | // pushing head or unversioned object. | |
1528 | // base this on partially on replica's clones? | |
1529 | SnapSetContext *ssc = obc->ssc; | |
1530 | assert(ssc); | |
1531 | dout(15) << "push_to_replica snapset is " << ssc->snapset << dendl; | |
1532 | calc_head_subsets( | |
1533 | obc, | |
1534 | ssc->snapset, soid, get_parent()->get_shard_missing().find(peer)->second, | |
1535 | get_parent()->get_shard_info().find(peer)->second.last_backfill, | |
1536 | data_subset, clone_subsets, | |
1537 | lock_manager); | |
1538 | } | |
1539 | ||
1540 | prep_push( | |
1541 | obc, | |
1542 | soid, | |
1543 | peer, | |
1544 | oi.version, | |
1545 | data_subset, | |
1546 | clone_subsets, | |
1547 | pop, | |
1548 | cache_dont_need, | |
1549 | std::move(lock_manager)); | |
1550 | } | |
1551 | ||
1552 | void ReplicatedBackend::prep_push(ObjectContextRef obc, | |
1553 | const hobject_t& soid, pg_shard_t peer, | |
1554 | PushOp *pop, bool cache_dont_need) | |
1555 | { | |
1556 | interval_set<uint64_t> data_subset; | |
1557 | if (obc->obs.oi.size) | |
1558 | data_subset.insert(0, obc->obs.oi.size); | |
1559 | map<hobject_t, interval_set<uint64_t>> clone_subsets; | |
1560 | ||
1561 | prep_push(obc, soid, peer, | |
1562 | obc->obs.oi.version, data_subset, clone_subsets, | |
1563 | pop, cache_dont_need, ObcLockManager()); | |
1564 | } | |
1565 | ||
1566 | void ReplicatedBackend::prep_push( | |
1567 | ObjectContextRef obc, | |
1568 | const hobject_t& soid, pg_shard_t peer, | |
1569 | eversion_t version, | |
1570 | interval_set<uint64_t> &data_subset, | |
1571 | map<hobject_t, interval_set<uint64_t>>& clone_subsets, | |
1572 | PushOp *pop, | |
1573 | bool cache_dont_need, | |
1574 | ObcLockManager &&lock_manager) | |
1575 | { | |
1576 | get_parent()->begin_peer_recover(peer, soid); | |
1577 | // take note. | |
1578 | PushInfo &pi = pushing[soid][peer]; | |
1579 | pi.obc = obc; | |
1580 | pi.recovery_info.size = obc->obs.oi.size; | |
1581 | pi.recovery_info.copy_subset = data_subset; | |
1582 | pi.recovery_info.clone_subset = clone_subsets; | |
1583 | pi.recovery_info.soid = soid; | |
1584 | pi.recovery_info.oi = obc->obs.oi; | |
1585 | pi.recovery_info.ss = pop->recovery_info.ss; | |
1586 | pi.recovery_info.version = version; | |
1587 | pi.lock_manager = std::move(lock_manager); | |
1588 | ||
1589 | ObjectRecoveryProgress new_progress; | |
1590 | int r = build_push_op(pi.recovery_info, | |
1591 | pi.recovery_progress, | |
1592 | &new_progress, | |
1593 | pop, | |
1594 | &(pi.stat), cache_dont_need); | |
1595 | assert(r == 0); | |
1596 | pi.recovery_progress = new_progress; | |
1597 | } | |
1598 | ||
1599 | void ReplicatedBackend::submit_push_data( | |
1600 | const ObjectRecoveryInfo &recovery_info, | |
1601 | bool first, | |
1602 | bool complete, | |
1603 | bool cache_dont_need, | |
1604 | const interval_set<uint64_t> &intervals_included, | |
1605 | bufferlist data_included, | |
1606 | bufferlist omap_header, | |
1607 | const map<string, bufferlist> &attrs, | |
1608 | const map<string, bufferlist> &omap_entries, | |
1609 | ObjectStore::Transaction *t) | |
1610 | { | |
1611 | hobject_t target_oid; | |
1612 | if (first && complete) { | |
1613 | target_oid = recovery_info.soid; | |
1614 | } else { | |
1615 | target_oid = get_parent()->get_temp_recovery_object(recovery_info.soid, | |
1616 | recovery_info.version); | |
1617 | if (first) { | |
1618 | dout(10) << __func__ << ": Adding oid " | |
1619 | << target_oid << " in the temp collection" << dendl; | |
1620 | add_temp_obj(target_oid); | |
1621 | } | |
1622 | } | |
1623 | ||
1624 | if (first) { | |
1625 | t->remove(coll, ghobject_t(target_oid)); | |
1626 | t->touch(coll, ghobject_t(target_oid)); | |
1627 | t->truncate(coll, ghobject_t(target_oid), recovery_info.size); | |
1628 | if (omap_header.length()) | |
1629 | t->omap_setheader(coll, ghobject_t(target_oid), omap_header); | |
1630 | ||
1631 | bufferlist bv = attrs.at(OI_ATTR); | |
1632 | object_info_t oi(bv); | |
1633 | t->set_alloc_hint(coll, ghobject_t(target_oid), | |
1634 | oi.expected_object_size, | |
1635 | oi.expected_write_size, | |
1636 | oi.alloc_hint_flags); | |
1637 | } | |
1638 | uint64_t off = 0; | |
1639 | uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL; | |
1640 | if (cache_dont_need) | |
1641 | fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; | |
1642 | for (interval_set<uint64_t>::const_iterator p = intervals_included.begin(); | |
1643 | p != intervals_included.end(); | |
1644 | ++p) { | |
1645 | bufferlist bit; | |
1646 | bit.substr_of(data_included, off, p.get_len()); | |
1647 | t->write(coll, ghobject_t(target_oid), | |
1648 | p.get_start(), p.get_len(), bit, fadvise_flags); | |
1649 | off += p.get_len(); | |
1650 | } | |
1651 | ||
1652 | if (!omap_entries.empty()) | |
1653 | t->omap_setkeys(coll, ghobject_t(target_oid), omap_entries); | |
1654 | if (!attrs.empty()) | |
1655 | t->setattrs(coll, ghobject_t(target_oid), attrs); | |
1656 | ||
1657 | if (complete) { | |
1658 | if (!first) { | |
1659 | dout(10) << __func__ << ": Removing oid " | |
1660 | << target_oid << " from the temp collection" << dendl; | |
1661 | clear_temp_obj(target_oid); | |
1662 | t->remove(coll, ghobject_t(recovery_info.soid)); | |
1663 | t->collection_move_rename(coll, ghobject_t(target_oid), | |
1664 | coll, ghobject_t(recovery_info.soid)); | |
1665 | } | |
1666 | ||
1667 | submit_push_complete(recovery_info, t); | |
1668 | } | |
1669 | } | |
1670 | ||
1671 | void ReplicatedBackend::submit_push_complete( | |
1672 | const ObjectRecoveryInfo &recovery_info, | |
1673 | ObjectStore::Transaction *t) | |
1674 | { | |
1675 | for (map<hobject_t, interval_set<uint64_t>>::const_iterator p = | |
1676 | recovery_info.clone_subset.begin(); | |
1677 | p != recovery_info.clone_subset.end(); | |
1678 | ++p) { | |
1679 | for (interval_set<uint64_t>::const_iterator q = p->second.begin(); | |
1680 | q != p->second.end(); | |
1681 | ++q) { | |
1682 | dout(15) << " clone_range " << p->first << " " | |
1683 | << q.get_start() << "~" << q.get_len() << dendl; | |
1684 | t->clone_range(coll, ghobject_t(p->first), ghobject_t(recovery_info.soid), | |
1685 | q.get_start(), q.get_len(), q.get_start()); | |
1686 | } | |
1687 | } | |
1688 | } | |
1689 | ||
1690 | ObjectRecoveryInfo ReplicatedBackend::recalc_subsets( | |
1691 | const ObjectRecoveryInfo& recovery_info, | |
1692 | SnapSetContext *ssc, | |
1693 | ObcLockManager &manager) | |
1694 | { | |
1695 | if (!recovery_info.soid.snap || recovery_info.soid.snap >= CEPH_NOSNAP) | |
1696 | return recovery_info; | |
1697 | ObjectRecoveryInfo new_info = recovery_info; | |
1698 | new_info.copy_subset.clear(); | |
1699 | new_info.clone_subset.clear(); | |
1700 | assert(ssc); | |
1701 | get_parent()->release_locks(manager); // might already have locks | |
1702 | calc_clone_subsets( | |
1703 | ssc->snapset, new_info.soid, get_parent()->get_local_missing(), | |
1704 | get_info().last_backfill, | |
1705 | new_info.copy_subset, new_info.clone_subset, | |
1706 | manager); | |
1707 | return new_info; | |
1708 | } | |
1709 | ||
1710 | bool ReplicatedBackend::handle_pull_response( | |
1711 | pg_shard_t from, const PushOp &pop, PullOp *response, | |
1712 | list<pull_complete_info> *to_continue, | |
1713 | ObjectStore::Transaction *t) | |
1714 | { | |
1715 | interval_set<uint64_t> data_included = pop.data_included; | |
1716 | bufferlist data; | |
1717 | data = pop.data; | |
1718 | dout(10) << "handle_pull_response " | |
1719 | << pop.recovery_info | |
1720 | << pop.after_progress | |
1721 | << " data.size() is " << data.length() | |
1722 | << " data_included: " << data_included | |
1723 | << dendl; | |
1724 | if (pop.version == eversion_t()) { | |
1725 | // replica doesn't have it! | |
1726 | _failed_push(from, pop.soid); | |
1727 | return false; | |
1728 | } | |
1729 | ||
1730 | const hobject_t &hoid = pop.soid; | |
1731 | assert((data_included.empty() && data.length() == 0) || | |
1732 | (!data_included.empty() && data.length() > 0)); | |
1733 | ||
1734 | auto piter = pulling.find(hoid); | |
1735 | if (piter == pulling.end()) { | |
1736 | return false; | |
1737 | } | |
1738 | ||
1739 | PullInfo &pi = piter->second; | |
1740 | if (pi.recovery_info.size == (uint64_t(-1))) { | |
1741 | pi.recovery_info.size = pop.recovery_info.size; | |
1742 | pi.recovery_info.copy_subset.intersection_of( | |
1743 | pop.recovery_info.copy_subset); | |
1744 | } | |
1745 | ||
1746 | bool first = pi.recovery_progress.first; | |
1747 | if (first) { | |
1748 | // attrs only reference the origin bufferlist (decode from | |
1749 | // MOSDPGPush message) whose size is much greater than attrs in | |
1750 | // recovery. If obc cache it (get_obc maybe cache the attr), this | |
1751 | // causes the whole origin bufferlist would not be free until obc | |
1752 | // is evicted from obc cache. So rebuild the bufferlists before | |
1753 | // cache it. | |
1754 | auto attrset = pop.attrset; | |
1755 | for (auto& a : attrset) { | |
1756 | a.second.rebuild(); | |
1757 | } | |
1758 | pi.obc = get_parent()->get_obc(pi.recovery_info.soid, attrset); | |
1759 | pi.recovery_info.oi = pi.obc->obs.oi; | |
1760 | pi.recovery_info = recalc_subsets( | |
1761 | pi.recovery_info, | |
1762 | pi.obc->ssc, | |
1763 | pi.lock_manager); | |
1764 | } | |
1765 | ||
1766 | ||
1767 | interval_set<uint64_t> usable_intervals; | |
1768 | bufferlist usable_data; | |
1769 | trim_pushed_data(pi.recovery_info.copy_subset, | |
1770 | data_included, | |
1771 | data, | |
1772 | &usable_intervals, | |
1773 | &usable_data); | |
1774 | data_included = usable_intervals; | |
1775 | data.claim(usable_data); | |
1776 | ||
1777 | ||
1778 | pi.recovery_progress = pop.after_progress; | |
1779 | ||
1780 | dout(10) << "new recovery_info " << pi.recovery_info | |
1781 | << ", new progress " << pi.recovery_progress | |
1782 | << dendl; | |
1783 | ||
1784 | bool complete = pi.is_complete(); | |
1785 | ||
1786 | submit_push_data(pi.recovery_info, first, | |
1787 | complete, pi.cache_dont_need, | |
1788 | data_included, data, | |
1789 | pop.omap_header, | |
1790 | pop.attrset, | |
1791 | pop.omap_entries, | |
1792 | t); | |
1793 | ||
1794 | pi.stat.num_keys_recovered += pop.omap_entries.size(); | |
1795 | pi.stat.num_bytes_recovered += data.length(); | |
1796 | ||
1797 | if (complete) { | |
1798 | pi.stat.num_objects_recovered++; | |
1799 | clear_pull_from(piter); | |
1800 | to_continue->push_back({hoid, pi.stat}); | |
1801 | get_parent()->on_local_recover( | |
1802 | hoid, pi.recovery_info, pi.obc, t); | |
1803 | return false; | |
1804 | } else { | |
1805 | response->soid = pop.soid; | |
1806 | response->recovery_info = pi.recovery_info; | |
1807 | response->recovery_progress = pi.recovery_progress; | |
1808 | return true; | |
1809 | } | |
1810 | } | |
1811 | ||
1812 | void ReplicatedBackend::handle_push( | |
1813 | pg_shard_t from, const PushOp &pop, PushReplyOp *response, | |
1814 | ObjectStore::Transaction *t) | |
1815 | { | |
1816 | dout(10) << "handle_push " | |
1817 | << pop.recovery_info | |
1818 | << pop.after_progress | |
1819 | << dendl; | |
1820 | bufferlist data; | |
1821 | data = pop.data; | |
1822 | bool first = pop.before_progress.first; | |
1823 | bool complete = pop.after_progress.data_complete && | |
1824 | pop.after_progress.omap_complete; | |
1825 | ||
1826 | response->soid = pop.recovery_info.soid; | |
1827 | submit_push_data(pop.recovery_info, | |
1828 | first, | |
1829 | complete, | |
1830 | true, // must be replicate | |
1831 | pop.data_included, | |
1832 | data, | |
1833 | pop.omap_header, | |
1834 | pop.attrset, | |
1835 | pop.omap_entries, | |
1836 | t); | |
1837 | ||
1838 | if (complete) | |
1839 | get_parent()->on_local_recover( | |
1840 | pop.recovery_info.soid, | |
1841 | pop.recovery_info, | |
1842 | ObjectContextRef(), // ok, is replica | |
1843 | t); | |
1844 | } | |
1845 | ||
1846 | void ReplicatedBackend::send_pushes(int prio, map<pg_shard_t, vector<PushOp> > &pushes) | |
1847 | { | |
1848 | for (map<pg_shard_t, vector<PushOp> >::iterator i = pushes.begin(); | |
1849 | i != pushes.end(); | |
1850 | ++i) { | |
1851 | ConnectionRef con = get_parent()->get_con_osd_cluster( | |
1852 | i->first.osd, | |
1853 | get_osdmap()->get_epoch()); | |
1854 | if (!con) | |
1855 | continue; | |
1856 | vector<PushOp>::iterator j = i->second.begin(); | |
1857 | while (j != i->second.end()) { | |
1858 | uint64_t cost = 0; | |
1859 | uint64_t pushes = 0; | |
1860 | MOSDPGPush *msg = new MOSDPGPush(); | |
1861 | msg->from = get_parent()->whoami_shard(); | |
1862 | msg->pgid = get_parent()->primary_spg_t(); | |
1863 | msg->map_epoch = get_osdmap()->get_epoch(); | |
1864 | msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); | |
1865 | msg->set_priority(prio); | |
1866 | for (; | |
1867 | (j != i->second.end() && | |
1868 | cost < cct->_conf->osd_max_push_cost && | |
1869 | pushes < cct->_conf->osd_max_push_objects) ; | |
1870 | ++j) { | |
1871 | dout(20) << __func__ << ": sending push " << *j | |
1872 | << " to osd." << i->first << dendl; | |
1873 | cost += j->cost(cct); | |
1874 | pushes += 1; | |
1875 | msg->pushes.push_back(*j); | |
1876 | } | |
1877 | msg->set_cost(cost); | |
1878 | get_parent()->send_message_osd_cluster(msg, con); | |
1879 | } | |
1880 | } | |
1881 | } | |
1882 | ||
1883 | void ReplicatedBackend::send_pulls(int prio, map<pg_shard_t, vector<PullOp> > &pulls) | |
1884 | { | |
1885 | for (map<pg_shard_t, vector<PullOp> >::iterator i = pulls.begin(); | |
1886 | i != pulls.end(); | |
1887 | ++i) { | |
1888 | ConnectionRef con = get_parent()->get_con_osd_cluster( | |
1889 | i->first.osd, | |
1890 | get_osdmap()->get_epoch()); | |
1891 | if (!con) | |
1892 | continue; | |
1893 | dout(20) << __func__ << ": sending pulls " << i->second | |
1894 | << " to osd." << i->first << dendl; | |
1895 | MOSDPGPull *msg = new MOSDPGPull(); | |
1896 | msg->from = parent->whoami_shard(); | |
1897 | msg->set_priority(prio); | |
1898 | msg->pgid = get_parent()->primary_spg_t(); | |
1899 | msg->map_epoch = get_osdmap()->get_epoch(); | |
1900 | msg->min_epoch = get_parent()->get_last_peering_reset_epoch(); | |
1901 | msg->set_pulls(&i->second); | |
1902 | msg->compute_cost(cct); | |
1903 | get_parent()->send_message_osd_cluster(msg, con); | |
1904 | } | |
1905 | } | |
1906 | ||
1907 | int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info, | |
1908 | const ObjectRecoveryProgress &progress, | |
1909 | ObjectRecoveryProgress *out_progress, | |
1910 | PushOp *out_op, | |
1911 | object_stat_sum_t *stat, | |
1912 | bool cache_dont_need) | |
1913 | { | |
1914 | ObjectRecoveryProgress _new_progress; | |
1915 | if (!out_progress) | |
1916 | out_progress = &_new_progress; | |
1917 | ObjectRecoveryProgress &new_progress = *out_progress; | |
1918 | new_progress = progress; | |
1919 | ||
1920 | dout(7) << "send_push_op " << recovery_info.soid | |
1921 | << " v " << recovery_info.version | |
1922 | << " size " << recovery_info.size | |
1923 | << " recovery_info: " << recovery_info | |
1924 | << dendl; | |
1925 | ||
1926 | if (progress.first) { | |
1927 | int r = store->omap_get_header(coll, ghobject_t(recovery_info.soid), &out_op->omap_header); | |
1928 | if(r < 0) { | |
1929 | dout(1) << __func__ << " get omap header failed: " << cpp_strerror(-r) << dendl; | |
1930 | return r; | |
1931 | } | |
1932 | r = store->getattrs(ch, ghobject_t(recovery_info.soid), out_op->attrset); | |
1933 | if(r < 0) { | |
1934 | dout(1) << __func__ << " getattrs failed: " << cpp_strerror(-r) << dendl; | |
1935 | return r; | |
1936 | } | |
1937 | ||
1938 | // Debug | |
1939 | bufferlist bv = out_op->attrset[OI_ATTR]; | |
1940 | object_info_t oi(bv); | |
1941 | ||
1942 | if (oi.version != recovery_info.version) { | |
1943 | get_parent()->clog_error() << get_info().pgid << " push " | |
1944 | << recovery_info.soid << " v " | |
1945 | << recovery_info.version | |
1946 | << " failed because local copy is " | |
1947 | << oi.version; | |
1948 | return -EINVAL; | |
1949 | } | |
1950 | ||
1951 | new_progress.first = false; | |
1952 | } | |
1953 | ||
1954 | uint64_t available = cct->_conf->osd_recovery_max_chunk; | |
1955 | if (!progress.omap_complete) { | |
1956 | ObjectMap::ObjectMapIterator iter = | |
1957 | store->get_omap_iterator(coll, | |
1958 | ghobject_t(recovery_info.soid)); | |
1959 | assert(iter); | |
1960 | for (iter->lower_bound(progress.omap_recovered_to); | |
1961 | iter->valid(); | |
1962 | iter->next(false)) { | |
1963 | if (!out_op->omap_entries.empty() && | |
1964 | ((cct->_conf->osd_recovery_max_omap_entries_per_chunk > 0 && | |
1965 | out_op->omap_entries.size() >= cct->_conf->osd_recovery_max_omap_entries_per_chunk) || | |
1966 | available <= iter->key().size() + iter->value().length())) | |
1967 | break; | |
1968 | out_op->omap_entries.insert(make_pair(iter->key(), iter->value())); | |
1969 | ||
1970 | if ((iter->key().size() + iter->value().length()) <= available) | |
1971 | available -= (iter->key().size() + iter->value().length()); | |
1972 | else | |
1973 | available = 0; | |
1974 | } | |
1975 | if (!iter->valid()) | |
1976 | new_progress.omap_complete = true; | |
1977 | else | |
1978 | new_progress.omap_recovered_to = iter->key(); | |
1979 | } | |
1980 | ||
1981 | if (available > 0) { | |
1982 | if (!recovery_info.copy_subset.empty()) { | |
1983 | interval_set<uint64_t> copy_subset = recovery_info.copy_subset; | |
1984 | map<uint64_t, uint64_t> m; | |
1985 | int r = store->fiemap(ch, ghobject_t(recovery_info.soid), 0, | |
1986 | copy_subset.range_end(), m); | |
1987 | if (r >= 0) { | |
1988 | interval_set<uint64_t> fiemap_included(m); | |
1989 | copy_subset.intersection_of(fiemap_included); | |
1990 | } else { | |
1991 | // intersection of copy_subset and empty interval_set would be empty anyway | |
1992 | copy_subset.clear(); | |
1993 | } | |
1994 | ||
1995 | out_op->data_included.span_of(copy_subset, progress.data_recovered_to, | |
1996 | available); | |
1997 | if (out_op->data_included.empty()) // zero filled section, skip to end! | |
1998 | new_progress.data_recovered_to = recovery_info.copy_subset.range_end(); | |
1999 | else | |
2000 | new_progress.data_recovered_to = out_op->data_included.range_end(); | |
2001 | } | |
2002 | } else { | |
2003 | out_op->data_included.clear(); | |
2004 | } | |
2005 | ||
2006 | for (interval_set<uint64_t>::iterator p = out_op->data_included.begin(); | |
2007 | p != out_op->data_included.end(); | |
2008 | ++p) { | |
2009 | bufferlist bit; | |
2010 | store->read(ch, ghobject_t(recovery_info.soid), | |
2011 | p.get_start(), p.get_len(), bit, | |
2012 | cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0); | |
2013 | if (p.get_len() != bit.length()) { | |
2014 | dout(10) << " extent " << p.get_start() << "~" << p.get_len() | |
2015 | << " is actually " << p.get_start() << "~" << bit.length() | |
2016 | << dendl; | |
2017 | interval_set<uint64_t>::iterator save = p++; | |
2018 | if (bit.length() == 0) | |
2019 | out_op->data_included.erase(save); //Remove this empty interval | |
2020 | else | |
2021 | save.set_len(bit.length()); | |
2022 | // Remove any other intervals present | |
2023 | while (p != out_op->data_included.end()) { | |
2024 | interval_set<uint64_t>::iterator save = p++; | |
2025 | out_op->data_included.erase(save); | |
2026 | } | |
2027 | new_progress.data_complete = true; | |
2028 | out_op->data.claim_append(bit); | |
2029 | break; | |
2030 | } | |
2031 | out_op->data.claim_append(bit); | |
2032 | } | |
2033 | ||
2034 | if (new_progress.is_complete(recovery_info)) { | |
2035 | new_progress.data_complete = true; | |
2036 | if (stat) | |
2037 | stat->num_objects_recovered++; | |
2038 | } | |
2039 | ||
2040 | if (stat) { | |
2041 | stat->num_keys_recovered += out_op->omap_entries.size(); | |
2042 | stat->num_bytes_recovered += out_op->data.length(); | |
2043 | } | |
2044 | ||
2045 | get_parent()->get_logger()->inc(l_osd_push); | |
2046 | get_parent()->get_logger()->inc(l_osd_push_outb, out_op->data.length()); | |
2047 | ||
2048 | // send | |
2049 | out_op->version = recovery_info.version; | |
2050 | out_op->soid = recovery_info.soid; | |
2051 | out_op->recovery_info = recovery_info; | |
2052 | out_op->after_progress = new_progress; | |
2053 | out_op->before_progress = progress; | |
2054 | return 0; | |
2055 | } | |
2056 | ||
2057 | void ReplicatedBackend::prep_push_op_blank(const hobject_t& soid, PushOp *op) | |
2058 | { | |
2059 | op->recovery_info.version = eversion_t(); | |
2060 | op->version = eversion_t(); | |
2061 | op->soid = soid; | |
2062 | } | |
2063 | ||
2064 | bool ReplicatedBackend::handle_push_reply( | |
2065 | pg_shard_t peer, const PushReplyOp &op, PushOp *reply) | |
2066 | { | |
2067 | const hobject_t &soid = op.soid; | |
2068 | if (pushing.count(soid) == 0) { | |
2069 | dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer | |
2070 | << ", or anybody else" | |
2071 | << dendl; | |
2072 | return false; | |
2073 | } else if (pushing[soid].count(peer) == 0) { | |
2074 | dout(10) << "huh, i wasn't pushing " << soid << " to osd." << peer | |
2075 | << dendl; | |
2076 | return false; | |
2077 | } else { | |
2078 | PushInfo *pi = &pushing[soid][peer]; | |
2079 | ||
2080 | if (!pi->recovery_progress.data_complete) { | |
2081 | dout(10) << " pushing more from, " | |
2082 | << pi->recovery_progress.data_recovered_to | |
2083 | << " of " << pi->recovery_info.copy_subset << dendl; | |
2084 | ObjectRecoveryProgress new_progress; | |
2085 | int r = build_push_op( | |
2086 | pi->recovery_info, | |
2087 | pi->recovery_progress, &new_progress, reply, | |
2088 | &(pi->stat)); | |
2089 | assert(r == 0); | |
2090 | pi->recovery_progress = new_progress; | |
2091 | return true; | |
2092 | } else { | |
2093 | // done! | |
2094 | get_parent()->on_peer_recover( | |
2095 | peer, soid, pi->recovery_info); | |
2096 | ||
2097 | get_parent()->release_locks(pi->lock_manager); | |
2098 | object_stat_sum_t stat = pi->stat; | |
2099 | pushing[soid].erase(peer); | |
2100 | pi = NULL; | |
2101 | ||
2102 | if (pushing[soid].empty()) { | |
2103 | get_parent()->on_global_recover(soid, stat); | |
2104 | pushing.erase(soid); | |
2105 | } else { | |
2106 | dout(10) << "pushed " << soid << ", still waiting for push ack from " | |
2107 | << pushing[soid].size() << " others" << dendl; | |
2108 | } | |
2109 | return false; | |
2110 | } | |
2111 | } | |
2112 | } | |
2113 | ||
2114 | void ReplicatedBackend::handle_pull(pg_shard_t peer, PullOp &op, PushOp *reply) | |
2115 | { | |
2116 | const hobject_t &soid = op.soid; | |
2117 | struct stat st; | |
2118 | int r = store->stat(ch, ghobject_t(soid), &st); | |
2119 | if (r != 0) { | |
2120 | get_parent()->clog_error() << get_info().pgid << " " | |
2121 | << peer << " tried to pull " << soid | |
2122 | << " but got " << cpp_strerror(-r); | |
2123 | prep_push_op_blank(soid, reply); | |
2124 | } else { | |
2125 | ObjectRecoveryInfo &recovery_info = op.recovery_info; | |
2126 | ObjectRecoveryProgress &progress = op.recovery_progress; | |
2127 | if (progress.first && recovery_info.size == ((uint64_t)-1)) { | |
2128 | // Adjust size and copy_subset | |
2129 | recovery_info.size = st.st_size; | |
2130 | recovery_info.copy_subset.clear(); | |
2131 | if (st.st_size) | |
2132 | recovery_info.copy_subset.insert(0, st.st_size); | |
2133 | assert(recovery_info.clone_subset.empty()); | |
2134 | } | |
2135 | ||
2136 | r = build_push_op(recovery_info, progress, 0, reply); | |
2137 | if (r < 0) | |
2138 | prep_push_op_blank(soid, reply); | |
2139 | } | |
2140 | } | |
2141 | ||
2142 | /** | |
2143 | * trim received data to remove what we don't want | |
2144 | * | |
2145 | * @param copy_subset intervals we want | |
2146 | * @param data_included intervals we got | |
2147 | * @param data_recieved data we got | |
2148 | * @param intervals_usable intervals we want to keep | |
2149 | * @param data_usable matching data we want to keep | |
2150 | */ | |
2151 | void ReplicatedBackend::trim_pushed_data( | |
2152 | const interval_set<uint64_t> ©_subset, | |
2153 | const interval_set<uint64_t> &intervals_received, | |
2154 | bufferlist data_received, | |
2155 | interval_set<uint64_t> *intervals_usable, | |
2156 | bufferlist *data_usable) | |
2157 | { | |
2158 | if (intervals_received.subset_of(copy_subset)) { | |
2159 | *intervals_usable = intervals_received; | |
2160 | *data_usable = data_received; | |
2161 | return; | |
2162 | } | |
2163 | ||
2164 | intervals_usable->intersection_of(copy_subset, | |
2165 | intervals_received); | |
2166 | ||
2167 | uint64_t off = 0; | |
2168 | for (interval_set<uint64_t>::const_iterator p = intervals_received.begin(); | |
2169 | p != intervals_received.end(); | |
2170 | ++p) { | |
2171 | interval_set<uint64_t> x; | |
2172 | x.insert(p.get_start(), p.get_len()); | |
2173 | x.intersection_of(copy_subset); | |
2174 | for (interval_set<uint64_t>::const_iterator q = x.begin(); | |
2175 | q != x.end(); | |
2176 | ++q) { | |
2177 | bufferlist sub; | |
2178 | uint64_t data_off = off + (q.get_start() - p.get_start()); | |
2179 | sub.substr_of(data_received, data_off, q.get_len()); | |
2180 | data_usable->claim_append(sub); | |
2181 | } | |
2182 | off += p.get_len(); | |
2183 | } | |
2184 | } | |
2185 | ||
2186 | void ReplicatedBackend::_failed_push(pg_shard_t from, const hobject_t &soid) | |
2187 | { | |
2188 | list<pg_shard_t> fl = { from }; | |
2189 | get_parent()->failed_push(fl, soid); | |
2190 | ||
2191 | clear_pull(pulling.find(soid)); | |
2192 | } | |
2193 | ||
2194 | void ReplicatedBackend::clear_pull_from( | |
2195 | map<hobject_t, PullInfo>::iterator piter) | |
2196 | { | |
2197 | auto from = piter->second.from; | |
2198 | pull_from_peer[from].erase(piter->second.soid); | |
2199 | if (pull_from_peer[from].empty()) | |
2200 | pull_from_peer.erase(from); | |
2201 | } | |
2202 | ||
2203 | void ReplicatedBackend::clear_pull( | |
2204 | map<hobject_t, PullInfo>::iterator piter, | |
2205 | bool clear_pull_from_peer) | |
2206 | { | |
2207 | if (clear_pull_from_peer) { | |
2208 | clear_pull_from(piter); | |
2209 | } | |
2210 | get_parent()->release_locks(piter->second.lock_manager); | |
2211 | pulling.erase(piter); | |
2212 | } | |
2213 | ||
2214 | int ReplicatedBackend::start_pushes( | |
2215 | const hobject_t &soid, | |
2216 | ObjectContextRef obc, | |
2217 | RPGHandle *h) | |
2218 | { | |
2219 | int pushes = 0; | |
2220 | // who needs it? | |
2221 | assert(get_parent()->get_actingbackfill_shards().size() > 0); | |
2222 | for (set<pg_shard_t>::iterator i = | |
2223 | get_parent()->get_actingbackfill_shards().begin(); | |
2224 | i != get_parent()->get_actingbackfill_shards().end(); | |
2225 | ++i) { | |
2226 | if (*i == get_parent()->whoami_shard()) continue; | |
2227 | pg_shard_t peer = *i; | |
2228 | map<pg_shard_t, pg_missing_t>::const_iterator j = | |
2229 | get_parent()->get_shard_missing().find(peer); | |
2230 | assert(j != get_parent()->get_shard_missing().end()); | |
2231 | if (j->second.is_missing(soid)) { | |
2232 | ++pushes; | |
2233 | h->pushes[peer].push_back(PushOp()); | |
2234 | prep_push_to_replica(obc, soid, peer, | |
2235 | &(h->pushes[peer].back()), h->cache_dont_need); | |
2236 | } | |
2237 | } | |
2238 | return pushes; | |
2239 | } |