]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License version 2.1, as published by the Free Software | |
14 | * Foundation. See file COPYING. | |
15 | * | |
16 | */ | |
17 | #ifndef CEPH_PG_LOG_H | |
18 | #define CEPH_PG_LOG_H | |
19 | ||
20 | // re-include our assert to clobber boost's | |
21 | #include "include/assert.h" | |
22 | #include "osd_types.h" | |
23 | #include "os/ObjectStore.h" | |
24 | #include <list> | |
25 | using namespace std; | |
26 | ||
27 | #define PGLOG_INDEXED_OBJECTS (1 << 0) | |
28 | #define PGLOG_INDEXED_CALLER_OPS (1 << 1) | |
29 | #define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2) | |
30 | #define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | PGLOG_INDEXED_CALLER_OPS | PGLOG_INDEXED_EXTRA_CALLER_OPS) | |
31 | ||
32 | class CephContext; | |
33 | ||
34 | struct PGLog : DoutPrefixProvider { | |
35 | DoutPrefixProvider *prefix_provider; | |
36 | string gen_prefix() const override { | |
37 | return prefix_provider ? prefix_provider->gen_prefix() : ""; | |
38 | } | |
39 | unsigned get_subsys() const override { | |
40 | return prefix_provider ? prefix_provider->get_subsys() : | |
41 | (unsigned)ceph_subsys_osd; | |
42 | } | |
43 | CephContext *get_cct() const override { | |
44 | return cct; | |
45 | } | |
46 | ||
47 | ////////////////////////////// sub classes ////////////////////////////// | |
48 | struct LogEntryHandler { | |
49 | virtual void rollback( | |
50 | const pg_log_entry_t &entry) = 0; | |
51 | virtual void rollforward( | |
52 | const pg_log_entry_t &entry) = 0; | |
53 | virtual void trim( | |
54 | const pg_log_entry_t &entry) = 0; | |
55 | virtual void remove( | |
56 | const hobject_t &hoid) = 0; | |
57 | virtual void try_stash( | |
58 | const hobject_t &hoid, | |
59 | version_t v) = 0; | |
60 | virtual ~LogEntryHandler() {} | |
61 | }; | |
62 | ||
63 | /* Exceptions */ | |
64 | class read_log_and_missing_error : public buffer::error { | |
65 | public: | |
66 | explicit read_log_and_missing_error(const char *what) { | |
67 | snprintf(buf, sizeof(buf), "read_log_and_missing_error: %s", what); | |
68 | } | |
69 | const char *what() const throw () override { | |
70 | return buf; | |
71 | } | |
72 | private: | |
73 | char buf[512]; | |
74 | }; | |
75 | ||
76 | public: | |
77 | /** | |
78 | * IndexLog - adds in-memory index of the log, by oid. | |
79 | * plus some methods to manipulate it all. | |
80 | */ | |
81 | struct IndexedLog : public pg_log_t { | |
82 | mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful! | |
83 | mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops; | |
84 | mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops; | |
85 | ||
86 | // recovery pointers | |
87 | list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item | |
88 | version_t last_requested = 0; // last object requested by primary | |
89 | ||
90 | // | |
91 | private: | |
92 | mutable __u16 indexed_data = 0; | |
93 | /** | |
94 | * rollback_info_trimmed_to_riter points to the first log entry <= | |
95 | * rollback_info_trimmed_to | |
96 | * | |
97 | * It's a reverse_iterator because rend() is a natural representation for | |
98 | * tail, and rbegin() works nicely for head. | |
99 | */ | |
31f18b77 | 100 | mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator |
7c673cae FG |
101 | rollback_info_trimmed_to_riter; |
102 | ||
103 | template <typename F> | |
104 | void advance_can_rollback_to(eversion_t to, F &&f) { | |
105 | if (to > can_rollback_to) | |
106 | can_rollback_to = to; | |
107 | ||
108 | if (to > rollback_info_trimmed_to) | |
109 | rollback_info_trimmed_to = to; | |
110 | ||
111 | while (rollback_info_trimmed_to_riter != log.rbegin()) { | |
112 | --rollback_info_trimmed_to_riter; | |
113 | if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { | |
114 | ++rollback_info_trimmed_to_riter; | |
115 | break; | |
116 | } | |
117 | f(*rollback_info_trimmed_to_riter); | |
118 | } | |
119 | } | |
120 | ||
121 | void reset_rollback_info_trimmed_to_riter() { | |
122 | rollback_info_trimmed_to_riter = log.rbegin(); | |
123 | while (rollback_info_trimmed_to_riter != log.rend() && | |
124 | rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) | |
125 | ++rollback_info_trimmed_to_riter; | |
126 | } | |
127 | ||
128 | // indexes objects, caller ops and extra caller ops | |
129 | public: | |
130 | IndexedLog() : | |
131 | complete_to(log.end()), | |
132 | last_requested(0), | |
133 | indexed_data(0), | |
134 | rollback_info_trimmed_to_riter(log.rbegin()) | |
135 | {} | |
136 | ||
137 | template <typename... Args> | |
138 | IndexedLog(Args&&... args) : | |
139 | pg_log_t(std::forward<Args>(args)...), | |
140 | complete_to(log.end()), | |
141 | last_requested(0), | |
142 | indexed_data(0), | |
143 | rollback_info_trimmed_to_riter(log.rbegin()) { | |
144 | reset_rollback_info_trimmed_to_riter(); | |
145 | index(); | |
146 | } | |
147 | ||
148 | IndexedLog(const IndexedLog &rhs) : | |
149 | pg_log_t(rhs), | |
150 | complete_to(log.end()), | |
151 | last_requested(rhs.last_requested), | |
152 | indexed_data(0), | |
153 | rollback_info_trimmed_to_riter(log.rbegin()) { | |
154 | reset_rollback_info_trimmed_to_riter(); | |
155 | index(rhs.indexed_data); | |
156 | } | |
157 | IndexedLog &operator=(const IndexedLog &rhs) { | |
158 | this->~IndexedLog(); | |
159 | new (this) IndexedLog(rhs); | |
160 | return *this; | |
161 | } | |
162 | ||
163 | void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { | |
164 | advance_can_rollback_to( | |
165 | to, | |
166 | [&](pg_log_entry_t &entry) { | |
167 | h->trim(entry); | |
168 | }); | |
169 | } | |
170 | void roll_forward_to(eversion_t to, LogEntryHandler *h) { | |
171 | advance_can_rollback_to( | |
172 | to, | |
173 | [&](pg_log_entry_t &entry) { | |
174 | h->rollforward(entry); | |
175 | }); | |
176 | } | |
177 | ||
178 | void skip_can_rollback_to_to_head() { | |
179 | advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); | |
180 | } | |
181 | ||
31f18b77 | 182 | mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) { |
7c673cae FG |
183 | auto divergent = pg_log_t::rewind_from_head(newhead); |
184 | index(); | |
185 | reset_rollback_info_trimmed_to_riter(); | |
186 | return divergent; | |
187 | } | |
188 | ||
189 | template <typename T> | |
190 | void scan_log_after( | |
191 | const eversion_t &bound, ///< [in] scan entries > bound | |
192 | T &&f) const { | |
193 | auto iter = log.rbegin(); | |
194 | while (iter != log.rend() && iter->version > bound) | |
195 | ++iter; | |
196 | ||
197 | while (true) { | |
198 | if (iter == log.rbegin()) | |
199 | break; | |
200 | f(*(--iter)); | |
201 | } | |
202 | } | |
203 | ||
204 | /****/ | |
205 | void claim_log_and_clear_rollback_info(const pg_log_t& o) { | |
206 | // we must have already trimmed the old entries | |
207 | assert(rollback_info_trimmed_to == head); | |
208 | assert(rollback_info_trimmed_to_riter == log.rbegin()); | |
209 | ||
210 | *this = IndexedLog(o); | |
211 | ||
212 | skip_can_rollback_to_to_head(); | |
213 | index(); | |
214 | } | |
215 | ||
216 | void split_out_child( | |
217 | pg_t child_pgid, | |
218 | unsigned split_bits, | |
219 | IndexedLog *target); | |
220 | ||
221 | void zero() { | |
222 | // we must have already trimmed the old entries | |
223 | assert(rollback_info_trimmed_to == head); | |
224 | assert(rollback_info_trimmed_to_riter == log.rbegin()); | |
225 | ||
226 | unindex(); | |
227 | pg_log_t::clear(); | |
228 | rollback_info_trimmed_to_riter = log.rbegin(); | |
229 | reset_recovery_pointers(); | |
230 | } | |
231 | void clear() { | |
232 | skip_can_rollback_to_to_head(); | |
233 | zero(); | |
234 | } | |
235 | void reset_recovery_pointers() { | |
236 | complete_to = log.end(); | |
237 | last_requested = 0; | |
238 | } | |
239 | ||
240 | bool logged_object(const hobject_t& oid) const { | |
241 | if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { | |
242 | index_objects(); | |
243 | } | |
244 | return objects.count(oid); | |
245 | } | |
246 | ||
247 | bool logged_req(const osd_reqid_t &r) const { | |
248 | if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { | |
249 | index_caller_ops(); | |
250 | } | |
251 | if (!caller_ops.count(r)) { | |
252 | if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { | |
253 | index_extra_caller_ops(); | |
254 | } | |
255 | return extra_caller_ops.count(r); | |
256 | } | |
257 | return true; | |
258 | } | |
259 | ||
260 | bool get_request( | |
261 | const osd_reqid_t &r, | |
262 | eversion_t *version, | |
263 | version_t *user_version, | |
264 | int *return_code) const { | |
265 | assert(version); | |
266 | assert(user_version); | |
267 | assert(return_code); | |
268 | ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p; | |
269 | if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { | |
270 | index_caller_ops(); | |
271 | } | |
272 | p = caller_ops.find(r); | |
273 | if (p != caller_ops.end()) { | |
274 | *version = p->second->version; | |
275 | *user_version = p->second->user_version; | |
276 | *return_code = p->second->return_code; | |
277 | return true; | |
278 | } | |
279 | ||
280 | // warning: we will return *a* request for this reqid, but not | |
281 | // necessarily the most recent. | |
282 | if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { | |
283 | index_extra_caller_ops(); | |
284 | } | |
285 | p = extra_caller_ops.find(r); | |
286 | if (p != extra_caller_ops.end()) { | |
31f18b77 | 287 | for (auto i = p->second->extra_reqids.begin(); |
7c673cae FG |
288 | i != p->second->extra_reqids.end(); |
289 | ++i) { | |
290 | if (i->first == r) { | |
291 | *version = p->second->version; | |
292 | *user_version = i->second; | |
293 | *return_code = p->second->return_code; | |
294 | return true; | |
295 | } | |
296 | } | |
297 | assert(0 == "in extra_caller_ops but not extra_reqids"); | |
298 | } | |
299 | return false; | |
300 | } | |
301 | ||
302 | /// get a (bounded) list of recent reqids for the given object | |
303 | void get_object_reqids(const hobject_t& oid, unsigned max, | |
31f18b77 | 304 | mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls) const { |
7c673cae FG |
305 | // make sure object is present at least once before we do an |
306 | // O(n) search. | |
307 | if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { | |
308 | index_objects(); | |
309 | } | |
310 | if (objects.count(oid) == 0) | |
311 | return; | |
312 | for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin(); | |
313 | i != log.rend(); | |
314 | ++i) { | |
315 | if (i->soid == oid) { | |
316 | if (i->reqid_is_indexed()) | |
317 | pls->push_back(make_pair(i->reqid, i->user_version)); | |
318 | pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end()); | |
319 | if (pls->size() >= max) { | |
320 | if (pls->size() > max) { | |
321 | pls->resize(max); | |
322 | } | |
323 | return; | |
324 | } | |
325 | } | |
326 | } | |
327 | } | |
328 | ||
329 | void index(__u16 to_index = PGLOG_INDEXED_ALL) const { | |
330 | if (to_index & PGLOG_INDEXED_OBJECTS) | |
331 | objects.clear(); | |
332 | if (to_index & PGLOG_INDEXED_CALLER_OPS) | |
333 | caller_ops.clear(); | |
334 | if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) | |
335 | extra_caller_ops.clear(); | |
336 | ||
337 | for (list<pg_log_entry_t>::const_iterator i = log.begin(); | |
338 | i != log.end(); | |
339 | ++i) { | |
340 | if (to_index & PGLOG_INDEXED_OBJECTS) { | |
341 | if (i->object_is_indexed()) { | |
342 | objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i)); | |
343 | } | |
344 | } | |
345 | ||
346 | if (to_index & PGLOG_INDEXED_CALLER_OPS) { | |
347 | if (i->reqid_is_indexed()) { | |
348 | caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i)); | |
349 | } | |
350 | } | |
351 | ||
352 | if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 353 | for (auto j = i->extra_reqids.begin(); |
7c673cae FG |
354 | j != i->extra_reqids.end(); |
355 | ++j) { | |
356 | extra_caller_ops.insert( | |
357 | make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i)))); | |
358 | } | |
359 | } | |
360 | } | |
361 | ||
362 | indexed_data |= to_index; | |
363 | } | |
364 | ||
365 | void index_objects() const { | |
366 | index(PGLOG_INDEXED_OBJECTS); | |
367 | } | |
368 | ||
369 | void index_caller_ops() const { | |
370 | index(PGLOG_INDEXED_CALLER_OPS); | |
371 | } | |
372 | ||
373 | void index_extra_caller_ops() const { | |
374 | index(PGLOG_INDEXED_EXTRA_CALLER_OPS); | |
375 | } | |
376 | ||
377 | void index(pg_log_entry_t& e) { | |
378 | if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { | |
379 | if (objects.count(e.soid) == 0 || | |
380 | objects[e.soid]->version < e.version) | |
381 | objects[e.soid] = &e; | |
382 | } | |
383 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
384 | // divergent merge_log indexes new before unindexing old | |
385 | if (e.reqid_is_indexed()) { | |
386 | caller_ops[e.reqid] = &e; | |
387 | } | |
388 | } | |
389 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 390 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
391 | j != e.extra_reqids.end(); |
392 | ++j) { | |
393 | extra_caller_ops.insert(make_pair(j->first, &e)); | |
394 | } | |
395 | } | |
396 | } | |
397 | void unindex() { | |
398 | objects.clear(); | |
399 | caller_ops.clear(); | |
400 | extra_caller_ops.clear(); | |
401 | indexed_data = 0; | |
402 | } | |
403 | void unindex(pg_log_entry_t& e) { | |
404 | // NOTE: this only works if we remove from the _tail_ of the log! | |
405 | if (indexed_data & PGLOG_INDEXED_OBJECTS) { | |
406 | if (objects.count(e.soid) && objects[e.soid]->version == e.version) | |
407 | objects.erase(e.soid); | |
408 | } | |
409 | if (e.reqid_is_indexed()) { | |
410 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
411 | // divergent merge_log indexes new before unindexing old | |
412 | if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e) | |
413 | caller_ops.erase(e.reqid); | |
414 | } | |
415 | } | |
416 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 417 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
418 | j != e.extra_reqids.end(); |
419 | ++j) { | |
420 | for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k = | |
421 | extra_caller_ops.find(j->first); | |
422 | k != extra_caller_ops.end() && k->first == j->first; | |
423 | ++k) { | |
424 | if (k->second == &e) { | |
425 | extra_caller_ops.erase(k); | |
426 | break; | |
427 | } | |
428 | } | |
429 | } | |
430 | } | |
431 | } | |
432 | ||
433 | // actors | |
434 | void add(const pg_log_entry_t& e, bool applied = true) { | |
435 | if (!applied) { | |
436 | assert(get_can_rollback_to() == head); | |
437 | } | |
438 | ||
31f18b77 FG |
439 | // make sure our buffers don't pin bigger buffers |
440 | e.mod_desc.trim_bl(); | |
441 | ||
7c673cae FG |
442 | // add to log |
443 | log.push_back(e); | |
444 | ||
445 | // riter previously pointed to the previous entry | |
446 | if (rollback_info_trimmed_to_riter == log.rbegin()) | |
447 | ++rollback_info_trimmed_to_riter; | |
448 | ||
449 | assert(e.version > head); | |
450 | assert(head.version == 0 || e.version.version > head.version); | |
451 | head = e.version; | |
452 | ||
453 | // to our index | |
454 | if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { | |
455 | objects[e.soid] = &(log.back()); | |
456 | } | |
457 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
458 | if (e.reqid_is_indexed()) { | |
459 | caller_ops[e.reqid] = &(log.back()); | |
460 | } | |
461 | } | |
462 | ||
463 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 464 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
465 | j != e.extra_reqids.end(); |
466 | ++j) { | |
467 | extra_caller_ops.insert(make_pair(j->first, &(log.back()))); | |
468 | } | |
469 | } | |
470 | ||
471 | if (!applied) { | |
472 | skip_can_rollback_to_to_head(); | |
473 | } | |
474 | } | |
475 | ||
476 | void trim( | |
477 | CephContext* cct, | |
478 | eversion_t s, | |
479 | set<eversion_t> *trimmed); | |
480 | ||
481 | ostream& print(ostream& out) const; | |
482 | }; | |
483 | ||
484 | ||
485 | protected: | |
486 | //////////////////// data members //////////////////// | |
487 | ||
488 | pg_missing_tracker_t missing; | |
489 | IndexedLog log; | |
490 | ||
491 | eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to | |
492 | eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from | |
493 | eversion_t writeout_from; ///< must writout keys >= writeout_from | |
494 | set<eversion_t> trimmed; ///< must clear keys in trimmed | |
495 | CephContext *cct; | |
496 | bool pg_log_debug; | |
497 | /// Log is clean on [dirty_to, dirty_from) | |
498 | bool touched_log; | |
499 | bool clear_divergent_priors; | |
500 | ||
501 | void mark_dirty_to(eversion_t to) { | |
502 | if (to > dirty_to) | |
503 | dirty_to = to; | |
504 | } | |
505 | void mark_dirty_from(eversion_t from) { | |
506 | if (from < dirty_from) | |
507 | dirty_from = from; | |
508 | } | |
509 | void mark_writeout_from(eversion_t from) { | |
510 | if (from < writeout_from) | |
511 | writeout_from = from; | |
512 | } | |
513 | public: | |
514 | bool is_dirty() const { | |
515 | return !touched_log || | |
516 | (dirty_to != eversion_t()) || | |
517 | (dirty_from != eversion_t::max()) || | |
518 | (writeout_from != eversion_t::max()) || | |
519 | !(trimmed.empty()) || | |
520 | !missing.is_clean(); | |
521 | } | |
522 | void mark_log_for_rewrite() { | |
523 | mark_dirty_to(eversion_t::max()); | |
524 | mark_dirty_from(eversion_t()); | |
525 | touched_log = false; | |
526 | } | |
527 | protected: | |
528 | ||
529 | /// DEBUG | |
530 | set<string> log_keys_debug; | |
531 | static void clear_after(set<string> *log_keys_debug, const string &lb) { | |
532 | if (!log_keys_debug) | |
533 | return; | |
534 | for (set<string>::iterator i = log_keys_debug->lower_bound(lb); | |
535 | i != log_keys_debug->end(); | |
536 | log_keys_debug->erase(i++)); | |
537 | } | |
538 | static void clear_up_to(set<string> *log_keys_debug, const string &ub) { | |
539 | if (!log_keys_debug) | |
540 | return; | |
541 | for (set<string>::iterator i = log_keys_debug->begin(); | |
542 | i != log_keys_debug->end() && *i < ub; | |
543 | log_keys_debug->erase(i++)); | |
544 | } | |
545 | ||
546 | void check(); | |
547 | void undirty() { | |
548 | dirty_to = eversion_t(); | |
549 | dirty_from = eversion_t::max(); | |
550 | touched_log = true; | |
551 | trimmed.clear(); | |
552 | writeout_from = eversion_t::max(); | |
553 | check(); | |
554 | missing.flush(); | |
555 | } | |
556 | public: | |
557 | // cppcheck-suppress noExplicitConstructor | |
558 | PGLog(CephContext *cct, DoutPrefixProvider *dpp = 0) : | |
559 | prefix_provider(dpp), | |
560 | dirty_from(eversion_t::max()), | |
561 | writeout_from(eversion_t::max()), | |
562 | cct(cct), | |
563 | pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))), | |
564 | touched_log(false), | |
565 | clear_divergent_priors(false) {} | |
566 | ||
567 | ||
568 | void reset_backfill(); | |
569 | ||
570 | void clear(); | |
571 | ||
572 | //////////////////// get or set missing //////////////////// | |
573 | ||
574 | const pg_missing_tracker_t& get_missing() const { return missing; } | |
575 | void revise_have(hobject_t oid, eversion_t have) { | |
576 | missing.revise_have(oid, have); | |
577 | } | |
578 | ||
579 | void revise_need(hobject_t oid, eversion_t need) { | |
580 | missing.revise_need(oid, need); | |
581 | } | |
582 | ||
583 | void missing_add(const hobject_t& oid, eversion_t need, eversion_t have) { | |
584 | missing.add(oid, need, have); | |
585 | } | |
586 | ||
7c673cae FG |
587 | //////////////////// get or set log //////////////////// |
588 | ||
589 | const IndexedLog &get_log() const { return log; } | |
590 | ||
591 | const eversion_t &get_tail() const { return log.tail; } | |
592 | ||
593 | void set_tail(eversion_t tail) { log.tail = tail; } | |
594 | ||
595 | const eversion_t &get_head() const { return log.head; } | |
596 | ||
597 | void set_head(eversion_t head) { log.head = head; } | |
598 | ||
599 | void set_last_requested(version_t last_requested) { | |
600 | log.last_requested = last_requested; | |
601 | } | |
602 | ||
603 | void index() { log.index(); } | |
604 | ||
605 | void unindex() { log.unindex(); } | |
606 | ||
607 | void add(const pg_log_entry_t& e, bool applied = true) { | |
608 | mark_writeout_from(e.version); | |
609 | log.add(e, applied); | |
610 | } | |
611 | ||
612 | void reset_recovery_pointers() { log.reset_recovery_pointers(); } | |
613 | ||
614 | static void clear_info_log( | |
615 | spg_t pgid, | |
616 | ObjectStore::Transaction *t); | |
617 | ||
618 | void trim( | |
619 | eversion_t trim_to, | |
620 | pg_info_t &info); | |
621 | ||
622 | void roll_forward_to( | |
623 | eversion_t roll_forward_to, | |
624 | LogEntryHandler *h) { | |
625 | log.roll_forward_to( | |
626 | roll_forward_to, | |
627 | h); | |
628 | } | |
629 | ||
630 | eversion_t get_can_rollback_to() const { | |
631 | return log.get_can_rollback_to(); | |
632 | } | |
633 | ||
634 | void roll_forward(LogEntryHandler *h) { | |
635 | roll_forward_to( | |
636 | log.head, | |
637 | h); | |
638 | } | |
639 | ||
640 | //////////////////// get or set log & missing //////////////////// | |
641 | ||
642 | void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { | |
643 | log.trim_rollback_info_to(log.head, h); | |
644 | log.claim_log_and_clear_rollback_info(o); | |
645 | missing.clear(); | |
646 | mark_dirty_to(eversion_t::max()); | |
647 | } | |
648 | ||
649 | void split_into( | |
650 | pg_t child_pgid, | |
651 | unsigned split_bits, | |
652 | PGLog *opg_log) { | |
653 | log.split_out_child(child_pgid, split_bits, &opg_log->log); | |
654 | missing.split_into(child_pgid, split_bits, &(opg_log->missing)); | |
655 | opg_log->mark_dirty_to(eversion_t::max()); | |
656 | mark_dirty_to(eversion_t::max()); | |
657 | } | |
658 | ||
659 | void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) { | |
660 | if (missing.is_missing(oid, v)) { | |
661 | missing.got(oid, v); | |
662 | ||
663 | // raise last_complete? | |
664 | if (missing.get_items().empty()) { | |
665 | log.complete_to = log.log.end(); | |
666 | info.last_complete = info.last_update; | |
667 | } | |
668 | while (log.complete_to != log.log.end()) { | |
669 | if (missing.get_items().at( | |
670 | missing.get_rmissing().begin()->second | |
671 | ).need <= log.complete_to->version) | |
672 | break; | |
673 | if (info.last_complete < log.complete_to->version) | |
674 | info.last_complete = log.complete_to->version; | |
675 | ++log.complete_to; | |
676 | } | |
677 | } | |
678 | ||
679 | assert(log.get_can_rollback_to() >= v); | |
680 | } | |
681 | ||
682 | void activate_not_complete(pg_info_t &info) { | |
683 | log.complete_to = log.log.begin(); | |
684 | while (log.complete_to->version < | |
685 | missing.get_items().at( | |
686 | missing.get_rmissing().begin()->second | |
687 | ).need) | |
688 | ++log.complete_to; | |
689 | assert(log.complete_to != log.log.end()); | |
690 | if (log.complete_to == log.log.begin()) { | |
691 | info.last_complete = eversion_t(); | |
692 | } else { | |
693 | --log.complete_to; | |
694 | info.last_complete = log.complete_to->version; | |
695 | ++log.complete_to; | |
696 | } | |
697 | log.last_requested = 0; | |
698 | } | |
699 | ||
700 | void proc_replica_log(pg_info_t &oinfo, | |
701 | const pg_log_t &olog, | |
702 | pg_missing_t& omissing, pg_shard_t from) const; | |
703 | ||
704 | protected: | |
705 | static void split_by_object( | |
31f18b77 FG |
706 | mempool::osd_pglog::list<pg_log_entry_t> &entries, |
707 | map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) { | |
7c673cae | 708 | while (!entries.empty()) { |
31f18b77 | 709 | auto &out_list = (*out_entries)[entries.front().soid]; |
7c673cae FG |
710 | out_list.splice(out_list.end(), entries, entries.begin()); |
711 | } | |
712 | } | |
713 | ||
714 | /** | |
715 | * _merge_object_divergent_entries | |
716 | * | |
717 | * There are 5 distinct cases: | |
718 | * 1) There is a more recent update: in this case we assume we adjusted the | |
719 | * store and missing during merge_log | |
720 | * 2) The first entry in the divergent sequence is a create. This might | |
721 | * either be because the object is a clone or because prior_version is | |
722 | * eversion_t(). In this case the object does not exist and we must | |
723 | * adjust missing and the store to match. | |
724 | * 3) We are currently missing the object. In this case, we adjust the | |
725 | * missing to our prior_version taking care to add a divergent_prior | |
726 | * if necessary | |
727 | * 4) We can rollback all of the entries. In this case, we do so using | |
728 | * the rollbacker and return -- the object does not go into missing. | |
729 | * 5) We cannot rollback at least 1 of the entries. In this case, we | |
730 | * clear the object out of the store and add a missing entry at | |
731 | * prior_version taking care to add a divergent_prior if | |
732 | * necessary. | |
733 | */ | |
734 | template <typename missing_type> | |
735 | static void _merge_object_divergent_entries( | |
736 | const IndexedLog &log, ///< [in] log to merge against | |
737 | const hobject_t &hoid, ///< [in] object we are merging | |
31f18b77 | 738 | const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge |
7c673cae FG |
739 | const pg_info_t &info, ///< [in] info for merging entries |
740 | eversion_t olog_can_rollback_to, ///< [in] rollback boundary | |
741 | missing_type &missing, ///< [in,out] missing to adjust, use | |
742 | LogEntryHandler *rollbacker, ///< [in] optional rollbacker object | |
743 | const DoutPrefixProvider *dpp ///< [in] logging provider | |
744 | ) { | |
745 | ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid | |
31f18b77 | 746 | << " entries: " << orig_entries << dendl; |
7c673cae FG |
747 | |
748 | if (hoid > info.last_backfill) { | |
749 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill" | |
750 | << dendl; | |
751 | return; | |
752 | } | |
753 | ||
754 | // entries is non-empty | |
31f18b77 FG |
755 | assert(!orig_entries.empty()); |
756 | // strip out and ignore ERROR entries | |
757 | mempool::osd_pglog::list<pg_log_entry_t> entries; | |
7c673cae | 758 | eversion_t last; |
31f18b77 FG |
759 | for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin(); |
760 | i != orig_entries.end(); | |
7c673cae FG |
761 | ++i) { |
762 | // all entries are on hoid | |
763 | assert(i->soid == hoid); | |
31f18b77 | 764 | if (i != orig_entries.begin() && i->prior_version != eversion_t()) { |
7c673cae FG |
765 | // in increasing order of version |
766 | assert(i->version > last); | |
31f18b77 FG |
767 | // prior_version correct (unless it is an ERROR entry) |
768 | assert(i->prior_version == last || i->is_error()); | |
7c673cae FG |
769 | } |
770 | last = i->version; | |
31f18b77 FG |
771 | if (i->is_error()) { |
772 | ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; | |
773 | } else { | |
774 | ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl; | |
775 | entries.push_back(*i); | |
776 | } | |
777 | } | |
778 | if (entries.empty()) { | |
779 | ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl; | |
780 | return; | |
7c673cae FG |
781 | } |
782 | ||
783 | const eversion_t prior_version = entries.begin()->prior_version; | |
784 | const eversion_t first_divergent_update = entries.begin()->version; | |
785 | const eversion_t last_divergent_update = entries.rbegin()->version; | |
786 | const bool object_not_in_store = | |
787 | !missing.is_missing(hoid) && | |
788 | entries.rbegin()->is_delete(); | |
789 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
790 | << " prior_version: " << prior_version | |
791 | << " first_divergent_update: " << first_divergent_update | |
792 | << " last_divergent_update: " << last_divergent_update | |
793 | << dendl; | |
794 | ||
795 | ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter = | |
796 | log.objects.find(hoid); | |
797 | if (objiter != log.objects.end() && | |
798 | objiter->second->version >= first_divergent_update) { | |
799 | /// Case 1) | |
800 | ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: " | |
801 | << *objiter->second << ", already merged" << dendl; | |
802 | ||
803 | assert(objiter->second->version > last_divergent_update); | |
804 | ||
805 | // ensure missing has been updated appropriately | |
806 | if (objiter->second->is_update()) { | |
807 | assert(missing.is_missing(hoid) && | |
808 | missing.get_items().at(hoid).need == objiter->second->version); | |
809 | } else { | |
810 | assert(!missing.is_missing(hoid)); | |
811 | } | |
812 | missing.revise_have(hoid, eversion_t()); | |
813 | if (rollbacker) { | |
814 | if (!object_not_in_store) { | |
815 | rollbacker->remove(hoid); | |
816 | } | |
817 | for (auto &&i: entries) { | |
818 | rollbacker->trim(i); | |
819 | } | |
820 | } | |
821 | return; | |
822 | } | |
823 | ||
824 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
825 | <<" has no more recent entries in log" << dendl; | |
826 | if (prior_version == eversion_t() || entries.front().is_clone()) { | |
827 | /// Case 2) | |
828 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
829 | << " prior_version or op type indicates creation," | |
830 | << " deleting" | |
831 | << dendl; | |
832 | if (missing.is_missing(hoid)) | |
833 | missing.rm(missing.get_items().find(hoid)); | |
834 | if (rollbacker) { | |
835 | if (!object_not_in_store) { | |
836 | rollbacker->remove(hoid); | |
837 | } | |
838 | for (auto &&i: entries) { | |
839 | rollbacker->trim(i); | |
840 | } | |
841 | } | |
842 | return; | |
843 | } | |
844 | ||
845 | if (missing.is_missing(hoid)) { | |
846 | /// Case 3) | |
847 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
848 | << " missing, " << missing.get_items().at(hoid) | |
849 | << " adjusting" << dendl; | |
850 | ||
851 | if (missing.get_items().at(hoid).have == prior_version) { | |
852 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
853 | << " missing.have is prior_version " << prior_version | |
854 | << " removing from missing" << dendl; | |
855 | missing.rm(missing.get_items().find(hoid)); | |
856 | } else { | |
857 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
858 | << " missing.have is " << missing.get_items().at(hoid).have | |
859 | << ", adjusting" << dendl; | |
860 | missing.revise_need(hoid, prior_version); | |
861 | if (prior_version <= info.log_tail) { | |
862 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
863 | << " prior_version " << prior_version | |
864 | << " <= info.log_tail " | |
865 | << info.log_tail << dendl; | |
866 | } | |
867 | } | |
868 | if (rollbacker) { | |
869 | for (auto &&i: entries) { | |
870 | rollbacker->trim(i); | |
871 | } | |
872 | } | |
873 | return; | |
874 | } | |
875 | ||
876 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
877 | << " must be rolled back or recovered," | |
878 | << " attempting to rollback" | |
879 | << dendl; | |
880 | bool can_rollback = true; | |
881 | /// Distinguish between 4) and 5) | |
882 | for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin(); | |
883 | i != entries.rend(); | |
884 | ++i) { | |
885 | if (!i->can_rollback() || i->version <= olog_can_rollback_to) { | |
886 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback " | |
887 | << *i << dendl; | |
888 | can_rollback = false; | |
889 | break; | |
890 | } | |
891 | } | |
892 | ||
893 | if (can_rollback) { | |
894 | /// Case 4) | |
895 | for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin(); | |
896 | i != entries.rend(); | |
897 | ++i) { | |
898 | assert(i->can_rollback() && i->version > olog_can_rollback_to); | |
899 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
900 | << " rolling back " << *i << dendl; | |
901 | if (rollbacker) | |
902 | rollbacker->rollback(*i); | |
903 | } | |
904 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
905 | << " rolled back" << dendl; | |
906 | return; | |
907 | } else { | |
908 | /// Case 5) | |
909 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, " | |
910 | << "removing and adding to missing" << dendl; | |
911 | if (rollbacker) { | |
912 | if (!object_not_in_store) | |
913 | rollbacker->remove(hoid); | |
914 | for (auto &&i: entries) { | |
915 | rollbacker->trim(i); | |
916 | } | |
917 | } | |
918 | missing.add(hoid, prior_version, eversion_t()); | |
919 | if (prior_version <= info.log_tail) { | |
920 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
921 | << " prior_version " << prior_version | |
922 | << " <= info.log_tail " | |
923 | << info.log_tail << dendl; | |
924 | } | |
925 | } | |
926 | } | |
927 | ||
928 | /// Merge all entries using above | |
929 | template <typename missing_type> | |
930 | static void _merge_divergent_entries( | |
931 | const IndexedLog &log, ///< [in] log to merge against | |
31f18b77 | 932 | mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge |
7c673cae FG |
933 | const pg_info_t &oinfo, ///< [in] info for merging entries |
934 | eversion_t olog_can_rollback_to, ///< [in] rollback boundary | |
935 | missing_type &omissing, ///< [in,out] missing to adjust, use | |
936 | LogEntryHandler *rollbacker, ///< [in] optional rollbacker object | |
937 | const DoutPrefixProvider *dpp ///< [in] logging provider | |
938 | ) { | |
31f18b77 | 939 | map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split; |
7c673cae | 940 | split_by_object(entries, &split); |
31f18b77 | 941 | for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin(); |
7c673cae FG |
942 | i != split.end(); |
943 | ++i) { | |
944 | _merge_object_divergent_entries( | |
945 | log, | |
946 | i->first, | |
947 | i->second, | |
948 | oinfo, | |
949 | olog_can_rollback_to, | |
950 | omissing, | |
951 | rollbacker, | |
952 | dpp); | |
953 | } | |
954 | } | |
955 | ||
956 | /** | |
957 | * Exists for use in TestPGLog for simply testing single divergent log | |
958 | * cases | |
959 | */ | |
960 | void merge_old_entry( | |
961 | ObjectStore::Transaction& t, | |
962 | const pg_log_entry_t& oe, | |
963 | const pg_info_t& info, | |
964 | LogEntryHandler *rollbacker) { | |
31f18b77 | 965 | mempool::osd_pglog::list<pg_log_entry_t> entries; |
7c673cae FG |
966 | entries.push_back(oe); |
967 | _merge_object_divergent_entries( | |
968 | log, | |
969 | oe.soid, | |
970 | entries, | |
971 | info, | |
972 | log.get_can_rollback_to(), | |
973 | missing, | |
974 | rollbacker, | |
975 | this); | |
976 | } | |
977 | public: | |
978 | void rewind_divergent_log(eversion_t newhead, | |
979 | pg_info_t &info, | |
980 | LogEntryHandler *rollbacker, | |
981 | bool &dirty_info, | |
982 | bool &dirty_big_info); | |
983 | ||
984 | void merge_log(pg_info_t &oinfo, | |
985 | pg_log_t &olog, | |
986 | pg_shard_t from, | |
987 | pg_info_t &info, LogEntryHandler *rollbacker, | |
988 | bool &dirty_info, bool &dirty_big_info); | |
989 | ||
990 | template <typename missing_type> | |
991 | static bool append_log_entries_update_missing( | |
992 | const hobject_t &last_backfill, | |
993 | bool last_backfill_bitwise, | |
31f18b77 | 994 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
995 | bool maintain_rollback, |
996 | IndexedLog *log, | |
997 | missing_type &missing, | |
998 | LogEntryHandler *rollbacker, | |
999 | const DoutPrefixProvider *dpp) { | |
1000 | bool invalidate_stats = false; | |
1001 | if (log && !entries.empty()) { | |
1002 | assert(log->head < entries.begin()->version); | |
1003 | } | |
1004 | for (list<pg_log_entry_t>::const_iterator p = entries.begin(); | |
1005 | p != entries.end(); | |
1006 | ++p) { | |
1007 | invalidate_stats = invalidate_stats || !p->is_error(); | |
1008 | if (log) { | |
1009 | ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; | |
1010 | log->add(*p); | |
1011 | } | |
1012 | if (p->soid <= last_backfill && | |
1013 | !p->is_error()) { | |
1014 | missing.add_next_event(*p); | |
1015 | if (rollbacker) { | |
1016 | // hack to match PG::mark_all_unfound_lost | |
1017 | if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) { | |
1018 | rollbacker->try_stash(p->soid, p->version.version); | |
1019 | } else if (p->is_delete()) { | |
1020 | rollbacker->remove(p->soid); | |
1021 | } | |
1022 | } | |
1023 | } | |
1024 | } | |
1025 | return invalidate_stats; | |
1026 | } | |
1027 | bool append_new_log_entries( | |
1028 | const hobject_t &last_backfill, | |
1029 | bool last_backfill_bitwise, | |
31f18b77 | 1030 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
1031 | LogEntryHandler *rollbacker) { |
1032 | bool invalidate_stats = append_log_entries_update_missing( | |
1033 | last_backfill, | |
1034 | last_backfill_bitwise, | |
1035 | entries, | |
1036 | true, | |
1037 | &log, | |
1038 | missing, | |
1039 | rollbacker, | |
1040 | this); | |
1041 | if (!entries.empty()) { | |
1042 | mark_writeout_from(entries.begin()->version); | |
1043 | } | |
1044 | return invalidate_stats; | |
1045 | } | |
1046 | ||
1047 | void write_log_and_missing(ObjectStore::Transaction& t, | |
1048 | map<string,bufferlist> *km, | |
1049 | const coll_t& coll, | |
1050 | const ghobject_t &log_oid, | |
1051 | bool require_rollback); | |
1052 | ||
1053 | static void write_log_and_missing_wo_missing( | |
1054 | ObjectStore::Transaction& t, | |
1055 | map<string,bufferlist>* km, | |
1056 | pg_log_t &log, | |
1057 | const coll_t& coll, | |
1058 | const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors, | |
1059 | bool require_rollback); | |
1060 | ||
1061 | static void write_log_and_missing( | |
1062 | ObjectStore::Transaction& t, | |
1063 | map<string,bufferlist>* km, | |
1064 | pg_log_t &log, | |
1065 | const coll_t& coll, | |
1066 | const ghobject_t &log_oid, | |
1067 | const pg_missing_tracker_t &missing, | |
1068 | bool require_rollback); | |
1069 | ||
1070 | static void _write_log_and_missing_wo_missing( | |
1071 | ObjectStore::Transaction& t, | |
1072 | map<string,bufferlist>* km, | |
1073 | pg_log_t &log, | |
1074 | const coll_t& coll, const ghobject_t &log_oid, | |
1075 | map<eversion_t, hobject_t> &divergent_priors, | |
1076 | eversion_t dirty_to, | |
1077 | eversion_t dirty_from, | |
1078 | eversion_t writeout_from, | |
1079 | const set<eversion_t> &trimmed, | |
1080 | bool dirty_divergent_priors, | |
1081 | bool touch_log, | |
1082 | bool require_rollback, | |
1083 | set<string> *log_keys_debug | |
1084 | ); | |
1085 | ||
1086 | static void _write_log_and_missing( | |
1087 | ObjectStore::Transaction& t, | |
1088 | map<string,bufferlist>* km, | |
1089 | pg_log_t &log, | |
1090 | const coll_t& coll, const ghobject_t &log_oid, | |
1091 | eversion_t dirty_to, | |
1092 | eversion_t dirty_from, | |
1093 | eversion_t writeout_from, | |
1094 | const set<eversion_t> &trimmed, | |
1095 | const pg_missing_tracker_t &missing, | |
1096 | bool touch_log, | |
1097 | bool require_rollback, | |
1098 | bool clear_divergent_priors, | |
1099 | set<string> *log_keys_debug | |
1100 | ); | |
1101 | ||
1102 | void read_log_and_missing( | |
1103 | ObjectStore *store, coll_t pg_coll, | |
1104 | coll_t log_coll, ghobject_t log_oid, | |
1105 | const pg_info_t &info, | |
1106 | ostringstream &oss, | |
1107 | bool tolerate_divergent_missing_log, | |
1108 | bool debug_verify_stored_missing = false | |
1109 | ) { | |
1110 | return read_log_and_missing( | |
1111 | store, pg_coll, log_coll, log_oid, info, | |
1112 | log, missing, oss, | |
1113 | tolerate_divergent_missing_log, | |
1114 | &clear_divergent_priors, | |
1115 | this, | |
1116 | (pg_log_debug ? &log_keys_debug : 0), | |
1117 | debug_verify_stored_missing); | |
1118 | } | |
1119 | ||
1120 | template <typename missing_type> | |
1121 | static void read_log_and_missing(ObjectStore *store, coll_t pg_coll, | |
1122 | coll_t log_coll, ghobject_t log_oid, | |
1123 | const pg_info_t &info, | |
1124 | IndexedLog &log, | |
1125 | missing_type &missing, ostringstream &oss, | |
1126 | bool tolerate_divergent_missing_log, | |
1127 | bool *clear_divergent_priors = NULL, | |
1128 | const DoutPrefixProvider *dpp = NULL, | |
1129 | set<string> *log_keys_debug = 0, | |
1130 | bool debug_verify_stored_missing = false | |
1131 | ) { | |
1132 | ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll | |
1133 | << " log_oid " << log_oid << dendl; | |
1134 | ||
1135 | // legacy? | |
1136 | struct stat st; | |
1137 | int r = store->stat(log_coll, log_oid, &st); | |
1138 | assert(r == 0); | |
1139 | assert(st.st_size == 0); | |
1140 | ||
1141 | // will get overridden below if it had been recorded | |
1142 | eversion_t on_disk_can_rollback_to = info.last_update; | |
1143 | eversion_t on_disk_rollback_info_trimmed_to = eversion_t(); | |
1144 | ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid); | |
1145 | map<eversion_t, hobject_t> divergent_priors; | |
1146 | bool has_divergent_priors = false; | |
1147 | list<pg_log_entry_t> entries; | |
1148 | if (p) { | |
1149 | for (p->seek_to_first(); p->valid() ; p->next(false)) { | |
1150 | // non-log pgmeta_oid keys are prefixed with _; skip those | |
1151 | if (p->key()[0] == '_') | |
1152 | continue; | |
1153 | bufferlist bl = p->value();//Copy bufferlist before creating iterator | |
1154 | bufferlist::iterator bp = bl.begin(); | |
1155 | if (p->key() == "divergent_priors") { | |
1156 | ::decode(divergent_priors, bp); | |
1157 | ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() | |
1158 | << " divergent_priors" << dendl; | |
1159 | has_divergent_priors = true; | |
1160 | debug_verify_stored_missing = false; | |
1161 | } else if (p->key() == "can_rollback_to") { | |
1162 | ::decode(on_disk_can_rollback_to, bp); | |
1163 | } else if (p->key() == "rollback_info_trimmed_to") { | |
1164 | ::decode(on_disk_rollback_info_trimmed_to, bp); | |
1165 | } else if (p->key().substr(0, 7) == string("missing")) { | |
1166 | pair<hobject_t, pg_missing_item> p; | |
1167 | ::decode(p, bp); | |
1168 | missing.add(p.first, p.second.need, p.second.have); | |
1169 | } else { | |
1170 | pg_log_entry_t e; | |
1171 | e.decode_with_checksum(bp); | |
1172 | ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; | |
1173 | if (!entries.empty()) { | |
1174 | pg_log_entry_t last_e(entries.back()); | |
1175 | assert(last_e.version.version < e.version.version); | |
1176 | assert(last_e.version.epoch <= e.version.epoch); | |
1177 | } | |
1178 | entries.push_back(e); | |
1179 | if (log_keys_debug) | |
1180 | log_keys_debug->insert(e.get_key_name()); | |
1181 | } | |
1182 | } | |
1183 | } | |
1184 | log = IndexedLog( | |
1185 | info.last_update, | |
1186 | info.log_tail, | |
1187 | on_disk_can_rollback_to, | |
1188 | on_disk_rollback_info_trimmed_to, | |
1189 | std::move(entries)); | |
1190 | ||
1191 | if (has_divergent_priors || debug_verify_stored_missing) { | |
1192 | // build missing | |
1193 | if (debug_verify_stored_missing || info.last_complete < info.last_update) { | |
1194 | ldpp_dout(dpp, 10) << "read_log_and_missing checking for missing items over interval (" | |
1195 | << info.last_complete | |
1196 | << "," << info.last_update << "]" << dendl; | |
1197 | ||
1198 | set<hobject_t> did; | |
1199 | set<hobject_t> checked; | |
1200 | set<hobject_t> skipped; | |
1201 | for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); | |
1202 | i != log.log.rend(); | |
1203 | ++i) { | |
1204 | if (!debug_verify_stored_missing && i->version <= info.last_complete) break; | |
1205 | if (i->soid > info.last_backfill) | |
1206 | continue; | |
1207 | if (i->is_error()) | |
1208 | continue; | |
1209 | if (did.count(i->soid)) continue; | |
1210 | did.insert(i->soid); | |
1211 | ||
1212 | if (i->is_delete()) continue; | |
1213 | ||
1214 | bufferlist bv; | |
1215 | int r = store->getattr( | |
1216 | pg_coll, | |
1217 | ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), | |
1218 | OI_ATTR, | |
1219 | bv); | |
1220 | if (r >= 0) { | |
1221 | object_info_t oi(bv); | |
1222 | if (oi.version < i->version) { | |
1223 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i | |
1224 | << " (have " << oi.version << ")" << dendl; | |
1225 | if (debug_verify_stored_missing) { | |
1226 | auto miter = missing.get_items().find(i->soid); | |
1227 | assert(miter != missing.get_items().end()); | |
1228 | assert(miter->second.need == i->version); | |
1229 | assert(miter->second.have == oi.version); | |
1230 | checked.insert(i->soid); | |
1231 | } else { | |
1232 | missing.add(i->soid, i->version, oi.version); | |
1233 | } | |
1234 | } | |
1235 | } else { | |
1236 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; | |
1237 | if (debug_verify_stored_missing) { | |
1238 | auto miter = missing.get_items().find(i->soid); | |
1239 | assert(miter != missing.get_items().end()); | |
1240 | assert(miter->second.need == i->version); | |
1241 | assert(miter->second.have == eversion_t()); | |
1242 | checked.insert(i->soid); | |
1243 | } else { | |
1244 | missing.add(i->soid, i->version, eversion_t()); | |
1245 | } | |
1246 | } | |
1247 | } | |
1248 | if (debug_verify_stored_missing) { | |
1249 | for (auto &&i: missing.get_items()) { | |
1250 | if (checked.count(i.first)) | |
1251 | continue; | |
1252 | if (i.second.need > log.tail || | |
1253 | i.first > info.last_backfill) { | |
1254 | ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry found " | |
1255 | << i.first | |
1256 | << dendl; | |
1257 | assert(0 == "invalid missing set entry found"); | |
1258 | } | |
1259 | bufferlist bv; | |
1260 | int r = store->getattr( | |
1261 | pg_coll, | |
1262 | ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard), | |
1263 | OI_ATTR, | |
1264 | bv); | |
1265 | if (r >= 0) { | |
1266 | object_info_t oi(bv); | |
1267 | assert(oi.version == i.second.have); | |
1268 | } else { | |
1269 | assert(eversion_t() == i.second.have); | |
1270 | } | |
1271 | } | |
1272 | } else { | |
1273 | assert(has_divergent_priors); | |
1274 | for (map<eversion_t, hobject_t>::reverse_iterator i = | |
1275 | divergent_priors.rbegin(); | |
1276 | i != divergent_priors.rend(); | |
1277 | ++i) { | |
1278 | if (i->first <= info.last_complete) break; | |
1279 | if (i->second > info.last_backfill) | |
1280 | continue; | |
1281 | if (did.count(i->second)) continue; | |
1282 | did.insert(i->second); | |
1283 | bufferlist bv; | |
1284 | int r = store->getattr( | |
1285 | pg_coll, | |
1286 | ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard), | |
1287 | OI_ATTR, | |
1288 | bv); | |
1289 | if (r >= 0) { | |
1290 | object_info_t oi(bv); | |
1291 | /** | |
1292 | * 1) we see this entry in the divergent priors mapping | |
1293 | * 2) we didn't see an entry for this object in the log | |
1294 | * | |
1295 | * From 1 & 2 we know that either the object does not exist | |
1296 | * or it is at the version specified in the divergent_priors | |
1297 | * map since the object would have been deleted atomically | |
1298 | * with the addition of the divergent_priors entry, an older | |
1299 | * version would not have been recovered, and a newer version | |
1300 | * would show up in the log above. | |
1301 | */ | |
1302 | /** | |
1303 | * Unfortunately the assessment above is incorrect because of | |
1304 | * http://tracker.ceph.com/issues/17916 (we were incorrectly | |
1305 | * not removing the divergent_priors set from disk state!), | |
1306 | * so let's check that. | |
1307 | */ | |
1308 | if (oi.version > i->first && tolerate_divergent_missing_log) { | |
1309 | ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i | |
1310 | << ") inconsistent with disk state (" << oi | |
1311 | << "), assuming it is tracker.ceph.com/issues/17916" | |
1312 | << dendl; | |
1313 | } else { | |
1314 | assert(oi.version == i->first); | |
1315 | } | |
1316 | } else { | |
1317 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; | |
1318 | missing.add(i->second, i->first, eversion_t()); | |
1319 | } | |
1320 | } | |
1321 | } | |
1322 | if (clear_divergent_priors) | |
1323 | (*clear_divergent_priors) = true; | |
1324 | } | |
1325 | } | |
1326 | ||
1327 | if (!has_divergent_priors) { | |
1328 | if (clear_divergent_priors) | |
1329 | (*clear_divergent_priors) = false; | |
1330 | missing.flush(); | |
1331 | } | |
1332 | ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl; | |
1333 | } | |
1334 | }; | |
1335 | ||
1336 | #endif // CEPH_PG_LOG_H |