]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License version 2.1, as published by the Free Software | |
14 | * Foundation. See file COPYING. | |
15 | * | |
16 | */ | |
c07f9fc5 | 17 | #pragma once |
7c673cae FG |
18 | |
19 | // re-include our assert to clobber boost's | |
20 | #include "include/assert.h" | |
21 | #include "osd_types.h" | |
22 | #include "os/ObjectStore.h" | |
23 | #include <list> | |
24 | using namespace std; | |
25 | ||
26 | #define PGLOG_INDEXED_OBJECTS (1 << 0) | |
27 | #define PGLOG_INDEXED_CALLER_OPS (1 << 1) | |
28 | #define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2) | |
c07f9fc5 FG |
29 | #define PGLOG_INDEXED_DUPS (1 << 3) |
30 | #define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | \ | |
31 | PGLOG_INDEXED_CALLER_OPS | \ | |
32 | PGLOG_INDEXED_EXTRA_CALLER_OPS | \ | |
33 | PGLOG_INDEXED_DUPS) | |
7c673cae FG |
34 | |
35 | class CephContext; | |
36 | ||
37 | struct PGLog : DoutPrefixProvider { | |
38 | DoutPrefixProvider *prefix_provider; | |
39 | string gen_prefix() const override { | |
40 | return prefix_provider ? prefix_provider->gen_prefix() : ""; | |
41 | } | |
42 | unsigned get_subsys() const override { | |
43 | return prefix_provider ? prefix_provider->get_subsys() : | |
44 | (unsigned)ceph_subsys_osd; | |
45 | } | |
46 | CephContext *get_cct() const override { | |
47 | return cct; | |
48 | } | |
49 | ||
50 | ////////////////////////////// sub classes ////////////////////////////// | |
51 | struct LogEntryHandler { | |
52 | virtual void rollback( | |
53 | const pg_log_entry_t &entry) = 0; | |
54 | virtual void rollforward( | |
55 | const pg_log_entry_t &entry) = 0; | |
56 | virtual void trim( | |
57 | const pg_log_entry_t &entry) = 0; | |
58 | virtual void remove( | |
59 | const hobject_t &hoid) = 0; | |
60 | virtual void try_stash( | |
61 | const hobject_t &hoid, | |
62 | version_t v) = 0; | |
63 | virtual ~LogEntryHandler() {} | |
64 | }; | |
65 | ||
66 | /* Exceptions */ | |
67 | class read_log_and_missing_error : public buffer::error { | |
68 | public: | |
69 | explicit read_log_and_missing_error(const char *what) { | |
70 | snprintf(buf, sizeof(buf), "read_log_and_missing_error: %s", what); | |
71 | } | |
72 | const char *what() const throw () override { | |
73 | return buf; | |
74 | } | |
75 | private: | |
76 | char buf[512]; | |
77 | }; | |
78 | ||
79 | public: | |
80 | /** | |
81 | * IndexLog - adds in-memory index of the log, by oid. | |
82 | * plus some methods to manipulate it all. | |
83 | */ | |
84 | struct IndexedLog : public pg_log_t { | |
85 | mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful! | |
86 | mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops; | |
87 | mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops; | |
c07f9fc5 | 88 | mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index; |
7c673cae FG |
89 | |
90 | // recovery pointers | |
91 | list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item | |
92 | version_t last_requested = 0; // last object requested by primary | |
93 | ||
94 | // | |
95 | private: | |
96 | mutable __u16 indexed_data = 0; | |
97 | /** | |
98 | * rollback_info_trimmed_to_riter points to the first log entry <= | |
99 | * rollback_info_trimmed_to | |
100 | * | |
101 | * It's a reverse_iterator because rend() is a natural representation for | |
102 | * tail, and rbegin() works nicely for head. | |
103 | */ | |
31f18b77 | 104 | mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator |
7c673cae FG |
105 | rollback_info_trimmed_to_riter; |
106 | ||
107 | template <typename F> | |
108 | void advance_can_rollback_to(eversion_t to, F &&f) { | |
109 | if (to > can_rollback_to) | |
110 | can_rollback_to = to; | |
111 | ||
112 | if (to > rollback_info_trimmed_to) | |
113 | rollback_info_trimmed_to = to; | |
114 | ||
115 | while (rollback_info_trimmed_to_riter != log.rbegin()) { | |
116 | --rollback_info_trimmed_to_riter; | |
117 | if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { | |
118 | ++rollback_info_trimmed_to_riter; | |
119 | break; | |
120 | } | |
121 | f(*rollback_info_trimmed_to_riter); | |
122 | } | |
123 | } | |
124 | ||
125 | void reset_rollback_info_trimmed_to_riter() { | |
126 | rollback_info_trimmed_to_riter = log.rbegin(); | |
127 | while (rollback_info_trimmed_to_riter != log.rend() && | |
128 | rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) | |
129 | ++rollback_info_trimmed_to_riter; | |
130 | } | |
131 | ||
132 | // indexes objects, caller ops and extra caller ops | |
133 | public: | |
134 | IndexedLog() : | |
135 | complete_to(log.end()), | |
136 | last_requested(0), | |
137 | indexed_data(0), | |
138 | rollback_info_trimmed_to_riter(log.rbegin()) | |
c07f9fc5 | 139 | { } |
7c673cae FG |
140 | |
141 | template <typename... Args> | |
142 | IndexedLog(Args&&... args) : | |
143 | pg_log_t(std::forward<Args>(args)...), | |
144 | complete_to(log.end()), | |
145 | last_requested(0), | |
146 | indexed_data(0), | |
c07f9fc5 FG |
147 | rollback_info_trimmed_to_riter(log.rbegin()) |
148 | { | |
7c673cae FG |
149 | reset_rollback_info_trimmed_to_riter(); |
150 | index(); | |
151 | } | |
152 | ||
153 | IndexedLog(const IndexedLog &rhs) : | |
154 | pg_log_t(rhs), | |
155 | complete_to(log.end()), | |
156 | last_requested(rhs.last_requested), | |
157 | indexed_data(0), | |
c07f9fc5 FG |
158 | rollback_info_trimmed_to_riter(log.rbegin()) |
159 | { | |
7c673cae FG |
160 | reset_rollback_info_trimmed_to_riter(); |
161 | index(rhs.indexed_data); | |
162 | } | |
c07f9fc5 | 163 | |
7c673cae FG |
164 | IndexedLog &operator=(const IndexedLog &rhs) { |
165 | this->~IndexedLog(); | |
166 | new (this) IndexedLog(rhs); | |
167 | return *this; | |
168 | } | |
169 | ||
170 | void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { | |
171 | advance_can_rollback_to( | |
172 | to, | |
173 | [&](pg_log_entry_t &entry) { | |
174 | h->trim(entry); | |
175 | }); | |
176 | } | |
177 | void roll_forward_to(eversion_t to, LogEntryHandler *h) { | |
178 | advance_can_rollback_to( | |
179 | to, | |
180 | [&](pg_log_entry_t &entry) { | |
181 | h->rollforward(entry); | |
182 | }); | |
183 | } | |
184 | ||
185 | void skip_can_rollback_to_to_head() { | |
186 | advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); | |
187 | } | |
188 | ||
31f18b77 | 189 | mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) { |
7c673cae FG |
190 | auto divergent = pg_log_t::rewind_from_head(newhead); |
191 | index(); | |
192 | reset_rollback_info_trimmed_to_riter(); | |
193 | return divergent; | |
194 | } | |
195 | ||
196 | template <typename T> | |
197 | void scan_log_after( | |
198 | const eversion_t &bound, ///< [in] scan entries > bound | |
199 | T &&f) const { | |
200 | auto iter = log.rbegin(); | |
201 | while (iter != log.rend() && iter->version > bound) | |
202 | ++iter; | |
203 | ||
204 | while (true) { | |
205 | if (iter == log.rbegin()) | |
206 | break; | |
207 | f(*(--iter)); | |
208 | } | |
209 | } | |
210 | ||
211 | /****/ | |
212 | void claim_log_and_clear_rollback_info(const pg_log_t& o) { | |
213 | // we must have already trimmed the old entries | |
214 | assert(rollback_info_trimmed_to == head); | |
215 | assert(rollback_info_trimmed_to_riter == log.rbegin()); | |
216 | ||
217 | *this = IndexedLog(o); | |
218 | ||
219 | skip_can_rollback_to_to_head(); | |
220 | index(); | |
221 | } | |
222 | ||
223 | void split_out_child( | |
224 | pg_t child_pgid, | |
225 | unsigned split_bits, | |
226 | IndexedLog *target); | |
227 | ||
228 | void zero() { | |
229 | // we must have already trimmed the old entries | |
230 | assert(rollback_info_trimmed_to == head); | |
231 | assert(rollback_info_trimmed_to_riter == log.rbegin()); | |
232 | ||
233 | unindex(); | |
234 | pg_log_t::clear(); | |
235 | rollback_info_trimmed_to_riter = log.rbegin(); | |
236 | reset_recovery_pointers(); | |
237 | } | |
238 | void clear() { | |
239 | skip_can_rollback_to_to_head(); | |
240 | zero(); | |
241 | } | |
242 | void reset_recovery_pointers() { | |
243 | complete_to = log.end(); | |
244 | last_requested = 0; | |
245 | } | |
246 | ||
247 | bool logged_object(const hobject_t& oid) const { | |
248 | if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { | |
249 | index_objects(); | |
250 | } | |
251 | return objects.count(oid); | |
252 | } | |
253 | ||
254 | bool logged_req(const osd_reqid_t &r) const { | |
255 | if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { | |
256 | index_caller_ops(); | |
257 | } | |
258 | if (!caller_ops.count(r)) { | |
259 | if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { | |
260 | index_extra_caller_ops(); | |
261 | } | |
262 | return extra_caller_ops.count(r); | |
263 | } | |
264 | return true; | |
265 | } | |
266 | ||
267 | bool get_request( | |
268 | const osd_reqid_t &r, | |
269 | eversion_t *version, | |
270 | version_t *user_version, | |
c07f9fc5 FG |
271 | int *return_code) const |
272 | { | |
7c673cae FG |
273 | assert(version); |
274 | assert(user_version); | |
275 | assert(return_code); | |
276 | ceph::unordered_map<osd_reqid_t,pg_log_entry_t*>::const_iterator p; | |
277 | if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { | |
278 | index_caller_ops(); | |
279 | } | |
280 | p = caller_ops.find(r); | |
281 | if (p != caller_ops.end()) { | |
282 | *version = p->second->version; | |
283 | *user_version = p->second->user_version; | |
284 | *return_code = p->second->return_code; | |
285 | return true; | |
286 | } | |
287 | ||
288 | // warning: we will return *a* request for this reqid, but not | |
289 | // necessarily the most recent. | |
290 | if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { | |
291 | index_extra_caller_ops(); | |
292 | } | |
293 | p = extra_caller_ops.find(r); | |
294 | if (p != extra_caller_ops.end()) { | |
31f18b77 | 295 | for (auto i = p->second->extra_reqids.begin(); |
7c673cae FG |
296 | i != p->second->extra_reqids.end(); |
297 | ++i) { | |
298 | if (i->first == r) { | |
299 | *version = p->second->version; | |
300 | *user_version = i->second; | |
301 | *return_code = p->second->return_code; | |
302 | return true; | |
303 | } | |
304 | } | |
305 | assert(0 == "in extra_caller_ops but not extra_reqids"); | |
306 | } | |
c07f9fc5 FG |
307 | |
308 | if (!(indexed_data & PGLOG_INDEXED_DUPS)) { | |
309 | index_dups(); | |
310 | } | |
311 | auto q = dup_index.find(r); | |
312 | if (q != dup_index.end()) { | |
313 | *version = q->second->version; | |
314 | *user_version = q->second->user_version; | |
315 | *return_code = q->second->return_code; | |
316 | return true; | |
317 | } | |
318 | ||
7c673cae FG |
319 | return false; |
320 | } | |
321 | ||
322 | /// get a (bounded) list of recent reqids for the given object | |
323 | void get_object_reqids(const hobject_t& oid, unsigned max, | |
31f18b77 | 324 | mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > *pls) const { |
7c673cae FG |
325 | // make sure object is present at least once before we do an |
326 | // O(n) search. | |
327 | if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { | |
328 | index_objects(); | |
329 | } | |
330 | if (objects.count(oid) == 0) | |
331 | return; | |
332 | for (list<pg_log_entry_t>::const_reverse_iterator i = log.rbegin(); | |
333 | i != log.rend(); | |
334 | ++i) { | |
335 | if (i->soid == oid) { | |
336 | if (i->reqid_is_indexed()) | |
337 | pls->push_back(make_pair(i->reqid, i->user_version)); | |
338 | pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end()); | |
339 | if (pls->size() >= max) { | |
340 | if (pls->size() > max) { | |
341 | pls->resize(max); | |
342 | } | |
343 | return; | |
344 | } | |
345 | } | |
346 | } | |
347 | } | |
c07f9fc5 | 348 | |
7c673cae | 349 | void index(__u16 to_index = PGLOG_INDEXED_ALL) const { |
c07f9fc5 FG |
350 | // if to_index is 0, no need to run any of this code, especially |
351 | // loop below; this can happen with copy constructor for | |
352 | // IndexedLog (and indirectly through assignment operator) | |
353 | if (!to_index) return; | |
354 | ||
7c673cae FG |
355 | if (to_index & PGLOG_INDEXED_OBJECTS) |
356 | objects.clear(); | |
357 | if (to_index & PGLOG_INDEXED_CALLER_OPS) | |
358 | caller_ops.clear(); | |
359 | if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) | |
360 | extra_caller_ops.clear(); | |
c07f9fc5 FG |
361 | if (to_index & PGLOG_INDEXED_DUPS) { |
362 | dup_index.clear(); | |
363 | for (auto& i : dups) { | |
364 | dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i); | |
365 | } | |
366 | } | |
7c673cae | 367 | |
c07f9fc5 FG |
368 | constexpr __u16 any_log_entry_index = |
369 | PGLOG_INDEXED_OBJECTS | | |
370 | PGLOG_INDEXED_CALLER_OPS | | |
371 | PGLOG_INDEXED_EXTRA_CALLER_OPS; | |
372 | ||
373 | if (to_index & any_log_entry_index) { | |
374 | for (list<pg_log_entry_t>::const_iterator i = log.begin(); | |
375 | i != log.end(); | |
376 | ++i) { | |
377 | if (to_index & PGLOG_INDEXED_OBJECTS) { | |
378 | if (i->object_is_indexed()) { | |
379 | objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i)); | |
380 | } | |
7c673cae | 381 | } |
7c673cae | 382 | |
c07f9fc5 FG |
383 | if (to_index & PGLOG_INDEXED_CALLER_OPS) { |
384 | if (i->reqid_is_indexed()) { | |
385 | caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i)); | |
386 | } | |
7c673cae | 387 | } |
c07f9fc5 FG |
388 | |
389 | if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
390 | for (auto j = i->extra_reqids.begin(); | |
391 | j != i->extra_reqids.end(); | |
392 | ++j) { | |
393 | extra_caller_ops.insert( | |
394 | make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i)))); | |
395 | } | |
7c673cae FG |
396 | } |
397 | } | |
398 | } | |
c07f9fc5 | 399 | |
7c673cae FG |
400 | indexed_data |= to_index; |
401 | } | |
402 | ||
403 | void index_objects() const { | |
404 | index(PGLOG_INDEXED_OBJECTS); | |
405 | } | |
406 | ||
407 | void index_caller_ops() const { | |
408 | index(PGLOG_INDEXED_CALLER_OPS); | |
409 | } | |
410 | ||
411 | void index_extra_caller_ops() const { | |
412 | index(PGLOG_INDEXED_EXTRA_CALLER_OPS); | |
413 | } | |
414 | ||
c07f9fc5 FG |
415 | void index_dups() const { |
416 | index(PGLOG_INDEXED_DUPS); | |
417 | } | |
418 | ||
7c673cae FG |
419 | void index(pg_log_entry_t& e) { |
420 | if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { | |
421 | if (objects.count(e.soid) == 0 || | |
422 | objects[e.soid]->version < e.version) | |
423 | objects[e.soid] = &e; | |
424 | } | |
425 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
426 | // divergent merge_log indexes new before unindexing old | |
427 | if (e.reqid_is_indexed()) { | |
428 | caller_ops[e.reqid] = &e; | |
429 | } | |
430 | } | |
431 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 432 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
433 | j != e.extra_reqids.end(); |
434 | ++j) { | |
435 | extra_caller_ops.insert(make_pair(j->first, &e)); | |
436 | } | |
437 | } | |
438 | } | |
c07f9fc5 | 439 | |
7c673cae FG |
440 | void unindex() { |
441 | objects.clear(); | |
442 | caller_ops.clear(); | |
443 | extra_caller_ops.clear(); | |
c07f9fc5 | 444 | dup_index.clear(); |
7c673cae FG |
445 | indexed_data = 0; |
446 | } | |
c07f9fc5 FG |
447 | |
448 | void unindex(const pg_log_entry_t& e) { | |
7c673cae FG |
449 | // NOTE: this only works if we remove from the _tail_ of the log! |
450 | if (indexed_data & PGLOG_INDEXED_OBJECTS) { | |
451 | if (objects.count(e.soid) && objects[e.soid]->version == e.version) | |
452 | objects.erase(e.soid); | |
453 | } | |
454 | if (e.reqid_is_indexed()) { | |
455 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
456 | // divergent merge_log indexes new before unindexing old | |
457 | if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e) | |
c07f9fc5 | 458 | caller_ops.erase(e.reqid); |
7c673cae FG |
459 | } |
460 | } | |
461 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 462 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
463 | j != e.extra_reqids.end(); |
464 | ++j) { | |
465 | for (ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*>::iterator k = | |
466 | extra_caller_ops.find(j->first); | |
467 | k != extra_caller_ops.end() && k->first == j->first; | |
468 | ++k) { | |
469 | if (k->second == &e) { | |
470 | extra_caller_ops.erase(k); | |
471 | break; | |
472 | } | |
473 | } | |
474 | } | |
475 | } | |
476 | } | |
477 | ||
c07f9fc5 | 478 | void index(pg_log_dup_t& e) { |
d2e6a577 | 479 | if (indexed_data & PGLOG_INDEXED_DUPS) { |
c07f9fc5 FG |
480 | dup_index[e.reqid] = &e; |
481 | } | |
482 | } | |
483 | ||
484 | void unindex(const pg_log_dup_t& e) { | |
d2e6a577 | 485 | if (indexed_data & PGLOG_INDEXED_DUPS) { |
c07f9fc5 FG |
486 | auto i = dup_index.find(e.reqid); |
487 | if (i != dup_index.end()) { | |
488 | dup_index.erase(i); | |
489 | } | |
490 | } | |
491 | } | |
492 | ||
7c673cae FG |
493 | // actors |
494 | void add(const pg_log_entry_t& e, bool applied = true) { | |
495 | if (!applied) { | |
496 | assert(get_can_rollback_to() == head); | |
497 | } | |
498 | ||
31f18b77 FG |
499 | // make sure our buffers don't pin bigger buffers |
500 | e.mod_desc.trim_bl(); | |
501 | ||
7c673cae FG |
502 | // add to log |
503 | log.push_back(e); | |
504 | ||
505 | // riter previously pointed to the previous entry | |
506 | if (rollback_info_trimmed_to_riter == log.rbegin()) | |
507 | ++rollback_info_trimmed_to_riter; | |
508 | ||
509 | assert(e.version > head); | |
510 | assert(head.version == 0 || e.version.version > head.version); | |
511 | head = e.version; | |
512 | ||
513 | // to our index | |
514 | if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { | |
515 | objects[e.soid] = &(log.back()); | |
516 | } | |
517 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
518 | if (e.reqid_is_indexed()) { | |
519 | caller_ops[e.reqid] = &(log.back()); | |
520 | } | |
521 | } | |
c07f9fc5 | 522 | |
7c673cae | 523 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { |
31f18b77 | 524 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
525 | j != e.extra_reqids.end(); |
526 | ++j) { | |
527 | extra_caller_ops.insert(make_pair(j->first, &(log.back()))); | |
528 | } | |
529 | } | |
530 | ||
531 | if (!applied) { | |
532 | skip_can_rollback_to_to_head(); | |
533 | } | |
c07f9fc5 | 534 | } // add |
7c673cae FG |
535 | |
536 | void trim( | |
537 | CephContext* cct, | |
538 | eversion_t s, | |
c07f9fc5 FG |
539 | set<eversion_t> *trimmed, |
540 | set<string>* trimmed_dups, | |
541 | bool* dirty_dups); | |
7c673cae FG |
542 | |
543 | ostream& print(ostream& out) const; | |
c07f9fc5 | 544 | }; // IndexedLog |
7c673cae FG |
545 | |
546 | ||
547 | protected: | |
548 | //////////////////// data members //////////////////// | |
549 | ||
550 | pg_missing_tracker_t missing; | |
551 | IndexedLog log; | |
552 | ||
553 | eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to | |
554 | eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from | |
555 | eversion_t writeout_from; ///< must writout keys >= writeout_from | |
556 | set<eversion_t> trimmed; ///< must clear keys in trimmed | |
c07f9fc5 | 557 | set<string> trimmed_dups; ///< must clear keys in trimmed_dups |
7c673cae FG |
558 | CephContext *cct; |
559 | bool pg_log_debug; | |
560 | /// Log is clean on [dirty_to, dirty_from) | |
561 | bool touched_log; | |
562 | bool clear_divergent_priors; | |
c07f9fc5 FG |
563 | bool dirty_dups; /// log.dups is updated |
564 | bool rebuilt_missing_with_deletes = false; | |
7c673cae FG |
565 | |
566 | void mark_dirty_to(eversion_t to) { | |
567 | if (to > dirty_to) | |
568 | dirty_to = to; | |
569 | } | |
570 | void mark_dirty_from(eversion_t from) { | |
571 | if (from < dirty_from) | |
572 | dirty_from = from; | |
573 | } | |
574 | void mark_writeout_from(eversion_t from) { | |
575 | if (from < writeout_from) | |
576 | writeout_from = from; | |
577 | } | |
578 | public: | |
579 | bool is_dirty() const { | |
580 | return !touched_log || | |
581 | (dirty_to != eversion_t()) || | |
582 | (dirty_from != eversion_t::max()) || | |
583 | (writeout_from != eversion_t::max()) || | |
584 | !(trimmed.empty()) || | |
c07f9fc5 FG |
585 | !missing.is_clean() || |
586 | !(trimmed_dups.empty()) || | |
587 | dirty_dups || | |
588 | rebuilt_missing_with_deletes; | |
7c673cae FG |
589 | } |
590 | void mark_log_for_rewrite() { | |
591 | mark_dirty_to(eversion_t::max()); | |
592 | mark_dirty_from(eversion_t()); | |
593 | touched_log = false; | |
594 | } | |
c07f9fc5 FG |
595 | bool get_rebuilt_missing_with_deletes() const { |
596 | return rebuilt_missing_with_deletes; | |
597 | } | |
7c673cae FG |
598 | protected: |
599 | ||
600 | /// DEBUG | |
601 | set<string> log_keys_debug; | |
602 | static void clear_after(set<string> *log_keys_debug, const string &lb) { | |
603 | if (!log_keys_debug) | |
604 | return; | |
605 | for (set<string>::iterator i = log_keys_debug->lower_bound(lb); | |
606 | i != log_keys_debug->end(); | |
607 | log_keys_debug->erase(i++)); | |
608 | } | |
609 | static void clear_up_to(set<string> *log_keys_debug, const string &ub) { | |
610 | if (!log_keys_debug) | |
611 | return; | |
612 | for (set<string>::iterator i = log_keys_debug->begin(); | |
613 | i != log_keys_debug->end() && *i < ub; | |
614 | log_keys_debug->erase(i++)); | |
615 | } | |
616 | ||
617 | void check(); | |
618 | void undirty() { | |
619 | dirty_to = eversion_t(); | |
620 | dirty_from = eversion_t::max(); | |
621 | touched_log = true; | |
622 | trimmed.clear(); | |
c07f9fc5 | 623 | trimmed_dups.clear(); |
7c673cae FG |
624 | writeout_from = eversion_t::max(); |
625 | check(); | |
626 | missing.flush(); | |
c07f9fc5 | 627 | dirty_dups = false; |
7c673cae FG |
628 | } |
629 | public: | |
c07f9fc5 | 630 | |
7c673cae | 631 | // cppcheck-suppress noExplicitConstructor |
c07f9fc5 | 632 | PGLog(CephContext *cct, DoutPrefixProvider *dpp = nullptr) : |
7c673cae FG |
633 | prefix_provider(dpp), |
634 | dirty_from(eversion_t::max()), | |
635 | writeout_from(eversion_t::max()), | |
636 | cct(cct), | |
637 | pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))), | |
638 | touched_log(false), | |
c07f9fc5 FG |
639 | clear_divergent_priors(false), |
640 | dirty_dups(false) | |
641 | { } | |
7c673cae FG |
642 | |
643 | void reset_backfill(); | |
644 | ||
645 | void clear(); | |
646 | ||
647 | //////////////////// get or set missing //////////////////// | |
648 | ||
649 | const pg_missing_tracker_t& get_missing() const { return missing; } | |
650 | void revise_have(hobject_t oid, eversion_t have) { | |
651 | missing.revise_have(oid, have); | |
652 | } | |
653 | ||
7c673cae | 654 | void missing_add(const hobject_t& oid, eversion_t need, eversion_t have) { |
c07f9fc5 | 655 | missing.add(oid, need, have, false); |
7c673cae FG |
656 | } |
657 | ||
7c673cae FG |
658 | //////////////////// get or set log //////////////////// |
659 | ||
660 | const IndexedLog &get_log() const { return log; } | |
661 | ||
662 | const eversion_t &get_tail() const { return log.tail; } | |
663 | ||
664 | void set_tail(eversion_t tail) { log.tail = tail; } | |
665 | ||
666 | const eversion_t &get_head() const { return log.head; } | |
667 | ||
668 | void set_head(eversion_t head) { log.head = head; } | |
669 | ||
670 | void set_last_requested(version_t last_requested) { | |
671 | log.last_requested = last_requested; | |
672 | } | |
673 | ||
674 | void index() { log.index(); } | |
675 | ||
676 | void unindex() { log.unindex(); } | |
677 | ||
678 | void add(const pg_log_entry_t& e, bool applied = true) { | |
679 | mark_writeout_from(e.version); | |
680 | log.add(e, applied); | |
681 | } | |
682 | ||
683 | void reset_recovery_pointers() { log.reset_recovery_pointers(); } | |
684 | ||
685 | static void clear_info_log( | |
686 | spg_t pgid, | |
687 | ObjectStore::Transaction *t); | |
688 | ||
689 | void trim( | |
690 | eversion_t trim_to, | |
691 | pg_info_t &info); | |
692 | ||
693 | void roll_forward_to( | |
694 | eversion_t roll_forward_to, | |
695 | LogEntryHandler *h) { | |
696 | log.roll_forward_to( | |
697 | roll_forward_to, | |
698 | h); | |
699 | } | |
700 | ||
701 | eversion_t get_can_rollback_to() const { | |
702 | return log.get_can_rollback_to(); | |
703 | } | |
704 | ||
705 | void roll_forward(LogEntryHandler *h) { | |
706 | roll_forward_to( | |
707 | log.head, | |
708 | h); | |
709 | } | |
710 | ||
711 | //////////////////// get or set log & missing //////////////////// | |
712 | ||
713 | void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { | |
714 | log.trim_rollback_info_to(log.head, h); | |
715 | log.claim_log_and_clear_rollback_info(o); | |
716 | missing.clear(); | |
717 | mark_dirty_to(eversion_t::max()); | |
718 | } | |
719 | ||
720 | void split_into( | |
721 | pg_t child_pgid, | |
722 | unsigned split_bits, | |
c07f9fc5 | 723 | PGLog *opg_log) { |
7c673cae FG |
724 | log.split_out_child(child_pgid, split_bits, &opg_log->log); |
725 | missing.split_into(child_pgid, split_bits, &(opg_log->missing)); | |
726 | opg_log->mark_dirty_to(eversion_t::max()); | |
727 | mark_dirty_to(eversion_t::max()); | |
c07f9fc5 FG |
728 | if (missing.may_include_deletes) |
729 | opg_log->rebuilt_missing_with_deletes = true; | |
7c673cae FG |
730 | } |
731 | ||
732 | void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) { | |
733 | if (missing.is_missing(oid, v)) { | |
734 | missing.got(oid, v); | |
c07f9fc5 | 735 | |
7c673cae FG |
736 | // raise last_complete? |
737 | if (missing.get_items().empty()) { | |
738 | log.complete_to = log.log.end(); | |
739 | info.last_complete = info.last_update; | |
740 | } | |
741 | while (log.complete_to != log.log.end()) { | |
742 | if (missing.get_items().at( | |
743 | missing.get_rmissing().begin()->second | |
744 | ).need <= log.complete_to->version) | |
745 | break; | |
746 | if (info.last_complete < log.complete_to->version) | |
747 | info.last_complete = log.complete_to->version; | |
748 | ++log.complete_to; | |
749 | } | |
750 | } | |
751 | ||
752 | assert(log.get_can_rollback_to() >= v); | |
753 | } | |
754 | ||
c07f9fc5 | 755 | void reset_complete_to(pg_info_t *info) { |
7c673cae | 756 | log.complete_to = log.log.begin(); |
c07f9fc5 | 757 | while (!missing.get_items().empty() && log.complete_to->version < |
7c673cae FG |
758 | missing.get_items().at( |
759 | missing.get_rmissing().begin()->second | |
c07f9fc5 | 760 | ).need) { |
d2e6a577 | 761 | assert(log.complete_to != log.log.end()); |
7c673cae | 762 | ++log.complete_to; |
c07f9fc5 | 763 | } |
7c673cae FG |
764 | assert(log.complete_to != log.log.end()); |
765 | if (log.complete_to == log.log.begin()) { | |
c07f9fc5 FG |
766 | if (info) |
767 | info->last_complete = eversion_t(); | |
7c673cae FG |
768 | } else { |
769 | --log.complete_to; | |
c07f9fc5 FG |
770 | if (info) |
771 | info->last_complete = log.complete_to->version; | |
7c673cae FG |
772 | ++log.complete_to; |
773 | } | |
c07f9fc5 FG |
774 | } |
775 | ||
776 | void activate_not_complete(pg_info_t &info) { | |
777 | reset_complete_to(&info); | |
7c673cae FG |
778 | log.last_requested = 0; |
779 | } | |
780 | ||
781 | void proc_replica_log(pg_info_t &oinfo, | |
782 | const pg_log_t &olog, | |
783 | pg_missing_t& omissing, pg_shard_t from) const; | |
784 | ||
c07f9fc5 FG |
785 | void rebuild_missing_set_with_deletes(ObjectStore *store, |
786 | coll_t pg_coll, | |
787 | const pg_info_t &info); | |
788 | ||
7c673cae FG |
789 | protected: |
790 | static void split_by_object( | |
31f18b77 FG |
791 | mempool::osd_pglog::list<pg_log_entry_t> &entries, |
792 | map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) { | |
7c673cae | 793 | while (!entries.empty()) { |
31f18b77 | 794 | auto &out_list = (*out_entries)[entries.front().soid]; |
7c673cae FG |
795 | out_list.splice(out_list.end(), entries, entries.begin()); |
796 | } | |
797 | } | |
798 | ||
799 | /** | |
800 | * _merge_object_divergent_entries | |
801 | * | |
802 | * There are 5 distinct cases: | |
803 | * 1) There is a more recent update: in this case we assume we adjusted the | |
804 | * store and missing during merge_log | |
805 | * 2) The first entry in the divergent sequence is a create. This might | |
806 | * either be because the object is a clone or because prior_version is | |
807 | * eversion_t(). In this case the object does not exist and we must | |
808 | * adjust missing and the store to match. | |
809 | * 3) We are currently missing the object. In this case, we adjust the | |
810 | * missing to our prior_version taking care to add a divergent_prior | |
811 | * if necessary | |
812 | * 4) We can rollback all of the entries. In this case, we do so using | |
813 | * the rollbacker and return -- the object does not go into missing. | |
814 | * 5) We cannot rollback at least 1 of the entries. In this case, we | |
815 | * clear the object out of the store and add a missing entry at | |
816 | * prior_version taking care to add a divergent_prior if | |
817 | * necessary. | |
818 | */ | |
819 | template <typename missing_type> | |
820 | static void _merge_object_divergent_entries( | |
821 | const IndexedLog &log, ///< [in] log to merge against | |
822 | const hobject_t &hoid, ///< [in] object we are merging | |
31f18b77 | 823 | const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge |
7c673cae FG |
824 | const pg_info_t &info, ///< [in] info for merging entries |
825 | eversion_t olog_can_rollback_to, ///< [in] rollback boundary | |
c07f9fc5 | 826 | missing_type &missing, ///< [in,out] missing to adjust, use |
7c673cae FG |
827 | LogEntryHandler *rollbacker, ///< [in] optional rollbacker object |
828 | const DoutPrefixProvider *dpp ///< [in] logging provider | |
829 | ) { | |
830 | ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid | |
31f18b77 | 831 | << " entries: " << orig_entries << dendl; |
7c673cae FG |
832 | |
833 | if (hoid > info.last_backfill) { | |
834 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill" | |
835 | << dendl; | |
836 | return; | |
837 | } | |
838 | ||
839 | // entries is non-empty | |
31f18b77 FG |
840 | assert(!orig_entries.empty()); |
841 | // strip out and ignore ERROR entries | |
842 | mempool::osd_pglog::list<pg_log_entry_t> entries; | |
7c673cae | 843 | eversion_t last; |
d2e6a577 | 844 | bool seen_non_error = false; |
31f18b77 FG |
845 | for (list<pg_log_entry_t>::const_iterator i = orig_entries.begin(); |
846 | i != orig_entries.end(); | |
7c673cae FG |
847 | ++i) { |
848 | // all entries are on hoid | |
849 | assert(i->soid == hoid); | |
d2e6a577 FG |
850 | // did not see error entries before this entry and this entry is not error |
851 | // then this entry is the first non error entry | |
852 | bool first_non_error = ! seen_non_error && ! i->is_error(); | |
853 | if (! i->is_error() ) { | |
854 | // see a non error entry now | |
855 | seen_non_error = true; | |
856 | } | |
857 | ||
858 | // No need to check the first entry since it prior_version is unavailable | |
859 | // in the list | |
860 | // No need to check if the prior_version is the minimal version | |
861 | // No need to check the first non-error entry since the leading error | |
862 | // entries are not its prior version | |
863 | if (i != orig_entries.begin() && i->prior_version != eversion_t() && | |
864 | ! first_non_error) { | |
7c673cae FG |
865 | // in increasing order of version |
866 | assert(i->version > last); | |
31f18b77 FG |
867 | // prior_version correct (unless it is an ERROR entry) |
868 | assert(i->prior_version == last || i->is_error()); | |
7c673cae | 869 | } |
31f18b77 FG |
870 | if (i->is_error()) { |
871 | ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; | |
872 | } else { | |
873 | ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl; | |
874 | entries.push_back(*i); | |
d2e6a577 | 875 | last = i->version; |
31f18b77 FG |
876 | } |
877 | } | |
878 | if (entries.empty()) { | |
879 | ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl; | |
880 | return; | |
7c673cae FG |
881 | } |
882 | ||
883 | const eversion_t prior_version = entries.begin()->prior_version; | |
884 | const eversion_t first_divergent_update = entries.begin()->version; | |
885 | const eversion_t last_divergent_update = entries.rbegin()->version; | |
886 | const bool object_not_in_store = | |
887 | !missing.is_missing(hoid) && | |
888 | entries.rbegin()->is_delete(); | |
889 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
890 | << " prior_version: " << prior_version | |
891 | << " first_divergent_update: " << first_divergent_update | |
892 | << " last_divergent_update: " << last_divergent_update | |
893 | << dendl; | |
894 | ||
895 | ceph::unordered_map<hobject_t, pg_log_entry_t*>::const_iterator objiter = | |
896 | log.objects.find(hoid); | |
897 | if (objiter != log.objects.end() && | |
898 | objiter->second->version >= first_divergent_update) { | |
899 | /// Case 1) | |
900 | ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: " | |
901 | << *objiter->second << ", already merged" << dendl; | |
902 | ||
903 | assert(objiter->second->version > last_divergent_update); | |
904 | ||
905 | // ensure missing has been updated appropriately | |
c07f9fc5 FG |
906 | if (objiter->second->is_update() || |
907 | (missing.may_include_deletes && objiter->second->is_delete())) { | |
7c673cae FG |
908 | assert(missing.is_missing(hoid) && |
909 | missing.get_items().at(hoid).need == objiter->second->version); | |
910 | } else { | |
911 | assert(!missing.is_missing(hoid)); | |
912 | } | |
913 | missing.revise_have(hoid, eversion_t()); | |
914 | if (rollbacker) { | |
915 | if (!object_not_in_store) { | |
916 | rollbacker->remove(hoid); | |
917 | } | |
918 | for (auto &&i: entries) { | |
919 | rollbacker->trim(i); | |
920 | } | |
921 | } | |
922 | return; | |
923 | } | |
924 | ||
925 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
926 | <<" has no more recent entries in log" << dendl; | |
927 | if (prior_version == eversion_t() || entries.front().is_clone()) { | |
928 | /// Case 2) | |
929 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
930 | << " prior_version or op type indicates creation," | |
931 | << " deleting" | |
932 | << dendl; | |
933 | if (missing.is_missing(hoid)) | |
934 | missing.rm(missing.get_items().find(hoid)); | |
935 | if (rollbacker) { | |
936 | if (!object_not_in_store) { | |
937 | rollbacker->remove(hoid); | |
938 | } | |
939 | for (auto &&i: entries) { | |
940 | rollbacker->trim(i); | |
941 | } | |
942 | } | |
943 | return; | |
944 | } | |
945 | ||
946 | if (missing.is_missing(hoid)) { | |
947 | /// Case 3) | |
948 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
949 | << " missing, " << missing.get_items().at(hoid) | |
950 | << " adjusting" << dendl; | |
951 | ||
952 | if (missing.get_items().at(hoid).have == prior_version) { | |
953 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
954 | << " missing.have is prior_version " << prior_version | |
955 | << " removing from missing" << dendl; | |
956 | missing.rm(missing.get_items().find(hoid)); | |
957 | } else { | |
958 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
959 | << " missing.have is " << missing.get_items().at(hoid).have | |
960 | << ", adjusting" << dendl; | |
c07f9fc5 | 961 | missing.revise_need(hoid, prior_version, false); |
7c673cae FG |
962 | if (prior_version <= info.log_tail) { |
963 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
964 | << " prior_version " << prior_version | |
965 | << " <= info.log_tail " | |
966 | << info.log_tail << dendl; | |
967 | } | |
968 | } | |
969 | if (rollbacker) { | |
970 | for (auto &&i: entries) { | |
971 | rollbacker->trim(i); | |
972 | } | |
973 | } | |
974 | return; | |
975 | } | |
976 | ||
977 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
978 | << " must be rolled back or recovered," | |
979 | << " attempting to rollback" | |
980 | << dendl; | |
981 | bool can_rollback = true; | |
982 | /// Distinguish between 4) and 5) | |
983 | for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin(); | |
984 | i != entries.rend(); | |
985 | ++i) { | |
986 | if (!i->can_rollback() || i->version <= olog_can_rollback_to) { | |
987 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback " | |
988 | << *i << dendl; | |
989 | can_rollback = false; | |
990 | break; | |
991 | } | |
992 | } | |
993 | ||
994 | if (can_rollback) { | |
995 | /// Case 4) | |
996 | for (list<pg_log_entry_t>::const_reverse_iterator i = entries.rbegin(); | |
997 | i != entries.rend(); | |
998 | ++i) { | |
999 | assert(i->can_rollback() && i->version > olog_can_rollback_to); | |
1000 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1001 | << " rolling back " << *i << dendl; | |
1002 | if (rollbacker) | |
1003 | rollbacker->rollback(*i); | |
1004 | } | |
1005 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1006 | << " rolled back" << dendl; | |
1007 | return; | |
1008 | } else { | |
1009 | /// Case 5) | |
1010 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, " | |
1011 | << "removing and adding to missing" << dendl; | |
1012 | if (rollbacker) { | |
1013 | if (!object_not_in_store) | |
1014 | rollbacker->remove(hoid); | |
1015 | for (auto &&i: entries) { | |
1016 | rollbacker->trim(i); | |
1017 | } | |
1018 | } | |
c07f9fc5 | 1019 | missing.add(hoid, prior_version, eversion_t(), false); |
7c673cae FG |
1020 | if (prior_version <= info.log_tail) { |
1021 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1022 | << " prior_version " << prior_version | |
1023 | << " <= info.log_tail " | |
1024 | << info.log_tail << dendl; | |
1025 | } | |
1026 | } | |
1027 | } | |
1028 | ||
1029 | /// Merge all entries using above | |
1030 | template <typename missing_type> | |
1031 | static void _merge_divergent_entries( | |
1032 | const IndexedLog &log, ///< [in] log to merge against | |
31f18b77 | 1033 | mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge |
7c673cae FG |
1034 | const pg_info_t &oinfo, ///< [in] info for merging entries |
1035 | eversion_t olog_can_rollback_to, ///< [in] rollback boundary | |
1036 | missing_type &omissing, ///< [in,out] missing to adjust, use | |
1037 | LogEntryHandler *rollbacker, ///< [in] optional rollbacker object | |
1038 | const DoutPrefixProvider *dpp ///< [in] logging provider | |
1039 | ) { | |
31f18b77 | 1040 | map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split; |
7c673cae | 1041 | split_by_object(entries, &split); |
31f18b77 | 1042 | for (map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>>::iterator i = split.begin(); |
7c673cae FG |
1043 | i != split.end(); |
1044 | ++i) { | |
1045 | _merge_object_divergent_entries( | |
1046 | log, | |
1047 | i->first, | |
1048 | i->second, | |
1049 | oinfo, | |
1050 | olog_can_rollback_to, | |
1051 | omissing, | |
1052 | rollbacker, | |
1053 | dpp); | |
1054 | } | |
1055 | } | |
1056 | ||
1057 | /** | |
1058 | * Exists for use in TestPGLog for simply testing single divergent log | |
1059 | * cases | |
1060 | */ | |
1061 | void merge_old_entry( | |
1062 | ObjectStore::Transaction& t, | |
1063 | const pg_log_entry_t& oe, | |
1064 | const pg_info_t& info, | |
1065 | LogEntryHandler *rollbacker) { | |
31f18b77 | 1066 | mempool::osd_pglog::list<pg_log_entry_t> entries; |
7c673cae FG |
1067 | entries.push_back(oe); |
1068 | _merge_object_divergent_entries( | |
1069 | log, | |
1070 | oe.soid, | |
1071 | entries, | |
1072 | info, | |
1073 | log.get_can_rollback_to(), | |
1074 | missing, | |
1075 | rollbacker, | |
1076 | this); | |
1077 | } | |
c07f9fc5 FG |
1078 | |
1079 | bool merge_log_dups(const pg_log_t& olog); | |
1080 | ||
7c673cae | 1081 | public: |
c07f9fc5 | 1082 | |
7c673cae FG |
1083 | void rewind_divergent_log(eversion_t newhead, |
1084 | pg_info_t &info, | |
1085 | LogEntryHandler *rollbacker, | |
1086 | bool &dirty_info, | |
1087 | bool &dirty_big_info); | |
1088 | ||
1089 | void merge_log(pg_info_t &oinfo, | |
1090 | pg_log_t &olog, | |
1091 | pg_shard_t from, | |
1092 | pg_info_t &info, LogEntryHandler *rollbacker, | |
1093 | bool &dirty_info, bool &dirty_big_info); | |
1094 | ||
1095 | template <typename missing_type> | |
1096 | static bool append_log_entries_update_missing( | |
1097 | const hobject_t &last_backfill, | |
1098 | bool last_backfill_bitwise, | |
31f18b77 | 1099 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
1100 | bool maintain_rollback, |
1101 | IndexedLog *log, | |
1102 | missing_type &missing, | |
1103 | LogEntryHandler *rollbacker, | |
1104 | const DoutPrefixProvider *dpp) { | |
1105 | bool invalidate_stats = false; | |
1106 | if (log && !entries.empty()) { | |
1107 | assert(log->head < entries.begin()->version); | |
1108 | } | |
1109 | for (list<pg_log_entry_t>::const_iterator p = entries.begin(); | |
1110 | p != entries.end(); | |
1111 | ++p) { | |
1112 | invalidate_stats = invalidate_stats || !p->is_error(); | |
1113 | if (log) { | |
1114 | ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; | |
1115 | log->add(*p); | |
1116 | } | |
1117 | if (p->soid <= last_backfill && | |
1118 | !p->is_error()) { | |
c07f9fc5 FG |
1119 | if (missing.may_include_deletes) { |
1120 | missing.add_next_event(*p); | |
1121 | } else { | |
1122 | if (p->is_delete()) { | |
1123 | missing.rm(p->soid, p->version); | |
1124 | } else { | |
1125 | missing.add_next_event(*p); | |
1126 | } | |
1127 | if (rollbacker) { | |
1128 | // hack to match PG::mark_all_unfound_lost | |
1129 | if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) { | |
1130 | rollbacker->try_stash(p->soid, p->version.version); | |
1131 | } else if (p->is_delete()) { | |
1132 | rollbacker->remove(p->soid); | |
1133 | } | |
7c673cae FG |
1134 | } |
1135 | } | |
1136 | } | |
1137 | } | |
1138 | return invalidate_stats; | |
1139 | } | |
1140 | bool append_new_log_entries( | |
1141 | const hobject_t &last_backfill, | |
1142 | bool last_backfill_bitwise, | |
31f18b77 | 1143 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
1144 | LogEntryHandler *rollbacker) { |
1145 | bool invalidate_stats = append_log_entries_update_missing( | |
1146 | last_backfill, | |
1147 | last_backfill_bitwise, | |
1148 | entries, | |
1149 | true, | |
1150 | &log, | |
1151 | missing, | |
1152 | rollbacker, | |
1153 | this); | |
1154 | if (!entries.empty()) { | |
1155 | mark_writeout_from(entries.begin()->version); | |
c07f9fc5 FG |
1156 | if (entries.begin()->is_lost_delete()) { |
1157 | // hack: since lost deletes queue recovery directly, and don't | |
1158 | // go through activate_not_complete() again, our complete_to | |
1159 | // iterator may still point at log.end(). Reset it to point | |
1160 | // before these new lost_delete entries. This only occurs | |
1161 | // when lost+delete entries are initially added, which is | |
1162 | // always in a list of solely lost_delete entries, so it is | |
1163 | // sufficient to check whether the first entry is a | |
1164 | // lost_delete | |
1165 | reset_complete_to(nullptr); | |
1166 | } | |
7c673cae FG |
1167 | } |
1168 | return invalidate_stats; | |
1169 | } | |
1170 | ||
c07f9fc5 FG |
1171 | void write_log_and_missing( |
1172 | ObjectStore::Transaction& t, | |
1173 | map<string,bufferlist> *km, | |
1174 | const coll_t& coll, | |
1175 | const ghobject_t &log_oid, | |
1176 | bool require_rollback); | |
7c673cae FG |
1177 | |
1178 | static void write_log_and_missing_wo_missing( | |
1179 | ObjectStore::Transaction& t, | |
1180 | map<string,bufferlist>* km, | |
1181 | pg_log_t &log, | |
1182 | const coll_t& coll, | |
1183 | const ghobject_t &log_oid, map<eversion_t, hobject_t> &divergent_priors, | |
c07f9fc5 FG |
1184 | bool require_rollback, |
1185 | bool dirty_dups); | |
7c673cae FG |
1186 | |
1187 | static void write_log_and_missing( | |
1188 | ObjectStore::Transaction& t, | |
1189 | map<string,bufferlist>* km, | |
1190 | pg_log_t &log, | |
1191 | const coll_t& coll, | |
1192 | const ghobject_t &log_oid, | |
1193 | const pg_missing_tracker_t &missing, | |
c07f9fc5 FG |
1194 | bool require_rollback, |
1195 | bool dirty_dups, | |
1196 | bool *rebuilt_missing_set_with_deletes); | |
7c673cae FG |
1197 | |
1198 | static void _write_log_and_missing_wo_missing( | |
1199 | ObjectStore::Transaction& t, | |
1200 | map<string,bufferlist>* km, | |
1201 | pg_log_t &log, | |
1202 | const coll_t& coll, const ghobject_t &log_oid, | |
1203 | map<eversion_t, hobject_t> &divergent_priors, | |
1204 | eversion_t dirty_to, | |
1205 | eversion_t dirty_from, | |
1206 | eversion_t writeout_from, | |
1207 | const set<eversion_t> &trimmed, | |
c07f9fc5 | 1208 | const set<string> &trimmed_dups, |
7c673cae FG |
1209 | bool dirty_divergent_priors, |
1210 | bool touch_log, | |
1211 | bool require_rollback, | |
c07f9fc5 | 1212 | bool dirty_dups, |
7c673cae FG |
1213 | set<string> *log_keys_debug |
1214 | ); | |
1215 | ||
1216 | static void _write_log_and_missing( | |
1217 | ObjectStore::Transaction& t, | |
1218 | map<string,bufferlist>* km, | |
1219 | pg_log_t &log, | |
1220 | const coll_t& coll, const ghobject_t &log_oid, | |
1221 | eversion_t dirty_to, | |
1222 | eversion_t dirty_from, | |
1223 | eversion_t writeout_from, | |
1224 | const set<eversion_t> &trimmed, | |
c07f9fc5 | 1225 | const set<string> &trimmed_dups, |
7c673cae FG |
1226 | const pg_missing_tracker_t &missing, |
1227 | bool touch_log, | |
1228 | bool require_rollback, | |
1229 | bool clear_divergent_priors, | |
c07f9fc5 FG |
1230 | bool dirty_dups, |
1231 | bool *rebuilt_missing_with_deletes, | |
7c673cae FG |
1232 | set<string> *log_keys_debug |
1233 | ); | |
1234 | ||
1235 | void read_log_and_missing( | |
c07f9fc5 FG |
1236 | ObjectStore *store, |
1237 | coll_t pg_coll, | |
1238 | coll_t log_coll, | |
1239 | ghobject_t log_oid, | |
7c673cae | 1240 | const pg_info_t &info, |
d2e6a577 | 1241 | bool force_rebuild_missing, |
7c673cae FG |
1242 | ostringstream &oss, |
1243 | bool tolerate_divergent_missing_log, | |
1244 | bool debug_verify_stored_missing = false | |
1245 | ) { | |
1246 | return read_log_and_missing( | |
1247 | store, pg_coll, log_coll, log_oid, info, | |
d2e6a577 | 1248 | log, missing, force_rebuild_missing, oss, |
7c673cae FG |
1249 | tolerate_divergent_missing_log, |
1250 | &clear_divergent_priors, | |
1251 | this, | |
c07f9fc5 | 1252 | (pg_log_debug ? &log_keys_debug : nullptr), |
7c673cae FG |
1253 | debug_verify_stored_missing); |
1254 | } | |
1255 | ||
1256 | template <typename missing_type> | |
c07f9fc5 FG |
1257 | static void read_log_and_missing( |
1258 | ObjectStore *store, | |
1259 | coll_t pg_coll, | |
1260 | coll_t log_coll, | |
1261 | ghobject_t log_oid, | |
7c673cae FG |
1262 | const pg_info_t &info, |
1263 | IndexedLog &log, | |
c07f9fc5 | 1264 | missing_type &missing, |
d2e6a577 | 1265 | bool force_rebuild_missing, |
c07f9fc5 | 1266 | ostringstream &oss, |
7c673cae | 1267 | bool tolerate_divergent_missing_log, |
c07f9fc5 FG |
1268 | bool *clear_divergent_priors = nullptr, |
1269 | const DoutPrefixProvider *dpp = nullptr, | |
1270 | set<string> *log_keys_debug = nullptr, | |
7c673cae FG |
1271 | bool debug_verify_stored_missing = false |
1272 | ) { | |
1273 | ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll | |
1274 | << " log_oid " << log_oid << dendl; | |
1275 | ||
1276 | // legacy? | |
1277 | struct stat st; | |
1278 | int r = store->stat(log_coll, log_oid, &st); | |
1279 | assert(r == 0); | |
1280 | assert(st.st_size == 0); | |
1281 | ||
1282 | // will get overridden below if it had been recorded | |
1283 | eversion_t on_disk_can_rollback_to = info.last_update; | |
1284 | eversion_t on_disk_rollback_info_trimmed_to = eversion_t(); | |
1285 | ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid); | |
1286 | map<eversion_t, hobject_t> divergent_priors; | |
c07f9fc5 | 1287 | missing.may_include_deletes = false; |
7c673cae | 1288 | list<pg_log_entry_t> entries; |
c07f9fc5 | 1289 | list<pg_log_dup_t> dups; |
7c673cae FG |
1290 | if (p) { |
1291 | for (p->seek_to_first(); p->valid() ; p->next(false)) { | |
1292 | // non-log pgmeta_oid keys are prefixed with _; skip those | |
1293 | if (p->key()[0] == '_') | |
1294 | continue; | |
1295 | bufferlist bl = p->value();//Copy bufferlist before creating iterator | |
1296 | bufferlist::iterator bp = bl.begin(); | |
1297 | if (p->key() == "divergent_priors") { | |
1298 | ::decode(divergent_priors, bp); | |
1299 | ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() | |
1300 | << " divergent_priors" << dendl; | |
d2e6a577 | 1301 | assert(force_rebuild_missing); |
7c673cae FG |
1302 | debug_verify_stored_missing = false; |
1303 | } else if (p->key() == "can_rollback_to") { | |
1304 | ::decode(on_disk_can_rollback_to, bp); | |
1305 | } else if (p->key() == "rollback_info_trimmed_to") { | |
1306 | ::decode(on_disk_rollback_info_trimmed_to, bp); | |
c07f9fc5 FG |
1307 | } else if (p->key() == "may_include_deletes_in_missing") { |
1308 | missing.may_include_deletes = true; | |
7c673cae | 1309 | } else if (p->key().substr(0, 7) == string("missing")) { |
c07f9fc5 FG |
1310 | hobject_t oid; |
1311 | pg_missing_item item; | |
1312 | ::decode(oid, bp); | |
1313 | ::decode(item, bp); | |
1314 | if (item.is_delete()) { | |
1315 | assert(missing.may_include_deletes); | |
1316 | } | |
1317 | missing.add(oid, item.need, item.have, item.is_delete()); | |
1318 | } else if (p->key().substr(0, 4) == string("dup_")) { | |
1319 | pg_log_dup_t dup; | |
1320 | ::decode(dup, bp); | |
1321 | if (!dups.empty()) { | |
1322 | assert(dups.back().version < dup.version); | |
1323 | } | |
1324 | dups.push_back(dup); | |
7c673cae FG |
1325 | } else { |
1326 | pg_log_entry_t e; | |
1327 | e.decode_with_checksum(bp); | |
1328 | ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; | |
1329 | if (!entries.empty()) { | |
1330 | pg_log_entry_t last_e(entries.back()); | |
1331 | assert(last_e.version.version < e.version.version); | |
1332 | assert(last_e.version.epoch <= e.version.epoch); | |
1333 | } | |
1334 | entries.push_back(e); | |
1335 | if (log_keys_debug) | |
1336 | log_keys_debug->insert(e.get_key_name()); | |
1337 | } | |
1338 | } | |
1339 | } | |
1340 | log = IndexedLog( | |
1341 | info.last_update, | |
1342 | info.log_tail, | |
1343 | on_disk_can_rollback_to, | |
1344 | on_disk_rollback_info_trimmed_to, | |
c07f9fc5 FG |
1345 | std::move(entries), |
1346 | std::move(dups)); | |
7c673cae | 1347 | |
d2e6a577 | 1348 | if (force_rebuild_missing || debug_verify_stored_missing) { |
7c673cae FG |
1349 | // build missing |
1350 | if (debug_verify_stored_missing || info.last_complete < info.last_update) { | |
c07f9fc5 FG |
1351 | ldpp_dout(dpp, 10) |
1352 | << "read_log_and_missing checking for missing items over interval (" | |
1353 | << info.last_complete | |
1354 | << "," << info.last_update << "]" << dendl; | |
7c673cae FG |
1355 | |
1356 | set<hobject_t> did; | |
1357 | set<hobject_t> checked; | |
1358 | set<hobject_t> skipped; | |
1359 | for (list<pg_log_entry_t>::reverse_iterator i = log.log.rbegin(); | |
1360 | i != log.log.rend(); | |
1361 | ++i) { | |
1362 | if (!debug_verify_stored_missing && i->version <= info.last_complete) break; | |
1363 | if (i->soid > info.last_backfill) | |
1364 | continue; | |
1365 | if (i->is_error()) | |
1366 | continue; | |
1367 | if (did.count(i->soid)) continue; | |
1368 | did.insert(i->soid); | |
1369 | ||
c07f9fc5 FG |
1370 | if (!missing.may_include_deletes && i->is_delete()) |
1371 | continue; | |
7c673cae FG |
1372 | |
1373 | bufferlist bv; | |
1374 | int r = store->getattr( | |
1375 | pg_coll, | |
1376 | ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), | |
1377 | OI_ATTR, | |
1378 | bv); | |
1379 | if (r >= 0) { | |
1380 | object_info_t oi(bv); | |
1381 | if (oi.version < i->version) { | |
1382 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i | |
1383 | << " (have " << oi.version << ")" << dendl; | |
1384 | if (debug_verify_stored_missing) { | |
1385 | auto miter = missing.get_items().find(i->soid); | |
1386 | assert(miter != missing.get_items().end()); | |
1387 | assert(miter->second.need == i->version); | |
c07f9fc5 FG |
1388 | // the 'have' version is reset if an object is deleted, |
1389 | // then created again | |
1390 | assert(miter->second.have == oi.version || miter->second.have == eversion_t()); | |
7c673cae FG |
1391 | checked.insert(i->soid); |
1392 | } else { | |
c07f9fc5 | 1393 | missing.add(i->soid, i->version, oi.version, i->is_delete()); |
7c673cae FG |
1394 | } |
1395 | } | |
1396 | } else { | |
1397 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; | |
1398 | if (debug_verify_stored_missing) { | |
1399 | auto miter = missing.get_items().find(i->soid); | |
c07f9fc5 FG |
1400 | if (i->is_delete()) { |
1401 | assert(miter == missing.get_items().end() || | |
1402 | (miter->second.need == i->version && | |
1403 | miter->second.have == eversion_t())); | |
1404 | } else { | |
1405 | assert(miter != missing.get_items().end()); | |
1406 | assert(miter->second.need == i->version); | |
1407 | assert(miter->second.have == eversion_t()); | |
1408 | } | |
7c673cae FG |
1409 | checked.insert(i->soid); |
1410 | } else { | |
c07f9fc5 | 1411 | missing.add(i->soid, i->version, eversion_t(), i->is_delete()); |
7c673cae FG |
1412 | } |
1413 | } | |
1414 | } | |
1415 | if (debug_verify_stored_missing) { | |
1416 | for (auto &&i: missing.get_items()) { | |
1417 | if (checked.count(i.first)) | |
1418 | continue; | |
c07f9fc5 FG |
1419 | if (i.first > info.last_backfill) { |
1420 | ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry " | |
1421 | << "found before last_backfill: " | |
1422 | << i.first << " " << i.second | |
1423 | << " last_backfill = " << info.last_backfill | |
1424 | << dendl; | |
7c673cae FG |
1425 | assert(0 == "invalid missing set entry found"); |
1426 | } | |
1427 | bufferlist bv; | |
1428 | int r = store->getattr( | |
1429 | pg_coll, | |
1430 | ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard), | |
1431 | OI_ATTR, | |
1432 | bv); | |
1433 | if (r >= 0) { | |
1434 | object_info_t oi(bv); | |
1435 | assert(oi.version == i.second.have); | |
1436 | } else { | |
c07f9fc5 | 1437 | assert(i.second.is_delete() || eversion_t() == i.second.have); |
7c673cae FG |
1438 | } |
1439 | } | |
1440 | } else { | |
d2e6a577 | 1441 | assert(force_rebuild_missing); |
7c673cae FG |
1442 | for (map<eversion_t, hobject_t>::reverse_iterator i = |
1443 | divergent_priors.rbegin(); | |
1444 | i != divergent_priors.rend(); | |
1445 | ++i) { | |
1446 | if (i->first <= info.last_complete) break; | |
1447 | if (i->second > info.last_backfill) | |
1448 | continue; | |
1449 | if (did.count(i->second)) continue; | |
1450 | did.insert(i->second); | |
1451 | bufferlist bv; | |
1452 | int r = store->getattr( | |
1453 | pg_coll, | |
1454 | ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard), | |
1455 | OI_ATTR, | |
1456 | bv); | |
1457 | if (r >= 0) { | |
1458 | object_info_t oi(bv); | |
1459 | /** | |
1460 | * 1) we see this entry in the divergent priors mapping | |
1461 | * 2) we didn't see an entry for this object in the log | |
1462 | * | |
1463 | * From 1 & 2 we know that either the object does not exist | |
1464 | * or it is at the version specified in the divergent_priors | |
1465 | * map since the object would have been deleted atomically | |
1466 | * with the addition of the divergent_priors entry, an older | |
1467 | * version would not have been recovered, and a newer version | |
1468 | * would show up in the log above. | |
1469 | */ | |
1470 | /** | |
1471 | * Unfortunately the assessment above is incorrect because of | |
1472 | * http://tracker.ceph.com/issues/17916 (we were incorrectly | |
1473 | * not removing the divergent_priors set from disk state!), | |
1474 | * so let's check that. | |
1475 | */ | |
1476 | if (oi.version > i->first && tolerate_divergent_missing_log) { | |
1477 | ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i | |
1478 | << ") inconsistent with disk state (" << oi | |
1479 | << "), assuming it is tracker.ceph.com/issues/17916" | |
1480 | << dendl; | |
1481 | } else { | |
1482 | assert(oi.version == i->first); | |
1483 | } | |
1484 | } else { | |
1485 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; | |
c07f9fc5 | 1486 | missing.add(i->second, i->first, eversion_t(), false); |
7c673cae FG |
1487 | } |
1488 | } | |
1489 | } | |
1490 | if (clear_divergent_priors) | |
1491 | (*clear_divergent_priors) = true; | |
1492 | } | |
1493 | } | |
1494 | ||
d2e6a577 | 1495 | if (!force_rebuild_missing) { |
7c673cae FG |
1496 | if (clear_divergent_priors) |
1497 | (*clear_divergent_priors) = false; | |
1498 | missing.flush(); | |
1499 | } | |
1500 | ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl; | |
c07f9fc5 FG |
1501 | } // static read_log_and_missing |
1502 | }; // struct PGLog |