]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License version 2.1, as published by the Free Software | |
14 | * Foundation. See file COPYING. | |
15 | * | |
16 | */ | |
c07f9fc5 | 17 | #pragma once |
7c673cae FG |
18 | |
19 | // re-include our assert to clobber boost's | |
11fdf7f2 | 20 | #include "include/ceph_assert.h" |
9f95a23c | 21 | #include "include/common_fwd.h" |
7c673cae FG |
22 | #include "osd_types.h" |
23 | #include "os/ObjectStore.h" | |
24 | #include <list> | |
7c673cae | 25 | |
9f95a23c TL |
26 | #ifdef WITH_SEASTAR |
27 | #include <seastar/core/future.hh> | |
28 | #include "crimson/os/futurized_store.h" | |
29 | #include "crimson/os/cyanstore/cyan_collection.h" | |
30 | #endif | |
31 | ||
f67539c2 TL |
32 | /** @name PG Log |
33 | * | |
34 | * The pg log serves three primary purposes: | |
35 | * | |
36 | * 1) improving recovery speed | |
37 | * | |
38 | * 2) detecting duplicate ops | |
39 | * | |
40 | * 3) making erasure coded updates safe | |
41 | * | |
42 | * For (1), the main data type is pg_log_entry_t. this is indexed in | |
43 | * memory by the IndexedLog class - this is where most of the logic | |
44 | * surrounding pg log is kept, even though the low level types are in | |
45 | * src/osd/osd_types.h | |
46 | * | |
47 | * (2) uses a type which is a subset of the full log entry, containing | |
48 | * just the pieces we need to identify and respond to a duplicate | |
49 | * request. | |
50 | * | |
51 | * As we trim the log, we convert pg_log_entry_t to smaller | |
52 | * pg_log_dup_t, and finally remove them once we reach a higher | |
53 | * limit. This is controlled by a few options: | |
54 | * | |
55 | * osd_min_pg_log_entries osd_max_pg_log_entries | |
56 | * osd_pg_log_dups_tracked | |
57 | * | |
58 | * For example, with a min of 100, max of 1000, and dups tracked of | |
59 | * 3000, the log entries and dups stored would span the following | |
60 | * versions, assuming the current earliest is version 1: | |
61 | * | |
62 | * version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ] | |
63 | * | |
64 | * after osd_pg_log_trim_min subsequent writes to this PG, the log | |
65 | * would be trimmed to look like: | |
66 | * | |
67 | * version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ] | |
68 | * | |
69 | * (3) means tracking the previous state of an object, so that we can | |
70 | * rollback to that prior state if necessary. It's only used for | |
71 | * erasure coding. Consider an erasure code of 4+2, for example. | |
72 | * | |
73 | * This means we split the object into 4 pieces (called shards) and | |
74 | * compute 2 parity shards. Each of these shards is stored on a | |
75 | * separate OSD. As long as 4 shards are the same version, we can | |
76 | * recover the remaining 2 by computation. Imagine during a write, 3 | |
77 | * of the osds go down and restart, resulting in shards 0,1,2 | |
78 | * reflecting version A and shards 3,4,5 reflecting version B, after | |
79 | * the write. | |
80 | * | |
81 | * If we had no way to reconstruct version A for another shard, we | |
82 | * would have lost the object. | |
83 | * | |
84 | * The actual data for rollback is stored in a look-aside object and | |
85 | * is removed once the EC write commits on all shards. The pg log just | |
86 | * stores the versions so we can tell how far we can rollback, and a | |
87 | * description of the type of operation for each log entry. Beyond | |
88 | * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor | |
89 | * for more details on this. | |
90 | * | |
91 | * An important implication of this is that although the pg log length | |
92 | * is normally bounded, under extreme conditions, with many EC I/Os | |
93 | * outstanding, the log may grow beyond that point because we need to | |
94 | * keep the rollback information for all outstanding EC I/O. | |
95 | * | |
96 | * For more on pg log bounds, see where it is calculated in | |
97 | * PeeringState::calc_trim_to_aggressive(). | |
98 | * | |
99 | * For more details on how peering uses the pg log, and architectural | |
100 | * reasons for its existence, see: | |
101 | * | |
102 | * doc/dev/osd_internals/log_based_pg.rst | |
103 | * | |
104 | */ | |
105 | ||
11fdf7f2 TL |
106 | constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0; |
107 | constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1; | |
108 | constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2; | |
109 | constexpr auto PGLOG_INDEXED_DUPS = 1 << 3; | |
110 | constexpr auto PGLOG_INDEXED_ALL = PGLOG_INDEXED_OBJECTS | |
111 | | PGLOG_INDEXED_CALLER_OPS | |
112 | | PGLOG_INDEXED_EXTRA_CALLER_OPS | |
113 | | PGLOG_INDEXED_DUPS; | |
7c673cae | 114 | |
7c673cae | 115 | struct PGLog : DoutPrefixProvider { |
11fdf7f2 TL |
116 | std::ostream& gen_prefix(std::ostream& out) const override { |
117 | return out; | |
7c673cae FG |
118 | } |
119 | unsigned get_subsys() const override { | |
11fdf7f2 | 120 | return static_cast<unsigned>(ceph_subsys_osd); |
7c673cae FG |
121 | } |
122 | CephContext *get_cct() const override { | |
123 | return cct; | |
124 | } | |
125 | ||
126 | ////////////////////////////// sub classes ////////////////////////////// | |
127 | struct LogEntryHandler { | |
128 | virtual void rollback( | |
129 | const pg_log_entry_t &entry) = 0; | |
130 | virtual void rollforward( | |
131 | const pg_log_entry_t &entry) = 0; | |
132 | virtual void trim( | |
133 | const pg_log_entry_t &entry) = 0; | |
134 | virtual void remove( | |
135 | const hobject_t &hoid) = 0; | |
136 | virtual void try_stash( | |
137 | const hobject_t &hoid, | |
138 | version_t v) = 0; | |
139 | virtual ~LogEntryHandler() {} | |
140 | }; | |
f67539c2 | 141 | using LogEntryHandlerRef = std::unique_ptr<LogEntryHandler>; |
7c673cae | 142 | |
7c673cae FG |
143 | public: |
144 | /** | |
145 | * IndexLog - adds in-memory index of the log, by oid. | |
146 | * plus some methods to manipulate it all. | |
147 | */ | |
148 | struct IndexedLog : public pg_log_t { | |
149 | mutable ceph::unordered_map<hobject_t,pg_log_entry_t*> objects; // ptrs into log. be careful! | |
150 | mutable ceph::unordered_map<osd_reqid_t,pg_log_entry_t*> caller_ops; | |
151 | mutable ceph::unordered_multimap<osd_reqid_t,pg_log_entry_t*> extra_caller_ops; | |
c07f9fc5 | 152 | mutable ceph::unordered_map<osd_reqid_t,pg_log_dup_t*> dup_index; |
7c673cae FG |
153 | |
154 | // recovery pointers | |
f67539c2 | 155 | std::list<pg_log_entry_t>::iterator complete_to; // not inclusive of referenced item |
7c673cae FG |
156 | version_t last_requested = 0; // last object requested by primary |
157 | ||
158 | // | |
159 | private: | |
160 | mutable __u16 indexed_data = 0; | |
161 | /** | |
162 | * rollback_info_trimmed_to_riter points to the first log entry <= | |
163 | * rollback_info_trimmed_to | |
164 | * | |
165 | * It's a reverse_iterator because rend() is a natural representation for | |
166 | * tail, and rbegin() works nicely for head. | |
167 | */ | |
31f18b77 | 168 | mempool::osd_pglog::list<pg_log_entry_t>::reverse_iterator |
7c673cae FG |
169 | rollback_info_trimmed_to_riter; |
170 | ||
eafe8130 TL |
171 | /* |
172 | * return true if we need to mark the pglog as dirty | |
173 | */ | |
7c673cae | 174 | template <typename F> |
eafe8130 TL |
175 | bool advance_can_rollback_to(eversion_t to, F &&f) { |
176 | bool dirty_log = to > can_rollback_to || to > rollback_info_trimmed_to; | |
177 | if (dirty_log) { | |
178 | if (to > can_rollback_to) | |
179 | can_rollback_to = to; | |
180 | ||
181 | if (to > rollback_info_trimmed_to) | |
182 | rollback_info_trimmed_to = to; | |
183 | } | |
7c673cae FG |
184 | |
185 | while (rollback_info_trimmed_to_riter != log.rbegin()) { | |
186 | --rollback_info_trimmed_to_riter; | |
187 | if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { | |
188 | ++rollback_info_trimmed_to_riter; | |
189 | break; | |
190 | } | |
191 | f(*rollback_info_trimmed_to_riter); | |
192 | } | |
eafe8130 TL |
193 | |
194 | return dirty_log; | |
7c673cae FG |
195 | } |
196 | ||
197 | void reset_rollback_info_trimmed_to_riter() { | |
198 | rollback_info_trimmed_to_riter = log.rbegin(); | |
199 | while (rollback_info_trimmed_to_riter != log.rend() && | |
200 | rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) | |
201 | ++rollback_info_trimmed_to_riter; | |
202 | } | |
203 | ||
204 | // indexes objects, caller ops and extra caller ops | |
205 | public: | |
206 | IndexedLog() : | |
207 | complete_to(log.end()), | |
208 | last_requested(0), | |
209 | indexed_data(0), | |
210 | rollback_info_trimmed_to_riter(log.rbegin()) | |
c07f9fc5 | 211 | { } |
7c673cae FG |
212 | |
213 | template <typename... Args> | |
11fdf7f2 | 214 | explicit IndexedLog(Args&&... args) : |
7c673cae FG |
215 | pg_log_t(std::forward<Args>(args)...), |
216 | complete_to(log.end()), | |
217 | last_requested(0), | |
218 | indexed_data(0), | |
c07f9fc5 FG |
219 | rollback_info_trimmed_to_riter(log.rbegin()) |
220 | { | |
7c673cae FG |
221 | reset_rollback_info_trimmed_to_riter(); |
222 | index(); | |
223 | } | |
224 | ||
225 | IndexedLog(const IndexedLog &rhs) : | |
226 | pg_log_t(rhs), | |
227 | complete_to(log.end()), | |
228 | last_requested(rhs.last_requested), | |
229 | indexed_data(0), | |
c07f9fc5 FG |
230 | rollback_info_trimmed_to_riter(log.rbegin()) |
231 | { | |
7c673cae FG |
232 | reset_rollback_info_trimmed_to_riter(); |
233 | index(rhs.indexed_data); | |
234 | } | |
c07f9fc5 | 235 | |
7c673cae FG |
236 | IndexedLog &operator=(const IndexedLog &rhs) { |
237 | this->~IndexedLog(); | |
238 | new (this) IndexedLog(rhs); | |
239 | return *this; | |
240 | } | |
241 | ||
242 | void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { | |
243 | advance_can_rollback_to( | |
244 | to, | |
245 | [&](pg_log_entry_t &entry) { | |
246 | h->trim(entry); | |
247 | }); | |
248 | } | |
eafe8130 TL |
249 | bool roll_forward_to(eversion_t to, LogEntryHandler *h) { |
250 | return advance_can_rollback_to( | |
7c673cae FG |
251 | to, |
252 | [&](pg_log_entry_t &entry) { | |
253 | h->rollforward(entry); | |
254 | }); | |
255 | } | |
256 | ||
257 | void skip_can_rollback_to_to_head() { | |
258 | advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); | |
259 | } | |
260 | ||
31f18b77 | 261 | mempool::osd_pglog::list<pg_log_entry_t> rewind_from_head(eversion_t newhead) { |
7c673cae FG |
262 | auto divergent = pg_log_t::rewind_from_head(newhead); |
263 | index(); | |
264 | reset_rollback_info_trimmed_to_riter(); | |
265 | return divergent; | |
266 | } | |
267 | ||
268 | template <typename T> | |
269 | void scan_log_after( | |
270 | const eversion_t &bound, ///< [in] scan entries > bound | |
271 | T &&f) const { | |
272 | auto iter = log.rbegin(); | |
273 | while (iter != log.rend() && iter->version > bound) | |
274 | ++iter; | |
275 | ||
276 | while (true) { | |
277 | if (iter == log.rbegin()) | |
278 | break; | |
279 | f(*(--iter)); | |
280 | } | |
281 | } | |
282 | ||
283 | /****/ | |
284 | void claim_log_and_clear_rollback_info(const pg_log_t& o) { | |
285 | // we must have already trimmed the old entries | |
11fdf7f2 TL |
286 | ceph_assert(rollback_info_trimmed_to == head); |
287 | ceph_assert(rollback_info_trimmed_to_riter == log.rbegin()); | |
7c673cae FG |
288 | |
289 | *this = IndexedLog(o); | |
290 | ||
291 | skip_can_rollback_to_to_head(); | |
292 | index(); | |
293 | } | |
294 | ||
295 | void split_out_child( | |
296 | pg_t child_pgid, | |
297 | unsigned split_bits, | |
298 | IndexedLog *target); | |
299 | ||
300 | void zero() { | |
301 | // we must have already trimmed the old entries | |
11fdf7f2 TL |
302 | ceph_assert(rollback_info_trimmed_to == head); |
303 | ceph_assert(rollback_info_trimmed_to_riter == log.rbegin()); | |
7c673cae FG |
304 | |
305 | unindex(); | |
306 | pg_log_t::clear(); | |
307 | rollback_info_trimmed_to_riter = log.rbegin(); | |
308 | reset_recovery_pointers(); | |
309 | } | |
310 | void clear() { | |
311 | skip_can_rollback_to_to_head(); | |
312 | zero(); | |
313 | } | |
314 | void reset_recovery_pointers() { | |
315 | complete_to = log.end(); | |
316 | last_requested = 0; | |
317 | } | |
318 | ||
319 | bool logged_object(const hobject_t& oid) const { | |
320 | if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { | |
321 | index_objects(); | |
322 | } | |
323 | return objects.count(oid); | |
324 | } | |
325 | ||
326 | bool logged_req(const osd_reqid_t &r) const { | |
327 | if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { | |
328 | index_caller_ops(); | |
329 | } | |
330 | if (!caller_ops.count(r)) { | |
331 | if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { | |
332 | index_extra_caller_ops(); | |
333 | } | |
334 | return extra_caller_ops.count(r); | |
335 | } | |
336 | return true; | |
337 | } | |
338 | ||
339 | bool get_request( | |
340 | const osd_reqid_t &r, | |
341 | eversion_t *version, | |
342 | version_t *user_version, | |
9f95a23c | 343 | int *return_code, |
f67539c2 | 344 | std::vector<pg_log_op_return_item_t> *op_returns) const |
c07f9fc5 | 345 | { |
11fdf7f2 TL |
346 | ceph_assert(version); |
347 | ceph_assert(user_version); | |
348 | ceph_assert(return_code); | |
7c673cae FG |
349 | if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { |
350 | index_caller_ops(); | |
351 | } | |
f67539c2 | 352 | auto p = caller_ops.find(r); |
7c673cae FG |
353 | if (p != caller_ops.end()) { |
354 | *version = p->second->version; | |
355 | *user_version = p->second->user_version; | |
356 | *return_code = p->second->return_code; | |
9f95a23c | 357 | *op_returns = p->second->op_returns; |
7c673cae FG |
358 | return true; |
359 | } | |
360 | ||
361 | // warning: we will return *a* request for this reqid, but not | |
362 | // necessarily the most recent. | |
363 | if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { | |
364 | index_extra_caller_ops(); | |
365 | } | |
366 | p = extra_caller_ops.find(r); | |
367 | if (p != extra_caller_ops.end()) { | |
11fdf7f2 | 368 | uint32_t idx = 0; |
31f18b77 | 369 | for (auto i = p->second->extra_reqids.begin(); |
7c673cae | 370 | i != p->second->extra_reqids.end(); |
11fdf7f2 | 371 | ++idx, ++i) { |
7c673cae FG |
372 | if (i->first == r) { |
373 | *version = p->second->version; | |
374 | *user_version = i->second; | |
375 | *return_code = p->second->return_code; | |
9f95a23c | 376 | *op_returns = p->second->op_returns; |
11fdf7f2 TL |
377 | if (*return_code >= 0) { |
378 | auto it = p->second->extra_reqid_return_codes.find(idx); | |
379 | if (it != p->second->extra_reqid_return_codes.end()) { | |
380 | *return_code = it->second; | |
381 | } | |
382 | } | |
7c673cae FG |
383 | return true; |
384 | } | |
385 | } | |
11fdf7f2 | 386 | ceph_abort_msg("in extra_caller_ops but not extra_reqids"); |
7c673cae | 387 | } |
c07f9fc5 FG |
388 | |
389 | if (!(indexed_data & PGLOG_INDEXED_DUPS)) { | |
390 | index_dups(); | |
391 | } | |
392 | auto q = dup_index.find(r); | |
393 | if (q != dup_index.end()) { | |
394 | *version = q->second->version; | |
395 | *user_version = q->second->user_version; | |
396 | *return_code = q->second->return_code; | |
9f95a23c | 397 | *op_returns = q->second->op_returns; |
c07f9fc5 FG |
398 | return true; |
399 | } | |
400 | ||
7c673cae FG |
401 | return false; |
402 | } | |
403 | ||
9f95a23c TL |
404 | bool has_write_since(const hobject_t &oid, const eversion_t &bound) const { |
405 | for (auto i = log.rbegin(); i != log.rend(); ++i) { | |
406 | if (i->version <= bound) | |
407 | return false; | |
408 | if (i->soid.get_head() == oid.get_head()) | |
409 | return true; | |
410 | } | |
411 | return false; | |
412 | } | |
413 | ||
f67539c2 | 414 | /// get a (bounded) std::list of recent reqids for the given object |
7c673cae | 415 | void get_object_reqids(const hobject_t& oid, unsigned max, |
f67539c2 | 416 | mempool::osd_pglog::vector<std::pair<osd_reqid_t, version_t> > *pls, |
11fdf7f2 | 417 | mempool::osd_pglog::map<uint32_t, int> *return_codes) const { |
7c673cae FG |
418 | // make sure object is present at least once before we do an |
419 | // O(n) search. | |
420 | if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { | |
421 | index_objects(); | |
422 | } | |
423 | if (objects.count(oid) == 0) | |
424 | return; | |
11fdf7f2 | 425 | |
f67539c2 | 426 | for (auto i = log.rbegin(); i != log.rend(); ++i) { |
7c673cae | 427 | if (i->soid == oid) { |
11fdf7f2 TL |
428 | if (i->reqid_is_indexed()) { |
429 | if (i->op == pg_log_entry_t::ERROR) { | |
430 | // propagate op errors to the cache tier's PG log | |
431 | return_codes->emplace(pls->size(), i->return_code); | |
432 | } | |
f67539c2 | 433 | pls->push_back(std::make_pair(i->reqid, i->user_version)); |
11fdf7f2 TL |
434 | } |
435 | ||
7c673cae FG |
436 | pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end()); |
437 | if (pls->size() >= max) { | |
438 | if (pls->size() > max) { | |
439 | pls->resize(max); | |
440 | } | |
441 | return; | |
442 | } | |
443 | } | |
444 | } | |
445 | } | |
c07f9fc5 | 446 | |
7c673cae | 447 | void index(__u16 to_index = PGLOG_INDEXED_ALL) const { |
c07f9fc5 FG |
448 | // if to_index is 0, no need to run any of this code, especially |
449 | // loop below; this can happen with copy constructor for | |
450 | // IndexedLog (and indirectly through assignment operator) | |
451 | if (!to_index) return; | |
452 | ||
7c673cae FG |
453 | if (to_index & PGLOG_INDEXED_OBJECTS) |
454 | objects.clear(); | |
455 | if (to_index & PGLOG_INDEXED_CALLER_OPS) | |
456 | caller_ops.clear(); | |
457 | if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) | |
458 | extra_caller_ops.clear(); | |
c07f9fc5 FG |
459 | if (to_index & PGLOG_INDEXED_DUPS) { |
460 | dup_index.clear(); | |
461 | for (auto& i : dups) { | |
462 | dup_index[i.reqid] = const_cast<pg_log_dup_t*>(&i); | |
463 | } | |
464 | } | |
7c673cae | 465 | |
c07f9fc5 FG |
466 | constexpr __u16 any_log_entry_index = |
467 | PGLOG_INDEXED_OBJECTS | | |
468 | PGLOG_INDEXED_CALLER_OPS | | |
469 | PGLOG_INDEXED_EXTRA_CALLER_OPS; | |
470 | ||
471 | if (to_index & any_log_entry_index) { | |
f67539c2 | 472 | for (auto i = log.begin(); i != log.end(); ++i) { |
c07f9fc5 FG |
473 | if (to_index & PGLOG_INDEXED_OBJECTS) { |
474 | if (i->object_is_indexed()) { | |
475 | objects[i->soid] = const_cast<pg_log_entry_t*>(&(*i)); | |
476 | } | |
7c673cae | 477 | } |
7c673cae | 478 | |
c07f9fc5 FG |
479 | if (to_index & PGLOG_INDEXED_CALLER_OPS) { |
480 | if (i->reqid_is_indexed()) { | |
481 | caller_ops[i->reqid] = const_cast<pg_log_entry_t*>(&(*i)); | |
482 | } | |
7c673cae | 483 | } |
c07f9fc5 FG |
484 | |
485 | if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
486 | for (auto j = i->extra_reqids.begin(); | |
487 | j != i->extra_reqids.end(); | |
488 | ++j) { | |
489 | extra_caller_ops.insert( | |
f67539c2 | 490 | std::make_pair(j->first, const_cast<pg_log_entry_t*>(&(*i)))); |
c07f9fc5 | 491 | } |
7c673cae FG |
492 | } |
493 | } | |
494 | } | |
c07f9fc5 | 495 | |
7c673cae FG |
496 | indexed_data |= to_index; |
497 | } | |
498 | ||
499 | void index_objects() const { | |
500 | index(PGLOG_INDEXED_OBJECTS); | |
501 | } | |
502 | ||
503 | void index_caller_ops() const { | |
504 | index(PGLOG_INDEXED_CALLER_OPS); | |
505 | } | |
506 | ||
507 | void index_extra_caller_ops() const { | |
508 | index(PGLOG_INDEXED_EXTRA_CALLER_OPS); | |
509 | } | |
510 | ||
c07f9fc5 FG |
511 | void index_dups() const { |
512 | index(PGLOG_INDEXED_DUPS); | |
513 | } | |
514 | ||
7c673cae FG |
515 | void index(pg_log_entry_t& e) { |
516 | if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { | |
517 | if (objects.count(e.soid) == 0 || | |
518 | objects[e.soid]->version < e.version) | |
519 | objects[e.soid] = &e; | |
520 | } | |
521 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
522 | // divergent merge_log indexes new before unindexing old | |
523 | if (e.reqid_is_indexed()) { | |
524 | caller_ops[e.reqid] = &e; | |
525 | } | |
526 | } | |
527 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 528 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
529 | j != e.extra_reqids.end(); |
530 | ++j) { | |
f67539c2 | 531 | extra_caller_ops.insert(std::make_pair(j->first, &e)); |
7c673cae FG |
532 | } |
533 | } | |
534 | } | |
c07f9fc5 | 535 | |
7c673cae FG |
536 | void unindex() { |
537 | objects.clear(); | |
538 | caller_ops.clear(); | |
539 | extra_caller_ops.clear(); | |
c07f9fc5 | 540 | dup_index.clear(); |
7c673cae FG |
541 | indexed_data = 0; |
542 | } | |
c07f9fc5 FG |
543 | |
544 | void unindex(const pg_log_entry_t& e) { | |
7c673cae FG |
545 | // NOTE: this only works if we remove from the _tail_ of the log! |
546 | if (indexed_data & PGLOG_INDEXED_OBJECTS) { | |
11fdf7f2 TL |
547 | auto it = objects.find(e.soid); |
548 | if (it != objects.end() && it->second->version == e.version) | |
549 | objects.erase(it); | |
7c673cae FG |
550 | } |
551 | if (e.reqid_is_indexed()) { | |
552 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
11fdf7f2 | 553 | auto it = caller_ops.find(e.reqid); |
7c673cae | 554 | // divergent merge_log indexes new before unindexing old |
11fdf7f2 TL |
555 | if (it != caller_ops.end() && it->second == &e) |
556 | caller_ops.erase(it); | |
7c673cae FG |
557 | } |
558 | } | |
559 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { | |
31f18b77 | 560 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
561 | j != e.extra_reqids.end(); |
562 | ++j) { | |
f67539c2 | 563 | for (auto k = extra_caller_ops.find(j->first); |
7c673cae FG |
564 | k != extra_caller_ops.end() && k->first == j->first; |
565 | ++k) { | |
566 | if (k->second == &e) { | |
567 | extra_caller_ops.erase(k); | |
568 | break; | |
569 | } | |
570 | } | |
571 | } | |
572 | } | |
573 | } | |
574 | ||
c07f9fc5 | 575 | void index(pg_log_dup_t& e) { |
d2e6a577 | 576 | if (indexed_data & PGLOG_INDEXED_DUPS) { |
c07f9fc5 FG |
577 | dup_index[e.reqid] = &e; |
578 | } | |
579 | } | |
580 | ||
581 | void unindex(const pg_log_dup_t& e) { | |
d2e6a577 | 582 | if (indexed_data & PGLOG_INDEXED_DUPS) { |
c07f9fc5 FG |
583 | auto i = dup_index.find(e.reqid); |
584 | if (i != dup_index.end()) { | |
585 | dup_index.erase(i); | |
586 | } | |
587 | } | |
588 | } | |
589 | ||
7c673cae FG |
590 | // actors |
591 | void add(const pg_log_entry_t& e, bool applied = true) { | |
592 | if (!applied) { | |
11fdf7f2 | 593 | ceph_assert(get_can_rollback_to() == head); |
7c673cae FG |
594 | } |
595 | ||
31f18b77 FG |
596 | // make sure our buffers don't pin bigger buffers |
597 | e.mod_desc.trim_bl(); | |
598 | ||
7c673cae FG |
599 | // add to log |
600 | log.push_back(e); | |
601 | ||
602 | // riter previously pointed to the previous entry | |
603 | if (rollback_info_trimmed_to_riter == log.rbegin()) | |
604 | ++rollback_info_trimmed_to_riter; | |
605 | ||
11fdf7f2 TL |
606 | ceph_assert(e.version > head); |
607 | ceph_assert(head.version == 0 || e.version.version > head.version); | |
7c673cae FG |
608 | head = e.version; |
609 | ||
610 | // to our index | |
611 | if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { | |
612 | objects[e.soid] = &(log.back()); | |
613 | } | |
614 | if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { | |
615 | if (e.reqid_is_indexed()) { | |
616 | caller_ops[e.reqid] = &(log.back()); | |
617 | } | |
618 | } | |
c07f9fc5 | 619 | |
7c673cae | 620 | if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { |
31f18b77 | 621 | for (auto j = e.extra_reqids.begin(); |
7c673cae FG |
622 | j != e.extra_reqids.end(); |
623 | ++j) { | |
f67539c2 | 624 | extra_caller_ops.insert(std::make_pair(j->first, &(log.back()))); |
7c673cae FG |
625 | } |
626 | } | |
627 | ||
628 | if (!applied) { | |
629 | skip_can_rollback_to_to_head(); | |
630 | } | |
c07f9fc5 | 631 | } // add |
7c673cae FG |
632 | |
633 | void trim( | |
634 | CephContext* cct, | |
635 | eversion_t s, | |
f67539c2 TL |
636 | std::set<eversion_t> *trimmed, |
637 | std::set<std::string>* trimmed_dups, | |
181888fb | 638 | eversion_t *write_from_dups); |
7c673cae | 639 | |
f67539c2 | 640 | std::ostream& print(std::ostream& out) const; |
c07f9fc5 | 641 | }; // IndexedLog |
7c673cae FG |
642 | |
643 | ||
644 | protected: | |
645 | //////////////////// data members //////////////////// | |
646 | ||
647 | pg_missing_tracker_t missing; | |
648 | IndexedLog log; | |
649 | ||
650 | eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to | |
651 | eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from | |
652 | eversion_t writeout_from; ///< must writout keys >= writeout_from | |
f67539c2 | 653 | std::set<eversion_t> trimmed; ///< must clear keys in trimmed |
181888fb FG |
654 | eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups |
655 | eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups | |
656 | eversion_t write_from_dups; ///< must write keys >= write_from_dups | |
f67539c2 | 657 | std::set<std::string> trimmed_dups; ///< must clear keys in trimmed_dups |
7c673cae FG |
658 | CephContext *cct; |
659 | bool pg_log_debug; | |
660 | /// Log is clean on [dirty_to, dirty_from) | |
661 | bool touched_log; | |
eafe8130 | 662 | bool dirty_log; |
7c673cae | 663 | bool clear_divergent_priors; |
9f95a23c | 664 | bool may_include_deletes_in_missing_dirty = false; |
7c673cae FG |
665 | |
666 | void mark_dirty_to(eversion_t to) { | |
667 | if (to > dirty_to) | |
668 | dirty_to = to; | |
669 | } | |
670 | void mark_dirty_from(eversion_t from) { | |
671 | if (from < dirty_from) | |
672 | dirty_from = from; | |
673 | } | |
674 | void mark_writeout_from(eversion_t from) { | |
675 | if (from < writeout_from) | |
676 | writeout_from = from; | |
677 | } | |
181888fb FG |
678 | void mark_dirty_to_dups(eversion_t to) { |
679 | if (to > dirty_to_dups) | |
680 | dirty_to_dups = to; | |
681 | } | |
682 | void mark_dirty_from_dups(eversion_t from) { | |
683 | if (from < dirty_from_dups) | |
684 | dirty_from_dups = from; | |
685 | } | |
7c673cae | 686 | public: |
9f95a23c TL |
687 | bool needs_write() const { |
688 | return !touched_log || is_dirty(); | |
689 | } | |
690 | ||
7c673cae | 691 | bool is_dirty() const { |
9f95a23c | 692 | return dirty_log || |
7c673cae FG |
693 | (dirty_to != eversion_t()) || |
694 | (dirty_from != eversion_t::max()) || | |
695 | (writeout_from != eversion_t::max()) || | |
696 | !(trimmed.empty()) || | |
c07f9fc5 FG |
697 | !missing.is_clean() || |
698 | !(trimmed_dups.empty()) || | |
181888fb FG |
699 | (dirty_to_dups != eversion_t()) || |
700 | (dirty_from_dups != eversion_t::max()) || | |
701 | (write_from_dups != eversion_t::max()) || | |
9f95a23c | 702 | may_include_deletes_in_missing_dirty; |
7c673cae | 703 | } |
9f95a23c | 704 | |
7c673cae FG |
705 | void mark_log_for_rewrite() { |
706 | mark_dirty_to(eversion_t::max()); | |
707 | mark_dirty_from(eversion_t()); | |
181888fb FG |
708 | mark_dirty_to_dups(eversion_t::max()); |
709 | mark_dirty_from_dups(eversion_t()); | |
7c673cae FG |
710 | touched_log = false; |
711 | } | |
9f95a23c TL |
712 | bool get_may_include_deletes_in_missing_dirty() const { |
713 | return may_include_deletes_in_missing_dirty; | |
c07f9fc5 | 714 | } |
7c673cae FG |
715 | protected: |
716 | ||
717 | /// DEBUG | |
f67539c2 TL |
718 | std::set<std::string> log_keys_debug; |
719 | static void clear_after(std::set<std::string> *log_keys_debug, const std::string &lb) { | |
7c673cae FG |
720 | if (!log_keys_debug) |
721 | return; | |
f67539c2 | 722 | for (auto i = log_keys_debug->lower_bound(lb); |
7c673cae FG |
723 | i != log_keys_debug->end(); |
724 | log_keys_debug->erase(i++)); | |
725 | } | |
f67539c2 | 726 | static void clear_up_to(std::set<std::string> *log_keys_debug, const std::string &ub) { |
7c673cae FG |
727 | if (!log_keys_debug) |
728 | return; | |
f67539c2 | 729 | for (auto i = log_keys_debug->begin(); |
7c673cae FG |
730 | i != log_keys_debug->end() && *i < ub; |
731 | log_keys_debug->erase(i++)); | |
732 | } | |
733 | ||
734 | void check(); | |
735 | void undirty() { | |
736 | dirty_to = eversion_t(); | |
737 | dirty_from = eversion_t::max(); | |
738 | touched_log = true; | |
eafe8130 | 739 | dirty_log = false; |
7c673cae | 740 | trimmed.clear(); |
c07f9fc5 | 741 | trimmed_dups.clear(); |
7c673cae FG |
742 | writeout_from = eversion_t::max(); |
743 | check(); | |
744 | missing.flush(); | |
181888fb FG |
745 | dirty_to_dups = eversion_t(); |
746 | dirty_from_dups = eversion_t::max(); | |
747 | write_from_dups = eversion_t::max(); | |
7c673cae FG |
748 | } |
749 | public: | |
c07f9fc5 | 750 | |
7c673cae | 751 | // cppcheck-suppress noExplicitConstructor |
11fdf7f2 | 752 | PGLog(CephContext *cct) : |
7c673cae FG |
753 | dirty_from(eversion_t::max()), |
754 | writeout_from(eversion_t::max()), | |
181888fb FG |
755 | dirty_from_dups(eversion_t::max()), |
756 | write_from_dups(eversion_t::max()), | |
7c673cae FG |
757 | cct(cct), |
758 | pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))), | |
759 | touched_log(false), | |
eafe8130 | 760 | dirty_log(false), |
181888fb | 761 | clear_divergent_priors(false) |
c07f9fc5 | 762 | { } |
7c673cae FG |
763 | |
764 | void reset_backfill(); | |
765 | ||
766 | void clear(); | |
767 | ||
f67539c2 | 768 | //////////////////// get or std::set missing //////////////////// |
7c673cae FG |
769 | |
770 | const pg_missing_tracker_t& get_missing() const { return missing; } | |
11fdf7f2 TL |
771 | |
772 | void missing_add(const hobject_t& oid, eversion_t need, eversion_t have, bool is_delete=false) { | |
773 | missing.add(oid, need, have, is_delete); | |
7c673cae FG |
774 | } |
775 | ||
11fdf7f2 TL |
776 | void missing_add_next_entry(const pg_log_entry_t& e) { |
777 | missing.add_next_event(e); | |
7c673cae FG |
778 | } |
779 | ||
f67539c2 | 780 | //////////////////// get or std::set log //////////////////// |
7c673cae FG |
781 | |
782 | const IndexedLog &get_log() const { return log; } | |
783 | ||
784 | const eversion_t &get_tail() const { return log.tail; } | |
785 | ||
786 | void set_tail(eversion_t tail) { log.tail = tail; } | |
787 | ||
788 | const eversion_t &get_head() const { return log.head; } | |
789 | ||
790 | void set_head(eversion_t head) { log.head = head; } | |
791 | ||
792 | void set_last_requested(version_t last_requested) { | |
793 | log.last_requested = last_requested; | |
794 | } | |
795 | ||
796 | void index() { log.index(); } | |
797 | ||
798 | void unindex() { log.unindex(); } | |
799 | ||
800 | void add(const pg_log_entry_t& e, bool applied = true) { | |
801 | mark_writeout_from(e.version); | |
802 | log.add(e, applied); | |
803 | } | |
804 | ||
805 | void reset_recovery_pointers() { log.reset_recovery_pointers(); } | |
806 | ||
807 | static void clear_info_log( | |
808 | spg_t pgid, | |
809 | ObjectStore::Transaction *t); | |
810 | ||
811 | void trim( | |
812 | eversion_t trim_to, | |
f64942e4 | 813 | pg_info_t &info, |
11fdf7f2 TL |
814 | bool transaction_applied = true, |
815 | bool async = false); | |
7c673cae FG |
816 | |
817 | void roll_forward_to( | |
818 | eversion_t roll_forward_to, | |
819 | LogEntryHandler *h) { | |
eafe8130 TL |
820 | if (log.roll_forward_to( |
821 | roll_forward_to, | |
822 | h)) | |
823 | dirty_log = true; | |
7c673cae FG |
824 | } |
825 | ||
826 | eversion_t get_can_rollback_to() const { | |
827 | return log.get_can_rollback_to(); | |
828 | } | |
829 | ||
830 | void roll_forward(LogEntryHandler *h) { | |
831 | roll_forward_to( | |
832 | log.head, | |
833 | h); | |
834 | } | |
835 | ||
81eedcae TL |
836 | void skip_rollforward() { |
837 | log.skip_can_rollback_to_to_head(); | |
838 | } | |
839 | ||
f67539c2 | 840 | //////////////////// get or std::set log & missing //////////////////// |
7c673cae FG |
841 | |
842 | void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { | |
843 | log.trim_rollback_info_to(log.head, h); | |
844 | log.claim_log_and_clear_rollback_info(o); | |
845 | missing.clear(); | |
846 | mark_dirty_to(eversion_t::max()); | |
181888fb | 847 | mark_dirty_to_dups(eversion_t::max()); |
7c673cae FG |
848 | } |
849 | ||
850 | void split_into( | |
851 | pg_t child_pgid, | |
852 | unsigned split_bits, | |
c07f9fc5 | 853 | PGLog *opg_log) { |
7c673cae FG |
854 | log.split_out_child(child_pgid, split_bits, &opg_log->log); |
855 | missing.split_into(child_pgid, split_bits, &(opg_log->missing)); | |
856 | opg_log->mark_dirty_to(eversion_t::max()); | |
181888fb | 857 | opg_log->mark_dirty_to_dups(eversion_t::max()); |
7c673cae | 858 | mark_dirty_to(eversion_t::max()); |
181888fb | 859 | mark_dirty_to_dups(eversion_t::max()); |
9f95a23c TL |
860 | if (missing.may_include_deletes) { |
861 | opg_log->set_missing_may_contain_deletes(); | |
862 | } | |
7c673cae FG |
863 | } |
864 | ||
11fdf7f2 | 865 | void merge_from( |
f67539c2 | 866 | const std::vector<PGLog*>& sources, |
11fdf7f2 TL |
867 | eversion_t last_update) { |
868 | unindex(); | |
869 | missing.clear(); | |
870 | ||
f67539c2 | 871 | std::vector<pg_log_t*> slogs; |
11fdf7f2 TL |
872 | for (auto s : sources) { |
873 | slogs.push_back(&s->log); | |
874 | } | |
875 | log.merge_from(slogs, last_update); | |
876 | ||
877 | index(); | |
878 | ||
879 | mark_log_for_rewrite(); | |
880 | } | |
881 | ||
7c673cae FG |
882 | void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) { |
883 | if (missing.is_missing(oid, v)) { | |
884 | missing.got(oid, v); | |
11fdf7f2 | 885 | info.stats.stats.sum.num_objects_missing = missing.num_missing(); |
c07f9fc5 | 886 | |
7c673cae FG |
887 | // raise last_complete? |
888 | if (missing.get_items().empty()) { | |
889 | log.complete_to = log.log.end(); | |
890 | info.last_complete = info.last_update; | |
891 | } | |
11fdf7f2 | 892 | auto oldest_need = missing.get_oldest_need(); |
7c673cae | 893 | while (log.complete_to != log.log.end()) { |
11fdf7f2 | 894 | if (oldest_need <= log.complete_to->version) |
7c673cae FG |
895 | break; |
896 | if (info.last_complete < log.complete_to->version) | |
897 | info.last_complete = log.complete_to->version; | |
898 | ++log.complete_to; | |
899 | } | |
900 | } | |
901 | ||
11fdf7f2 | 902 | ceph_assert(log.get_can_rollback_to() >= v); |
7c673cae FG |
903 | } |
904 | ||
c07f9fc5 | 905 | void reset_complete_to(pg_info_t *info) { |
81eedcae TL |
906 | if (log.log.empty()) // caller is split_into() |
907 | return; | |
7c673cae | 908 | log.complete_to = log.log.begin(); |
11fdf7f2 TL |
909 | ceph_assert(log.complete_to != log.log.end()); |
910 | auto oldest_need = missing.get_oldest_need(); | |
911 | if (oldest_need != eversion_t()) { | |
912 | while (log.complete_to->version < oldest_need) { | |
913 | ++log.complete_to; | |
914 | ceph_assert(log.complete_to != log.log.end()); | |
915 | } | |
c07f9fc5 | 916 | } |
11fdf7f2 TL |
917 | if (!info) |
918 | return; | |
7c673cae | 919 | if (log.complete_to == log.log.begin()) { |
11fdf7f2 | 920 | info->last_complete = eversion_t(); |
7c673cae FG |
921 | } else { |
922 | --log.complete_to; | |
11fdf7f2 | 923 | info->last_complete = log.complete_to->version; |
7c673cae FG |
924 | ++log.complete_to; |
925 | } | |
c07f9fc5 FG |
926 | } |
927 | ||
928 | void activate_not_complete(pg_info_t &info) { | |
929 | reset_complete_to(&info); | |
7c673cae FG |
930 | log.last_requested = 0; |
931 | } | |
932 | ||
933 | void proc_replica_log(pg_info_t &oinfo, | |
934 | const pg_log_t &olog, | |
935 | pg_missing_t& omissing, pg_shard_t from) const; | |
936 | ||
9f95a23c TL |
937 | void set_missing_may_contain_deletes() { |
938 | missing.may_include_deletes = true; | |
939 | may_include_deletes_in_missing_dirty = true; | |
940 | } | |
941 | ||
c07f9fc5 | 942 | void rebuild_missing_set_with_deletes(ObjectStore *store, |
11fdf7f2 | 943 | ObjectStore::CollectionHandle& ch, |
c07f9fc5 FG |
944 | const pg_info_t &info); |
945 | ||
7c673cae FG |
946 | protected: |
947 | static void split_by_object( | |
31f18b77 | 948 | mempool::osd_pglog::list<pg_log_entry_t> &entries, |
f67539c2 | 949 | std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t>> *out_entries) { |
7c673cae | 950 | while (!entries.empty()) { |
31f18b77 | 951 | auto &out_list = (*out_entries)[entries.front().soid]; |
7c673cae FG |
952 | out_list.splice(out_list.end(), entries, entries.begin()); |
953 | } | |
954 | } | |
955 | ||
956 | /** | |
957 | * _merge_object_divergent_entries | |
958 | * | |
959 | * There are 5 distinct cases: | |
960 | * 1) There is a more recent update: in this case we assume we adjusted the | |
961 | * store and missing during merge_log | |
962 | * 2) The first entry in the divergent sequence is a create. This might | |
963 | * either be because the object is a clone or because prior_version is | |
964 | * eversion_t(). In this case the object does not exist and we must | |
965 | * adjust missing and the store to match. | |
966 | * 3) We are currently missing the object. In this case, we adjust the | |
967 | * missing to our prior_version taking care to add a divergent_prior | |
968 | * if necessary | |
969 | * 4) We can rollback all of the entries. In this case, we do so using | |
970 | * the rollbacker and return -- the object does not go into missing. | |
971 | * 5) We cannot rollback at least 1 of the entries. In this case, we | |
972 | * clear the object out of the store and add a missing entry at | |
973 | * prior_version taking care to add a divergent_prior if | |
974 | * necessary. | |
975 | */ | |
976 | template <typename missing_type> | |
977 | static void _merge_object_divergent_entries( | |
978 | const IndexedLog &log, ///< [in] log to merge against | |
979 | const hobject_t &hoid, ///< [in] object we are merging | |
31f18b77 | 980 | const mempool::osd_pglog::list<pg_log_entry_t> &orig_entries, ///< [in] entries for hoid to merge |
7c673cae | 981 | const pg_info_t &info, ///< [in] info for merging entries |
eafe8130 | 982 | eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input InedexedLog |
c07f9fc5 | 983 | missing_type &missing, ///< [in,out] missing to adjust, use |
7c673cae FG |
984 | LogEntryHandler *rollbacker, ///< [in] optional rollbacker object |
985 | const DoutPrefixProvider *dpp ///< [in] logging provider | |
986 | ) { | |
987 | ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid | |
31f18b77 | 988 | << " entries: " << orig_entries << dendl; |
7c673cae FG |
989 | |
990 | if (hoid > info.last_backfill) { | |
991 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill" | |
992 | << dendl; | |
993 | return; | |
994 | } | |
995 | ||
996 | // entries is non-empty | |
11fdf7f2 | 997 | ceph_assert(!orig_entries.empty()); |
31f18b77 FG |
998 | // strip out and ignore ERROR entries |
999 | mempool::osd_pglog::list<pg_log_entry_t> entries; | |
7c673cae | 1000 | eversion_t last; |
d2e6a577 | 1001 | bool seen_non_error = false; |
f67539c2 | 1002 | for (auto i = orig_entries.begin(); |
31f18b77 | 1003 | i != orig_entries.end(); |
7c673cae FG |
1004 | ++i) { |
1005 | // all entries are on hoid | |
11fdf7f2 | 1006 | ceph_assert(i->soid == hoid); |
d2e6a577 FG |
1007 | // did not see error entries before this entry and this entry is not error |
1008 | // then this entry is the first non error entry | |
1009 | bool first_non_error = ! seen_non_error && ! i->is_error(); | |
1010 | if (! i->is_error() ) { | |
1011 | // see a non error entry now | |
1012 | seen_non_error = true; | |
1013 | } | |
1014 | ||
1015 | // No need to check the first entry since it prior_version is unavailable | |
f67539c2 | 1016 | // in the std::list |
d2e6a577 FG |
1017 | // No need to check if the prior_version is the minimal version |
1018 | // No need to check the first non-error entry since the leading error | |
1019 | // entries are not its prior version | |
1020 | if (i != orig_entries.begin() && i->prior_version != eversion_t() && | |
1021 | ! first_non_error) { | |
7c673cae | 1022 | // in increasing order of version |
11fdf7f2 | 1023 | ceph_assert(i->version > last); |
31f18b77 | 1024 | // prior_version correct (unless it is an ERROR entry) |
11fdf7f2 | 1025 | ceph_assert(i->prior_version == last || i->is_error()); |
7c673cae | 1026 | } |
31f18b77 FG |
1027 | if (i->is_error()) { |
1028 | ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; | |
1029 | } else { | |
1030 | ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl; | |
1031 | entries.push_back(*i); | |
d2e6a577 | 1032 | last = i->version; |
31f18b77 FG |
1033 | } |
1034 | } | |
1035 | if (entries.empty()) { | |
1036 | ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl; | |
1037 | return; | |
7c673cae FG |
1038 | } |
1039 | ||
1040 | const eversion_t prior_version = entries.begin()->prior_version; | |
1041 | const eversion_t first_divergent_update = entries.begin()->version; | |
1042 | const eversion_t last_divergent_update = entries.rbegin()->version; | |
1043 | const bool object_not_in_store = | |
1044 | !missing.is_missing(hoid) && | |
1045 | entries.rbegin()->is_delete(); | |
81eedcae TL |
1046 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << " object_not_in_store: " |
1047 | << object_not_in_store << dendl; | |
7c673cae FG |
1048 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid |
1049 | << " prior_version: " << prior_version | |
1050 | << " first_divergent_update: " << first_divergent_update | |
1051 | << " last_divergent_update: " << last_divergent_update | |
1052 | << dendl; | |
1053 | ||
f67539c2 | 1054 | auto objiter = log.objects.find(hoid); |
7c673cae FG |
1055 | if (objiter != log.objects.end() && |
1056 | objiter->second->version >= first_divergent_update) { | |
1057 | /// Case 1) | |
1058 | ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: " | |
1059 | << *objiter->second << ", already merged" << dendl; | |
1060 | ||
11fdf7f2 | 1061 | ceph_assert(objiter->second->version > last_divergent_update); |
7c673cae FG |
1062 | |
1063 | // ensure missing has been updated appropriately | |
c07f9fc5 FG |
1064 | if (objiter->second->is_update() || |
1065 | (missing.may_include_deletes && objiter->second->is_delete())) { | |
11fdf7f2 | 1066 | ceph_assert(missing.is_missing(hoid) && |
7c673cae FG |
1067 | missing.get_items().at(hoid).need == objiter->second->version); |
1068 | } else { | |
11fdf7f2 | 1069 | ceph_assert(!missing.is_missing(hoid)); |
7c673cae FG |
1070 | } |
1071 | missing.revise_have(hoid, eversion_t()); | |
9f95a23c | 1072 | missing.mark_fully_dirty(hoid); |
7c673cae FG |
1073 | if (rollbacker) { |
1074 | if (!object_not_in_store) { | |
1075 | rollbacker->remove(hoid); | |
1076 | } | |
1077 | for (auto &&i: entries) { | |
1078 | rollbacker->trim(i); | |
1079 | } | |
1080 | } | |
1081 | return; | |
1082 | } | |
1083 | ||
1084 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1085 | <<" has no more recent entries in log" << dendl; | |
1086 | if (prior_version == eversion_t() || entries.front().is_clone()) { | |
1087 | /// Case 2) | |
1088 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1089 | << " prior_version or op type indicates creation," | |
1090 | << " deleting" | |
1091 | << dendl; | |
1092 | if (missing.is_missing(hoid)) | |
1093 | missing.rm(missing.get_items().find(hoid)); | |
1094 | if (rollbacker) { | |
1095 | if (!object_not_in_store) { | |
1096 | rollbacker->remove(hoid); | |
1097 | } | |
1098 | for (auto &&i: entries) { | |
1099 | rollbacker->trim(i); | |
1100 | } | |
1101 | } | |
1102 | return; | |
1103 | } | |
1104 | ||
1105 | if (missing.is_missing(hoid)) { | |
1106 | /// Case 3) | |
1107 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1108 | << " missing, " << missing.get_items().at(hoid) | |
1109 | << " adjusting" << dendl; | |
1110 | ||
1111 | if (missing.get_items().at(hoid).have == prior_version) { | |
1112 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1113 | << " missing.have is prior_version " << prior_version | |
1114 | << " removing from missing" << dendl; | |
1115 | missing.rm(missing.get_items().find(hoid)); | |
1116 | } else { | |
1117 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1118 | << " missing.have is " << missing.get_items().at(hoid).have | |
1119 | << ", adjusting" << dendl; | |
c07f9fc5 | 1120 | missing.revise_need(hoid, prior_version, false); |
7c673cae FG |
1121 | if (prior_version <= info.log_tail) { |
1122 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1123 | << " prior_version " << prior_version | |
1124 | << " <= info.log_tail " | |
1125 | << info.log_tail << dendl; | |
1126 | } | |
1127 | } | |
1128 | if (rollbacker) { | |
1129 | for (auto &&i: entries) { | |
1130 | rollbacker->trim(i); | |
1131 | } | |
1132 | } | |
1133 | return; | |
1134 | } | |
1135 | ||
1136 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1137 | << " must be rolled back or recovered," | |
1138 | << " attempting to rollback" | |
1139 | << dendl; | |
1140 | bool can_rollback = true; | |
11fdf7f2 TL |
1141 | // We are going to make an important decision based on the |
1142 | // olog_can_rollback_to value we have received, better known it. | |
1143 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1144 | << " olog_can_rollback_to: " | |
1145 | << olog_can_rollback_to << dendl; | |
7c673cae | 1146 | /// Distinguish between 4) and 5) |
f67539c2 | 1147 | for (auto i = entries.rbegin(); i != entries.rend(); ++i) { |
eafe8130 | 1148 | if (!i->can_rollback() || i->version <= olog_can_rollback_to) { |
7c673cae FG |
1149 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback " |
1150 | << *i << dendl; | |
1151 | can_rollback = false; | |
1152 | break; | |
1153 | } | |
1154 | } | |
1155 | ||
1156 | if (can_rollback) { | |
1157 | /// Case 4) | |
f67539c2 | 1158 | for (auto i = entries.rbegin(); i != entries.rend(); ++i) { |
eafe8130 | 1159 | ceph_assert(i->can_rollback() && i->version > olog_can_rollback_to); |
7c673cae FG |
1160 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid |
1161 | << " rolling back " << *i << dendl; | |
1162 | if (rollbacker) | |
1163 | rollbacker->rollback(*i); | |
1164 | } | |
1165 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1166 | << " rolled back" << dendl; | |
1167 | return; | |
1168 | } else { | |
1169 | /// Case 5) | |
1170 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, " | |
1171 | << "removing and adding to missing" << dendl; | |
1172 | if (rollbacker) { | |
1173 | if (!object_not_in_store) | |
1174 | rollbacker->remove(hoid); | |
1175 | for (auto &&i: entries) { | |
1176 | rollbacker->trim(i); | |
1177 | } | |
1178 | } | |
c07f9fc5 | 1179 | missing.add(hoid, prior_version, eversion_t(), false); |
7c673cae FG |
1180 | if (prior_version <= info.log_tail) { |
1181 | ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid | |
1182 | << " prior_version " << prior_version | |
1183 | << " <= info.log_tail " | |
1184 | << info.log_tail << dendl; | |
1185 | } | |
1186 | } | |
1187 | } | |
1188 | ||
1189 | /// Merge all entries using above | |
1190 | template <typename missing_type> | |
1191 | static void _merge_divergent_entries( | |
1192 | const IndexedLog &log, ///< [in] log to merge against | |
31f18b77 | 1193 | mempool::osd_pglog::list<pg_log_entry_t> &entries, ///< [in] entries to merge |
7c673cae | 1194 | const pg_info_t &oinfo, ///< [in] info for merging entries |
eafe8130 | 1195 | eversion_t olog_can_rollback_to, ///< [in] rollback boundary of input IndexedLog |
7c673cae FG |
1196 | missing_type &omissing, ///< [in,out] missing to adjust, use |
1197 | LogEntryHandler *rollbacker, ///< [in] optional rollbacker object | |
1198 | const DoutPrefixProvider *dpp ///< [in] logging provider | |
1199 | ) { | |
f67539c2 | 1200 | std::map<hobject_t, mempool::osd_pglog::list<pg_log_entry_t> > split; |
7c673cae | 1201 | split_by_object(entries, &split); |
f67539c2 | 1202 | for (auto i = split.begin(); i != split.end(); ++i) { |
7c673cae FG |
1203 | _merge_object_divergent_entries( |
1204 | log, | |
1205 | i->first, | |
1206 | i->second, | |
1207 | oinfo, | |
1208 | olog_can_rollback_to, | |
1209 | omissing, | |
1210 | rollbacker, | |
1211 | dpp); | |
1212 | } | |
1213 | } | |
1214 | ||
1215 | /** | |
1216 | * Exists for use in TestPGLog for simply testing single divergent log | |
1217 | * cases | |
1218 | */ | |
1219 | void merge_old_entry( | |
1220 | ObjectStore::Transaction& t, | |
1221 | const pg_log_entry_t& oe, | |
1222 | const pg_info_t& info, | |
1223 | LogEntryHandler *rollbacker) { | |
31f18b77 | 1224 | mempool::osd_pglog::list<pg_log_entry_t> entries; |
7c673cae FG |
1225 | entries.push_back(oe); |
1226 | _merge_object_divergent_entries( | |
1227 | log, | |
1228 | oe.soid, | |
1229 | entries, | |
1230 | info, | |
1231 | log.get_can_rollback_to(), | |
1232 | missing, | |
1233 | rollbacker, | |
1234 | this); | |
1235 | } | |
c07f9fc5 FG |
1236 | |
1237 | bool merge_log_dups(const pg_log_t& olog); | |
1238 | ||
7c673cae | 1239 | public: |
c07f9fc5 | 1240 | |
7c673cae FG |
1241 | void rewind_divergent_log(eversion_t newhead, |
1242 | pg_info_t &info, | |
1243 | LogEntryHandler *rollbacker, | |
1244 | bool &dirty_info, | |
1245 | bool &dirty_big_info); | |
1246 | ||
1247 | void merge_log(pg_info_t &oinfo, | |
f67539c2 | 1248 | pg_log_t&& olog, |
7c673cae FG |
1249 | pg_shard_t from, |
1250 | pg_info_t &info, LogEntryHandler *rollbacker, | |
1251 | bool &dirty_info, bool &dirty_big_info); | |
1252 | ||
1253 | template <typename missing_type> | |
1254 | static bool append_log_entries_update_missing( | |
1255 | const hobject_t &last_backfill, | |
31f18b77 | 1256 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
1257 | bool maintain_rollback, |
1258 | IndexedLog *log, | |
1259 | missing_type &missing, | |
1260 | LogEntryHandler *rollbacker, | |
1261 | const DoutPrefixProvider *dpp) { | |
1262 | bool invalidate_stats = false; | |
1263 | if (log && !entries.empty()) { | |
11fdf7f2 | 1264 | ceph_assert(log->head < entries.begin()->version); |
7c673cae | 1265 | } |
f67539c2 | 1266 | for (auto p = entries.begin(); p != entries.end(); ++p) { |
7c673cae FG |
1267 | invalidate_stats = invalidate_stats || !p->is_error(); |
1268 | if (log) { | |
1269 | ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; | |
1270 | log->add(*p); | |
1271 | } | |
1272 | if (p->soid <= last_backfill && | |
1273 | !p->is_error()) { | |
c07f9fc5 FG |
1274 | if (missing.may_include_deletes) { |
1275 | missing.add_next_event(*p); | |
1276 | } else { | |
1277 | if (p->is_delete()) { | |
1278 | missing.rm(p->soid, p->version); | |
1279 | } else { | |
1280 | missing.add_next_event(*p); | |
1281 | } | |
1282 | if (rollbacker) { | |
1283 | // hack to match PG::mark_all_unfound_lost | |
1284 | if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) { | |
1285 | rollbacker->try_stash(p->soid, p->version.version); | |
1286 | } else if (p->is_delete()) { | |
1287 | rollbacker->remove(p->soid); | |
1288 | } | |
7c673cae FG |
1289 | } |
1290 | } | |
1291 | } | |
1292 | } | |
1293 | return invalidate_stats; | |
1294 | } | |
1295 | bool append_new_log_entries( | |
1296 | const hobject_t &last_backfill, | |
31f18b77 | 1297 | const mempool::osd_pglog::list<pg_log_entry_t> &entries, |
7c673cae FG |
1298 | LogEntryHandler *rollbacker) { |
1299 | bool invalidate_stats = append_log_entries_update_missing( | |
1300 | last_backfill, | |
7c673cae FG |
1301 | entries, |
1302 | true, | |
1303 | &log, | |
1304 | missing, | |
1305 | rollbacker, | |
1306 | this); | |
1307 | if (!entries.empty()) { | |
1308 | mark_writeout_from(entries.begin()->version); | |
c07f9fc5 FG |
1309 | if (entries.begin()->is_lost_delete()) { |
1310 | // hack: since lost deletes queue recovery directly, and don't | |
1311 | // go through activate_not_complete() again, our complete_to | |
1312 | // iterator may still point at log.end(). Reset it to point | |
1313 | // before these new lost_delete entries. This only occurs | |
1314 | // when lost+delete entries are initially added, which is | |
f67539c2 | 1315 | // always in a std::list of solely lost_delete entries, so it is |
c07f9fc5 FG |
1316 | // sufficient to check whether the first entry is a |
1317 | // lost_delete | |
1318 | reset_complete_to(nullptr); | |
1319 | } | |
7c673cae FG |
1320 | } |
1321 | return invalidate_stats; | |
1322 | } | |
1323 | ||
c07f9fc5 FG |
1324 | void write_log_and_missing( |
1325 | ObjectStore::Transaction& t, | |
f67539c2 | 1326 | std::map<std::string,ceph::buffer::list> *km, |
c07f9fc5 FG |
1327 | const coll_t& coll, |
1328 | const ghobject_t &log_oid, | |
1329 | bool require_rollback); | |
7c673cae FG |
1330 | |
1331 | static void write_log_and_missing_wo_missing( | |
1332 | ObjectStore::Transaction& t, | |
f67539c2 | 1333 | std::map<std::string,ceph::buffer::list>* km, |
7c673cae FG |
1334 | pg_log_t &log, |
1335 | const coll_t& coll, | |
f67539c2 | 1336 | const ghobject_t &log_oid, std::map<eversion_t, hobject_t> &divergent_priors, |
181888fb | 1337 | bool require_rollback); |
7c673cae FG |
1338 | |
1339 | static void write_log_and_missing( | |
1340 | ObjectStore::Transaction& t, | |
f67539c2 | 1341 | std::map<std::string,ceph::buffer::list>* km, |
7c673cae FG |
1342 | pg_log_t &log, |
1343 | const coll_t& coll, | |
1344 | const ghobject_t &log_oid, | |
1345 | const pg_missing_tracker_t &missing, | |
c07f9fc5 | 1346 | bool require_rollback, |
c07f9fc5 | 1347 | bool *rebuilt_missing_set_with_deletes); |
7c673cae FG |
1348 | |
1349 | static void _write_log_and_missing_wo_missing( | |
1350 | ObjectStore::Transaction& t, | |
f67539c2 | 1351 | std::map<std::string,ceph::buffer::list>* km, |
7c673cae FG |
1352 | pg_log_t &log, |
1353 | const coll_t& coll, const ghobject_t &log_oid, | |
f67539c2 | 1354 | std::map<eversion_t, hobject_t> &divergent_priors, |
7c673cae FG |
1355 | eversion_t dirty_to, |
1356 | eversion_t dirty_from, | |
1357 | eversion_t writeout_from, | |
7c673cae FG |
1358 | bool dirty_divergent_priors, |
1359 | bool touch_log, | |
1360 | bool require_rollback, | |
181888fb FG |
1361 | eversion_t dirty_to_dups, |
1362 | eversion_t dirty_from_dups, | |
1363 | eversion_t write_from_dups, | |
f67539c2 | 1364 | std::set<std::string> *log_keys_debug |
7c673cae FG |
1365 | ); |
1366 | ||
1367 | static void _write_log_and_missing( | |
1368 | ObjectStore::Transaction& t, | |
f67539c2 | 1369 | std::map<std::string,ceph::buffer::list>* km, |
7c673cae FG |
1370 | pg_log_t &log, |
1371 | const coll_t& coll, const ghobject_t &log_oid, | |
1372 | eversion_t dirty_to, | |
1373 | eversion_t dirty_from, | |
1374 | eversion_t writeout_from, | |
f67539c2 TL |
1375 | std::set<eversion_t> &&trimmed, |
1376 | std::set<std::string> &&trimmed_dups, | |
7c673cae FG |
1377 | const pg_missing_tracker_t &missing, |
1378 | bool touch_log, | |
1379 | bool require_rollback, | |
1380 | bool clear_divergent_priors, | |
181888fb FG |
1381 | eversion_t dirty_to_dups, |
1382 | eversion_t dirty_from_dups, | |
1383 | eversion_t write_from_dups, | |
9f95a23c | 1384 | bool *may_include_deletes_in_missing_dirty, |
f67539c2 | 1385 | std::set<std::string> *log_keys_debug |
7c673cae FG |
1386 | ); |
1387 | ||
1388 | void read_log_and_missing( | |
c07f9fc5 | 1389 | ObjectStore *store, |
11fdf7f2 TL |
1390 | ObjectStore::CollectionHandle& ch, |
1391 | ghobject_t pgmeta_oid, | |
7c673cae | 1392 | const pg_info_t &info, |
f67539c2 | 1393 | std::ostringstream &oss, |
7c673cae FG |
1394 | bool tolerate_divergent_missing_log, |
1395 | bool debug_verify_stored_missing = false | |
1396 | ) { | |
1397 | return read_log_and_missing( | |
11fdf7f2 TL |
1398 | store, ch, pgmeta_oid, info, |
1399 | log, missing, oss, | |
7c673cae FG |
1400 | tolerate_divergent_missing_log, |
1401 | &clear_divergent_priors, | |
1402 | this, | |
c07f9fc5 | 1403 | (pg_log_debug ? &log_keys_debug : nullptr), |
7c673cae FG |
1404 | debug_verify_stored_missing); |
1405 | } | |
1406 | ||
1407 | template <typename missing_type> | |
c07f9fc5 FG |
1408 | static void read_log_and_missing( |
1409 | ObjectStore *store, | |
11fdf7f2 TL |
1410 | ObjectStore::CollectionHandle &ch, |
1411 | ghobject_t pgmeta_oid, | |
7c673cae FG |
1412 | const pg_info_t &info, |
1413 | IndexedLog &log, | |
c07f9fc5 | 1414 | missing_type &missing, |
f67539c2 | 1415 | std::ostringstream &oss, |
7c673cae | 1416 | bool tolerate_divergent_missing_log, |
c07f9fc5 FG |
1417 | bool *clear_divergent_priors = nullptr, |
1418 | const DoutPrefixProvider *dpp = nullptr, | |
f67539c2 | 1419 | std::set<std::string> *log_keys_debug = nullptr, |
7c673cae FG |
1420 | bool debug_verify_stored_missing = false |
1421 | ) { | |
11fdf7f2 TL |
1422 | ldpp_dout(dpp, 20) << "read_log_and_missing coll " << ch->cid |
1423 | << " " << pgmeta_oid << dendl; | |
7c673cae FG |
1424 | |
1425 | // legacy? | |
1426 | struct stat st; | |
11fdf7f2 TL |
1427 | int r = store->stat(ch, pgmeta_oid, &st); |
1428 | ceph_assert(r == 0); | |
1429 | ceph_assert(st.st_size == 0); | |
7c673cae FG |
1430 | |
1431 | // will get overridden below if it had been recorded | |
1432 | eversion_t on_disk_can_rollback_to = info.last_update; | |
1433 | eversion_t on_disk_rollback_info_trimmed_to = eversion_t(); | |
11fdf7f2 TL |
1434 | ObjectMap::ObjectMapIterator p = store->get_omap_iterator(ch, |
1435 | pgmeta_oid); | |
f67539c2 | 1436 | std::map<eversion_t, hobject_t> divergent_priors; |
11fdf7f2 | 1437 | bool must_rebuild = false; |
c07f9fc5 | 1438 | missing.may_include_deletes = false; |
f67539c2 TL |
1439 | std::list<pg_log_entry_t> entries; |
1440 | std::list<pg_log_dup_t> dups; | |
7c673cae | 1441 | if (p) { |
f67539c2 | 1442 | using ceph::decode; |
11fdf7f2 | 1443 | for (p->seek_to_first(); p->valid() ; p->next()) { |
7c673cae FG |
1444 | // non-log pgmeta_oid keys are prefixed with _; skip those |
1445 | if (p->key()[0] == '_') | |
1446 | continue; | |
f67539c2 | 1447 | auto bl = p->value();//Copy ceph::buffer::list before creating iterator |
11fdf7f2 | 1448 | auto bp = bl.cbegin(); |
7c673cae | 1449 | if (p->key() == "divergent_priors") { |
11fdf7f2 | 1450 | decode(divergent_priors, bp); |
7c673cae FG |
1451 | ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() |
1452 | << " divergent_priors" << dendl; | |
35e4c445 | 1453 | must_rebuild = true; |
7c673cae FG |
1454 | debug_verify_stored_missing = false; |
1455 | } else if (p->key() == "can_rollback_to") { | |
11fdf7f2 | 1456 | decode(on_disk_can_rollback_to, bp); |
7c673cae | 1457 | } else if (p->key() == "rollback_info_trimmed_to") { |
11fdf7f2 | 1458 | decode(on_disk_rollback_info_trimmed_to, bp); |
c07f9fc5 FG |
1459 | } else if (p->key() == "may_include_deletes_in_missing") { |
1460 | missing.may_include_deletes = true; | |
f67539c2 | 1461 | } else if (p->key().substr(0, 7) == std::string("missing")) { |
c07f9fc5 FG |
1462 | hobject_t oid; |
1463 | pg_missing_item item; | |
11fdf7f2 TL |
1464 | decode(oid, bp); |
1465 | decode(item, bp); | |
9f95a23c | 1466 | ldpp_dout(dpp, 20) << "read_log_and_missing " << item << dendl; |
c07f9fc5 | 1467 | if (item.is_delete()) { |
11fdf7f2 | 1468 | ceph_assert(missing.may_include_deletes); |
c07f9fc5 | 1469 | } |
9f95a23c | 1470 | missing.add(oid, std::move(item)); |
f67539c2 | 1471 | } else if (p->key().substr(0, 4) == std::string("dup_")) { |
c07f9fc5 | 1472 | pg_log_dup_t dup; |
11fdf7f2 | 1473 | decode(dup, bp); |
c07f9fc5 | 1474 | if (!dups.empty()) { |
11fdf7f2 | 1475 | ceph_assert(dups.back().version < dup.version); |
c07f9fc5 FG |
1476 | } |
1477 | dups.push_back(dup); | |
7c673cae FG |
1478 | } else { |
1479 | pg_log_entry_t e; | |
1480 | e.decode_with_checksum(bp); | |
1481 | ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; | |
1482 | if (!entries.empty()) { | |
1483 | pg_log_entry_t last_e(entries.back()); | |
11fdf7f2 TL |
1484 | ceph_assert(last_e.version.version < e.version.version); |
1485 | ceph_assert(last_e.version.epoch <= e.version.epoch); | |
7c673cae FG |
1486 | } |
1487 | entries.push_back(e); | |
1488 | if (log_keys_debug) | |
1489 | log_keys_debug->insert(e.get_key_name()); | |
1490 | } | |
1491 | } | |
1492 | } | |
1493 | log = IndexedLog( | |
1494 | info.last_update, | |
1495 | info.log_tail, | |
1496 | on_disk_can_rollback_to, | |
1497 | on_disk_rollback_info_trimmed_to, | |
c07f9fc5 FG |
1498 | std::move(entries), |
1499 | std::move(dups)); | |
7c673cae | 1500 | |
35e4c445 | 1501 | if (must_rebuild || debug_verify_stored_missing) { |
7c673cae FG |
1502 | // build missing |
1503 | if (debug_verify_stored_missing || info.last_complete < info.last_update) { | |
c07f9fc5 FG |
1504 | ldpp_dout(dpp, 10) |
1505 | << "read_log_and_missing checking for missing items over interval (" | |
1506 | << info.last_complete | |
1507 | << "," << info.last_update << "]" << dendl; | |
7c673cae | 1508 | |
f67539c2 TL |
1509 | std::set<hobject_t> did; |
1510 | std::set<hobject_t> checked; | |
1511 | std::set<hobject_t> skipped; | |
1512 | for (auto i = log.log.rbegin(); i != log.log.rend(); ++i) { | |
7c673cae FG |
1513 | if (i->soid > info.last_backfill) |
1514 | continue; | |
1515 | if (i->is_error()) | |
1516 | continue; | |
1517 | if (did.count(i->soid)) continue; | |
1518 | did.insert(i->soid); | |
1519 | ||
c07f9fc5 FG |
1520 | if (!missing.may_include_deletes && i->is_delete()) |
1521 | continue; | |
7c673cae | 1522 | |
f67539c2 | 1523 | ceph::buffer::list bv; |
7c673cae | 1524 | int r = store->getattr( |
11fdf7f2 | 1525 | ch, |
7c673cae FG |
1526 | ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), |
1527 | OI_ATTR, | |
1528 | bv); | |
1529 | if (r >= 0) { | |
1530 | object_info_t oi(bv); | |
1531 | if (oi.version < i->version) { | |
1532 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i | |
9f95a23c TL |
1533 | << " (have " << oi.version << ")" |
1534 | << " clean_regions " << i->clean_regions << dendl; | |
1535 | ||
7c673cae FG |
1536 | if (debug_verify_stored_missing) { |
1537 | auto miter = missing.get_items().find(i->soid); | |
11fdf7f2 TL |
1538 | ceph_assert(miter != missing.get_items().end()); |
1539 | ceph_assert(miter->second.need == i->version); | |
c07f9fc5 FG |
1540 | // the 'have' version is reset if an object is deleted, |
1541 | // then created again | |
11fdf7f2 | 1542 | ceph_assert(miter->second.have == oi.version || miter->second.have == eversion_t()); |
7c673cae FG |
1543 | checked.insert(i->soid); |
1544 | } else { | |
c07f9fc5 | 1545 | missing.add(i->soid, i->version, oi.version, i->is_delete()); |
7c673cae FG |
1546 | } |
1547 | } | |
1548 | } else { | |
1549 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; | |
1550 | if (debug_verify_stored_missing) { | |
1551 | auto miter = missing.get_items().find(i->soid); | |
c07f9fc5 | 1552 | if (i->is_delete()) { |
11fdf7f2 | 1553 | ceph_assert(miter == missing.get_items().end() || |
c07f9fc5 FG |
1554 | (miter->second.need == i->version && |
1555 | miter->second.have == eversion_t())); | |
1556 | } else { | |
11fdf7f2 TL |
1557 | ceph_assert(miter != missing.get_items().end()); |
1558 | ceph_assert(miter->second.need == i->version); | |
1559 | ceph_assert(miter->second.have == eversion_t()); | |
c07f9fc5 | 1560 | } |
7c673cae FG |
1561 | checked.insert(i->soid); |
1562 | } else { | |
c07f9fc5 | 1563 | missing.add(i->soid, i->version, eversion_t(), i->is_delete()); |
7c673cae FG |
1564 | } |
1565 | } | |
1566 | } | |
1567 | if (debug_verify_stored_missing) { | |
1568 | for (auto &&i: missing.get_items()) { | |
1569 | if (checked.count(i.first)) | |
1570 | continue; | |
c07f9fc5 | 1571 | if (i.first > info.last_backfill) { |
f67539c2 | 1572 | ldpp_dout(dpp, -1) << __func__ << ": invalid missing std::set entry " |
c07f9fc5 FG |
1573 | << "found before last_backfill: " |
1574 | << i.first << " " << i.second | |
1575 | << " last_backfill = " << info.last_backfill | |
1576 | << dendl; | |
f67539c2 | 1577 | ceph_abort_msg("invalid missing std::set entry found"); |
7c673cae | 1578 | } |
f67539c2 | 1579 | ceph::buffer::list bv; |
7c673cae | 1580 | int r = store->getattr( |
11fdf7f2 | 1581 | ch, |
7c673cae FG |
1582 | ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard), |
1583 | OI_ATTR, | |
1584 | bv); | |
1585 | if (r >= 0) { | |
1586 | object_info_t oi(bv); | |
11fdf7f2 | 1587 | ceph_assert(oi.version == i.second.have || eversion_t() == i.second.have); |
7c673cae | 1588 | } else { |
11fdf7f2 | 1589 | ceph_assert(i.second.is_delete() || eversion_t() == i.second.have); |
7c673cae FG |
1590 | } |
1591 | } | |
1592 | } else { | |
11fdf7f2 | 1593 | ceph_assert(must_rebuild); |
f67539c2 | 1594 | for (auto i = divergent_priors.rbegin(); |
7c673cae FG |
1595 | i != divergent_priors.rend(); |
1596 | ++i) { | |
1597 | if (i->first <= info.last_complete) break; | |
1598 | if (i->second > info.last_backfill) | |
1599 | continue; | |
1600 | if (did.count(i->second)) continue; | |
1601 | did.insert(i->second); | |
f67539c2 | 1602 | ceph::buffer::list bv; |
7c673cae | 1603 | int r = store->getattr( |
11fdf7f2 | 1604 | ch, |
7c673cae FG |
1605 | ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard), |
1606 | OI_ATTR, | |
1607 | bv); | |
1608 | if (r >= 0) { | |
1609 | object_info_t oi(bv); | |
1610 | /** | |
1611 | * 1) we see this entry in the divergent priors mapping | |
1612 | * 2) we didn't see an entry for this object in the log | |
1613 | * | |
1614 | * From 1 & 2 we know that either the object does not exist | |
1615 | * or it is at the version specified in the divergent_priors | |
1616 | * map since the object would have been deleted atomically | |
1617 | * with the addition of the divergent_priors entry, an older | |
1618 | * version would not have been recovered, and a newer version | |
1619 | * would show up in the log above. | |
1620 | */ | |
f67539c2 | 1621 | /** |
7c673cae FG |
1622 | * Unfortunately the assessment above is incorrect because of |
1623 | * http://tracker.ceph.com/issues/17916 (we were incorrectly | |
f67539c2 | 1624 | * not removing the divergent_priors std::set from disk state!), |
7c673cae FG |
1625 | * so let's check that. |
1626 | */ | |
1627 | if (oi.version > i->first && tolerate_divergent_missing_log) { | |
1628 | ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i | |
1629 | << ") inconsistent with disk state (" << oi | |
1630 | << "), assuming it is tracker.ceph.com/issues/17916" | |
1631 | << dendl; | |
1632 | } else { | |
11fdf7f2 | 1633 | ceph_assert(oi.version == i->first); |
7c673cae FG |
1634 | } |
1635 | } else { | |
1636 | ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; | |
c07f9fc5 | 1637 | missing.add(i->second, i->first, eversion_t(), false); |
7c673cae FG |
1638 | } |
1639 | } | |
1640 | } | |
1641 | if (clear_divergent_priors) | |
1642 | (*clear_divergent_priors) = true; | |
1643 | } | |
1644 | } | |
1645 | ||
35e4c445 | 1646 | if (!must_rebuild) { |
7c673cae FG |
1647 | if (clear_divergent_priors) |
1648 | (*clear_divergent_priors) = false; | |
1649 | missing.flush(); | |
1650 | } | |
1651 | ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl; | |
c07f9fc5 | 1652 | } // static read_log_and_missing |
9f95a23c TL |
1653 | |
1654 | #ifdef WITH_SEASTAR | |
1655 | seastar::future<> read_log_and_missing_crimson( | |
1656 | crimson::os::FuturizedStore &store, | |
1657 | crimson::os::CollectionRef ch, | |
1658 | const pg_info_t &info, | |
1659 | ghobject_t pgmeta_oid | |
1660 | ) { | |
1661 | return read_log_and_missing_crimson( | |
1662 | store, ch, info, | |
f67539c2 TL |
1663 | log, (pg_log_debug ? &log_keys_debug : nullptr), |
1664 | missing, pgmeta_oid, this); | |
9f95a23c TL |
1665 | } |
1666 | ||
9f95a23c TL |
1667 | static seastar::future<> read_log_and_missing_crimson( |
1668 | crimson::os::FuturizedStore &store, | |
1669 | crimson::os::CollectionRef ch, | |
1670 | const pg_info_t &info, | |
1671 | IndexedLog &log, | |
f67539c2 TL |
1672 | std::set<std::string>* log_keys_debug, |
1673 | pg_missing_tracker_t &missing, | |
9f95a23c | 1674 | ghobject_t pgmeta_oid, |
f67539c2 | 1675 | const DoutPrefixProvider *dpp = nullptr); |
9f95a23c TL |
1676 | |
1677 | #endif | |
1678 | ||
c07f9fc5 | 1679 | }; // struct PGLog |