]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGTransaction.h
update sources to 12.2.7
[ceph.git] / ceph / src / osd / PGTransaction.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14 #ifndef PGTRANSACTION_H
15 #define PGTRANSACTION_H
16
17 #include <map>
18 #include <memory>
19 #include <boost/optional.hpp>
20
21 #include "common/hobject.h"
22 #include "osd/osd_types.h"
23 #include "osd/osd_internal_types.h"
24 #include "common/interval_map.h"
25 #include "common/inline_variant.h"
26
27 /**
28 * This class represents transactions which can be submitted to
29 * a PGBackend. For expediency, there are some constraints on
30 * the operations submitted:
31 * 1) Rename sources may only be referenced prior to the rename
32 * operation to the destination.
33 * 2) The graph formed by edges of source->destination for clones
34 * (Create) and Renames must be acyclic.
35 * 3) clone_range sources must not be modified by the same
36 * transaction
37 */
38 class PGTransaction {
39 public:
40 map<hobject_t, ObjectContextRef> obc_map;
41
42 class ObjectOperation {
43 public:
44 struct Init
45 {
46 struct None {};
47 struct Create {};
48 struct Clone {
49 hobject_t source;
50 };
51 struct Rename {
52 hobject_t source; // must be temp object
53 };
54 };
55 using InitType = boost::variant<
56 Init::None,
57 Init::Create,
58 Init::Clone,
59 Init::Rename>;
60
61 InitType init_type = Init::None();
62 bool delete_first = false;
63
64 /**
65 * is_none() && is_delete() indicates that we are deleting an
66 * object which already exists and not recreating it. delete_first means
67 * that the transaction logically removes the object.
68
69 * There are really 4 cases:
70
71 * 1) We are modifying an existing object (is_none() &&
72 * !is_delete())
73 * a) If it's an append, we just write into the log entry the old size
74 * b) If it's an actual overwrite, we save the old versions of the
75 * extents being overwritten and write those offsets into the log
76 * entry
77 * 2) We are removing and then recreating an object (!is_none() && is_delete())
78 * -- stash
79 * 3) We are removing an object (is_none() && is_delete()) -- stash
80 * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
81 * stash)
82 *
83 * Create, Clone, Rename are the three ways we can recreate it.
84 * ECBackend transaction planning needs this context
85 * to figure out how to perform the transaction.
86 */
87 bool deletes_first() const {
88 return delete_first;
89 }
90 bool is_delete() const {
91 return boost::get<Init::None>(&init_type) != nullptr && delete_first;
92 }
93 bool is_none() const {
94 return boost::get<Init::None>(&init_type) != nullptr && !delete_first;
95 }
96 bool is_fresh_object() const {
97 return boost::get<Init::None>(&init_type) == nullptr;
98 }
99 bool is_rename() const {
100 return boost::get<Init::Rename>(&init_type) != nullptr;
101 }
102 bool has_source(hobject_t *source = nullptr) const {
103 return match(
104 init_type,
105 [&](const Init::Clone &op) -> bool {
106 if (source)
107 *source = op.source;
108 return true;
109 },
110 [&](const Init::Rename &op) -> bool {
111 if (source)
112 *source = op.source;
113 return true;
114 },
115 [&](const Init::None &) -> bool { return false; },
116 [&](const Init::Create &) -> bool { return false; });
117 }
118
119 bool clear_omap = false;
120
121 /**
122 * truncate
123 * <lowest, last> ?
124 *
125 * truncate is represented as a pair because in the event of
126 * multiple truncates within a single transaction we need to
127 * remember the lowest truncate and the final object size
128 * (the last truncate). We also adjust the buffers map
129 * to account for truncates overriding previous writes */
130 boost::optional<pair<uint64_t, uint64_t> > truncate = boost::none;
131
132 std::map<string, boost::optional<bufferlist> > attr_updates;
133
134 enum class OmapUpdateType {Remove, Insert};
135 std::vector<std::pair<OmapUpdateType, bufferlist> > omap_updates;
136
137 boost::optional<bufferlist> omap_header;
138
139 /// (old, new) -- only valid with no truncate or buffer updates
140 boost::optional<pair<set<snapid_t>, set<snapid_t> > > updated_snaps;
141
142 struct alloc_hint_t {
143 uint64_t expected_object_size;
144 uint64_t expected_write_size;
145 uint32_t flags;
146 };
147 boost::optional<alloc_hint_t> alloc_hint;
148
149 struct BufferUpdate {
150 struct Write {
151 bufferlist buffer;
152 uint32_t fadvise_flags;
153 };
154 struct Zero {
155 uint64_t len;
156 };
157 struct CloneRange {
158 hobject_t from;
159 uint64_t offset;
160 uint64_t len;
161 };
162 };
163 using BufferUpdateType = boost::variant<
164 BufferUpdate::Write,
165 BufferUpdate::Zero,
166 BufferUpdate::CloneRange>;
167
168 private:
169 struct SplitMerger {
170 BufferUpdateType split(
171 uint64_t offset,
172 uint64_t len,
173 const BufferUpdateType &bu) const {
174 return match(
175 bu,
176 [&](const BufferUpdate::Write &w) -> BufferUpdateType {
177 bufferlist bl;
178 bl.substr_of(w.buffer, offset, len);
179 return BufferUpdate::Write{bl, w.fadvise_flags};
180 },
181 [&](const BufferUpdate::Zero &) -> BufferUpdateType {
182 return BufferUpdate::Zero{len};
183 },
184 [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
185 return BufferUpdate::CloneRange{c.from, c.offset + offset, len};
186 });
187 }
188 uint64_t length(
189 const BufferUpdateType &left) const {
190 return match(
191 left,
192 [&](const BufferUpdate::Write &w) -> uint64_t {
193 return w.buffer.length();
194 },
195 [&](const BufferUpdate::Zero &z) -> uint64_t {
196 return z.len;
197 },
198 [&](const BufferUpdate::CloneRange &c) -> uint64_t {
199 return c.len;
200 });
201 }
202 bool can_merge(
203 const BufferUpdateType &left,
204 const BufferUpdateType &right) const {
205 return match(
206 left,
207 [&](const BufferUpdate::Write &w) -> bool {
208 auto r = boost::get<BufferUpdate::Write>(&right);
209 return r != nullptr && (w.fadvise_flags == r->fadvise_flags);
210 },
211 [&](const BufferUpdate::Zero &) -> bool {
212 auto r = boost::get<BufferUpdate::Zero>(&right);
213 return r != nullptr;
214 },
215 [&](const BufferUpdate::CloneRange &c) -> bool {
216 return false;
217 });
218 }
219 BufferUpdateType merge(
220 BufferUpdateType &&left,
221 BufferUpdateType &&right) const {
222 return match(
223 left,
224 [&](const BufferUpdate::Write &w) -> BufferUpdateType {
225 auto r = boost::get<BufferUpdate::Write>(&right);
226 assert(r && w.fadvise_flags == r->fadvise_flags);
227 bufferlist bl = w.buffer;
228 bl.append(r->buffer);
229 return BufferUpdate::Write{bl, w.fadvise_flags};
230 },
231 [&](const BufferUpdate::Zero &z) -> BufferUpdateType {
232 auto r = boost::get<BufferUpdate::Zero>(&right);
233 assert(r);
234 return BufferUpdate::Zero{z.len + r->len};
235 },
236 [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
237 assert(0 == "violates can_merge condition");
238 return left;
239 });
240 }
241 };
242 public:
243 using buffer_update_type = interval_map<
244 uint64_t, BufferUpdateType, SplitMerger>;
245 buffer_update_type buffer_updates;
246
247 friend class PGTransaction;
248 };
249 map<hobject_t, ObjectOperation> op_map;
250 private:
251 ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) {
252 auto &op = op_map[hoid];
253 assert(!op.is_delete());
254 return op;
255 }
256 ObjectOperation &get_object_op(const hobject_t &hoid) {
257 return op_map[hoid];
258 }
259 public:
260 void add_obc(
261 ObjectContextRef obc) {
262 assert(obc);
263 obc_map[obc->obs.oi.soid] = obc;
264 }
265 /// Sets up state for new object
266 void create(
267 const hobject_t &hoid
268 ) {
269 auto &op = op_map[hoid];
270 assert(op.is_none() || op.is_delete());
271 op.init_type = ObjectOperation::Init::Create();
272 }
273
274 /// Sets up state for target cloned from source
275 void clone(
276 const hobject_t &target, ///< [in] obj to clone to
277 const hobject_t &source ///< [in] obj to clone from
278 ) {
279 auto &op = op_map[target];
280 assert(op.is_none() || op.is_delete());
281 op.init_type = ObjectOperation::Init::Clone{source};
282 }
283
284 /// Sets up state for target renamed from source
285 void rename(
286 const hobject_t &target, ///< [in] source (must be a temp object)
287 const hobject_t &source ///< [in] to, must not exist, be non-temp
288 ) {
289 assert(source.is_temp());
290 assert(!target.is_temp());
291 auto &op = op_map[target];
292 assert(op.is_none() || op.is_delete());
293
294 bool del_first = op.is_delete();
295 auto iter = op_map.find(source);
296 if (iter != op_map.end()) {
297 op = iter->second;
298 op_map.erase(iter);
299 op.delete_first = del_first;
300 }
301
302 op.init_type = ObjectOperation::Init::Rename{source};
303 }
304
305 /// Remove -- must not be called on rename target
306 void remove(
307 const hobject_t &hoid ///< [in] obj to remove
308 ) {
309 auto &op = get_object_op_for_modify(hoid);
310 if (!op.is_fresh_object()) {
311 assert(!op.updated_snaps);
312 op = ObjectOperation();
313 op.delete_first = true;
314 } else {
315 assert(!op.is_rename());
316 op_map.erase(hoid); // make it a noop if it's a fresh object
317 }
318 }
319
320 void update_snaps(
321 const hobject_t &hoid, ///< [in] object for snaps
322 const set<snapid_t> &old_snaps,///< [in] old snaps value
323 const set<snapid_t> &new_snaps ///< [in] new snaps value
324 ) {
325 auto &op = get_object_op(hoid);
326 assert(!op.updated_snaps);
327 assert(op.buffer_updates.empty());
328 assert(!op.truncate);
329 op.updated_snaps = make_pair(
330 old_snaps,
331 new_snaps);
332 }
333
334 /// Clears, truncates
335 void omap_clear(
336 const hobject_t &hoid ///< [in] object to clear omap
337 ) {
338 auto &op = get_object_op_for_modify(hoid);
339 op.clear_omap = true;
340 op.omap_updates.clear();
341 op.omap_header = boost::none;
342 }
343 void truncate(
344 const hobject_t &hoid, ///< [in] object
345 uint64_t off ///< [in] offset to truncate to
346 ) {
347 auto &op = get_object_op_for_modify(hoid);
348 assert(!op.updated_snaps);
349 op.buffer_updates.erase(
350 off,
351 std::numeric_limits<uint64_t>::max() - off);
352 if (!op.truncate || off < op.truncate->first) {
353 op.truncate = std::pair<uint64_t, uint64_t>(off, off);
354 } else {
355 op.truncate->second = off;
356 }
357 }
358
359 /// Attr ops
360 void setattrs(
361 const hobject_t &hoid, ///< [in] object to write
362 map<string, bufferlist> &attrs ///< [in] attrs, may be cleared
363 ) {
364 auto &op = get_object_op_for_modify(hoid);
365 for (auto &&i: attrs) {
366 auto& d = op.attr_updates[i.first];
367 d = i.second;
368 d->rebuild();
369 }
370 }
371 void setattr(
372 const hobject_t &hoid, ///< [in] object to write
373 const string &attrname, ///< [in] attr to write
374 bufferlist &bl ///< [in] val to write, may be claimed
375 ) {
376 auto &op = get_object_op_for_modify(hoid);
377 auto& d = op.attr_updates[attrname];
378 d = bl;
379 d->rebuild();
380 }
381 void rmattr(
382 const hobject_t &hoid, ///< [in] object to write
383 const string &attrname ///< [in] attr to remove
384 ) {
385 auto &op = get_object_op_for_modify(hoid);
386 op.attr_updates[attrname] = boost::none;
387 }
388
389 /// set alloc hint
390 void set_alloc_hint(
391 const hobject_t &hoid, ///< [in] object (must exist)
392 uint64_t expected_object_size, ///< [in]
393 uint64_t expected_write_size,
394 uint32_t flags
395 ) {
396 auto &op = get_object_op_for_modify(hoid);
397 op.alloc_hint = ObjectOperation::alloc_hint_t{
398 expected_object_size, expected_write_size, flags};
399 }
400
401 /// Buffer updates
402 void write(
403 const hobject_t &hoid, ///< [in] object to write
404 uint64_t off, ///< [in] off at which to write
405 uint64_t len, ///< [in] len to write from bl
406 bufferlist &bl, ///< [in] bl to write will be claimed to len
407 uint32_t fadvise_flags = 0 ///< [in] fadvise hint
408 ) {
409 auto &op = get_object_op_for_modify(hoid);
410 assert(!op.updated_snaps);
411 assert(len > 0);
412 assert(len == bl.length());
413 op.buffer_updates.insert(
414 off,
415 len,
416 ObjectOperation::BufferUpdate::Write{bl, fadvise_flags});
417 }
418 void clone_range(
419 const hobject_t &from, ///< [in] from
420 const hobject_t &to, ///< [in] to
421 uint64_t fromoff, ///< [in] offset
422 uint64_t len, ///< [in] len
423 uint64_t tooff ///< [in] offset
424 ) {
425 auto &op = get_object_op_for_modify(to);
426 assert(!op.updated_snaps);
427 op.buffer_updates.insert(
428 tooff,
429 len,
430 ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len});
431 }
432 void zero(
433 const hobject_t &hoid, ///< [in] object
434 uint64_t off, ///< [in] offset to start zeroing at
435 uint64_t len ///< [in] amount to zero
436 ) {
437 auto &op = get_object_op_for_modify(hoid);
438 assert(!op.updated_snaps);
439 op.buffer_updates.insert(
440 off,
441 len,
442 ObjectOperation::BufferUpdate::Zero{len});
443 }
444
445 /// Omap updates
446 void omap_setkeys(
447 const hobject_t &hoid, ///< [in] object to write
448 bufferlist &keys_bl ///< [in] encoded map<string, bufferlist>
449 ) {
450 auto &op = get_object_op_for_modify(hoid);
451 op.omap_updates.emplace_back(
452 make_pair(
453 ObjectOperation::OmapUpdateType::Insert,
454 keys_bl));
455 }
456 void omap_setkeys(
457 const hobject_t &hoid, ///< [in] object to write
458 map<string, bufferlist> &keys ///< [in] omap keys, may be cleared
459 ) {
460 bufferlist bl;
461 ::encode(keys, bl);
462 omap_setkeys(hoid, bl);
463 }
464 void omap_rmkeys(
465 const hobject_t &hoid, ///< [in] object to write
466 bufferlist &keys_bl ///< [in] encode set<string>
467 ) {
468 auto &op = get_object_op_for_modify(hoid);
469 op.omap_updates.emplace_back(
470 make_pair(
471 ObjectOperation::OmapUpdateType::Remove,
472 keys_bl));
473 }
474 void omap_rmkeys(
475 const hobject_t &hoid, ///< [in] object to write
476 set<string> &keys ///< [in] omap keys, may be cleared
477 ) {
478 bufferlist bl;
479 ::encode(keys, bl);
480 omap_rmkeys(hoid, bl);
481 }
482 void omap_setheader(
483 const hobject_t &hoid, ///< [in] object to write
484 bufferlist &header ///< [in] header
485 ) {
486 auto &op = get_object_op_for_modify(hoid);
487 op.omap_header = header;
488 }
489
490 bool empty() const {
491 return op_map.empty();
492 }
493
494 uint64_t get_bytes_written() const {
495 uint64_t ret = 0;
496 for (auto &&i: op_map) {
497 for (auto &&j: i.second.buffer_updates) {
498 ret += j.get_len();
499 }
500 }
501 return ret;
502 }
503
504 void nop(
505 const hobject_t &hoid ///< [in] obj to which we are doing nothing
506 ) {
507 get_object_op_for_modify(hoid);
508 }
509
510 /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
511 * sinks are always called before clone sources
512 *
513 * TODO: add a fast path for the single object case and possibly the single
514 * object clone from source case (make_writeable made a clone).
515 *
516 * This structure only requires that the source->sink graph be acyclic.
517 * This is much more general than is actually required by PrimaryLogPG.
518 * Only 4 flavors of multi-object transactions actually happen:
519 * 1) rename temp -> object for copyfrom
520 * 2) clone head -> clone, modify head for make_writeable on normal head write
521 * 3) clone clone -> head for rollback
522 * 4) 2 + 3
523 *
524 * We can bypass the below logic for single object transactions trivially
525 * (including case 1 above since temp doesn't show up again).
526 * For 2-3, we could add something ad-hoc to ensure that they happen in the
527 * right order, but it actually seems easier to just do the graph construction.
528 */
529 template <typename T>
530 void safe_create_traverse(T &&t) {
531 map<hobject_t, list<hobject_t>> dgraph;
532 list<hobject_t> stack;
533
534 // Populate stack with roots, dgraph with edges
535 for (auto &&opair: op_map) {
536 hobject_t source;
537 if (opair.second.has_source(&source)) {
538 auto &l = dgraph[source];
539 if (l.empty() && !op_map.count(source)) {
540 /* Source oids not in op_map need to be added as roots
541 * (but only once!) */
542 stack.push_back(source);
543 }
544 l.push_back(opair.first);
545 } else {
546 stack.push_back(opair.first);
547 }
548 }
549
550 /* Why don't we need to worry about accessing the same node
551 * twice? dgraph nodes always have in-degree at most 1 because
552 * the inverse graph nodes (source->dest) can have out-degree
553 * at most 1 (only one possible source). We do a post-order
554 * depth-first traversal here to ensure we call f on children
555 * before parents.
556 */
557 while (!stack.empty()) {
558 hobject_t &cur = stack.front();
559 auto diter = dgraph.find(cur);
560 if (diter == dgraph.end()) {
561 /* Leaf: pop and call t() */
562 auto opiter = op_map.find(cur);
563 if (opiter != op_map.end())
564 t(*opiter);
565 stack.pop_front();
566 } else {
567 /* Internal node: push children onto stack, remove edge,
568 * recurse. When this node is encountered again, it'll
569 * be a leaf */
570 assert(!diter->second.empty());
571 stack.splice(stack.begin(), diter->second);
572 dgraph.erase(diter);
573 }
574 }
575 }
576 };
577 using PGTransactionUPtr = std::unique_ptr<PGTransaction>;
578
579 #endif