]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGTransaction.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / osd / PGTransaction.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14 #ifndef PGTRANSACTION_H
15 #define PGTRANSACTION_H
16
17 #include <map>
18 #include <memory>
19 #include <boost/optional.hpp>
20
21 #include "common/hobject.h"
22 #include "osd/osd_types.h"
23 #include "osd/osd_internal_types.h"
24 #include "common/interval_map.h"
25 #include "common/inline_variant.h"
26
27 /**
28 * This class represents transactions which can be submitted to
29 * a PGBackend. For expediency, there are some constraints on
30 * the operations submitted:
31 * 1) Rename sources may only be referenced prior to the rename
32 * operation to the destination.
33 * 2) The graph formed by edges of source->destination for clones
34 * (Create) and Renames must be acyclic.
35 * 3) clone_range sources must not be modified by the same
36 * transaction
37 */
38 class PGTransaction {
39 public:
40 map<hobject_t, ObjectContextRef> obc_map;
41
42 class ObjectOperation {
43 public:
44 struct Init
45 {
46 struct None {};
47 struct Create {};
48 struct Clone {
49 hobject_t source;
50 };
51 struct Rename {
52 hobject_t source; // must be temp object
53 };
54 };
55 using InitType = boost::variant<
56 Init::None,
57 Init::Create,
58 Init::Clone,
59 Init::Rename>;
60
61 InitType init_type = Init::None();
62 bool delete_first = false;
63
64 /**
65 * is_none() && is_delete() indicates that we are deleting an
66 * object which already exists and not recreating it. delete_first means
67 * that the transaction logically removes the object.
68
69 * There are really 4 cases:
70
71 * 1) We are modifying an existing object (is_none() &&
72 * !is_delete())
73 * a) If it's an append, we just write into the log entry the old size
74 * b) If it's an actual overwrite, we save the old versions of the
75 * extents being overwritten and write those offsets into the log
76 * entry
77 * 2) We are removing and then recreating an object (!is_none() && is_delete())
78 * -- stash
79 * 3) We are removing an object (is_none() && is_delete()) -- stash
80 * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
81 * stash)
82 *
83 * Create, Clone, Rename are the three ways we can recreate it.
84 * ECBackend transaction planning needs this context
85 * to figure out how to perform the transaction.
86 */
87 bool deletes_first() const {
88 return delete_first;
89 }
90 bool is_delete() const {
91 return boost::get<Init::None>(&init_type) != nullptr && delete_first;
92 }
93 bool is_none() const {
94 return boost::get<Init::None>(&init_type) != nullptr && !delete_first;
95 }
96 bool is_fresh_object() const {
97 return boost::get<Init::None>(&init_type) == nullptr;
98 }
99 bool is_rename() const {
100 return boost::get<Init::Rename>(&init_type) != nullptr;
101 }
102 bool has_source(hobject_t *source = nullptr) const {
103 return match(
104 init_type,
105 [&](const Init::Clone &op) -> bool {
106 if (source)
107 *source = op.source;
108 return true;
109 },
110 [&](const Init::Rename &op) -> bool {
111 if (source)
112 *source = op.source;
113 return true;
114 },
115 [&](const Init::None &) -> bool { return false; },
116 [&](const Init::Create &) -> bool { return false; });
117 }
118
119 bool clear_omap = false;
120
121 /**
122 * truncate
123 * <lowest, last> ?
124 *
125 * truncate is represented as a pair because in the event of
126 * multiple truncates within a single transaction we need to
127 * remember the lowest truncate and the final object size
128 * (the last truncate). We also adjust the buffers map
129 * to account for truncates overriding previous writes */
130 boost::optional<pair<uint64_t, uint64_t> > truncate = boost::none;
131
132 std::map<string, boost::optional<bufferlist> > attr_updates;
133
134 enum class OmapUpdateType {Remove, Insert};
135 std::vector<std::pair<OmapUpdateType, bufferlist> > omap_updates;
136
137 boost::optional<bufferlist> omap_header;
138
139 /// (old, new) -- only valid with no truncate or buffer updates
140 boost::optional<pair<set<snapid_t>, set<snapid_t> > > updated_snaps;
141
142 struct alloc_hint_t {
143 uint64_t expected_object_size;
144 uint64_t expected_write_size;
145 uint32_t flags;
146 };
147 boost::optional<alloc_hint_t> alloc_hint;
148
149 struct BufferUpdate {
150 struct Write {
151 bufferlist buffer;
152 uint32_t fadvise_flags;
153 };
154 struct Zero {
155 uint64_t len;
156 };
157 struct CloneRange {
158 hobject_t from;
159 uint64_t offset;
160 uint64_t len;
161 };
162 };
163 using BufferUpdateType = boost::variant<
164 BufferUpdate::Write,
165 BufferUpdate::Zero,
166 BufferUpdate::CloneRange>;
167
168 private:
169 struct SplitMerger {
170 BufferUpdateType split(
171 uint64_t offset,
172 uint64_t len,
173 const BufferUpdateType &bu) const {
174 return match(
175 bu,
176 [&](const BufferUpdate::Write &w) -> BufferUpdateType {
177 bufferlist bl;
178 bl.substr_of(w.buffer, offset, len);
179 return BufferUpdate::Write{bl, w.fadvise_flags};
180 },
181 [&](const BufferUpdate::Zero &) -> BufferUpdateType {
182 return BufferUpdate::Zero{len};
183 },
184 [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
185 return BufferUpdate::CloneRange{c.from, c.offset + offset, len};
186 });
187 }
188 uint64_t length(
189 const BufferUpdateType &left) const {
190 return match(
191 left,
192 [&](const BufferUpdate::Write &w) -> uint64_t {
193 return w.buffer.length();
194 },
195 [&](const BufferUpdate::Zero &z) -> uint64_t {
196 return z.len;
197 },
198 [&](const BufferUpdate::CloneRange &c) -> uint64_t {
199 return c.len;
200 });
201 }
202 bool can_merge(
203 const BufferUpdateType &left,
204 const BufferUpdateType &right) const {
205 return match(
206 left,
207 [&](const BufferUpdate::Write &w) -> bool {
208 auto r = boost::get<BufferUpdate::Write>(&right);
209 return r != nullptr && (w.fadvise_flags == r->fadvise_flags);
210 },
211 [&](const BufferUpdate::Zero &) -> bool {
212 auto r = boost::get<BufferUpdate::Zero>(&right);
213 return r != nullptr;
214 },
215 [&](const BufferUpdate::CloneRange &c) -> bool {
216 return false;
217 });
218 }
219 BufferUpdateType merge(
220 BufferUpdateType &&left,
221 BufferUpdateType &&right) const {
222 return match(
223 left,
224 [&](const BufferUpdate::Write &w) -> BufferUpdateType {
225 auto r = boost::get<BufferUpdate::Write>(&right);
226 assert(r && w.fadvise_flags == r->fadvise_flags);
227 bufferlist bl = w.buffer;
228 bl.append(r->buffer);
229 return BufferUpdate::Write{bl, w.fadvise_flags};
230 },
231 [&](const BufferUpdate::Zero &z) -> BufferUpdateType {
232 auto r = boost::get<BufferUpdate::Zero>(&right);
233 assert(r);
234 return BufferUpdate::Zero{z.len + r->len};
235 },
236 [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
237 assert(0 == "violates can_merge condition");
238 return left;
239 });
240 }
241 };
242 public:
243 using buffer_update_type = interval_map<
244 uint64_t, BufferUpdateType, SplitMerger>;
245 buffer_update_type buffer_updates;
246
247 friend class PGTransaction;
248 };
249 map<hobject_t, ObjectOperation> op_map;
250 private:
251 ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) {
252 auto &op = op_map[hoid];
253 assert(!op.is_delete());
254 return op;
255 }
256 ObjectOperation &get_object_op(const hobject_t &hoid) {
257 return op_map[hoid];
258 }
259 public:
260 void add_obc(
261 ObjectContextRef obc) {
262 assert(obc);
263 obc_map[obc->obs.oi.soid] = obc;
264 }
265 /// Sets up state for new object
266 void create(
267 const hobject_t &hoid
268 ) {
269 auto &op = op_map[hoid];
270 assert(op.is_none() || op.is_delete());
271 op.init_type = ObjectOperation::Init::Create();
272 }
273
274 /// Sets up state for target cloned from source
275 void clone(
276 const hobject_t &target, ///< [in] obj to clone to
277 const hobject_t &source ///< [in] obj to clone from
278 ) {
279 auto &op = op_map[target];
280 assert(op.is_none() || op.is_delete());
281 op.init_type = ObjectOperation::Init::Clone{source};
282 }
283
284 /// Sets up state for target renamed from source
285 void rename(
286 const hobject_t &target, ///< [in] source (must be a temp object)
287 const hobject_t &source ///< [in] to, must not exist, be non-temp
288 ) {
289 assert(source.is_temp());
290 assert(!target.is_temp());
291 auto &op = op_map[target];
292 assert(op.is_none() || op.is_delete());
293
294 bool del_first = op.is_delete();
295 auto iter = op_map.find(source);
296 if (iter != op_map.end()) {
297 op = iter->second;
298 op_map.erase(iter);
299 op.delete_first = del_first;
300 }
301
302 op.init_type = ObjectOperation::Init::Rename{source};
303 }
304
305 /// Remove -- must not be called on rename target
306 void remove(
307 const hobject_t &hoid ///< [in] obj to remove
308 ) {
309 auto &op = get_object_op_for_modify(hoid);
310 if (!op.is_fresh_object()) {
311 assert(!op.updated_snaps);
312 op = ObjectOperation();
313 op.delete_first = true;
314 } else {
315 assert(!op.is_rename());
316 op_map.erase(hoid); // make it a noop if it's a fresh object
317 }
318 }
319
320 void update_snaps(
321 const hobject_t &hoid, ///< [in] object for snaps
322 const set<snapid_t> &old_snaps,///< [in] old snaps value
323 const set<snapid_t> &new_snaps ///< [in] new snaps value
324 ) {
325 auto &op = get_object_op(hoid);
326 assert(!op.updated_snaps);
327 assert(op.buffer_updates.empty());
328 assert(!op.truncate);
329 op.updated_snaps = make_pair(
330 old_snaps,
331 new_snaps);
332 }
333
334 /// Clears, truncates
335 void omap_clear(
336 const hobject_t &hoid ///< [in] object to clear omap
337 ) {
338 auto &op = get_object_op_for_modify(hoid);
339 op.clear_omap = true;
340 op.omap_updates.clear();
341 op.omap_header = boost::none;
342 }
343 void truncate(
344 const hobject_t &hoid, ///< [in] object
345 uint64_t off ///< [in] offset to truncate to
346 ) {
347 auto &op = get_object_op_for_modify(hoid);
348 assert(!op.updated_snaps);
349 op.buffer_updates.erase(
350 off,
351 std::numeric_limits<uint64_t>::max() - off);
352 if (!op.truncate || off < op.truncate->first) {
353 op.truncate = std::pair<uint64_t, uint64_t>(off, off);
354 } else {
355 op.truncate->second = off;
356 }
357 }
358
359 /// Attr ops
360 void setattrs(
361 const hobject_t &hoid, ///< [in] object to write
362 map<string, bufferlist> &attrs ///< [in] attrs, may be cleared
363 ) {
364 auto &op = get_object_op_for_modify(hoid);
365 for (auto &&i: attrs) {
366 op.attr_updates[i.first] = i.second;
367 }
368 }
369 void setattr(
370 const hobject_t &hoid, ///< [in] object to write
371 const string &attrname, ///< [in] attr to write
372 bufferlist &bl ///< [in] val to write, may be claimed
373 ) {
374 auto &op = get_object_op_for_modify(hoid);
375 op.attr_updates[attrname] = bl;
376 }
377 void rmattr(
378 const hobject_t &hoid, ///< [in] object to write
379 const string &attrname ///< [in] attr to remove
380 ) {
381 auto &op = get_object_op_for_modify(hoid);
382 op.attr_updates[attrname] = boost::none;
383 }
384
385 /// set alloc hint
386 void set_alloc_hint(
387 const hobject_t &hoid, ///< [in] object (must exist)
388 uint64_t expected_object_size, ///< [in]
389 uint64_t expected_write_size,
390 uint32_t flags
391 ) {
392 auto &op = get_object_op_for_modify(hoid);
393 op.alloc_hint = ObjectOperation::alloc_hint_t{
394 expected_object_size, expected_write_size, flags};
395 }
396
397 /// Buffer updates
398 void write(
399 const hobject_t &hoid, ///< [in] object to write
400 uint64_t off, ///< [in] off at which to write
401 uint64_t len, ///< [in] len to write from bl
402 bufferlist &bl, ///< [in] bl to write will be claimed to len
403 uint32_t fadvise_flags = 0 ///< [in] fadvise hint
404 ) {
405 auto &op = get_object_op_for_modify(hoid);
406 assert(!op.updated_snaps);
407 assert(len > 0);
408 assert(len == bl.length());
409 op.buffer_updates.insert(
410 off,
411 len,
412 ObjectOperation::BufferUpdate::Write{bl, fadvise_flags});
413 }
414 void clone_range(
415 const hobject_t &from, ///< [in] from
416 const hobject_t &to, ///< [in] to
417 uint64_t fromoff, ///< [in] offset
418 uint64_t len, ///< [in] len
419 uint64_t tooff ///< [in] offset
420 ) {
421 auto &op = get_object_op_for_modify(to);
422 assert(!op.updated_snaps);
423 op.buffer_updates.insert(
424 tooff,
425 len,
426 ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len});
427 }
428 void zero(
429 const hobject_t &hoid, ///< [in] object
430 uint64_t off, ///< [in] offset to start zeroing at
431 uint64_t len ///< [in] amount to zero
432 ) {
433 auto &op = get_object_op_for_modify(hoid);
434 assert(!op.updated_snaps);
435 op.buffer_updates.insert(
436 off,
437 len,
438 ObjectOperation::BufferUpdate::Zero{len});
439 }
440
441 /// Omap updates
442 void omap_setkeys(
443 const hobject_t &hoid, ///< [in] object to write
444 bufferlist &keys_bl ///< [in] encoded map<string, bufferlist>
445 ) {
446 auto &op = get_object_op_for_modify(hoid);
447 op.omap_updates.emplace_back(
448 make_pair(
449 ObjectOperation::OmapUpdateType::Insert,
450 keys_bl));
451 }
452 void omap_setkeys(
453 const hobject_t &hoid, ///< [in] object to write
454 map<string, bufferlist> &keys ///< [in] omap keys, may be cleared
455 ) {
456 bufferlist bl;
457 ::encode(keys, bl);
458 omap_setkeys(hoid, bl);
459 }
460 void omap_rmkeys(
461 const hobject_t &hoid, ///< [in] object to write
462 bufferlist &keys_bl ///< [in] encode set<string>
463 ) {
464 auto &op = get_object_op_for_modify(hoid);
465 op.omap_updates.emplace_back(
466 make_pair(
467 ObjectOperation::OmapUpdateType::Remove,
468 keys_bl));
469 }
470 void omap_rmkeys(
471 const hobject_t &hoid, ///< [in] object to write
472 set<string> &keys ///< [in] omap keys, may be cleared
473 ) {
474 bufferlist bl;
475 ::encode(keys, bl);
476 omap_rmkeys(hoid, bl);
477 }
478 void omap_setheader(
479 const hobject_t &hoid, ///< [in] object to write
480 bufferlist &header ///< [in] header
481 ) {
482 auto &op = get_object_op_for_modify(hoid);
483 op.omap_header = header;
484 }
485
486 bool empty() const {
487 return op_map.empty();
488 }
489
490 uint64_t get_bytes_written() const {
491 uint64_t ret = 0;
492 for (auto &&i: op_map) {
493 for (auto &&j: i.second.buffer_updates) {
494 ret += j.get_len();
495 }
496 }
497 return ret;
498 }
499
500 void nop(
501 const hobject_t &hoid ///< [in] obj to which we are doing nothing
502 ) {
503 get_object_op_for_modify(hoid);
504 }
505
506 /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
507 * sinks are always called before clone sources
508 *
509 * TODO: add a fast path for the single object case and possibly the single
510 * object clone from source case (make_writeable made a clone).
511 *
512 * This structure only requires that the source->sink graph be acyclic.
513 * This is much more general than is actually required by PrimaryLogPG.
514 * Only 4 flavors of multi-object transactions actually happen:
515 * 1) rename temp -> object for copyfrom
516 * 2) clone head -> clone, modify head for make_writeable on normal head write
517 * 3) clone clone -> head for rollback
518 * 4) 2 + 3
519 *
520 * We can bypass the below logic for single object transactions trivially
521 * (including case 1 above since temp doesn't show up again).
522 * For 2-3, we could add something ad-hoc to ensure that they happen in the
523 * right order, but it actually seems easier to just do the graph construction.
524 */
525 template <typename T>
526 void safe_create_traverse(T &&t) {
527 map<hobject_t, list<hobject_t>> dgraph;
528 list<hobject_t> stack;
529
530 // Populate stack with roots, dgraph with edges
531 for (auto &&opair: op_map) {
532 hobject_t source;
533 if (opair.second.has_source(&source)) {
534 auto &l = dgraph[source];
535 if (l.empty() && !op_map.count(source)) {
536 /* Source oids not in op_map need to be added as roots
537 * (but only once!) */
538 stack.push_back(source);
539 }
540 l.push_back(opair.first);
541 } else {
542 stack.push_back(opair.first);
543 }
544 }
545
546 /* Why don't we need to worry about accessing the same node
547 * twice? dgraph nodes always have in-degree at most 1 because
548 * the inverse graph nodes (source->dest) can have out-degree
549 * at most 1 (only one possible source). We do a post-order
550 * depth-first traversal here to ensure we call f on children
551 * before parents.
552 */
553 while (!stack.empty()) {
554 hobject_t &cur = stack.front();
555 auto diter = dgraph.find(cur);
556 if (diter == dgraph.end()) {
557 /* Leaf: pop and call t() */
558 auto opiter = op_map.find(cur);
559 if (opiter != op_map.end())
560 t(*opiter);
561 stack.pop_front();
562 } else {
563 /* Internal node: push children onto stack, remove edge,
564 * recurse. When this node is encountered again, it'll
565 * be a leaf */
566 assert(!diter->second.empty());
567 stack.splice(stack.begin(), diter->second);
568 dgraph.erase(diter);
569 }
570 }
571 }
572 };
573 using PGTransactionUPtr = std::unique_ptr<PGTransaction>;
574
575 #endif