]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/ExtentCache.h
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / ExtentCache.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef EXTENT_CACHE_H
16 #define EXTENT_CACHE_H
17
18 #include <map>
19 #include <list>
20 #include <vector>
21 #include <utility>
22 #include <optional>
23 #include <boost/intrusive/set.hpp>
24 #include <boost/intrusive/list.hpp>
25 #include "include/interval_set.h"
26 #include "common/interval_map.h"
27 #include "include/buffer.h"
28 #include "common/hobject.h"
29
30 /**
31 ExtentCache
32
33 The main purpose of this cache is to ensure that we can pipeline
34 overlapping partial overwrites.
35
36 To that end we need to ensure that an extent pinned for an operation is
37 live until that operation completes. However, a particular extent
38 might be pinned by multiple operations (several pipelined writes
39 on the same object).
40
41 1) When we complete an operation, we only look at extents owned only
42 by that operation.
43 2) Per-extent overhead is fixed size.
44 2) Per-operation metadata is fixed size.
45
46 This is simple enough to realize with two main structures:
47 - extent: contains a pointer to the pin owning it and intrusive list
48 pointers to other extents owned by the same pin
49 - pin_state: contains the list head for extents owned by it
50
51 This works as long as we only need to remember one "owner" for
52 each extent. To make this work, we'll need to leverage some
53 invariants guaranteed by higher layers:
54
55 1) Writes on a particular object must be ordered
56 2) A particular object will have outstanding reads or writes, but not
57 both (note that you can have a read while a write is committed, but
58 not applied).
59
60 Our strategy therefore will be to have whichever in-progress op will
61 finish "last" be the owner of a particular extent. For now, we won't
62 cache reads, so 2) simply means that we can assume that reads and
63 recovery operations imply no unstable extents on the object in
64 question.
65
66 Write: WaitRead -> WaitCommit -> Complete
67
68 Invariant 1) above actually indicates that we can't have writes
69 bypassing the WaitRead state while there are writes waiting on
70 Reads. Thus, the set of operations pinning a particular extent
71 must always complete in order or arrival.
72
73 This suggests that a particular extent may be in only the following
74 states:
75
76
77 0) Empty (not in the map at all)
78 1) Write Pending N
79 - Some write with reqid <= N is currently fetching the data for
80 this extent
81 - The extent must persist until Write reqid N completes
82 - All ops pinning this extent are writes in the WaitRead state of
83 the Write pipeline (there must be an in progress write, so no
84 reads can be in progress).
85 2) Write Pinned N:
86 - This extent has data corresponding to some reqid M <= N
87 - The extent must persist until Write reqid N commits
88 - All ops pinning this extent are writes in some Write
89 state (all are possible). Reads are not possible
90 in this state (or the others) due to 2).
91
92 All of the above suggests that there are 3 things users can
93 ask of the cache corresponding to the 3 Write pipelines
94 states.
95 */
96
97 /// If someone wants these types, but not ExtentCache, move to another file
98 struct bl_split_merge {
99 bufferlist split(
100 uint64_t offset,
101 uint64_t length,
102 bufferlist &bl) const {
103 bufferlist out;
104 out.substr_of(bl, offset, length);
105 return out;
106 }
107 bool can_merge(const bufferlist &left, const bufferlist &right) const {
108 return true;
109 }
110 bufferlist merge(bufferlist &&left, bufferlist &&right) const {
111 bufferlist bl;
112 bl.claim(left);
113 bl.claim_append(right);
114 return bl;
115 }
116 uint64_t length(const bufferlist &b) const { return b.length(); }
117 };
118 using extent_set = interval_set<uint64_t>;
119 using extent_map = interval_map<uint64_t, bufferlist, bl_split_merge>;
120
121 class ExtentCache {
122 struct object_extent_set;
123 struct pin_state;
124 private:
125
126 struct extent {
127 object_extent_set *parent_extent_set = nullptr;
128 pin_state *parent_pin_state = nullptr;
129 boost::intrusive::set_member_hook<> extent_set_member;
130 boost::intrusive::list_member_hook<> pin_list_member;
131
132 uint64_t offset;
133 uint64_t length;
134 std::optional<bufferlist> bl;
135
136 uint64_t get_length() const {
137 return length;
138 }
139
140 bool is_pending() const {
141 return bl == std::nullopt;
142 }
143
144 bool pinned_by_write() const {
145 ceph_assert(parent_pin_state);
146 return parent_pin_state->is_write();
147 }
148
149 uint64_t pin_tid() const {
150 ceph_assert(parent_pin_state);
151 return parent_pin_state->tid;
152 }
153
154 extent(uint64_t offset, bufferlist _bl)
155 : offset(offset), length(_bl.length()), bl(_bl) {}
156
157 extent(uint64_t offset, uint64_t length)
158 : offset(offset), length(length) {}
159
160 bool operator<(const extent &rhs) const {
161 return offset < rhs.offset;
162 }
163 private:
164 // can briefly violate the two link invariant, used in unlink() and move()
165 void _link_pin_state(pin_state &pin_state);
166 void _unlink_pin_state();
167 public:
168 void unlink();
169 void link(object_extent_set &parent_extent_set, pin_state &pin_state);
170 void move(pin_state &to);
171 };
172
173 struct object_extent_set : boost::intrusive::set_base_hook<> {
174 hobject_t oid;
175 explicit object_extent_set(const hobject_t &oid) : oid(oid) {}
176
177 using set_member_options = boost::intrusive::member_hook<
178 extent,
179 boost::intrusive::set_member_hook<>,
180 &extent::extent_set_member>;
181 using set = boost::intrusive::set<extent, set_member_options>;
182 set extent_set;
183
184 bool operator<(const object_extent_set &rhs) const {
185 return oid < rhs.oid;
186 }
187
188 struct uint_cmp {
189 bool operator()(uint64_t lhs, const extent &rhs) const {
190 return lhs < rhs.offset;
191 }
192 bool operator()(const extent &lhs, uint64_t rhs) const {
193 return lhs.offset < rhs;
194 }
195 };
196 std::pair<set::iterator, set::iterator> get_containing_range(
197 uint64_t offset, uint64_t length);
198
199 void erase(uint64_t offset, uint64_t length);
200
201 struct update_action {
202 enum type {
203 NONE,
204 UPDATE_PIN
205 };
206 type action = NONE;
207 std::optional<bufferlist> bl;
208 };
209 template <typename F>
210 void traverse_update(
211 pin_state &pin,
212 uint64_t offset,
213 uint64_t length,
214 F &&f) {
215 auto range = get_containing_range(offset, length);
216
217 if (range.first == range.second || range.first->offset > offset) {
218 uint64_t extlen = range.first == range.second ?
219 length : range.first->offset - offset;
220
221 update_action action;
222 f(offset, extlen, nullptr, &action);
223 ceph_assert(!action.bl || action.bl->length() == extlen);
224 if (action.action == update_action::UPDATE_PIN) {
225 extent *ext = action.bl ?
226 new extent(offset, *action.bl) :
227 new extent(offset, extlen);
228 ext->link(*this, pin);
229 } else {
230 ceph_assert(!action.bl);
231 }
232 }
233
234 for (auto p = range.first; p != range.second;) {
235 extent *ext = &*p;
236 ++p;
237
238 uint64_t extoff = std::max(ext->offset, offset);
239 uint64_t extlen = std::min(
240 ext->length - (extoff - ext->offset),
241 offset + length - extoff);
242
243 update_action action;
244 f(extoff, extlen, ext, &action);
245 ceph_assert(!action.bl || action.bl->length() == extlen);
246 extent *final_extent = nullptr;
247 if (action.action == update_action::NONE) {
248 final_extent = ext;
249 } else {
250 pin_state *ps = ext->parent_pin_state;
251 ext->unlink();
252 if ((ext->offset < offset) &&
253 (ext->offset + ext->get_length() > offset)) {
254 extent *head = nullptr;
255 if (ext->bl) {
256 bufferlist bl;
257 bl.substr_of(
258 *(ext->bl),
259 0,
260 offset - ext->offset);
261 head = new extent(ext->offset, bl);
262 } else {
263 head = new extent(
264 ext->offset, offset - ext->offset);
265 }
266 head->link(*this, *ps);
267 }
268 if ((ext->offset + ext->length > offset + length) &&
269 (offset + length > ext->offset)) {
270 uint64_t nlen =
271 (ext->offset + ext->get_length()) - (offset + length);
272 extent *tail = nullptr;
273 if (ext->bl) {
274 bufferlist bl;
275 bl.substr_of(
276 *(ext->bl),
277 ext->get_length() - nlen,
278 nlen);
279 tail = new extent(offset + length, bl);
280 } else {
281 tail = new extent(offset + length, nlen);
282 }
283 tail->link(*this, *ps);
284 }
285 if (action.action == update_action::UPDATE_PIN) {
286 if (ext->bl) {
287 bufferlist bl;
288 bl.substr_of(
289 *(ext->bl),
290 extoff - ext->offset,
291 extlen);
292 final_extent = new ExtentCache::extent(
293 extoff,
294 bl);
295 } else {
296 final_extent = new ExtentCache::extent(
297 extoff, extlen);
298 }
299 final_extent->link(*this, pin);
300 }
301 delete ext;
302 }
303
304 if (action.bl) {
305 ceph_assert(final_extent);
306 ceph_assert(final_extent->length == action.bl->length());
307 final_extent->bl = *(action.bl);
308 }
309
310 uint64_t next_off = p == range.second ?
311 offset + length : p->offset;
312 if (extoff + extlen < next_off) {
313 uint64_t tailoff = extoff + extlen;
314 uint64_t taillen = next_off - tailoff;
315
316 update_action action;
317 f(tailoff, taillen, nullptr, &action);
318 ceph_assert(!action.bl || action.bl->length() == taillen);
319 if (action.action == update_action::UPDATE_PIN) {
320 extent *ext = action.bl ?
321 new extent(tailoff, *action.bl) :
322 new extent(tailoff, taillen);
323 ext->link(*this, pin);
324 } else {
325 ceph_assert(!action.bl);
326 }
327 }
328 }
329 }
330 };
331 struct Cmp {
332 bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
333 return oid < rhs.oid;
334 }
335 bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
336 return lhs.oid < oid;
337 }
338 };
339
340 object_extent_set &get_or_create(const hobject_t &oid);
341 object_extent_set *get_if_exists(const hobject_t &oid);
342
343 void remove_and_destroy_if_empty(object_extent_set &set);
344 using cache_set = boost::intrusive::set<object_extent_set>;
345 cache_set per_object_caches;
346
347 uint64_t next_write_tid = 1;
348 uint64_t next_read_tid = 1;
349 struct pin_state {
350 uint64_t tid = 0;
351 enum pin_type_t {
352 NONE,
353 WRITE,
354 };
355 pin_type_t pin_type = NONE;
356 bool is_write() const { return pin_type == WRITE; }
357
358 pin_state(const pin_state &other) = delete;
359 pin_state &operator=(const pin_state &other) = delete;
360 pin_state(pin_state &&other) = delete;
361 pin_state() = default;
362
363 using list_member_options = boost::intrusive::member_hook<
364 extent,
365 boost::intrusive::list_member_hook<>,
366 &extent::pin_list_member>;
367 using list = boost::intrusive::list<extent, list_member_options>;
368 list pin_list;
369 ~pin_state() {
370 ceph_assert(pin_list.empty());
371 ceph_assert(tid == 0);
372 ceph_assert(pin_type == NONE);
373 }
374 void _open(uint64_t in_tid, pin_type_t in_type) {
375 ceph_assert(pin_type == NONE);
376 ceph_assert(in_tid > 0);
377 tid = in_tid;
378 pin_type = in_type;
379 }
380 };
381
382 void release_pin(pin_state &p) {
383 for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
384 unique_ptr<extent> extent(&*iter); // we now own this
385 iter++; // unlink will invalidate
386 ceph_assert(extent->parent_extent_set);
387 auto &eset = *(extent->parent_extent_set);
388 extent->unlink();
389 remove_and_destroy_if_empty(eset);
390 }
391 p.tid = 0;
392 p.pin_type = pin_state::NONE;
393 }
394
395 public:
396 class write_pin : private pin_state {
397 friend class ExtentCache;
398 private:
399 void open(uint64_t in_tid) {
400 _open(in_tid, pin_state::WRITE);
401 }
402 public:
403 write_pin() : pin_state() {}
404 };
405
406 void open_write_pin(write_pin &pin) {
407 pin.open(next_write_tid++);
408 }
409
410 /**
411 * Reserves extents required for rmw, and learn
412 * which need to be read
413 *
414 * Pins all extents in to_write. Returns subset of to_read not
415 * currently present in the cache. Caller must obtain those
416 * extents before calling get_remaining_extents_for_rmw.
417 *
418 * Transition table:
419 * - Empty -> Write Pending pin.reqid
420 * - Write Pending N -> Write Pending pin.reqid
421 * - Write Pinned N -> Write Pinned pin.reqid
422 *
423 * @param oid [in] object undergoing rmw
424 * @param pin [in,out] pin to use (obtained from create_write_pin)
425 * @param to_write [in] extents which will be written
426 * @param to_read [in] extents to read prior to write (must be subset
427 * of to_write)
428 * @return subset of to_read which isn't already present or pending
429 */
430 extent_set reserve_extents_for_rmw(
431 const hobject_t &oid,
432 write_pin &pin,
433 const extent_set &to_write,
434 const extent_set &to_read);
435
436 /**
437 * Gets extents required for rmw not returned from
438 * reserve_extents_for_rmw
439 *
440 * Requested extents (to_get) must be the set to_read \ the set
441 * returned from reserve_extents_for_rmw. No transition table,
442 * all extents at this point must be present and already pinned
443 * for this pin by reserve_extents_for_rmw.
444 *
445 * @param oid [in] object
446 * @param pin [in,out] pin associated with this IO
447 * @param to_get [in] extents to get (see above for restrictions)
448 * @return map of buffers from to_get
449 */
450 extent_map get_remaining_extents_for_rmw(
451 const hobject_t &oid,
452 write_pin &pin,
453 const extent_set &to_get);
454
455 /**
456 * Updates the cache to reflect the rmw write
457 *
458 * All presented extents must already have been specified in
459 * reserve_extents_for_rmw under to_write.
460 *
461 * Transition table:
462 * - Empty -> invalid, must call reserve_extents_for_rmw first
463 * - Write Pending N -> Write Pinned N, update buffer
464 * (assert N >= pin.reqid)
465 * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
466 *
467 * @param oid [in] object
468 * @param pin [in,out] pin associated with this IO
469 * @param extents [in] map of buffers to update
470 * @return void
471 */
472 void present_rmw_update(
473 const hobject_t &oid,
474 write_pin &pin,
475 const extent_map &extents);
476
477 /**
478 * Release all buffers pinned by pin
479 */
480 void release_write_pin(
481 write_pin &pin) {
482 release_pin(pin);
483 }
484
485 ostream &print(
486 ostream &out) const;
487 };
488
489 ostream &operator<<(ostream &lhs, const ExtentCache &cache);
490
491 #endif