]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2016 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef EXTENT_CACHE_H | |
16 | #define EXTENT_CACHE_H | |
17 | ||
18 | #include <map> | |
19 | #include <list> | |
20 | #include <vector> | |
21 | #include <utility> | |
22 | #include <boost/optional.hpp> | |
23 | #include <boost/intrusive/set.hpp> | |
24 | #include <boost/intrusive/list.hpp> | |
25 | #include "include/interval_set.h" | |
26 | #include "common/interval_map.h" | |
27 | #include "include/buffer.h" | |
28 | #include "common/hobject.h" | |
29 | ||
30 | /** | |
31 | ExtentCache | |
32 | ||
33 | The main purpose of this cache is to ensure that we can pipeline | |
34 | overlapping partial overwrites. | |
35 | ||
36 | To that end we need to ensure that an extent pinned for an operation is | |
37 | live until that operation completes. However, a particular extent | |
38 | might be pinned by multiple operations (several pipelined writes | |
39 | on the same object). | |
40 | ||
41 | 1) When we complete an operation, we only look at extents owned only | |
42 | by that operation. | |
43 | 2) Per-extent overhead is fixed size. | |
44 | 2) Per-operation metadata is fixed size. | |
45 | ||
46 | This is simple enough to realize with two main structures: | |
47 | - extent: contains a pointer to the pin owning it and intrusive list | |
48 | pointers to other extents owned by the same pin | |
49 | - pin_state: contains the list head for extents owned by it | |
50 | ||
51 | This works as long as we only need to remember one "owner" for | |
52 | each extent. To make this work, we'll need to leverage some | |
53 | invariants guaranteed by higher layers: | |
54 | ||
55 | 1) Writes on a particular object must be ordered | |
56 | 2) A particular object will have outstanding reads or writes, but not | |
57 | both (note that you can have a read while a write is committed, but | |
58 | not applied). | |
59 | ||
60 | Our strategy therefore will be to have whichever in-progress op will | |
61 | finish "last" be the owner of a particular extent. For now, we won't | |
62 | cache reads, so 2) simply means that we can assume that reads and | |
63 | recovery operations imply no unstable extents on the object in | |
64 | question. | |
65 | ||
66 | Write: WaitRead -> WaitCommit -> Complete | |
67 | ||
68 | Invariant 1) above actually indicates that we can't have writes | |
69 | bypassing the WaitRead state while there are writes waiting on | |
70 | Reads. Thus, the set of operations pinning a particular extent | |
71 | must always complete in order or arrival. | |
72 | ||
73 | This suggests that a particular extent may be in only the following | |
74 | states: | |
75 | ||
76 | ||
77 | 0) Empty (not in the map at all) | |
78 | 1) Write Pending N | |
79 | - Some write with reqid <= N is currently fetching the data for | |
80 | this extent | |
81 | - The extent must persist until Write reqid N completes | |
82 | - All ops pinning this extent are writes in the WaitRead state of | |
83 | the Write pipeline (there must be an in progress write, so no | |
84 | reads can be in progress). | |
85 | 2) Write Pinned N: | |
86 | - This extent has data corresponding to some reqid M <= N | |
87 | - The extent must persist until Write reqid N commits | |
88 | - All ops pinning this extent are writes in some Write | |
89 | state (all are possible). Reads are not possible | |
90 | in this state (or the others) due to 2). | |
91 | ||
92 | All of the above suggests that there are 3 things users can | |
93 | ask of the cache corresponding to the 3 Write pipelines | |
94 | states. | |
95 | */ | |
96 | ||
97 | /// If someone wants these types, but not ExtentCache, move to another file | |
98 | struct bl_split_merge { | |
99 | bufferlist split( | |
100 | uint64_t offset, | |
101 | uint64_t length, | |
102 | bufferlist &bl) const { | |
103 | bufferlist out; | |
104 | out.substr_of(bl, offset, length); | |
105 | return out; | |
106 | } | |
107 | bool can_merge(const bufferlist &left, const bufferlist &right) const { | |
108 | return true; | |
109 | } | |
110 | bufferlist merge(bufferlist &&left, bufferlist &&right) const { | |
111 | bufferlist bl; | |
112 | bl.claim(left); | |
113 | bl.claim_append(right); | |
114 | return bl; | |
115 | } | |
116 | uint64_t length(const bufferlist &b) const { return b.length(); } | |
117 | }; | |
118 | using extent_set = interval_set<uint64_t>; | |
119 | using extent_map = interval_map<uint64_t, bufferlist, bl_split_merge>; | |
120 | ||
121 | class ExtentCache { | |
122 | struct object_extent_set; | |
123 | struct pin_state; | |
124 | private: | |
125 | ||
126 | struct extent { | |
127 | object_extent_set *parent_extent_set = nullptr; | |
128 | pin_state *parent_pin_state = nullptr; | |
129 | boost::intrusive::set_member_hook<> extent_set_member; | |
130 | boost::intrusive::list_member_hook<> pin_list_member; | |
131 | ||
132 | uint64_t offset; | |
133 | uint64_t length; | |
134 | boost::optional<bufferlist> bl; | |
135 | ||
136 | uint64_t get_length() const { | |
137 | return length; | |
138 | } | |
139 | ||
140 | bool is_pending() const { | |
141 | return bl == boost::none; | |
142 | } | |
143 | ||
144 | bool pinned_by_write() const { | |
145 | assert(parent_pin_state); | |
146 | return parent_pin_state->is_write(); | |
147 | } | |
148 | ||
149 | uint64_t pin_tid() const { | |
150 | assert(parent_pin_state); | |
151 | return parent_pin_state->tid; | |
152 | } | |
153 | ||
154 | extent(uint64_t offset, bufferlist _bl) | |
155 | : offset(offset), length(_bl.length()), bl(_bl) {} | |
156 | ||
157 | extent(uint64_t offset, uint64_t length) | |
158 | : offset(offset), length(length) {} | |
159 | ||
160 | bool operator<(const extent &rhs) const { | |
161 | return offset < rhs.offset; | |
162 | } | |
163 | private: | |
164 | // can briefly violate the two link invariant, used in unlink() and move() | |
165 | void _link_pin_state(pin_state &pin_state); | |
166 | void _unlink_pin_state(); | |
167 | public: | |
168 | void unlink(); | |
169 | void link(object_extent_set &parent_extent_set, pin_state &pin_state); | |
170 | void move(pin_state &to); | |
171 | }; | |
172 | ||
173 | struct object_extent_set : boost::intrusive::set_base_hook<> { | |
174 | hobject_t oid; | |
175 | object_extent_set(const hobject_t &oid) : oid(oid) {} | |
176 | ||
177 | using set_member_options = boost::intrusive::member_hook< | |
178 | extent, | |
179 | boost::intrusive::set_member_hook<>, | |
180 | &extent::extent_set_member>; | |
181 | using set = boost::intrusive::set<extent, set_member_options>; | |
182 | set extent_set; | |
183 | ||
184 | bool operator<(const object_extent_set &rhs) const { | |
185 | return oid < rhs.oid; | |
186 | } | |
187 | ||
188 | struct uint_cmp { | |
189 | bool operator()(uint64_t lhs, const extent &rhs) const { | |
190 | return lhs < rhs.offset; | |
191 | } | |
192 | bool operator()(const extent &lhs, uint64_t rhs) const { | |
193 | return lhs.offset < rhs; | |
194 | } | |
195 | }; | |
196 | std::pair<set::iterator, set::iterator> get_containing_range( | |
197 | uint64_t offset, uint64_t length); | |
198 | ||
199 | void erase(uint64_t offset, uint64_t length); | |
200 | ||
201 | struct update_action { | |
202 | enum type { | |
203 | NONE, | |
204 | UPDATE_PIN | |
205 | }; | |
206 | type action = NONE; | |
207 | boost::optional<bufferlist> bl; | |
208 | }; | |
209 | template <typename F> | |
210 | void traverse_update( | |
211 | pin_state &pin, | |
212 | uint64_t offset, | |
213 | uint64_t length, | |
214 | F &&f) { | |
215 | auto range = get_containing_range(offset, length); | |
216 | ||
217 | if (range.first == range.second || range.first->offset > offset) { | |
218 | uint64_t extlen = range.first == range.second ? | |
219 | length : range.first->offset - offset; | |
220 | ||
221 | update_action action; | |
222 | f(offset, extlen, nullptr, &action); | |
223 | assert(!action.bl || action.bl->length() == extlen); | |
224 | if (action.action == update_action::UPDATE_PIN) { | |
225 | extent *ext = action.bl ? | |
226 | new extent(offset, *action.bl) : | |
227 | new extent(offset, extlen); | |
228 | ext->link(*this, pin); | |
229 | } else { | |
230 | assert(!action.bl); | |
231 | } | |
232 | } | |
233 | ||
234 | for (auto p = range.first; p != range.second;) { | |
235 | extent *ext = &*p; | |
236 | ++p; | |
237 | ||
238 | uint64_t extoff = MAX(ext->offset, offset); | |
239 | uint64_t extlen = MIN( | |
240 | ext->length - (extoff - ext->offset), | |
241 | offset + length - extoff); | |
242 | ||
243 | update_action action; | |
244 | f(extoff, extlen, ext, &action); | |
245 | assert(!action.bl || action.bl->length() == extlen); | |
246 | extent *final_extent = nullptr; | |
247 | if (action.action == update_action::NONE) { | |
248 | final_extent = ext; | |
249 | } else { | |
250 | pin_state *ps = ext->parent_pin_state; | |
251 | ext->unlink(); | |
252 | if ((ext->offset < offset) && | |
253 | (ext->offset + ext->get_length() > offset)) { | |
254 | extent *head = nullptr; | |
255 | if (ext->bl) { | |
256 | bufferlist bl; | |
257 | bl.substr_of( | |
258 | *(ext->bl), | |
259 | 0, | |
260 | offset - ext->offset); | |
261 | head = new extent(ext->offset, bl); | |
262 | } else { | |
263 | head = new extent( | |
264 | ext->offset, offset - ext->offset); | |
265 | } | |
266 | head->link(*this, *ps); | |
267 | } | |
268 | if ((ext->offset + ext->length > offset + length) && | |
269 | (offset + length > ext->offset)) { | |
270 | uint64_t nlen = | |
271 | (ext->offset + ext->get_length()) - (offset + length); | |
272 | extent *tail = nullptr; | |
273 | if (ext->bl) { | |
274 | bufferlist bl; | |
275 | bl.substr_of( | |
276 | *(ext->bl), | |
277 | ext->get_length() - nlen, | |
278 | nlen); | |
279 | tail = new extent(offset + length, bl); | |
280 | } else { | |
281 | tail = new extent(offset + length, nlen); | |
282 | } | |
283 | tail->link(*this, *ps); | |
284 | } | |
285 | if (action.action == update_action::UPDATE_PIN) { | |
286 | if (ext->bl) { | |
287 | bufferlist bl; | |
288 | bl.substr_of( | |
289 | *(ext->bl), | |
290 | extoff - ext->offset, | |
291 | extlen); | |
292 | final_extent = new ExtentCache::extent( | |
293 | extoff, | |
294 | bl); | |
295 | } else { | |
296 | final_extent = new ExtentCache::extent( | |
297 | extoff, extlen); | |
298 | } | |
299 | final_extent->link(*this, pin); | |
300 | } | |
301 | delete ext; | |
302 | } | |
303 | ||
304 | if (action.bl) { | |
305 | assert(final_extent); | |
306 | assert(final_extent->length == action.bl->length()); | |
307 | final_extent->bl = *(action.bl); | |
308 | } | |
309 | ||
310 | uint64_t next_off = p == range.second ? | |
311 | offset + length : p->offset; | |
312 | if (extoff + extlen < next_off) { | |
313 | uint64_t tailoff = extoff + extlen; | |
314 | uint64_t taillen = next_off - tailoff; | |
315 | ||
316 | update_action action; | |
317 | f(tailoff, taillen, nullptr, &action); | |
318 | assert(!action.bl || action.bl->length() == taillen); | |
319 | if (action.action == update_action::UPDATE_PIN) { | |
320 | extent *ext = action.bl ? | |
321 | new extent(tailoff, *action.bl) : | |
322 | new extent(tailoff, taillen); | |
323 | ext->link(*this, pin); | |
324 | } else { | |
325 | assert(!action.bl); | |
326 | } | |
327 | } | |
328 | } | |
329 | } | |
330 | }; | |
331 | struct Cmp { | |
332 | bool operator()(const hobject_t &oid, const object_extent_set &rhs) const { | |
333 | return oid < rhs.oid; | |
334 | } | |
335 | bool operator()(const object_extent_set &lhs, const hobject_t &oid) const { | |
336 | return lhs.oid < oid; | |
337 | } | |
338 | }; | |
339 | ||
340 | object_extent_set &get_or_create(const hobject_t &oid); | |
341 | object_extent_set *get_if_exists(const hobject_t &oid); | |
342 | ||
343 | void remove_and_destroy_if_empty(object_extent_set &set); | |
344 | using cache_set = boost::intrusive::set<object_extent_set>; | |
345 | cache_set per_object_caches; | |
346 | ||
347 | uint64_t next_write_tid = 1; | |
348 | uint64_t next_read_tid = 1; | |
349 | struct pin_state { | |
350 | uint64_t tid = 0; | |
351 | enum pin_type_t { | |
352 | NONE, | |
353 | WRITE, | |
354 | }; | |
355 | pin_type_t pin_type = NONE; | |
356 | bool is_write() const { return pin_type == WRITE; } | |
357 | ||
358 | pin_state(const pin_state &other) = delete; | |
359 | pin_state &operator=(const pin_state &other) = delete; | |
360 | pin_state(pin_state &&other) = delete; | |
361 | pin_state() = default; | |
362 | ||
363 | using list_member_options = boost::intrusive::member_hook< | |
364 | extent, | |
365 | boost::intrusive::list_member_hook<>, | |
366 | &extent::pin_list_member>; | |
367 | using list = boost::intrusive::list<extent, list_member_options>; | |
368 | list pin_list; | |
369 | ~pin_state() { | |
370 | assert(pin_list.empty()); | |
371 | assert(tid == 0); | |
372 | assert(pin_type == NONE); | |
373 | } | |
374 | void _open(uint64_t in_tid, pin_type_t in_type) { | |
375 | assert(pin_type == NONE); | |
376 | assert(in_tid > 0); | |
377 | tid = in_tid; | |
378 | pin_type = in_type; | |
379 | } | |
380 | }; | |
381 | ||
382 | void release_pin(pin_state &p) { | |
383 | for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) { | |
384 | unique_ptr<extent> extent(&*iter); // we now own this | |
385 | iter++; // unlink will invalidate | |
386 | assert(extent->parent_extent_set); | |
387 | auto &eset = *(extent->parent_extent_set); | |
388 | extent->unlink(); | |
389 | remove_and_destroy_if_empty(eset); | |
390 | } | |
391 | p.tid = 0; | |
392 | p.pin_type = pin_state::NONE; | |
393 | } | |
394 | ||
395 | public: | |
396 | class write_pin : private pin_state { | |
397 | friend class ExtentCache; | |
398 | private: | |
399 | void open(uint64_t in_tid) { | |
400 | _open(in_tid, pin_state::WRITE); | |
401 | } | |
402 | public: | |
403 | write_pin() : pin_state() {} | |
404 | }; | |
405 | ||
406 | void open_write_pin(write_pin &pin) { | |
407 | pin.open(next_write_tid++); | |
408 | } | |
409 | ||
410 | /** | |
411 | * Reserves extents required for rmw, and learn | |
412 | * which need to be read | |
413 | * | |
414 | * Pins all extents in to_write. Returns subset of to_read not | |
415 | * currently present in the cache. Caller must obtain those | |
416 | * extents before calling get_remaining_extents_for_rmw. | |
417 | * | |
418 | * Transition table: | |
419 | * - Empty -> Write Pending pin.reqid | |
420 | * - Write Pending N -> Write Pending pin.reqid | |
421 | * - Write Pinned N -> Write Pinned pin.reqid | |
422 | * | |
423 | * @param oid [in] object undergoing rmw | |
424 | * @param pin [in,out] pin to use (obtained from create_write_pin) | |
425 | * @param to_write [in] extents which will be written | |
426 | * @param to_read [in] extents to read prior to write (must be subset | |
427 | * of to_write) | |
428 | * @return subset of to_read which isn't already present or pending | |
429 | */ | |
430 | extent_set reserve_extents_for_rmw( | |
431 | const hobject_t &oid, | |
432 | write_pin &pin, | |
433 | const extent_set &to_write, | |
434 | const extent_set &to_read); | |
435 | ||
436 | /** | |
437 | * Gets extents required for rmw not returned from | |
438 | * reserve_extents_for_rmw | |
439 | * | |
440 | * Requested extents (to_get) must be the set to_read \ the set | |
441 | * returned from reserve_extents_for_rmw. No transition table, | |
442 | * all extents at this point must be present and already pinned | |
443 | * for this pin by reserve_extents_for_rmw. | |
444 | * | |
445 | * @param oid [in] object | |
446 | * @param pin [in,out] pin associated with this IO | |
447 | * @param to_get [in] extents to get (see above for restrictions) | |
448 | * @return map of buffers from to_get | |
449 | */ | |
450 | extent_map get_remaining_extents_for_rmw( | |
451 | const hobject_t &oid, | |
452 | write_pin &pin, | |
453 | const extent_set &to_get); | |
454 | ||
455 | /** | |
456 | * Updates the cache to reflect the rmw write | |
457 | * | |
458 | * All presented extents must already have been specified in | |
459 | * reserve_extents_for_rmw under to_write. | |
460 | * | |
461 | * Transition table: | |
462 | * - Empty -> invalid, must call reserve_extents_for_rmw first | |
463 | * - Write Pending N -> Write Pinned N, update buffer | |
464 | * (assert N >= pin.reqid) | |
465 | * - Write Pinned N -> Update buffer (assert N >= pin.reqid) | |
466 | * | |
467 | * @param oid [in] object | |
468 | * @param pin [in,out] pin associated with this IO | |
469 | * @param extents [in] map of buffers to update | |
470 | * @return void | |
471 | */ | |
472 | void present_rmw_update( | |
473 | const hobject_t &oid, | |
474 | write_pin &pin, | |
475 | const extent_map &extents); | |
476 | ||
477 | /** | |
478 | * Release all buffers pinned by pin | |
479 | */ | |
480 | void release_write_pin( | |
481 | write_pin &pin) { | |
482 | release_pin(pin); | |
483 | } | |
484 | ||
485 | ostream &print( | |
486 | ostream &out) const; | |
487 | }; | |
488 | ||
489 | ostream &operator<<(ostream &lhs, const ExtentCache &cache); | |
490 | ||
491 | #endif |