]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | #ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG | |
5 | #define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG | |
6 | ||
a4b75251 | 7 | #include "common/Timer.h" |
f67539c2 TL |
8 | #include "common/RWLock.h" |
9 | #include "common/WorkQueue.h" | |
10 | #include "common/AsyncOpTracker.h" | |
11 | #include "librbd/cache/ImageWriteback.h" | |
12 | #include "librbd/Utils.h" | |
13 | #include "librbd/BlockGuard.h" | |
14 | #include "librbd/cache/Types.h" | |
15 | #include "librbd/cache/pwl/LogOperation.h" | |
16 | #include "librbd/cache/pwl/ReadRequest.h" | |
17 | #include "librbd/cache/pwl/Request.h" | |
18 | #include "librbd/cache/pwl/LogMap.h" | |
19 | #include "librbd/cache/pwl/Builder.h" | |
20 | #include <functional> | |
21 | #include <list> | |
22 | ||
23 | class Context; | |
f67539c2 TL |
24 | |
25 | namespace librbd { | |
26 | ||
27 | struct ImageCtx; | |
28 | ||
29 | namespace plugin { template <typename> struct Api; } | |
30 | ||
31 | namespace cache { | |
32 | namespace pwl { | |
33 | ||
34 | class GenericLogEntry; | |
35 | class GenericWriteLogEntry; | |
36 | class SyncPointLogEntry; | |
37 | class WriteLogEntry; | |
38 | struct WriteLogCacheEntry; | |
39 | ||
40 | typedef std::list<std::shared_ptr<WriteLogEntry>> WriteLogEntries; | |
41 | typedef std::list<std::shared_ptr<GenericLogEntry>> GenericLogEntries; | |
42 | typedef std::list<std::shared_ptr<GenericWriteLogEntry>> GenericWriteLogEntries; | |
43 | typedef std::vector<std::shared_ptr<GenericLogEntry>> GenericLogEntriesVector; | |
44 | ||
45 | typedef LogMapEntries<GenericWriteLogEntry> WriteLogMapEntries; | |
46 | typedef LogMap<GenericWriteLogEntry> WriteLogMap; | |
47 | ||
48 | /**** Write log entries end ****/ | |
49 | ||
50 | typedef librbd::BlockGuard<GuardedRequest> WriteLogGuard; | |
51 | ||
52 | class DeferredContexts; | |
53 | template <typename> | |
54 | class ImageCacheState; | |
55 | ||
56 | template<typename T> | |
57 | class Builder; | |
58 | ||
59 | template <typename T> | |
60 | struct C_BlockIORequest; | |
61 | ||
62 | template <typename T> | |
63 | struct C_WriteRequest; | |
64 | ||
65 | using GenericLogOperations = std::list<GenericLogOperationSharedPtr>; | |
66 | ||
67 | ||
68 | template <typename ImageCtxT> | |
69 | class AbstractWriteLog { | |
70 | public: | |
71 | typedef io::Extent Extent; | |
72 | typedef io::Extents Extents; | |
73 | using This = AbstractWriteLog<ImageCtxT>; | |
74 | Builder<This> *m_builder; | |
75 | ||
76 | AbstractWriteLog(ImageCtxT &image_ctx, | |
77 | librbd::cache::pwl::ImageCacheState<ImageCtxT>* cache_state, | |
78 | Builder<This> *builder, | |
79 | cache::ImageWritebackInterface& image_writeback, | |
80 | plugin::Api<ImageCtxT>& plugin_api); | |
81 | virtual ~AbstractWriteLog(); | |
82 | AbstractWriteLog(const AbstractWriteLog&) = delete; | |
83 | AbstractWriteLog &operator=(const AbstractWriteLog&) = delete; | |
84 | ||
85 | /// IO methods | |
86 | void read( | |
87 | Extents&& image_extents, ceph::bufferlist *bl, | |
88 | int fadvise_flags, Context *on_finish); | |
89 | void write( | |
90 | Extents&& image_extents, ceph::bufferlist&& bl, | |
91 | int fadvise_flags, | |
92 | Context *on_finish); | |
93 | void discard( | |
94 | uint64_t offset, uint64_t length, | |
95 | uint32_t discard_granularity_bytes, | |
96 | Context *on_finish); | |
97 | void flush( | |
98 | io::FlushSource flush_source, Context *on_finish); | |
99 | void writesame( | |
100 | uint64_t offset, uint64_t length, | |
101 | ceph::bufferlist&& bl, | |
102 | int fadvise_flags, Context *on_finish); | |
103 | void compare_and_write( | |
104 | Extents&& image_extents, | |
105 | ceph::bufferlist&& cmp_bl, ceph::bufferlist&& bl, | |
106 | uint64_t *mismatch_offset,int fadvise_flags, | |
107 | Context *on_finish); | |
108 | ||
109 | /// internal state methods | |
110 | void init(Context *on_finish); | |
111 | void shut_down(Context *on_finish); | |
112 | void invalidate(Context *on_finish); | |
113 | void flush(Context *on_finish); | |
114 | ||
115 | using C_WriteRequestT = pwl::C_WriteRequest<This>; | |
116 | using C_BlockIORequestT = pwl::C_BlockIORequest<This>; | |
117 | using C_FlushRequestT = pwl::C_FlushRequest<This>; | |
118 | using C_DiscardRequestT = pwl::C_DiscardRequest<This>; | |
119 | using C_WriteSameRequestT = pwl::C_WriteSameRequest<This>; | |
120 | ||
121 | CephContext * get_context(); | |
122 | void release_guarded_request(BlockGuardCell *cell); | |
123 | void release_write_lanes(C_BlockIORequestT *req); | |
124 | virtual bool alloc_resources(C_BlockIORequestT *req) = 0; | |
125 | virtual void setup_schedule_append( | |
126 | pwl::GenericLogOperationsVector &ops, bool do_early_flush, | |
127 | C_BlockIORequestT *req) = 0; | |
a4b75251 TL |
128 | void schedule_append(pwl::GenericLogOperationsVector &ops, C_BlockIORequestT *req = nullptr); |
129 | void schedule_append(pwl::GenericLogOperationSharedPtr op, C_BlockIORequestT *req = nullptr); | |
f67539c2 TL |
130 | void flush_new_sync_point(C_FlushRequestT *flush_req, |
131 | pwl::DeferredContexts &later); | |
132 | ||
133 | std::shared_ptr<pwl::SyncPoint> get_current_sync_point() { | |
134 | return m_current_sync_point; | |
135 | } | |
136 | bool get_persist_on_flush() { | |
137 | return m_persist_on_flush; | |
138 | } | |
139 | void inc_last_op_sequence_num() { | |
140 | m_perfcounter->inc(l_librbd_pwl_log_ops, 1); | |
141 | ++m_last_op_sequence_num; | |
142 | } | |
143 | uint64_t get_last_op_sequence_num() { | |
144 | return m_last_op_sequence_num; | |
145 | } | |
146 | uint64_t get_current_sync_gen() { | |
147 | return m_current_sync_gen; | |
148 | } | |
149 | unsigned int get_free_lanes() { | |
150 | return m_free_lanes; | |
151 | } | |
152 | uint32_t get_free_log_entries() { | |
153 | return m_free_log_entries; | |
154 | } | |
155 | void add_into_log_map(pwl::GenericWriteLogEntries &log_entries, | |
156 | C_BlockIORequestT *req); | |
157 | virtual void complete_user_request(Context *&user_req, int r) = 0; | |
158 | virtual void copy_bl_to_buffer( | |
159 | WriteRequestResources *resources, | |
160 | std::unique_ptr<WriteLogOperationSet> &op_set) {} | |
161 | ||
162 | private: | |
163 | typedef std::list<pwl::C_WriteRequest<This> *> C_WriteRequests; | |
164 | typedef std::list<pwl::C_BlockIORequest<This> *> C_BlockIORequests; | |
165 | ||
166 | std::atomic<bool> m_initialized = {false}; | |
167 | ||
168 | uint64_t m_bytes_dirty = 0; /* Total bytes yet to flush to RBD */ | |
169 | utime_t m_last_alloc_fail; /* Entry or buffer allocation fail seen */ | |
170 | ||
171 | pwl::WriteLogGuard m_write_log_guard; | |
172 | ||
173 | /* Starts at 0 for a new write log. Incremented on every flush. */ | |
174 | uint64_t m_current_sync_gen = 0; | |
175 | /* Starts at 0 on each sync gen increase. Incremented before applied | |
176 | to an operation */ | |
177 | uint64_t m_last_op_sequence_num = 0; | |
178 | ||
179 | bool m_persist_on_write_until_flush = true; | |
180 | ||
20effc67 TL |
181 | pwl::WriteLogGuard m_flush_guard; |
182 | mutable ceph::mutex m_flush_guard_lock; | |
183 | ||
f67539c2 TL |
184 | /* Debug counters for the places m_async_op_tracker is used */ |
185 | std::atomic<int> m_async_complete_ops = {0}; | |
186 | std::atomic<int> m_async_null_flush_finish = {0}; | |
187 | std::atomic<int> m_async_process_work = {0}; | |
188 | ||
189 | /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */ | |
190 | mutable ceph::mutex m_deferred_dispatch_lock; | |
191 | ||
192 | /* Used in release/detain to make BlockGuard preserve submission order */ | |
193 | mutable ceph::mutex m_blockguard_lock; | |
194 | ||
195 | /* Use m_blockguard_lock for the following 3 things */ | |
196 | bool m_barrier_in_progress = false; | |
197 | BlockGuardCell *m_barrier_cell = nullptr; | |
198 | ||
199 | bool m_wake_up_enabled = true; | |
200 | ||
201 | Contexts m_flush_complete_contexts; | |
202 | ||
203 | std::shared_ptr<pwl::SyncPoint> m_current_sync_point = nullptr; | |
204 | bool m_persist_on_flush = false; //If false, persist each write before completion | |
205 | ||
206 | int m_flush_ops_in_flight = 0; | |
207 | int m_flush_bytes_in_flight = 0; | |
208 | uint64_t m_lowest_flushing_sync_gen = 0; | |
209 | ||
210 | /* Writes that have left the block guard, but are waiting for resources */ | |
211 | C_BlockIORequests m_deferred_ios; | |
212 | /* Throttle writes concurrently allocating & replicating */ | |
213 | unsigned int m_free_lanes = pwl::MAX_CONCURRENT_WRITES; | |
214 | ||
f67539c2 TL |
215 | SafeTimer *m_timer = nullptr; /* Used with m_timer_lock */ |
216 | mutable ceph::mutex *m_timer_lock = nullptr; /* Used with and by m_timer */ | |
217 | Context *m_timer_ctx = nullptr; | |
218 | ||
219 | ThreadPool m_thread_pool; | |
220 | ||
221 | uint32_t m_discard_granularity_bytes; | |
222 | ||
223 | BlockGuardCell* detain_guarded_request_helper(pwl::GuardedRequest &req); | |
224 | BlockGuardCell* detain_guarded_request_barrier_helper( | |
225 | pwl::GuardedRequest &req); | |
226 | void detain_guarded_request(C_BlockIORequestT *request, | |
227 | pwl::GuardedRequestFunctionContext *guarded_ctx, | |
228 | bool is_barrier); | |
f67539c2 TL |
229 | void perf_start(const std::string name); |
230 | void perf_stop(); | |
231 | void log_perf(); | |
232 | void periodic_stats(); | |
233 | void arm_periodic_stats(); | |
234 | ||
235 | void pwl_init(Context *on_finish, pwl::DeferredContexts &later); | |
236 | void update_image_cache_state(Context *on_finish); | |
33c7a0ef | 237 | void handle_update_image_cache_state(int r); |
a4b75251 | 238 | void check_image_cache_state_clean(); |
f67539c2 TL |
239 | |
240 | void flush_dirty_entries(Context *on_finish); | |
241 | bool can_flush_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry); | |
242 | bool handle_flushed_sync_point( | |
243 | std::shared_ptr<pwl::SyncPointLogEntry> log_entry); | |
244 | void sync_point_writer_flushed( | |
245 | std::shared_ptr<pwl::SyncPointLogEntry> log_entry); | |
246 | ||
247 | void init_flush_new_sync_point(pwl::DeferredContexts &later); | |
248 | void new_sync_point(pwl::DeferredContexts &later); | |
249 | pwl::C_FlushRequest<AbstractWriteLog<ImageCtxT>>* make_flush_req( | |
250 | Context *on_finish); | |
251 | void flush_new_sync_point_if_needed(C_FlushRequestT *flush_req, | |
252 | pwl::DeferredContexts &later); | |
253 | ||
254 | void alloc_and_dispatch_io_req(C_BlockIORequestT *write_req); | |
255 | void schedule_complete_op_log_entries(pwl::GenericLogOperations &&ops, | |
256 | const int r); | |
257 | void internal_flush(bool invalidate, Context *on_finish); | |
258 | ||
259 | protected: | |
260 | librbd::cache::pwl::ImageCacheState<ImageCtxT>* m_cache_state = nullptr; | |
261 | ||
262 | std::atomic<bool> m_shutting_down = {false}; | |
263 | std::atomic<bool> m_invalidating = {false}; | |
264 | ||
265 | ImageCtxT &m_image_ctx; | |
266 | ||
267 | std::string m_log_pool_name; | |
a4b75251 | 268 | uint64_t m_log_pool_size; |
f67539c2 TL |
269 | |
270 | uint32_t m_total_log_entries = 0; | |
271 | uint32_t m_free_log_entries = 0; | |
272 | ||
273 | std::atomic<uint64_t> m_bytes_allocated = {0}; /* Total bytes allocated in write buffers */ | |
274 | uint64_t m_bytes_cached = 0; /* Total bytes used in write buffers */ | |
275 | uint64_t m_bytes_allocated_cap = 0; | |
276 | ||
277 | std::atomic<bool> m_alloc_failed_since_retire = {false}; | |
278 | ||
279 | cache::ImageWritebackInterface& m_image_writeback; | |
280 | plugin::Api<ImageCtxT>& m_plugin_api; | |
281 | ||
282 | /* | |
283 | * When m_first_free_entry == m_first_valid_entry, the log is | |
284 | * empty. There is always at least one free entry, which can't be | |
285 | * used. | |
286 | */ | |
287 | uint64_t m_first_free_entry = 0; /* Entries from here to m_first_valid_entry-1 are free */ | |
288 | uint64_t m_first_valid_entry = 0; /* Entries from here to m_first_free_entry-1 are valid */ | |
289 | ||
290 | /* All writes bearing this and all prior sync gen numbers are flushed */ | |
291 | uint64_t m_flushed_sync_gen = 0; | |
292 | ||
293 | AsyncOpTracker m_async_op_tracker; | |
294 | /* Debug counters for the places m_async_op_tracker is used */ | |
295 | std::atomic<int> m_async_flush_ops = {0}; | |
296 | std::atomic<int> m_async_append_ops = {0}; | |
297 | ||
298 | /* Acquire locks in order declared here */ | |
299 | ||
300 | mutable ceph::mutex m_log_retire_lock; | |
301 | /* Hold a read lock on m_entry_reader_lock to add readers to log entry | |
302 | * bufs. Hold a write lock to prevent readers from being added (e.g. when | |
303 | * removing log entrys from the map). No lock required to remove readers. */ | |
304 | mutable RWLock m_entry_reader_lock; | |
305 | /* Hold m_log_append_lock while appending or retiring log entries. */ | |
306 | mutable ceph::mutex m_log_append_lock; | |
307 | /* Used for most synchronization */ | |
308 | mutable ceph::mutex m_lock; | |
309 | ||
310 | /* Use m_blockguard_lock for the following 3 things */ | |
311 | pwl::WriteLogGuard::BlockOperations m_awaiting_barrier; | |
312 | ||
313 | bool m_wake_up_requested = false; | |
314 | bool m_wake_up_scheduled = false; | |
315 | bool m_appending = false; | |
316 | bool m_dispatching_deferred_ops = false; | |
317 | ||
318 | pwl::GenericLogOperations m_ops_to_flush; /* Write ops needing flush in local log */ | |
319 | pwl::GenericLogOperations m_ops_to_append; /* Write ops needing event append in local log */ | |
320 | ||
321 | pwl::WriteLogMap m_blocks_to_log_entries; | |
322 | ||
323 | /* New entries are at the back. Oldest at the front */ | |
324 | pwl::GenericLogEntries m_log_entries; | |
325 | pwl::GenericLogEntries m_dirty_log_entries; | |
326 | ||
327 | PerfCounters *m_perfcounter = nullptr; | |
328 | ||
329 | unsigned int m_unpublished_reserves = 0; | |
330 | ||
331 | ContextWQ m_work_queue; | |
332 | ||
333 | void wake_up(); | |
334 | ||
335 | void update_entries( | |
336 | std::shared_ptr<pwl::GenericLogEntry> *log_entry, | |
337 | pwl::WriteLogCacheEntry *cache_entry, | |
338 | std::map<uint64_t, bool> &missing_sync_points, | |
339 | std::map<uint64_t, | |
340 | std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries, | |
a4b75251 | 341 | uint64_t entry_index); |
f67539c2 TL |
342 | void update_sync_points( |
343 | std::map<uint64_t, bool> &missing_sync_points, | |
344 | std::map<uint64_t, | |
345 | std::shared_ptr<pwl::SyncPointLogEntry>> &sync_point_entries, | |
a4b75251 TL |
346 | pwl::DeferredContexts &later); |
347 | virtual void inc_allocated_cached_bytes( | |
348 | std::shared_ptr<pwl::GenericLogEntry> log_entry) = 0; | |
f67539c2 TL |
349 | Context *construct_flush_entry( |
350 | const std::shared_ptr<pwl::GenericLogEntry> log_entry, bool invalidating); | |
20effc67 TL |
351 | void detain_flush_guard_request(std::shared_ptr<GenericLogEntry> log_entry, |
352 | GuardedRequestFunctionContext *guarded_ctx); | |
f67539c2 TL |
353 | void process_writeback_dirty_entries(); |
354 | bool can_retire_entry(const std::shared_ptr<pwl::GenericLogEntry> log_entry); | |
355 | ||
356 | void dispatch_deferred_writes(void); | |
357 | void complete_op_log_entries(pwl::GenericLogOperations &&ops, const int r); | |
358 | ||
359 | bool check_allocation( | |
a4b75251 TL |
360 | C_BlockIORequestT *req, uint64_t bytes_cached, uint64_t bytes_dirtied, |
361 | uint64_t bytes_allocated, uint32_t num_lanes, uint32_t num_log_entries, | |
362 | uint32_t num_unpublished_reserves); | |
f67539c2 TL |
363 | void append_scheduled( |
364 | pwl::GenericLogOperations &ops, bool &ops_remain, bool &appending, | |
365 | bool isRWL=false); | |
366 | ||
367 | virtual void process_work() = 0; | |
368 | virtual void append_scheduled_ops(void) = 0; | |
a4b75251 | 369 | virtual void schedule_append_ops(pwl::GenericLogOperations &ops, C_BlockIORequestT *req) = 0; |
f67539c2 | 370 | virtual void remove_pool_file() = 0; |
a4b75251 | 371 | virtual bool initialize_pool(Context *on_finish, |
f67539c2 TL |
372 | pwl::DeferredContexts &later) = 0; |
373 | virtual void collect_read_extents( | |
374 | uint64_t read_buffer_offset, LogMapEntry<GenericWriteLogEntry> map_entry, | |
a4b75251 | 375 | std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read, |
f67539c2 TL |
376 | std::vector<bufferlist*> &bls_to_read, uint64_t entry_hit_length, |
377 | Extent hit_extent, pwl::C_ReadRequest *read_ctx) = 0; | |
378 | virtual void complete_read( | |
a4b75251 | 379 | std::vector<std::shared_ptr<GenericWriteLogEntry>> &log_entries_to_read, |
f67539c2 TL |
380 | std::vector<bufferlist*> &bls_to_read, Context *ctx) = 0; |
381 | virtual void write_data_to_buffer( | |
382 | std::shared_ptr<pwl::WriteLogEntry> ws_entry, | |
383 | pwl::WriteLogCacheEntry *cache_entry) {} | |
384 | virtual void release_ram( | |
385 | const std::shared_ptr<pwl::GenericLogEntry> log_entry) {} | |
386 | virtual void alloc_op_log_entries(pwl::GenericLogOperations &ops) {} | |
387 | virtual bool retire_entries(const unsigned long int frees_per_tx) { | |
388 | return false; | |
389 | } | |
390 | virtual void schedule_flush_and_append( | |
391 | pwl::GenericLogOperationsVector &ops) {} | |
392 | virtual void persist_last_flushed_sync_gen() {} | |
393 | virtual void reserve_cache(C_BlockIORequestT *req, bool &alloc_succeeds, | |
394 | bool &no_space) {} | |
a4b75251 TL |
395 | virtual void construct_flush_entries(pwl::GenericLogEntries entries_to_flush, |
396 | DeferredContexts &post_unlock, | |
397 | bool has_write_entry) = 0; | |
f67539c2 TL |
398 | virtual uint64_t get_max_extent() { |
399 | return 0; | |
400 | } | |
33c7a0ef | 401 | void update_image_cache_state(void); |
f67539c2 TL |
402 | }; |
403 | ||
404 | } // namespace pwl | |
405 | } // namespace cache | |
406 | } // namespace librbd | |
407 | ||
408 | extern template class librbd::cache::pwl::AbstractWriteLog<librbd::ImageCtx>; | |
409 | ||
410 | #endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG |