1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #ifndef CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
5 #define CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG
7 #include "common/Timer.h"
8 #include "common/RWLock.h"
9 #include "common/WorkQueue.h"
10 #include "common/AsyncOpTracker.h"
11 #include "librbd/cache/ImageWriteback.h"
12 #include "librbd/Utils.h"
13 #include "librbd/BlockGuard.h"
14 #include "librbd/cache/Types.h"
15 #include "librbd/cache/pwl/LogOperation.h"
16 #include "librbd/cache/pwl/ReadRequest.h"
17 #include "librbd/cache/pwl/Request.h"
18 #include "librbd/cache/pwl/LogMap.h"
19 #include "librbd/cache/pwl/Builder.h"
29 namespace plugin
{ template <typename
> struct Api
; }
34 class GenericLogEntry
;
35 class GenericWriteLogEntry
;
36 class SyncPointLogEntry
;
38 struct WriteLogCacheEntry
;
40 typedef std::list
<std::shared_ptr
<WriteLogEntry
>> WriteLogEntries
;
41 typedef std::list
<std::shared_ptr
<GenericLogEntry
>> GenericLogEntries
;
42 typedef std::list
<std::shared_ptr
<GenericWriteLogEntry
>> GenericWriteLogEntries
;
43 typedef std::vector
<std::shared_ptr
<GenericLogEntry
>> GenericLogEntriesVector
;
45 typedef LogMapEntries
<GenericWriteLogEntry
> WriteLogMapEntries
;
46 typedef LogMap
<GenericWriteLogEntry
> WriteLogMap
;
48 /**** Write log entries end ****/
50 typedef librbd::BlockGuard
<GuardedRequest
> WriteLogGuard
;
52 class DeferredContexts
;
54 class ImageCacheState
;
60 struct C_BlockIORequest
;
63 struct C_WriteRequest
;
65 using GenericLogOperations
= std::list
<GenericLogOperationSharedPtr
>;
68 template <typename ImageCtxT
>
69 class AbstractWriteLog
{
71 typedef io::Extent Extent
;
72 typedef io::Extents Extents
;
73 using This
= AbstractWriteLog
<ImageCtxT
>;
74 Builder
<This
> *m_builder
;
76 AbstractWriteLog(ImageCtxT
&image_ctx
,
77 librbd::cache::pwl::ImageCacheState
<ImageCtxT
>* cache_state
,
78 Builder
<This
> *builder
,
79 cache::ImageWritebackInterface
& image_writeback
,
80 plugin::Api
<ImageCtxT
>& plugin_api
);
81 virtual ~AbstractWriteLog();
82 AbstractWriteLog(const AbstractWriteLog
&) = delete;
83 AbstractWriteLog
&operator=(const AbstractWriteLog
&) = delete;
87 Extents
&& image_extents
, ceph::bufferlist
*bl
,
88 int fadvise_flags
, Context
*on_finish
);
90 Extents
&& image_extents
, ceph::bufferlist
&& bl
,
94 uint64_t offset
, uint64_t length
,
95 uint32_t discard_granularity_bytes
,
98 io::FlushSource flush_source
, Context
*on_finish
);
100 uint64_t offset
, uint64_t length
,
101 ceph::bufferlist
&& bl
,
102 int fadvise_flags
, Context
*on_finish
);
103 void compare_and_write(
104 Extents
&& image_extents
,
105 ceph::bufferlist
&& cmp_bl
, ceph::bufferlist
&& bl
,
106 uint64_t *mismatch_offset
,int fadvise_flags
,
109 /// internal state methods
110 void init(Context
*on_finish
);
111 void shut_down(Context
*on_finish
);
112 void invalidate(Context
*on_finish
);
113 void flush(Context
*on_finish
);
115 using C_WriteRequestT
= pwl::C_WriteRequest
<This
>;
116 using C_BlockIORequestT
= pwl::C_BlockIORequest
<This
>;
117 using C_FlushRequestT
= pwl::C_FlushRequest
<This
>;
118 using C_DiscardRequestT
= pwl::C_DiscardRequest
<This
>;
119 using C_WriteSameRequestT
= pwl::C_WriteSameRequest
<This
>;
121 CephContext
* get_context();
122 void release_guarded_request(BlockGuardCell
*cell
);
123 void release_write_lanes(C_BlockIORequestT
*req
);
124 virtual bool alloc_resources(C_BlockIORequestT
*req
) = 0;
125 virtual void setup_schedule_append(
126 pwl::GenericLogOperationsVector
&ops
, bool do_early_flush
,
127 C_BlockIORequestT
*req
) = 0;
128 void schedule_append(pwl::GenericLogOperationsVector
&ops
, C_BlockIORequestT
*req
= nullptr);
129 void schedule_append(pwl::GenericLogOperationSharedPtr op
, C_BlockIORequestT
*req
= nullptr);
130 void flush_new_sync_point(C_FlushRequestT
*flush_req
,
131 pwl::DeferredContexts
&later
);
133 std::shared_ptr
<pwl::SyncPoint
> get_current_sync_point() {
134 return m_current_sync_point
;
136 bool get_persist_on_flush() {
137 return m_persist_on_flush
;
139 void inc_last_op_sequence_num() {
140 m_perfcounter
->inc(l_librbd_pwl_log_ops
, 1);
141 ++m_last_op_sequence_num
;
143 uint64_t get_last_op_sequence_num() {
144 return m_last_op_sequence_num
;
146 uint64_t get_current_sync_gen() {
147 return m_current_sync_gen
;
149 unsigned int get_free_lanes() {
152 uint32_t get_free_log_entries() {
153 return m_free_log_entries
;
155 void add_into_log_map(pwl::GenericWriteLogEntries
&log_entries
,
156 C_BlockIORequestT
*req
);
157 virtual void complete_user_request(Context
*&user_req
, int r
) = 0;
158 virtual void copy_bl_to_buffer(
159 WriteRequestResources
*resources
,
160 std::unique_ptr
<WriteLogOperationSet
> &op_set
) {}
163 typedef std::list
<pwl::C_WriteRequest
<This
> *> C_WriteRequests
;
164 typedef std::list
<pwl::C_BlockIORequest
<This
> *> C_BlockIORequests
;
166 std::atomic
<bool> m_initialized
= {false};
168 uint64_t m_bytes_dirty
= 0; /* Total bytes yet to flush to RBD */
169 utime_t m_last_alloc_fail
; /* Entry or buffer allocation fail seen */
171 pwl::WriteLogGuard m_write_log_guard
;
173 /* Starts at 0 for a new write log. Incremented on every flush. */
174 uint64_t m_current_sync_gen
= 0;
175 /* Starts at 0 on each sync gen increase. Incremented before applied
177 uint64_t m_last_op_sequence_num
= 0;
179 bool m_persist_on_write_until_flush
= true;
181 pwl::WriteLogGuard m_flush_guard
;
182 mutable ceph::mutex m_flush_guard_lock
;
184 /* Debug counters for the places m_async_op_tracker is used */
185 std::atomic
<int> m_async_complete_ops
= {0};
186 std::atomic
<int> m_async_null_flush_finish
= {0};
187 std::atomic
<int> m_async_process_work
= {0};
189 /* Hold m_deferred_dispatch_lock while consuming from m_deferred_ios. */
190 mutable ceph::mutex m_deferred_dispatch_lock
;
192 /* Used in release/detain to make BlockGuard preserve submission order */
193 mutable ceph::mutex m_blockguard_lock
;
195 /* Use m_blockguard_lock for the following 3 things */
196 bool m_barrier_in_progress
= false;
197 BlockGuardCell
*m_barrier_cell
= nullptr;
199 bool m_wake_up_enabled
= true;
201 Contexts m_flush_complete_contexts
;
203 std::shared_ptr
<pwl::SyncPoint
> m_current_sync_point
= nullptr;
204 bool m_persist_on_flush
= false; //If false, persist each write before completion
206 int m_flush_ops_in_flight
= 0;
207 int m_flush_bytes_in_flight
= 0;
208 uint64_t m_lowest_flushing_sync_gen
= 0;
210 /* Writes that have left the block guard, but are waiting for resources */
211 C_BlockIORequests m_deferred_ios
;
212 /* Throttle writes concurrently allocating & replicating */
213 unsigned int m_free_lanes
= pwl::MAX_CONCURRENT_WRITES
;
215 SafeTimer
*m_timer
= nullptr; /* Used with m_timer_lock */
216 mutable ceph::mutex
*m_timer_lock
= nullptr; /* Used with and by m_timer */
217 Context
*m_timer_ctx
= nullptr;
219 ThreadPool m_thread_pool
;
221 uint32_t m_discard_granularity_bytes
;
223 BlockGuardCell
* detain_guarded_request_helper(pwl::GuardedRequest
&req
);
224 BlockGuardCell
* detain_guarded_request_barrier_helper(
225 pwl::GuardedRequest
&req
);
226 void detain_guarded_request(C_BlockIORequestT
*request
,
227 pwl::GuardedRequestFunctionContext
*guarded_ctx
,
229 void perf_start(const std::string name
);
232 void periodic_stats();
233 void arm_periodic_stats();
235 void pwl_init(Context
*on_finish
, pwl::DeferredContexts
&later
);
236 void update_image_cache_state(Context
*on_finish
);
237 void handle_update_image_cache_state(int r
);
238 void check_image_cache_state_clean();
240 void flush_dirty_entries(Context
*on_finish
);
241 bool can_flush_entry(const std::shared_ptr
<pwl::GenericLogEntry
> log_entry
);
242 bool handle_flushed_sync_point(
243 std::shared_ptr
<pwl::SyncPointLogEntry
> log_entry
);
244 void sync_point_writer_flushed(
245 std::shared_ptr
<pwl::SyncPointLogEntry
> log_entry
);
247 void init_flush_new_sync_point(pwl::DeferredContexts
&later
);
248 void new_sync_point(pwl::DeferredContexts
&later
);
249 pwl::C_FlushRequest
<AbstractWriteLog
<ImageCtxT
>>* make_flush_req(
251 void flush_new_sync_point_if_needed(C_FlushRequestT
*flush_req
,
252 pwl::DeferredContexts
&later
);
254 void alloc_and_dispatch_io_req(C_BlockIORequestT
*write_req
);
255 void schedule_complete_op_log_entries(pwl::GenericLogOperations
&&ops
,
257 void internal_flush(bool invalidate
, Context
*on_finish
);
260 librbd::cache::pwl::ImageCacheState
<ImageCtxT
>* m_cache_state
= nullptr;
262 std::atomic
<bool> m_shutting_down
= {false};
263 std::atomic
<bool> m_invalidating
= {false};
265 ImageCtxT
&m_image_ctx
;
267 std::string m_log_pool_name
;
268 uint64_t m_log_pool_size
;
270 uint32_t m_total_log_entries
= 0;
271 uint32_t m_free_log_entries
= 0;
273 std::atomic
<uint64_t> m_bytes_allocated
= {0}; /* Total bytes allocated in write buffers */
274 uint64_t m_bytes_cached
= 0; /* Total bytes used in write buffers */
275 uint64_t m_bytes_allocated_cap
= 0;
277 std::atomic
<bool> m_alloc_failed_since_retire
= {false};
279 cache::ImageWritebackInterface
& m_image_writeback
;
280 plugin::Api
<ImageCtxT
>& m_plugin_api
;
283 * When m_first_free_entry == m_first_valid_entry, the log is
284 * empty. There is always at least one free entry, which can't be
287 uint64_t m_first_free_entry
= 0; /* Entries from here to m_first_valid_entry-1 are free */
288 uint64_t m_first_valid_entry
= 0; /* Entries from here to m_first_free_entry-1 are valid */
290 /* All writes bearing this and all prior sync gen numbers are flushed */
291 uint64_t m_flushed_sync_gen
= 0;
293 AsyncOpTracker m_async_op_tracker
;
294 /* Debug counters for the places m_async_op_tracker is used */
295 std::atomic
<int> m_async_flush_ops
= {0};
296 std::atomic
<int> m_async_append_ops
= {0};
298 /* Acquire locks in order declared here */
300 mutable ceph::mutex m_log_retire_lock
;
301 /* Hold a read lock on m_entry_reader_lock to add readers to log entry
302 * bufs. Hold a write lock to prevent readers from being added (e.g. when
303 * removing log entrys from the map). No lock required to remove readers. */
304 mutable RWLock m_entry_reader_lock
;
305 /* Hold m_log_append_lock while appending or retiring log entries. */
306 mutable ceph::mutex m_log_append_lock
;
307 /* Used for most synchronization */
308 mutable ceph::mutex m_lock
;
310 /* Use m_blockguard_lock for the following 3 things */
311 pwl::WriteLogGuard::BlockOperations m_awaiting_barrier
;
313 bool m_wake_up_requested
= false;
314 bool m_wake_up_scheduled
= false;
315 bool m_appending
= false;
316 bool m_dispatching_deferred_ops
= false;
318 pwl::GenericLogOperations m_ops_to_flush
; /* Write ops needing flush in local log */
319 pwl::GenericLogOperations m_ops_to_append
; /* Write ops needing event append in local log */
321 pwl::WriteLogMap m_blocks_to_log_entries
;
323 /* New entries are at the back. Oldest at the front */
324 pwl::GenericLogEntries m_log_entries
;
325 pwl::GenericLogEntries m_dirty_log_entries
;
327 PerfCounters
*m_perfcounter
= nullptr;
329 unsigned int m_unpublished_reserves
= 0;
331 ContextWQ m_work_queue
;
336 std::shared_ptr
<pwl::GenericLogEntry
> *log_entry
,
337 pwl::WriteLogCacheEntry
*cache_entry
,
338 std::map
<uint64_t, bool> &missing_sync_points
,
340 std::shared_ptr
<pwl::SyncPointLogEntry
>> &sync_point_entries
,
341 uint64_t entry_index
);
342 void update_sync_points(
343 std::map
<uint64_t, bool> &missing_sync_points
,
345 std::shared_ptr
<pwl::SyncPointLogEntry
>> &sync_point_entries
,
346 pwl::DeferredContexts
&later
);
347 virtual void inc_allocated_cached_bytes(
348 std::shared_ptr
<pwl::GenericLogEntry
> log_entry
) = 0;
349 Context
*construct_flush_entry(
350 const std::shared_ptr
<pwl::GenericLogEntry
> log_entry
, bool invalidating
);
351 void detain_flush_guard_request(std::shared_ptr
<GenericLogEntry
> log_entry
,
352 GuardedRequestFunctionContext
*guarded_ctx
);
353 void process_writeback_dirty_entries();
354 bool can_retire_entry(const std::shared_ptr
<pwl::GenericLogEntry
> log_entry
);
356 void dispatch_deferred_writes(void);
357 void complete_op_log_entries(pwl::GenericLogOperations
&&ops
, const int r
);
359 bool check_allocation(
360 C_BlockIORequestT
*req
, uint64_t bytes_cached
, uint64_t bytes_dirtied
,
361 uint64_t bytes_allocated
, uint32_t num_lanes
, uint32_t num_log_entries
,
362 uint32_t num_unpublished_reserves
);
363 void append_scheduled(
364 pwl::GenericLogOperations
&ops
, bool &ops_remain
, bool &appending
,
367 virtual void process_work() = 0;
368 virtual void append_scheduled_ops(void) = 0;
369 virtual void schedule_append_ops(pwl::GenericLogOperations
&ops
, C_BlockIORequestT
*req
) = 0;
370 virtual void remove_pool_file() = 0;
371 virtual bool initialize_pool(Context
*on_finish
,
372 pwl::DeferredContexts
&later
) = 0;
373 virtual void collect_read_extents(
374 uint64_t read_buffer_offset
, LogMapEntry
<GenericWriteLogEntry
> map_entry
,
375 std::vector
<std::shared_ptr
<GenericWriteLogEntry
>> &log_entries_to_read
,
376 std::vector
<bufferlist
*> &bls_to_read
, uint64_t entry_hit_length
,
377 Extent hit_extent
, pwl::C_ReadRequest
*read_ctx
) = 0;
378 virtual void complete_read(
379 std::vector
<std::shared_ptr
<GenericWriteLogEntry
>> &log_entries_to_read
,
380 std::vector
<bufferlist
*> &bls_to_read
, Context
*ctx
) = 0;
381 virtual void write_data_to_buffer(
382 std::shared_ptr
<pwl::WriteLogEntry
> ws_entry
,
383 pwl::WriteLogCacheEntry
*cache_entry
) {}
384 virtual void release_ram(
385 const std::shared_ptr
<pwl::GenericLogEntry
> log_entry
) {}
386 virtual void alloc_op_log_entries(pwl::GenericLogOperations
&ops
) {}
387 virtual bool retire_entries(const unsigned long int frees_per_tx
) {
390 virtual void schedule_flush_and_append(
391 pwl::GenericLogOperationsVector
&ops
) {}
392 virtual void persist_last_flushed_sync_gen() {}
393 virtual void reserve_cache(C_BlockIORequestT
*req
, bool &alloc_succeeds
,
395 virtual void construct_flush_entries(pwl::GenericLogEntries entries_to_flush
,
396 DeferredContexts
&post_unlock
,
397 bool has_write_entry
) = 0;
398 virtual uint64_t get_max_extent() {
401 void update_image_cache_state(void);
406 } // namespace librbd
408 extern template class librbd::cache::pwl::AbstractWriteLog
<librbd::ImageCtx
>;
410 #endif // CEPH_LIBRBD_CACHE_PARENT_WRITE_LOG