[ceph.git] / ceph / src / rocksdb / utilities / persistent_cache / persistent_cache_tier.h

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
#pragma once

#ifndef ROCKSDB_LITE

#include <limits>
#include <list>
#include <map>
#include <string>
#include <vector>

#include "monitoring/histogram.h"
#include "rocksdb/env.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/status.h"

// Persistent Cache
//
// Persistent cache is tiered key-value cache that can use persistent medium. It
// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
// The code has been kept generic but significant benchmark/design/development
// time has been spent to make sure the cache performs appropriately for
// respective storage medium.
// The file defines
// PersistentCacheTier    : Implementation that handles individual cache tier
// PersistentTieresCache  : Implementation that handles all tiers as a logical
//                          unit
//
// PersistentTieredCache architecture:
// +--------------------------+ PersistentCacheTier that handles multiple tiers
// | +----------------+       |
// | | RAM            | PersistentCacheTier that handles RAM (VolatileCacheImpl)
// | +----------------+       |
// |   | next                 |
// |   v                      |
// | +----------------+       |
// | | NVM            | PersistentCacheTier implementation that handles NVM
// | +----------------+ (BlockCacheImpl)
// |   | next                 |
// |   V                      |
// | +----------------+       |
// | | LE-SSD         | PersistentCacheTier implementation that handles LE-SSD
// | +----------------+ (BlockCacheImpl)
// |   |                      |
// |   V                      |
// |  null                    |
// +--------------------------+
//               |
//               V
//              null
namespace rocksdb {

// Persistent Cache Config
//
// This struct captures all the options that are used to configure persistent
// cache. Some of the terminologies used in naming the options are
//
// dispatch size :
// This is the size in which IO is dispatched to the device
//
// write buffer size :
// This is the size of an individual write buffer size. Write buffers are
// grouped to form buffered file.
//
// cache size :
// This is the logical maximum for the cache size
//
// qdepth :
// This is the max number of IOs that can issues to the device in parallel
//
// pepeling :
// The writer code path follows pipelined architecture, which means the
// operations are handed off from one stage to another
//
// pipelining backlog size :
// With the pipelined architecture, there can always be backlogging of ops in
// pipeline queues. This is the maximum backlog size after which ops are dropped
// from queue
struct PersistentCacheConfig {
  explicit PersistentCacheConfig(
      Env* const _env, const std::string& _path, const uint64_t _cache_size,
      const std::shared_ptr<Logger>& _log,
      const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
    env = _env;
    path = _path;
    log = _log;
    cache_size = _cache_size;
    writer_dispatch_size = write_buffer_size = _write_buffer_size;
  }

  //
  // Validate the settings. Our intentions are to catch erroneous settings ahead
  // of time instead going violating invariants or causing dead locks.
  //
  Status ValidateSettings() const {
    // (1) check pre-conditions for variables
    if (!env || path.empty()) {
      return Status::InvalidArgument("empty or null args");
    }

    // (2) assert size related invariants
    // - cache size cannot be less than cache file size
    // - individual write buffer size cannot be greater than cache file size
    // - total write buffer size cannot be less than 2X cache file size
    if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
        write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
      return Status::InvalidArgument("invalid cache size");
    }

    // (2) check writer settings
    // - Queue depth cannot be 0
    // - writer_dispatch_size cannot be greater than writer_buffer_size
    // - dispatch size and buffer size need to be aligned
    if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
        write_buffer_size % writer_dispatch_size) {
      return Status::InvalidArgument("invalid writer settings");
    }

    return Status::OK();
  }

  //
  // Env abstraction to use for systmer level operations
  //
  Env* env;

  //
  // Path for the block cache where blocks are persisted
  //
  std::string path;

  //
  // Log handle for logging messages
  //
  std::shared_ptr<Logger> log;

  //
  // Enable direct IO for reading
  //
  bool enable_direct_reads = true;

  //
  // Enable direct IO for writing
  //
  bool enable_direct_writes = false;

  //
  // Logical cache size
  //
  uint64_t cache_size = std::numeric_limits<uint64_t>::max();

  // cache-file-size
  //
  // Cache consists of multiples of small files. This parameter defines the
  // size of an individual cache file
  //
  // default: 1M
  uint32_t cache_file_size = 100ULL * 1024 * 1024;

  // writer-qdepth
  //
  // The writers can issues IO to the devices in parallel. This parameter
  // controls the max number if IOs that can issues in parallel to the block
  // device
  //
  // default :1
  uint32_t writer_qdepth = 1;

  // pipeline-writes
  //
  // The write optionally follow pipelined architecture. This helps
  // avoid regression in the eviction code path of the primary tier. This
  // parameter defines if pipelining is enabled or disabled
  //
  // default: true
  bool pipeline_writes = true;

  // max-write-pipeline-backlog-size
  //
  // Max pipeline buffer size. This is the maximum backlog we can accumulate
  // while waiting for writes. After the limit, new ops will be dropped.
  //
  // Default: 1GiB
  uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;

  // write-buffer-size
  //
  // This is the size in which buffer slabs are allocated.
  //
  // Default: 1M
  uint32_t write_buffer_size = 1ULL * 1024 * 1024;

  // write-buffer-count
  //
  // This is the total number of buffer slabs. This is calculated as a factor of
  // file size in order to avoid dead lock.
  size_t write_buffer_count() const {
    assert(write_buffer_size);
    return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
                               write_buffer_size);
  }

  // writer-dispatch-size
  //
  // The writer thread will dispatch the IO at the specified IO size
  //
  // default: 1M
  uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;

  // is_compressed
  //
  // This option determines if the cache will run in compressed mode or
  // uncompressed mode
  bool is_compressed = true;

  PersistentCacheConfig MakePersistentCacheConfig(
      const std::string& path, const uint64_t size,
      const std::shared_ptr<Logger>& log);

  std::string ToString() const;
};

// Persistent Cache Tier
//
// This a logical abstraction that defines a tier of the persistent cache. Tiers
// can be stacked over one another. PersistentCahe provides the basic definition
// for accessing/storing in the cache. PersistentCacheTier extends the interface
// to enable management and stacking of tiers.
class PersistentCacheTier : public PersistentCache {
 public:
  typedef std::shared_ptr<PersistentCacheTier> Tier;

  virtual ~PersistentCacheTier() {}

  // Open the persistent cache tier
  virtual Status Open();

  // Close the persistent cache tier
  virtual Status Close();

  // Reserve space up to 'size' bytes
  virtual bool Reserve(const size_t size);

  // Erase a key from the cache
  virtual bool Erase(const Slice& key);

  // Print stats to string recursively
  virtual std::string PrintStats();

  virtual PersistentCache::StatsType Stats() override;

  // Insert to page cache
  virtual Status Insert(const Slice& page_key, const char* data,
                        const size_t size) override = 0;

  // Lookup page cache by page identifier
  virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
                        size_t* size) override = 0;

  // Does it store compressed data ?
  virtual bool IsCompressed() override = 0;

  virtual std::string GetPrintableOptions() const override = 0;

  // Return a reference to next tier
  virtual Tier& next_tier() { return next_tier_; }

  // Set the value for next tier
  virtual void set_next_tier(const Tier& tier) {
    assert(!next_tier_);
    next_tier_ = tier;
  }

  virtual void TEST_Flush() {
    if (next_tier_) {
      next_tier_->TEST_Flush();
    }
  }

 private:
  Tier next_tier_;  // next tier
};

// PersistentTieredCache
//
// Abstraction that helps you construct a tiers of persistent caches as a
// unified cache. The tier(s) of cache will act a single tier for management
// ease and support PersistentCache methods for accessing data.
class PersistentTieredCache : public PersistentCacheTier {
 public:
  virtual ~PersistentTieredCache();

  Status Open() override;
  Status Close() override;
  bool Erase(const Slice& key) override;
  std::string PrintStats() override;
  PersistentCache::StatsType Stats() override;
  Status Insert(const Slice& page_key, const char* data,
                const size_t size) override;
  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
                size_t* size) override;
  bool IsCompressed() override;

  std::string GetPrintableOptions() const override {
    return "PersistentTieredCache";
  }

  void AddTier(const Tier& tier);

  Tier& next_tier() override {
    auto it = tiers_.end();
    return (*it)->next_tier();
  }

  void set_next_tier(const Tier& tier) override {
    auto it = tiers_.end();
    (*it)->set_next_tier(tier);
  }

  void TEST_Flush() override {
    assert(!tiers_.empty());
    tiers_.front()->TEST_Flush();
    PersistentCacheTier::TEST_Flush();
  }

 protected:
  std::list<Tier> tiers_;  // list of tiers top-down
};

}  // namespace rocksdb

#endif
Commit	Line	Data
7c673cae	1	// Copyright (c) 2013, Facebook, Inc. All rights reserved.
11fdf7f2 TL	2	// This source code is licensed under both the GPLv2 (found in the
	3	// COPYING file in the root directory) and Apache 2.0 License
	4	// (found in the LICENSE.Apache file in the root directory).
7c673cae FG	5	//
	6	#pragma once
	7
	8	#ifndef ROCKSDB_LITE
	9
	10	#include <limits>
	11	#include <list>
	12	#include <map>
	13	#include <string>
	14	#include <vector>
	15
	16	#include "monitoring/histogram.h"
	17	#include "rocksdb/env.h"
	18	#include "rocksdb/persistent_cache.h"
	19	#include "rocksdb/status.h"
	20
	21	// Persistent Cache
	22	//
	23	// Persistent cache is tiered key-value cache that can use persistent medium. It
	24	// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
	25	// The code has been kept generic but significant benchmark/design/development
	26	// time has been spent to make sure the cache performs appropriately for
	27	// respective storage medium.
	28	// The file defines
	29	// PersistentCacheTier : Implementation that handles individual cache tier
	30	// PersistentTieresCache : Implementation that handles all tiers as a logical
	31	// unit
	32	//
	33	// PersistentTieredCache architecture:
	34	// +--------------------------+ PersistentCacheTier that handles multiple tiers
	35	// \| +----------------+ \|
	36	// \| \| RAM \| PersistentCacheTier that handles RAM (VolatileCacheImpl)
	37	// \| +----------------+ \|
	38	// \| \| next \|
	39	// \| v \|
	40	// \| +----------------+ \|
	41	// \| \| NVM \| PersistentCacheTier implementation that handles NVM
	42	// \| +----------------+ (BlockCacheImpl)
	43	// \| \| next \|
	44	// \| V \|
	45	// \| +----------------+ \|
	46	// \| \| LE-SSD \| PersistentCacheTier implementation that handles LE-SSD
	47	// \| +----------------+ (BlockCacheImpl)
	48	// \| \| \|
	49	// \| V \|
	50	// \| null \|
	51	// +--------------------------+
	52	// \|
	53	// V
	54	// null
	55	namespace rocksdb {
	56
	57	// Persistent Cache Config
	58	//
	59	// This struct captures all the options that are used to configure persistent
	60	// cache. Some of the terminologies used in naming the options are
	61	//
	62	// dispatch size :
	63	// This is the size in which IO is dispatched to the device
	64	//
	65	// write buffer size :
	66	// This is the size of an individual write buffer size. Write buffers are
	67	// grouped to form buffered file.
	68	//
69	// cache size :
70	// This is the logical maximum for the cache size
71	//
72	// qdepth :
73	// This is the max number of IOs that can issues to the device in parallel
74	//
75	// pepeling :
76	// The writer code path follows pipelined architecture, which means the
77	// operations are handed off from one stage to another
78	//
79	// pipelining backlog size :
80	// With the pipelined architecture, there can always be backlogging of ops in
81	// pipeline queues. This is the maximum backlog size after which ops are dropped
82	// from queue
83	struct PersistentCacheConfig {
84	explicit PersistentCacheConfig(
85	Env* const _env, const std::string& _path, const uint64_t _cache_size,
86	const std::shared_ptr<Logger>& _log,
87	const uint32_t _write_buffer_size = 1 * 1024 * 1024 /1MB/) {
88	env = _env;
89	path = _path;
90	log = _log;
91	cache_size = _cache_size;
92	writer_dispatch_size = write_buffer_size = _write_buffer_size;
93	}
94
95	//
96	// Validate the settings. Our intentions are to catch erroneous settings ahead
97	// of time instead going violating invariants or causing dead locks.
98	//
99	Status ValidateSettings() const {
100	// (1) check pre-conditions for variables
101	if (!env \|\| path.empty()) {
102	return Status::InvalidArgument("empty or null args");
103	}
104
105	// (2) assert size related invariants
106	// - cache size cannot be less than cache file size
107	// - individual write buffer size cannot be greater than cache file size
108	// - total write buffer size cannot be less than 2X cache file size
109	if (cache_size < cache_file_size \|\| write_buffer_size >= cache_file_size \|\|
110	write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
111	return Status::InvalidArgument("invalid cache size");
112	}
113
114	// (2) check writer settings
115	// - Queue depth cannot be 0
116	// - writer_dispatch_size cannot be greater than writer_buffer_size
117	// - dispatch size and buffer size need to be aligned
118	if (!writer_qdepth \|\| writer_dispatch_size > write_buffer_size \|\|
119	write_buffer_size % writer_dispatch_size) {
120	return Status::InvalidArgument("invalid writer settings");
121	}
122
123	return Status::OK();
124	}
125
126	//
127	// Env abstraction to use for systmer level operations
128	//
129	Env* env;
130
131	//
132	// Path for the block cache where blocks are persisted
133	//
134	std::string path;
135
136	//
137	// Log handle for logging messages
138	//
139	std::shared_ptr<Logger> log;
140
141	//
142	// Enable direct IO for reading
143	//
144	bool enable_direct_reads = true;
145
146	//
147	// Enable direct IO for writing
148	//
149	bool enable_direct_writes = false;
150
151	//
152	// Logical cache size
153	//
154	uint64_t cache_size = std::numeric_limits<uint64_t>::max();
155
156	// cache-file-size
157	//
158	// Cache consists of multiples of small files. This parameter defines the
159	// size of an individual cache file
160	//
161	// default: 1M
162	uint32_t cache_file_size = 100ULL * 1024 * 1024;
163
164	// writer-qdepth
165	//
166	// The writers can issues IO to the devices in parallel. This parameter
167	// controls the max number if IOs that can issues in parallel to the block
168	// device
169	//
170	// default :1
171	uint32_t writer_qdepth = 1;
172
173	// pipeline-writes
174	//
175	// The write optionally follow pipelined architecture. This helps
176	// avoid regression in the eviction code path of the primary tier. This
177	// parameter defines if pipelining is enabled or disabled
178	//
179	// default: true
180	bool pipeline_writes = true;
181
182	// max-write-pipeline-backlog-size
183	//
184	// Max pipeline buffer size. This is the maximum backlog we can accumulate
185	// while waiting for writes. After the limit, new ops will be dropped.
186	//
187	// Default: 1GiB
188	uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;
189
190	// write-buffer-size
191	//
192	// This is the size in which buffer slabs are allocated.
193	//
194	// Default: 1M
195	uint32_t write_buffer_size = 1ULL * 1024 * 1024;
196
197	// write-buffer-count
198	//
199	// This is the total number of buffer slabs. This is calculated as a factor of
200	// file size in order to avoid dead lock.
201	size_t write_buffer_count() const {
202	assert(write_buffer_size);
203	return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
204	write_buffer_size);
205	}
206
207	// writer-dispatch-size
208	//
209	// The writer thread will dispatch the IO at the specified IO size
210	//
211	// default: 1M
212	uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;
213
214	// is_compressed
215	//
216	// This option determines if the cache will run in compressed mode or
217	// uncompressed mode
218	bool is_compressed = true;
219
220	PersistentCacheConfig MakePersistentCacheConfig(
221	const std::string& path, const uint64_t size,
222	const std::shared_ptr<Logger>& log);
223
224	std::string ToString() const;
225	};
226
227	// Persistent Cache Tier
228	//
229	// This a logical abstraction that defines a tier of the persistent cache. Tiers
230	// can be stacked over one another. PersistentCahe provides the basic definition
231	// for accessing/storing in the cache. PersistentCacheTier extends the interface
232	// to enable management and stacking of tiers.
233	class PersistentCacheTier : public PersistentCache {
234	public:
235	typedef std::shared_ptr<PersistentCacheTier> Tier;
236
237	virtual ~PersistentCacheTier() {}
238
239	// Open the persistent cache tier
240	virtual Status Open();
241
242	// Close the persistent cache tier
243	virtual Status Close();
244
245	// Reserve space up to 'size' bytes
246	virtual bool Reserve(const size_t size);
247
248	// Erase a key from the cache
249	virtual bool Erase(const Slice& key);
250
251	// Print stats to string recursively
252	virtual std::string PrintStats();
253
11fdf7f2	254	virtual PersistentCache::StatsType Stats() override;
7c673cae FG	255
	256	// Insert to page cache
	257	virtual Status Insert(const Slice& page_key, const char* data,
11fdf7f2	258	const size_t size) override = 0;
7c673cae FG	259
	260	// Lookup page cache by page identifier
	261	virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
11fdf7f2	262	size_t* size) override = 0;
7c673cae FG	263
7c673cae FG	264	// Does it store compressed data ?
11fdf7f2	265	virtual bool IsCompressed() override = 0;
7c673cae	266
11fdf7f2	267	virtual std::string GetPrintableOptions() const override = 0;
7c673cae FG	268
	269	// Return a reference to next tier
	270	virtual Tier& next_tier() { return next_tier_; }
	271
	272	// Set the value for next tier
	273	virtual void set_next_tier(const Tier& tier) {
	274	assert(!next_tier_);
	275	next_tier_ = tier;
	276	}
	277
	278	virtual void TEST_Flush() {
	279	if (next_tier_) {
	280	next_tier_->TEST_Flush();
	281	}
	282	}
	283
	284	private:
	285	Tier next_tier_; // next tier
	286	};
	287
	288	// PersistentTieredCache
	289	//
	290	// Abstraction that helps you construct a tiers of persistent caches as a
	291	// unified cache. The tier(s) of cache will act a single tier for management
	292	// ease and support PersistentCache methods for accessing data.
	293	class PersistentTieredCache : public PersistentCacheTier {
	294	public:
	295	virtual ~PersistentTieredCache();
	296
	297	Status Open() override;
	298	Status Close() override;
	299	bool Erase(const Slice& key) override;
	300	std::string PrintStats() override;
	301	PersistentCache::StatsType Stats() override;
	302	Status Insert(const Slice& page_key, const char* data,
	303	const size_t size) override;
	304	Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
	305	size_t* size) override;
	306	bool IsCompressed() override;
	307
	308	std::string GetPrintableOptions() const override {
	309	return "PersistentTieredCache";
	310	}
	311
	312	void AddTier(const Tier& tier);
	313
	314	Tier& next_tier() override {
	315	auto it = tiers_.end();
	316	return (*it)->next_tier();
	317	}
	318
	319	void set_next_tier(const Tier& tier) override {
	320	auto it = tiers_.end();
	321	(*it)->set_next_tier(tier);
	322	}
	323
	324	void TEST_Flush() override {
	325	assert(!tiers_.empty());
	326	tiers_.front()->TEST_Flush();
	327	PersistentCacheTier::TEST_Flush();
	328	}
	329
	330	protected:
	331	std::list<Tier> tiers_; // list of tiers top-down
332	};
333
334	} // namespace rocksdb
335
336	#endif