]> git.proxmox.com Git - ceph.git/blame - ceph/src/kv/MemDB.cc
buildsys: change download over to reef release
[ceph.git] / ceph / src / kv / MemDB.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * In-memory crash non-safe keyvalue db
5 * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
6 */
7
8#include "include/compat.h"
20effc67 9#include <filesystem>
7c673cae
FG
10#include <set>
11#include <map>
12#include <string>
13#include <memory>
14#include <errno.h>
15#include <unistd.h>
16#include <sys/types.h>
17#include <sys/stat.h>
18
19#include "common/perf_counters.h"
20#include "common/debug.h"
21#include "include/str_list.h"
22#include "include/str_map.h"
23#include "KeyValueDB.h"
24#include "MemDB.h"
25
11fdf7f2 26#include "include/ceph_assert.h"
7c673cae
FG
27#include "common/debug.h"
28#include "common/errno.h"
11fdf7f2
TL
29#include "include/buffer.h"
30#include "include/buffer_raw.h"
7c673cae
FG
31#include "include/compat.h"
32
33#define dout_context g_ceph_context
34#define dout_subsys ceph_subsys_memdb
35#undef dout_prefix
36#define dout_prefix *_dout << "memdb: "
37#define dtrace dout(30)
38#define dwarn dout(0)
39#define dinfo dout(0)
40
20effc67
TL
41namespace fs = std::filesystem;
42
f67539c2
TL
43using std::cerr;
44using std::ostream;
45using std::string;
46using std::vector;
47
48using ceph::bufferlist;
49using ceph::bufferptr;
50using ceph::decode;
51using ceph::encode;
52
7c673cae
FG
53static void split_key(const string& raw_key, string *prefix, string *key)
54{
55 size_t pos = raw_key.find(KEY_DELIM, 0);
11fdf7f2 56 ceph_assert(pos != std::string::npos);
7c673cae
FG
57 *prefix = raw_key.substr(0, pos);
58 *key = raw_key.substr(pos + 1, raw_key.length());
59}
60
61static string make_key(const string &prefix, const string &value)
62{
63 string out = prefix;
64 out.push_back(KEY_DELIM);
65 out.append(value);
66 return out;
67}
68
69void MemDB::_encode(mdb_iter_t iter, bufferlist &bl)
70{
11fdf7f2
TL
71 encode(iter->first, bl);
72 encode(iter->second, bl);
7c673cae
FG
73}
74
75std::string MemDB::_get_data_fn()
76{
77 string fn = m_db_path + "/" + "MemDB.db";
78 return fn;
79}
80
81void MemDB::_save()
82{
83 std::lock_guard<std::mutex> l(m_lock);
84 dout(10) << __func__ << " Saving MemDB to file: "<< _get_data_fn().c_str() << dendl;
85 int mode = 0644;
86 int fd = TEMP_FAILURE_RETRY(::open(_get_data_fn().c_str(),
91327a77 87 O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode));
7c673cae
FG
88 if (fd < 0) {
89 int err = errno;
90 cerr << "write_file(" << _get_data_fn().c_str() << "): failed to open file: "
91 << cpp_strerror(err) << std::endl;
92 return;
93 }
94 bufferlist bl;
95 mdb_iter_t iter = m_map.begin();
96 while (iter != m_map.end()) {
97 dout(10) << __func__ << " Key:"<< iter->first << dendl;
98 _encode(iter, bl);
99 ++iter;
100 }
101 bl.write_fd(fd);
102
103 VOID_TEMP_FAILURE_RETRY(::close(fd));
104}
105
106int MemDB::_load()
107{
108 std::lock_guard<std::mutex> l(m_lock);
109 dout(10) << __func__ << " Reading MemDB from file: "<< _get_data_fn().c_str() << dendl;
110 /*
111 * Open file and read it in single shot.
112 */
91327a77 113 int fd = TEMP_FAILURE_RETRY(::open(_get_data_fn().c_str(), O_RDONLY|O_CLOEXEC));
7c673cae
FG
114 if (fd < 0) {
115 int err = errno;
116 cerr << "can't open " << _get_data_fn().c_str() << ": "
117 << cpp_strerror(err) << std::endl;
118 return -err;
119 }
120
121 struct stat st;
122 memset(&st, 0, sizeof(st));
123 if (::fstat(fd, &st) < 0) {
124 int err = errno;
125 cerr << "can't stat file " << _get_data_fn().c_str() << ": "
126 << cpp_strerror(err) << std::endl;
127 VOID_TEMP_FAILURE_RETRY(::close(fd));
128 return -err;
129 }
130
131 ssize_t file_size = st.st_size;
132 ssize_t bytes_done = 0;
133 while (bytes_done < file_size) {
134 string key;
135 bufferptr datap;
136
f67539c2
TL
137 bytes_done += ceph::decode_file(fd, key);
138 bytes_done += ceph::decode_file(fd, datap);
7c673cae
FG
139
140 dout(10) << __func__ << " Key:"<< key << dendl;
141 m_map[key] = datap;
142 m_total_bytes += datap.length();
143 }
144 VOID_TEMP_FAILURE_RETRY(::close(fd));
145 return 0;
146}
147
148int MemDB::_init(bool create)
149{
f67539c2 150 int r = 0;
7c673cae
FG
151 dout(1) << __func__ << dendl;
152 if (create) {
f67539c2 153 if (fs::exists(m_db_path)) {
11fdf7f2 154 r = 0; // ignore EEXIST
f67539c2
TL
155 } else {
156 std::error_code ec;
157 if (!fs::create_directory(m_db_path, ec)) {
158 derr << __func__ << " mkdir failed: " << ec.message() << dendl;
159 return -ec.value();
160 }
161 fs::permissions(m_db_path, fs::perms::owner_all);
7c673cae
FG
162 }
163 } else {
164 r = _load();
165 }
166
11fdf7f2
TL
167 PerfCountersBuilder plb(g_ceph_context, "memdb", l_memdb_first, l_memdb_last);
168 plb.add_u64_counter(l_memdb_gets, "get", "Gets");
169 plb.add_u64_counter(l_memdb_txns, "submit_transaction", "Submit transactions");
170 plb.add_time_avg(l_memdb_get_latency, "get_latency", "Get latency");
171 plb.add_time_avg(l_memdb_submit_latency, "submit_latency", "Submit Latency");
172 logger = plb.create_perf_counters();
173 m_cct->get_perfcounters_collection()->add(logger);
174
7c673cae
FG
175 return r;
176}
177
178int MemDB::set_merge_operator(
179 const string& prefix,
180 std::shared_ptr<KeyValueDB::MergeOperator> mop)
181{
182 merge_ops.push_back(std::make_pair(prefix, mop));
183 return 0;
184}
185
186int MemDB::do_open(ostream &out, bool create)
187{
188 m_total_bytes = 0;
189 m_allocated_bytes = 1;
190
191 return _init(create);
192}
193
f67539c2 194int MemDB::open(ostream &out, const std::string& cfs) {
11fdf7f2
TL
195 if (!cfs.empty()) {
196 ceph_abort_msg("Not implemented");
197 }
198 return do_open(out, false);
199}
200
f67539c2 201int MemDB::create_and_open(ostream &out, const std::string& cfs) {
11fdf7f2
TL
202 if (!cfs.empty()) {
203 ceph_abort_msg("Not implemented");
204 }
205 return do_open(out, true);
206}
207
7c673cae
FG
208MemDB::~MemDB()
209{
210 close();
211 dout(10) << __func__ << " Destroying MemDB instance: "<< dendl;
212}
213
214void MemDB::close()
215{
216 /*
217 * Save whatever in memory btree.
218 */
219 _save();
11fdf7f2
TL
220 if (logger)
221 m_cct->get_perfcounters_collection()->remove(logger);
7c673cae
FG
222}
223
224int MemDB::submit_transaction(KeyValueDB::Transaction t)
225{
11fdf7f2
TL
226 utime_t start = ceph_clock_now();
227
7c673cae
FG
228 MDBTransactionImpl* mt = static_cast<MDBTransactionImpl*>(t.get());
229
230 dtrace << __func__ << " " << mt->get_ops().size() << dendl;
231 for(auto& op : mt->get_ops()) {
232 if(op.first == MDBTransactionImpl::WRITE) {
233 ms_op_t set_op = op.second;
234 _setkey(set_op);
235 } else if (op.first == MDBTransactionImpl::MERGE) {
236 ms_op_t merge_op = op.second;
237 _merge(merge_op);
238 } else {
239 ms_op_t rm_op = op.second;
11fdf7f2 240 ceph_assert(op.first == MDBTransactionImpl::DELETE);
7c673cae
FG
241 _rmkey(rm_op);
242 }
243 }
244
11fdf7f2
TL
245 utime_t lat = ceph_clock_now() - start;
246 logger->inc(l_memdb_txns);
247 logger->tinc(l_memdb_submit_latency, lat);
248
7c673cae
FG
249 return 0;
250}
251
252int MemDB::submit_transaction_sync(KeyValueDB::Transaction tsync)
253{
254 dtrace << __func__ << " " << dendl;
255 submit_transaction(tsync);
256 return 0;
257}
258
259int MemDB::transaction_rollback(KeyValueDB::Transaction t)
260{
261 MDBTransactionImpl* mt = static_cast<MDBTransactionImpl*>(t.get());
262 mt->clear();
263 return 0;
264}
265
266void MemDB::MDBTransactionImpl::set(
267 const string &prefix, const string &k, const bufferlist &to_set_bl)
268{
269 dtrace << __func__ << " " << prefix << " " << k << dendl;
270 ops.push_back(make_pair(WRITE, std::make_pair(std::make_pair(prefix, k),
271 to_set_bl)));
272}
273
274void MemDB::MDBTransactionImpl::rmkey(const string &prefix,
275 const string &k)
276{
277 dtrace << __func__ << " " << prefix << " " << k << dendl;
278 ops.push_back(make_pair(DELETE,
279 std::make_pair(std::make_pair(prefix, k),
280 bufferlist())));
281}
282
283void MemDB::MDBTransactionImpl::rmkeys_by_prefix(const string &prefix)
284{
285 KeyValueDB::Iterator it = m_db->get_iterator(prefix);
286 for (it->seek_to_first(); it->valid(); it->next()) {
287 rmkey(prefix, it->key());
288 }
289}
290
291void MemDB::MDBTransactionImpl::rm_range_keys(const string &prefix, const string &start, const string &end)
292{
293 KeyValueDB::Iterator it = m_db->get_iterator(prefix);
294 it->lower_bound(start);
295 while (it->valid()) {
296 if (it->key() >= end) {
297 break;
298 }
299 rmkey(prefix, it->key());
300 it->next();
301 }
302}
303
304void MemDB::MDBTransactionImpl::merge(
305 const std::string &prefix, const std::string &key, const bufferlist &value)
306{
307
308 dtrace << __func__ << " " << prefix << " " << key << dendl;
309 ops.push_back(make_pair(MERGE, make_pair(std::make_pair(prefix, key), value)));
310 return;
311}
312
313int MemDB::_setkey(ms_op_t &op)
314{
315 std::lock_guard<std::mutex> l(m_lock);
316 std::string key = make_key(op.first.first, op.first.second);
317 bufferlist bl = op.second;
318
319 m_total_bytes += bl.length();
320
321 bufferlist bl_old;
322 if (_get(op.first.first, op.first.second, &bl_old)) {
323 /*
324 * delete and free existing key.
325 */
11fdf7f2 326 ceph_assert(m_total_bytes >= bl_old.length());
7c673cae
FG
327 m_total_bytes -= bl_old.length();
328 m_map.erase(key);
329 }
330
331 m_map[key] = bufferptr((char *) bl.c_str(), bl.length());
332 iterator_seq_no++;
333 return 0;
334}
335
336int MemDB::_rmkey(ms_op_t &op)
337{
338 std::lock_guard<std::mutex> l(m_lock);
339 std::string key = make_key(op.first.first, op.first.second);
340
341 bufferlist bl_old;
342 if (_get(op.first.first, op.first.second, &bl_old)) {
11fdf7f2 343 ceph_assert(m_total_bytes >= bl_old.length());
7c673cae
FG
344 m_total_bytes -= bl_old.length();
345 }
346 iterator_seq_no++;
347 /*
348 * Erase will call the destructor for bufferptr.
349 */
350 return m_map.erase(key);
351}
352
11fdf7f2 353std::shared_ptr<KeyValueDB::MergeOperator> MemDB::_find_merge_op(const std::string &prefix)
7c673cae
FG
354{
355 for (const auto& i : merge_ops) {
356 if (i.first == prefix) {
357 return i.second;
358 }
359 }
360
361 dtrace << __func__ << " No merge op for " << prefix << dendl;
362 return NULL;
363}
364
365
366int MemDB::_merge(ms_op_t &op)
367{
368 std::lock_guard<std::mutex> l(m_lock);
369 std::string prefix = op.first.first;
370 std::string key = make_key(op.first.first, op.first.second);
371 bufferlist bl = op.second;
372 int64_t bytes_adjusted = bl.length();
373
374 /*
375 * find the operator for this prefix
376 */
377 std::shared_ptr<MergeOperator> mop = _find_merge_op(prefix);
11fdf7f2 378 ceph_assert(mop);
7c673cae
FG
379
380 /*
381 * call the merge operator with value and non value
382 */
383 bufferlist bl_old;
384 if (_get(op.first.first, op.first.second, &bl_old) == false) {
385 std::string new_val;
386 /*
387 * Merge non existent.
388 */
389 mop->merge_nonexistent(bl.c_str(), bl.length(), &new_val);
390 m_map[key] = bufferptr(new_val.c_str(), new_val.length());
391 } else {
392 /*
393 * Merge existing.
394 */
395 std::string new_val;
396 mop->merge(bl_old.c_str(), bl_old.length(), bl.c_str(), bl.length(), &new_val);
397 m_map[key] = bufferptr(new_val.c_str(), new_val.length());
398 bytes_adjusted -= bl_old.length();
399 bl_old.clear();
400 }
401
11fdf7f2 402 ceph_assert((int64_t)m_total_bytes + bytes_adjusted >= 0);
7c673cae
FG
403 m_total_bytes += bytes_adjusted;
404 iterator_seq_no++;
405 return 0;
406}
407
408/*
409 * Caller take btree lock.
410 */
411bool MemDB::_get(const string &prefix, const string &k, bufferlist *out)
412{
413 string key = make_key(prefix, k);
414
415 mdb_iter_t iter = m_map.find(key);
416 if (iter == m_map.end()) {
417 return false;
418 }
419
420 out->push_back((m_map[key].clone()));
421 return true;
422}
423
424bool MemDB::_get_locked(const string &prefix, const string &k, bufferlist *out)
425{
426 std::lock_guard<std::mutex> l(m_lock);
427 return _get(prefix, k, out);
428}
429
430
431int MemDB::get(const string &prefix, const std::string& key,
432 bufferlist *out)
433{
11fdf7f2
TL
434 utime_t start = ceph_clock_now();
435 int ret;
436
7c673cae 437 if (_get_locked(prefix, key, out)) {
11fdf7f2
TL
438 ret = 0;
439 } else {
440 ret = -ENOENT;
7c673cae 441 }
11fdf7f2
TL
442
443 utime_t lat = ceph_clock_now() - start;
444 logger->inc(l_memdb_gets);
445 logger->tinc(l_memdb_get_latency, lat);
446
447 return ret;
7c673cae
FG
448}
449
450int MemDB::get(const string &prefix, const std::set<string> &keys,
451 std::map<string, bufferlist> *out)
452{
11fdf7f2
TL
453 utime_t start = ceph_clock_now();
454
7c673cae
FG
455 for (const auto& i : keys) {
456 bufferlist bl;
457 if (_get_locked(prefix, i, &bl))
458 out->insert(make_pair(i, bl));
459 }
460
11fdf7f2
TL
461 utime_t lat = ceph_clock_now() - start;
462 logger->inc(l_memdb_gets);
463 logger->tinc(l_memdb_get_latency, lat);
464
7c673cae
FG
465 return 0;
466}
467
468void MemDB::MDBWholeSpaceIteratorImpl::fill_current()
469{
470 bufferlist bl;
11fdf7f2 471 bl.push_back(m_iter->second.clone());
7c673cae
FG
472 m_key_value = std::make_pair(m_iter->first, bl);
473}
474
475bool MemDB::MDBWholeSpaceIteratorImpl::valid()
476{
477 if (m_key_value.first.empty()) {
478 return false;
479 }
480 return true;
481}
482
483bool MemDB::MDBWholeSpaceIteratorImpl::iterator_validate() {
484
485 if (this_seq_no != *global_seq_no) {
486 auto key = m_key_value.first;
11fdf7f2 487 ceph_assert(!key.empty());
7c673cae
FG
488
489 bool restart_iter = false;
490 if (!m_using_btree) {
491 /*
492 * Map is modified and marker key does not exists,
493 * restart the iterator from next key.
494 */
495 if (m_map_p->find(key) == m_map_p->end()) {
496 restart_iter = true;
497 }
498 } else {
499 restart_iter = true;
500 }
501
502 if (restart_iter) {
503 m_iter = m_map_p->lower_bound(key);
504 if (m_iter == m_map_p->end()) {
505 return false;
506 }
507 }
508
509 /*
510 * This iter is valid now.
511 */
512 this_seq_no = *global_seq_no;
513 }
514
515 return true;
516}
517
518void
519MemDB::MDBWholeSpaceIteratorImpl::free_last()
520{
521 m_key_value.first.clear();
522 m_key_value.second.clear();
523}
524
525string MemDB::MDBWholeSpaceIteratorImpl::key()
526{
527 dtrace << __func__ << " " << m_key_value.first << dendl;
528 string prefix, key;
529 split_key(m_key_value.first, &prefix, &key);
530 return key;
531}
532
f67539c2 533std::pair<string,string> MemDB::MDBWholeSpaceIteratorImpl::raw_key()
7c673cae
FG
534{
535 string prefix, key;
536 split_key(m_key_value.first, &prefix, &key);
f67539c2 537 return { prefix, key };
7c673cae
FG
538}
539
540bool MemDB::MDBWholeSpaceIteratorImpl::raw_key_is_prefixed(
541 const string &prefix)
542{
543 string p, k;
544 split_key(m_key_value.first, &p, &k);
545 return (p == prefix);
546}
547
548bufferlist MemDB::MDBWholeSpaceIteratorImpl::value()
549{
550 dtrace << __func__ << " " << m_key_value << dendl;
551 return m_key_value.second;
552}
553
554int MemDB::MDBWholeSpaceIteratorImpl::next()
555{
556 std::lock_guard<std::mutex> l(*m_map_lock_p);
557 if (!iterator_validate()) {
558 free_last();
559 return -1;
560 }
561 free_last();
562 ++m_iter;
563 if (m_iter != m_map_p->end()) {
564 fill_current();
565 return 0;
566 } else {
567 return -1;
568 }
569}
570
571int MemDB::MDBWholeSpaceIteratorImpl:: prev()
572{
573 std::lock_guard<std::mutex> l(*m_map_lock_p);
574 if (!iterator_validate()) {
575 free_last();
576 return -1;
577 }
578 free_last();
579 if (m_iter != m_map_p->begin()) {
580 --m_iter;
581 fill_current();
582 return 0;
583 } else {
584 return -1;
585 }
586}
587
588/*
589 * First key >= to given key, if key is null then first key in btree.
590 */
591int MemDB::MDBWholeSpaceIteratorImpl::seek_to_first(const std::string &k)
592{
593 std::lock_guard<std::mutex> l(*m_map_lock_p);
594 free_last();
595 if (k.empty()) {
596 m_iter = m_map_p->begin();
597 } else {
598 m_iter = m_map_p->lower_bound(k);
599 }
600
601 if (m_iter == m_map_p->end()) {
602 return -1;
603 }
604 fill_current();
605 return 0;
606}
607
608int MemDB::MDBWholeSpaceIteratorImpl::seek_to_last(const std::string &k)
609{
610 std::lock_guard<std::mutex> l(*m_map_lock_p);
611 free_last();
612 if (k.empty()) {
613 m_iter = m_map_p->end();
614 --m_iter;
615 } else {
616 m_iter = m_map_p->lower_bound(k);
617 }
618
619 if (m_iter == m_map_p->end()) {
620 return -1;
621 }
622 fill_current();
623 return 0;
624}
625
626MemDB::MDBWholeSpaceIteratorImpl::~MDBWholeSpaceIteratorImpl()
627{
628 free_last();
629}
630
631int MemDB::MDBWholeSpaceIteratorImpl::upper_bound(const std::string &prefix,
632 const std::string &after) {
633
634 std::lock_guard<std::mutex> l(*m_map_lock_p);
635
636 dtrace << "upper_bound " << prefix.c_str() << after.c_str() << dendl;
637 string k = make_key(prefix, after);
638 m_iter = m_map_p->upper_bound(k);
639 if (m_iter != m_map_p->end()) {
640 fill_current();
641 return 0;
642 }
643 return -1;
644}
645
646int MemDB::MDBWholeSpaceIteratorImpl::lower_bound(const std::string &prefix,
647 const std::string &to) {
648 std::lock_guard<std::mutex> l(*m_map_lock_p);
649 dtrace << "lower_bound " << prefix.c_str() << to.c_str() << dendl;
650 string k = make_key(prefix, to);
651 m_iter = m_map_p->lower_bound(k);
652 if (m_iter != m_map_p->end()) {
653 fill_current();
654 return 0;
655 }
656 return -1;
657}