]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * In-memory crash non-safe keyvalue db | |
5 | * Author: Ramesh Chander, Ramesh.Chander@sandisk.com | |
6 | */ | |
7 | ||
8 | #include "include/compat.h" | |
9 | #include <set> | |
10 | #include <map> | |
11 | #include <string> | |
12 | #include <memory> | |
13 | #include <errno.h> | |
14 | #include <unistd.h> | |
15 | #include <sys/types.h> | |
16 | #include <sys/stat.h> | |
17 | ||
18 | #include "common/perf_counters.h" | |
19 | #include "common/debug.h" | |
20 | #include "include/str_list.h" | |
21 | #include "include/str_map.h" | |
22 | #include "KeyValueDB.h" | |
23 | #include "MemDB.h" | |
24 | ||
25 | #include "include/assert.h" | |
26 | #include "common/debug.h" | |
27 | #include "common/errno.h" | |
28 | #include "include/compat.h" | |
29 | ||
30 | #define dout_context g_ceph_context | |
31 | #define dout_subsys ceph_subsys_memdb | |
32 | #undef dout_prefix | |
33 | #define dout_prefix *_dout << "memdb: " | |
34 | #define dtrace dout(30) | |
35 | #define dwarn dout(0) | |
36 | #define dinfo dout(0) | |
37 | ||
38 | static void split_key(const string& raw_key, string *prefix, string *key) | |
39 | { | |
40 | size_t pos = raw_key.find(KEY_DELIM, 0); | |
41 | assert(pos != std::string::npos); | |
42 | *prefix = raw_key.substr(0, pos); | |
43 | *key = raw_key.substr(pos + 1, raw_key.length()); | |
44 | } | |
45 | ||
46 | static string make_key(const string &prefix, const string &value) | |
47 | { | |
48 | string out = prefix; | |
49 | out.push_back(KEY_DELIM); | |
50 | out.append(value); | |
51 | return out; | |
52 | } | |
53 | ||
54 | void MemDB::_encode(mdb_iter_t iter, bufferlist &bl) | |
55 | { | |
56 | ::encode(iter->first, bl); | |
57 | ::encode(iter->second, bl); | |
58 | } | |
59 | ||
60 | std::string MemDB::_get_data_fn() | |
61 | { | |
62 | string fn = m_db_path + "/" + "MemDB.db"; | |
63 | return fn; | |
64 | } | |
65 | ||
66 | void MemDB::_save() | |
67 | { | |
68 | std::lock_guard<std::mutex> l(m_lock); | |
69 | dout(10) << __func__ << " Saving MemDB to file: "<< _get_data_fn().c_str() << dendl; | |
70 | int mode = 0644; | |
71 | int fd = TEMP_FAILURE_RETRY(::open(_get_data_fn().c_str(), | |
91327a77 | 72 | O_WRONLY|O_CREAT|O_TRUNC|O_CLOEXEC, mode)); |
7c673cae FG |
73 | if (fd < 0) { |
74 | int err = errno; | |
75 | cerr << "write_file(" << _get_data_fn().c_str() << "): failed to open file: " | |
76 | << cpp_strerror(err) << std::endl; | |
77 | return; | |
78 | } | |
79 | bufferlist bl; | |
80 | mdb_iter_t iter = m_map.begin(); | |
81 | while (iter != m_map.end()) { | |
82 | dout(10) << __func__ << " Key:"<< iter->first << dendl; | |
83 | _encode(iter, bl); | |
84 | ++iter; | |
85 | } | |
86 | bl.write_fd(fd); | |
87 | ||
88 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
89 | } | |
90 | ||
91 | int MemDB::_load() | |
92 | { | |
93 | std::lock_guard<std::mutex> l(m_lock); | |
94 | dout(10) << __func__ << " Reading MemDB from file: "<< _get_data_fn().c_str() << dendl; | |
95 | /* | |
96 | * Open file and read it in single shot. | |
97 | */ | |
91327a77 | 98 | int fd = TEMP_FAILURE_RETRY(::open(_get_data_fn().c_str(), O_RDONLY|O_CLOEXEC)); |
7c673cae FG |
99 | if (fd < 0) { |
100 | int err = errno; | |
101 | cerr << "can't open " << _get_data_fn().c_str() << ": " | |
102 | << cpp_strerror(err) << std::endl; | |
103 | return -err; | |
104 | } | |
105 | ||
106 | struct stat st; | |
107 | memset(&st, 0, sizeof(st)); | |
108 | if (::fstat(fd, &st) < 0) { | |
109 | int err = errno; | |
110 | cerr << "can't stat file " << _get_data_fn().c_str() << ": " | |
111 | << cpp_strerror(err) << std::endl; | |
112 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
113 | return -err; | |
114 | } | |
115 | ||
116 | ssize_t file_size = st.st_size; | |
117 | ssize_t bytes_done = 0; | |
118 | while (bytes_done < file_size) { | |
119 | string key; | |
120 | bufferptr datap; | |
121 | ||
122 | bytes_done += ::decode_file(fd, key); | |
123 | bytes_done += ::decode_file(fd, datap); | |
124 | ||
125 | dout(10) << __func__ << " Key:"<< key << dendl; | |
126 | m_map[key] = datap; | |
127 | m_total_bytes += datap.length(); | |
128 | } | |
129 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
130 | return 0; | |
131 | } | |
132 | ||
133 | int MemDB::_init(bool create) | |
134 | { | |
135 | int r; | |
136 | dout(1) << __func__ << dendl; | |
137 | if (create) { | |
138 | r = ::mkdir(m_db_path.c_str(), 0700); | |
139 | if (r < 0) { | |
140 | r = -errno; | |
141 | if (r != -EEXIST) { | |
142 | derr << __func__ << " mkdir failed: " << cpp_strerror(r) << dendl; | |
143 | return r; | |
144 | } | |
145 | return 0; // ignore EEXIST | |
146 | } | |
147 | } else { | |
148 | r = _load(); | |
149 | } | |
150 | ||
151 | return r; | |
152 | } | |
153 | ||
154 | int MemDB::set_merge_operator( | |
155 | const string& prefix, | |
156 | std::shared_ptr<KeyValueDB::MergeOperator> mop) | |
157 | { | |
158 | merge_ops.push_back(std::make_pair(prefix, mop)); | |
159 | return 0; | |
160 | } | |
161 | ||
162 | int MemDB::do_open(ostream &out, bool create) | |
163 | { | |
164 | m_total_bytes = 0; | |
165 | m_allocated_bytes = 1; | |
166 | ||
167 | return _init(create); | |
168 | } | |
169 | ||
170 | MemDB::~MemDB() | |
171 | { | |
172 | close(); | |
173 | dout(10) << __func__ << " Destroying MemDB instance: "<< dendl; | |
174 | } | |
175 | ||
176 | void MemDB::close() | |
177 | { | |
178 | /* | |
179 | * Save whatever in memory btree. | |
180 | */ | |
181 | _save(); | |
182 | } | |
183 | ||
184 | int MemDB::submit_transaction(KeyValueDB::Transaction t) | |
185 | { | |
186 | MDBTransactionImpl* mt = static_cast<MDBTransactionImpl*>(t.get()); | |
187 | ||
188 | dtrace << __func__ << " " << mt->get_ops().size() << dendl; | |
189 | for(auto& op : mt->get_ops()) { | |
190 | if(op.first == MDBTransactionImpl::WRITE) { | |
191 | ms_op_t set_op = op.second; | |
192 | _setkey(set_op); | |
193 | } else if (op.first == MDBTransactionImpl::MERGE) { | |
194 | ms_op_t merge_op = op.second; | |
195 | _merge(merge_op); | |
196 | } else { | |
197 | ms_op_t rm_op = op.second; | |
198 | assert(op.first == MDBTransactionImpl::DELETE); | |
199 | _rmkey(rm_op); | |
200 | } | |
201 | } | |
202 | ||
203 | return 0; | |
204 | } | |
205 | ||
206 | int MemDB::submit_transaction_sync(KeyValueDB::Transaction tsync) | |
207 | { | |
208 | dtrace << __func__ << " " << dendl; | |
209 | submit_transaction(tsync); | |
210 | return 0; | |
211 | } | |
212 | ||
213 | int MemDB::transaction_rollback(KeyValueDB::Transaction t) | |
214 | { | |
215 | MDBTransactionImpl* mt = static_cast<MDBTransactionImpl*>(t.get()); | |
216 | mt->clear(); | |
217 | return 0; | |
218 | } | |
219 | ||
220 | void MemDB::MDBTransactionImpl::set( | |
221 | const string &prefix, const string &k, const bufferlist &to_set_bl) | |
222 | { | |
223 | dtrace << __func__ << " " << prefix << " " << k << dendl; | |
224 | ops.push_back(make_pair(WRITE, std::make_pair(std::make_pair(prefix, k), | |
225 | to_set_bl))); | |
226 | } | |
227 | ||
228 | void MemDB::MDBTransactionImpl::rmkey(const string &prefix, | |
229 | const string &k) | |
230 | { | |
231 | dtrace << __func__ << " " << prefix << " " << k << dendl; | |
232 | ops.push_back(make_pair(DELETE, | |
233 | std::make_pair(std::make_pair(prefix, k), | |
234 | bufferlist()))); | |
235 | } | |
236 | ||
237 | void MemDB::MDBTransactionImpl::rmkeys_by_prefix(const string &prefix) | |
238 | { | |
239 | KeyValueDB::Iterator it = m_db->get_iterator(prefix); | |
240 | for (it->seek_to_first(); it->valid(); it->next()) { | |
241 | rmkey(prefix, it->key()); | |
242 | } | |
243 | } | |
244 | ||
245 | void MemDB::MDBTransactionImpl::rm_range_keys(const string &prefix, const string &start, const string &end) | |
246 | { | |
247 | KeyValueDB::Iterator it = m_db->get_iterator(prefix); | |
248 | it->lower_bound(start); | |
249 | while (it->valid()) { | |
250 | if (it->key() >= end) { | |
251 | break; | |
252 | } | |
253 | rmkey(prefix, it->key()); | |
254 | it->next(); | |
255 | } | |
256 | } | |
257 | ||
258 | void MemDB::MDBTransactionImpl::merge( | |
259 | const std::string &prefix, const std::string &key, const bufferlist &value) | |
260 | { | |
261 | ||
262 | dtrace << __func__ << " " << prefix << " " << key << dendl; | |
263 | ops.push_back(make_pair(MERGE, make_pair(std::make_pair(prefix, key), value))); | |
264 | return; | |
265 | } | |
266 | ||
267 | int MemDB::_setkey(ms_op_t &op) | |
268 | { | |
269 | std::lock_guard<std::mutex> l(m_lock); | |
270 | std::string key = make_key(op.first.first, op.first.second); | |
271 | bufferlist bl = op.second; | |
272 | ||
273 | m_total_bytes += bl.length(); | |
274 | ||
275 | bufferlist bl_old; | |
276 | if (_get(op.first.first, op.first.second, &bl_old)) { | |
277 | /* | |
278 | * delete and free existing key. | |
279 | */ | |
280 | assert(m_total_bytes >= bl_old.length()); | |
281 | m_total_bytes -= bl_old.length(); | |
282 | m_map.erase(key); | |
283 | } | |
284 | ||
285 | m_map[key] = bufferptr((char *) bl.c_str(), bl.length()); | |
286 | iterator_seq_no++; | |
287 | return 0; | |
288 | } | |
289 | ||
290 | int MemDB::_rmkey(ms_op_t &op) | |
291 | { | |
292 | std::lock_guard<std::mutex> l(m_lock); | |
293 | std::string key = make_key(op.first.first, op.first.second); | |
294 | ||
295 | bufferlist bl_old; | |
296 | if (_get(op.first.first, op.first.second, &bl_old)) { | |
297 | assert(m_total_bytes >= bl_old.length()); | |
298 | m_total_bytes -= bl_old.length(); | |
299 | } | |
300 | iterator_seq_no++; | |
301 | /* | |
302 | * Erase will call the destructor for bufferptr. | |
303 | */ | |
304 | return m_map.erase(key); | |
305 | } | |
306 | ||
307 | std::shared_ptr<KeyValueDB::MergeOperator> MemDB::_find_merge_op(std::string prefix) | |
308 | { | |
309 | for (const auto& i : merge_ops) { | |
310 | if (i.first == prefix) { | |
311 | return i.second; | |
312 | } | |
313 | } | |
314 | ||
315 | dtrace << __func__ << " No merge op for " << prefix << dendl; | |
316 | return NULL; | |
317 | } | |
318 | ||
319 | ||
320 | int MemDB::_merge(ms_op_t &op) | |
321 | { | |
322 | std::lock_guard<std::mutex> l(m_lock); | |
323 | std::string prefix = op.first.first; | |
324 | std::string key = make_key(op.first.first, op.first.second); | |
325 | bufferlist bl = op.second; | |
326 | int64_t bytes_adjusted = bl.length(); | |
327 | ||
328 | /* | |
329 | * find the operator for this prefix | |
330 | */ | |
331 | std::shared_ptr<MergeOperator> mop = _find_merge_op(prefix); | |
332 | assert(mop); | |
333 | ||
334 | /* | |
335 | * call the merge operator with value and non value | |
336 | */ | |
337 | bufferlist bl_old; | |
338 | if (_get(op.first.first, op.first.second, &bl_old) == false) { | |
339 | std::string new_val; | |
340 | /* | |
341 | * Merge non existent. | |
342 | */ | |
343 | mop->merge_nonexistent(bl.c_str(), bl.length(), &new_val); | |
344 | m_map[key] = bufferptr(new_val.c_str(), new_val.length()); | |
345 | } else { | |
346 | /* | |
347 | * Merge existing. | |
348 | */ | |
349 | std::string new_val; | |
350 | mop->merge(bl_old.c_str(), bl_old.length(), bl.c_str(), bl.length(), &new_val); | |
351 | m_map[key] = bufferptr(new_val.c_str(), new_val.length()); | |
352 | bytes_adjusted -= bl_old.length(); | |
353 | bl_old.clear(); | |
354 | } | |
355 | ||
356 | assert((int64_t)m_total_bytes + bytes_adjusted >= 0); | |
357 | m_total_bytes += bytes_adjusted; | |
358 | iterator_seq_no++; | |
359 | return 0; | |
360 | } | |
361 | ||
362 | /* | |
363 | * Caller take btree lock. | |
364 | */ | |
365 | bool MemDB::_get(const string &prefix, const string &k, bufferlist *out) | |
366 | { | |
367 | string key = make_key(prefix, k); | |
368 | ||
369 | mdb_iter_t iter = m_map.find(key); | |
370 | if (iter == m_map.end()) { | |
371 | return false; | |
372 | } | |
373 | ||
374 | out->push_back((m_map[key].clone())); | |
375 | return true; | |
376 | } | |
377 | ||
378 | bool MemDB::_get_locked(const string &prefix, const string &k, bufferlist *out) | |
379 | { | |
380 | std::lock_guard<std::mutex> l(m_lock); | |
381 | return _get(prefix, k, out); | |
382 | } | |
383 | ||
384 | ||
385 | int MemDB::get(const string &prefix, const std::string& key, | |
386 | bufferlist *out) | |
387 | { | |
388 | if (_get_locked(prefix, key, out)) { | |
389 | return 0; | |
390 | } | |
391 | return -ENOENT; | |
392 | } | |
393 | ||
394 | int MemDB::get(const string &prefix, const std::set<string> &keys, | |
395 | std::map<string, bufferlist> *out) | |
396 | { | |
397 | for (const auto& i : keys) { | |
398 | bufferlist bl; | |
399 | if (_get_locked(prefix, i, &bl)) | |
400 | out->insert(make_pair(i, bl)); | |
401 | } | |
402 | ||
403 | return 0; | |
404 | } | |
405 | ||
406 | void MemDB::MDBWholeSpaceIteratorImpl::fill_current() | |
407 | { | |
408 | bufferlist bl; | |
409 | bl.append(m_iter->second.clone()); | |
410 | m_key_value = std::make_pair(m_iter->first, bl); | |
411 | } | |
412 | ||
413 | bool MemDB::MDBWholeSpaceIteratorImpl::valid() | |
414 | { | |
415 | if (m_key_value.first.empty()) { | |
416 | return false; | |
417 | } | |
418 | return true; | |
419 | } | |
420 | ||
421 | bool MemDB::MDBWholeSpaceIteratorImpl::iterator_validate() { | |
422 | ||
423 | if (this_seq_no != *global_seq_no) { | |
424 | auto key = m_key_value.first; | |
425 | assert(!key.empty()); | |
426 | ||
427 | bool restart_iter = false; | |
428 | if (!m_using_btree) { | |
429 | /* | |
430 | * Map is modified and marker key does not exists, | |
431 | * restart the iterator from next key. | |
432 | */ | |
433 | if (m_map_p->find(key) == m_map_p->end()) { | |
434 | restart_iter = true; | |
435 | } | |
436 | } else { | |
437 | restart_iter = true; | |
438 | } | |
439 | ||
440 | if (restart_iter) { | |
441 | m_iter = m_map_p->lower_bound(key); | |
442 | if (m_iter == m_map_p->end()) { | |
443 | return false; | |
444 | } | |
445 | } | |
446 | ||
447 | /* | |
448 | * This iter is valid now. | |
449 | */ | |
450 | this_seq_no = *global_seq_no; | |
451 | } | |
452 | ||
453 | return true; | |
454 | } | |
455 | ||
456 | void | |
457 | MemDB::MDBWholeSpaceIteratorImpl::free_last() | |
458 | { | |
459 | m_key_value.first.clear(); | |
460 | m_key_value.second.clear(); | |
461 | } | |
462 | ||
463 | string MemDB::MDBWholeSpaceIteratorImpl::key() | |
464 | { | |
465 | dtrace << __func__ << " " << m_key_value.first << dendl; | |
466 | string prefix, key; | |
467 | split_key(m_key_value.first, &prefix, &key); | |
468 | return key; | |
469 | } | |
470 | ||
471 | pair<string,string> MemDB::MDBWholeSpaceIteratorImpl::raw_key() | |
472 | { | |
473 | string prefix, key; | |
474 | split_key(m_key_value.first, &prefix, &key); | |
475 | return make_pair(prefix, key); | |
476 | } | |
477 | ||
478 | bool MemDB::MDBWholeSpaceIteratorImpl::raw_key_is_prefixed( | |
479 | const string &prefix) | |
480 | { | |
481 | string p, k; | |
482 | split_key(m_key_value.first, &p, &k); | |
483 | return (p == prefix); | |
484 | } | |
485 | ||
486 | bufferlist MemDB::MDBWholeSpaceIteratorImpl::value() | |
487 | { | |
488 | dtrace << __func__ << " " << m_key_value << dendl; | |
489 | return m_key_value.second; | |
490 | } | |
491 | ||
492 | int MemDB::MDBWholeSpaceIteratorImpl::next() | |
493 | { | |
494 | std::lock_guard<std::mutex> l(*m_map_lock_p); | |
495 | if (!iterator_validate()) { | |
496 | free_last(); | |
497 | return -1; | |
498 | } | |
499 | free_last(); | |
500 | ++m_iter; | |
501 | if (m_iter != m_map_p->end()) { | |
502 | fill_current(); | |
503 | return 0; | |
504 | } else { | |
505 | return -1; | |
506 | } | |
507 | } | |
508 | ||
509 | int MemDB::MDBWholeSpaceIteratorImpl:: prev() | |
510 | { | |
511 | std::lock_guard<std::mutex> l(*m_map_lock_p); | |
512 | if (!iterator_validate()) { | |
513 | free_last(); | |
514 | return -1; | |
515 | } | |
516 | free_last(); | |
517 | if (m_iter != m_map_p->begin()) { | |
518 | --m_iter; | |
519 | fill_current(); | |
520 | return 0; | |
521 | } else { | |
522 | return -1; | |
523 | } | |
524 | } | |
525 | ||
526 | /* | |
527 | * First key >= to given key, if key is null then first key in btree. | |
528 | */ | |
529 | int MemDB::MDBWholeSpaceIteratorImpl::seek_to_first(const std::string &k) | |
530 | { | |
531 | std::lock_guard<std::mutex> l(*m_map_lock_p); | |
532 | free_last(); | |
533 | if (k.empty()) { | |
534 | m_iter = m_map_p->begin(); | |
535 | } else { | |
536 | m_iter = m_map_p->lower_bound(k); | |
537 | } | |
538 | ||
539 | if (m_iter == m_map_p->end()) { | |
540 | return -1; | |
541 | } | |
542 | fill_current(); | |
543 | return 0; | |
544 | } | |
545 | ||
546 | int MemDB::MDBWholeSpaceIteratorImpl::seek_to_last(const std::string &k) | |
547 | { | |
548 | std::lock_guard<std::mutex> l(*m_map_lock_p); | |
549 | free_last(); | |
550 | if (k.empty()) { | |
551 | m_iter = m_map_p->end(); | |
552 | --m_iter; | |
553 | } else { | |
554 | m_iter = m_map_p->lower_bound(k); | |
555 | } | |
556 | ||
557 | if (m_iter == m_map_p->end()) { | |
558 | return -1; | |
559 | } | |
560 | fill_current(); | |
561 | return 0; | |
562 | } | |
563 | ||
564 | MemDB::MDBWholeSpaceIteratorImpl::~MDBWholeSpaceIteratorImpl() | |
565 | { | |
566 | free_last(); | |
567 | } | |
568 | ||
569 | int MemDB::MDBWholeSpaceIteratorImpl::upper_bound(const std::string &prefix, | |
570 | const std::string &after) { | |
571 | ||
572 | std::lock_guard<std::mutex> l(*m_map_lock_p); | |
573 | ||
574 | dtrace << "upper_bound " << prefix.c_str() << after.c_str() << dendl; | |
575 | string k = make_key(prefix, after); | |
576 | m_iter = m_map_p->upper_bound(k); | |
577 | if (m_iter != m_map_p->end()) { | |
578 | fill_current(); | |
579 | return 0; | |
580 | } | |
581 | return -1; | |
582 | } | |
583 | ||
584 | int MemDB::MDBWholeSpaceIteratorImpl::lower_bound(const std::string &prefix, | |
585 | const std::string &to) { | |
586 | std::lock_guard<std::mutex> l(*m_map_lock_p); | |
587 | dtrace << "lower_bound " << prefix.c_str() << to.c_str() << dendl; | |
588 | string k = make_key(prefix, to); | |
589 | m_iter = m_map_p->lower_bound(k); | |
590 | if (m_iter != m_map_p->end()) { | |
591 | fill_current(); | |
592 | return 0; | |
593 | } | |
594 | return -1; | |
595 | } |