]> git.proxmox.com Git - ceph.git/blame - ceph/src/libcephsqlite.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / libcephsqlite.cc
CommitLineData
f67539c2
TL
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4/*
5 * Ceph - scalable distributed file system
6 *
7 * Copyright (C) 2021 Red Hat, Inc.
8 *
9 * This is free software; you can redistribute it and/or modify it under the
10 * terms of the GNU Lesser General Public License version 2.1, as published by
11 * the Free Software Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/smart_ptr/intrusive_ptr.hpp>
16#include <fmt/format.h>
17
18#include <fcntl.h>
19#include <stdio.h>
20#include <sys/stat.h>
21#include <sys/types.h>
22#include <unistd.h>
23
20effc67 24#include <cstdlib>
f67539c2
TL
25#include <iomanip>
26#include <iostream>
27#include <regex>
28#include <sstream>
29#include <string_view>
30
31#include <limits.h>
32#include <string.h>
33
34#include <sqlite3ext.h>
35SQLITE_EXTENSION_INIT1
36
37#include "include/ceph_assert.h"
38#include "include/rados/librados.hpp"
39
40#include "common/Clock.h"
41#include "common/Formatter.h"
42#include "common/ceph_argparse.h"
43#include "common/ceph_mutex.h"
44#include "common/common_init.h"
45#include "common/config.h"
46#include "common/debug.h"
47#include "common/errno.h"
48#include "common/perf_counters.h"
49#include "common/version.h"
50
51#include "include/libcephsqlite.h"
52#include "SimpleRADOSStriper.h"
53
54#define dout_subsys ceph_subsys_cephsqlite
55#undef dout_prefix
56#define dout_prefix *_dout << "cephsqlite: " << __func__ << ": "
aee94f69
TL
57#define d(cct,cluster,lvl) ldout((cct), (lvl)) << "(client." << cluster->get_instance_id() << ") "
58#define dv(lvl) d(cct,cluster,(lvl))
59#define df(lvl) d(f->io.cct,f->io.cluster,(lvl)) << f->loc << " "
f67539c2
TL
60
61enum {
62 P_FIRST = 0xf0000,
63 P_OP_OPEN,
64 P_OP_DELETE,
65 P_OP_ACCESS,
66 P_OP_FULLPATHNAME,
67 P_OP_CURRENTTIME,
68 P_OPF_CLOSE,
69 P_OPF_READ,
70 P_OPF_WRITE,
71 P_OPF_TRUNCATE,
72 P_OPF_SYNC,
73 P_OPF_FILESIZE,
74 P_OPF_LOCK,
75 P_OPF_UNLOCK,
76 P_OPF_CHECKRESERVEDLOCK,
77 P_OPF_FILECONTROL,
78 P_OPF_SECTORSIZE,
79 P_OPF_DEVICECHARACTERISTICS,
80 P_LAST,
81};
82
aee94f69
TL
83using cctptr = boost::intrusive_ptr<CephContext>;
84using rsptr = std::shared_ptr<librados::Rados>;
85
f67539c2
TL
86struct cephsqlite_appdata {
87 ~cephsqlite_appdata() {
aee94f69
TL
88 {
89 std::scoped_lock lock(cluster_mutex);
90 _disconnect();
91 }
f67539c2
TL
92 if (logger) {
93 cct->get_perfcounters_collection()->remove(logger.get());
94 }
95 if (striper_logger) {
96 cct->get_perfcounters_collection()->remove(striper_logger.get());
97 }
98 }
99 int setup_perf() {
100 ceph_assert(cct);
101 PerfCountersBuilder plb(cct.get(), "libcephsqlite_vfs", P_FIRST, P_LAST);
102 plb.add_time_avg(P_OP_OPEN, "op_open", "Time average of Open operations");
103 plb.add_time_avg(P_OP_DELETE, "op_delete", "Time average of Delete operations");
104 plb.add_time_avg(P_OP_ACCESS, "op_access", "Time average of Access operations");
105 plb.add_time_avg(P_OP_FULLPATHNAME, "op_fullpathname", "Time average of FullPathname operations");
106 plb.add_time_avg(P_OP_CURRENTTIME, "op_currenttime", "Time average of Currenttime operations");
107 plb.add_time_avg(P_OPF_CLOSE, "opf_close", "Time average of Close file operations");
108 plb.add_time_avg(P_OPF_READ, "opf_read", "Time average of Read file operations");
109 plb.add_time_avg(P_OPF_WRITE, "opf_write", "Time average of Write file operations");
110 plb.add_time_avg(P_OPF_TRUNCATE, "opf_truncate", "Time average of Truncate file operations");
111 plb.add_time_avg(P_OPF_SYNC, "opf_sync", "Time average of Sync file operations");
112 plb.add_time_avg(P_OPF_FILESIZE, "opf_filesize", "Time average of FileSize file operations");
113 plb.add_time_avg(P_OPF_LOCK, "opf_lock", "Time average of Lock file operations");
114 plb.add_time_avg(P_OPF_UNLOCK, "opf_unlock", "Time average of Unlock file operations");
115 plb.add_time_avg(P_OPF_CHECKRESERVEDLOCK, "opf_checkreservedlock", "Time average of CheckReservedLock file operations");
116 plb.add_time_avg(P_OPF_FILECONTROL, "opf_filecontrol", "Time average of FileControl file operations");
117 plb.add_time_avg(P_OPF_SECTORSIZE, "opf_sectorsize", "Time average of SectorSize file operations");
118 plb.add_time_avg(P_OPF_DEVICECHARACTERISTICS, "opf_devicecharacteristics", "Time average of DeviceCharacteristics file operations");
119 logger.reset(plb.create_perf_counters());
120 if (int rc = SimpleRADOSStriper::config_logger(cct.get(), "libcephsqlite_striper", &striper_logger); rc < 0) {
121 return rc;
122 }
123 cct->get_perfcounters_collection()->add(logger.get());
124 cct->get_perfcounters_collection()->add(striper_logger.get());
125 return 0;
126 }
aee94f69
TL
127
128 std::pair<cctptr, rsptr> get_cluster() {
129 std::scoped_lock lock(cluster_mutex);
130 if (!cct) {
131 if (int rc = _open(nullptr); rc < 0) {
132 ceph_abort("could not open connection to ceph");
133 }
134 }
135 return {cct, cluster};
136 }
137 int connect() {
138 std::scoped_lock lock(cluster_mutex);
139 return _connect();
140 }
141 int reconnect() {
142 std::scoped_lock lock(cluster_mutex);
143 _disconnect();
144 return _connect();
145 }
146 int maybe_reconnect(rsptr _cluster) {
147 std::scoped_lock lock(cluster_mutex);
148 if (!cluster || cluster == _cluster) {
149 ldout(cct, 10) << "reconnecting to RADOS" << dendl;
150 _disconnect();
151 return _connect();
152 } else {
153 ldout(cct, 10) << "already reconnected" << dendl;
154 return 0;
155 }
156 }
157 int open(CephContext* _cct) {
158 std::scoped_lock lock(cluster_mutex);
159 return _open(_cct);
160 }
161
162 std::unique_ptr<PerfCounters> logger;
163 std::shared_ptr<PerfCounters> striper_logger;
164
165private:
166 int _open(CephContext* _cct) {
167 if (!_cct) {
168 std::vector<const char*> env_args;
169 env_to_vec(env_args, "CEPH_ARGS");
170 std::string cluster, conf_file_list; // unused
171 CephInitParameters iparams = ceph_argparse_early_args(env_args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
172 cct = cctptr(common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0), false);
173 cct->_conf.parse_config_files(nullptr, &std::cerr, 0);
174 cct->_conf.parse_env(cct->get_module_type()); // environment variables override
175 cct->_conf.apply_changes(nullptr);
176 common_init_finish(cct.get());
177 } else {
178 cct = cctptr(_cct);
179 }
180
181 if (int rc = setup_perf(); rc < 0) {
182 return rc;
183 }
184
185 if (int rc = _connect(); rc < 0) {
186 return rc;
187 }
188
189 return 0;
190 }
191 void _disconnect() {
192 if (cluster) {
193 cluster.reset();
194 }
195 }
196 int _connect() {
f67539c2 197 ceph_assert(cct);
aee94f69 198 auto _cluster = rsptr(new librados::Rados());
f67539c2 199 ldout(cct, 5) << "initializing RADOS handle as " << cct->_conf->name << dendl;
aee94f69 200 if (int rc = _cluster->init_with_context(cct.get()); rc < 0) {
f67539c2
TL
201 lderr(cct) << "cannot initialize RADOS: " << cpp_strerror(rc) << dendl;
202 return rc;
203 }
aee94f69 204 if (int rc = _cluster->connect(); rc < 0) {
f67539c2
TL
205 lderr(cct) << "cannot connect: " << cpp_strerror(rc) << dendl;
206 return rc;
207 }
aee94f69 208 auto s = _cluster->get_addrs();
f67539c2 209 ldout(cct, 5) << "completed connection to RADOS with address " << s << dendl;
aee94f69 210 cluster = std::move(_cluster);
f67539c2
TL
211 return 0;
212 }
213
aee94f69
TL
214 ceph::mutex cluster_mutex = ceph::make_mutex("libcephsqlite");;
215 cctptr cct;
216 rsptr cluster;
f67539c2
TL
217};
218
219struct cephsqlite_fileloc {
220 std::string pool;
221 std::string radosns;
222 std::string name;
223};
224
225struct cephsqlite_fileio {
aee94f69
TL
226 cctptr cct;
227 rsptr cluster; // anchor for ioctx
f67539c2
TL
228 librados::IoCtx ioctx;
229 std::unique_ptr<SimpleRADOSStriper> rs;
230};
231
232std::ostream& operator<<(std::ostream &out, const cephsqlite_fileloc& fileloc) {
233 return out
234 << "["
235 << fileloc.pool
236 << ":"
237 << fileloc.radosns
238 << "/"
239 << fileloc.name
240 << "]"
241 ;
242}
243
244struct cephsqlite_file {
245 sqlite3_file base;
246 struct sqlite3_vfs* vfs = nullptr;
247 int flags = 0;
248 // There are 5 lock states: https://sqlite.org/c3ref/c_lock_exclusive.html
249 int lock = 0;
250 struct cephsqlite_fileloc loc{};
251 struct cephsqlite_fileio io{};
252};
253
254
255#define getdata(vfs) (*((cephsqlite_appdata*)((vfs)->pAppData)))
256
f67539c2
TL
257static int Lock(sqlite3_file *file, int ilock)
258{
259 auto f = (cephsqlite_file*)file;
260 auto start = ceph::coarse_mono_clock::now();
261 df(5) << std::hex << ilock << dendl;
262
263 auto& lock = f->lock;
264 ceph_assert(!f->io.rs->is_locked() || lock > SQLITE_LOCK_NONE);
265 ceph_assert(lock <= ilock);
266 if (!f->io.rs->is_locked() && ilock > SQLITE_LOCK_NONE) {
267 if (int rc = f->io.rs->lock(0); rc < 0) {
268 df(5) << "failed: " << rc << dendl;
aee94f69
TL
269 if (rc == -EBLOCKLISTED) {
270 getdata(f->vfs).maybe_reconnect(f->io.cluster);
271 }
f67539c2
TL
272 return SQLITE_IOERR;
273 }
274 }
275
276 lock = ilock;
277 auto end = ceph::coarse_mono_clock::now();
278 getdata(f->vfs).logger->tinc(P_OPF_LOCK, end-start);
279 return SQLITE_OK;
280}
281
282static int Unlock(sqlite3_file *file, int ilock)
283{
284 auto f = (cephsqlite_file*)file;
285 auto start = ceph::coarse_mono_clock::now();
286 df(5) << std::hex << ilock << dendl;
287
288 auto& lock = f->lock;
289 ceph_assert(lock == SQLITE_LOCK_NONE || (lock > SQLITE_LOCK_NONE && f->io.rs->is_locked()));
290 ceph_assert(lock >= ilock);
291 if (ilock <= SQLITE_LOCK_NONE && SQLITE_LOCK_NONE < lock) {
292 if (int rc = f->io.rs->unlock(); rc < 0) {
293 df(5) << "failed: " << rc << dendl;
aee94f69
TL
294 if (rc == -EBLOCKLISTED) {
295 getdata(f->vfs).maybe_reconnect(f->io.cluster);
296 }
f67539c2
TL
297 return SQLITE_IOERR;
298 }
299 }
300
301 lock = ilock;
302 auto end = ceph::coarse_mono_clock::now();
303 getdata(f->vfs).logger->tinc(P_OPF_UNLOCK, end-start);
304 return SQLITE_OK;
305}
306
307static int CheckReservedLock(sqlite3_file *file, int *result)
308{
309 auto f = (cephsqlite_file*)file;
310 auto start = ceph::coarse_mono_clock::now();
311 df(5) << dendl;
20effc67 312 *result = 0;
f67539c2
TL
313
314 auto& lock = f->lock;
315 if (lock > SQLITE_LOCK_SHARED) {
316 *result = 1;
317 }
318
319 df(10);
320 f->io.rs->print_lockers(*_dout);
321 *_dout << dendl;
322
f67539c2
TL
323 auto end = ceph::coarse_mono_clock::now();
324 getdata(f->vfs).logger->tinc(P_OPF_CHECKRESERVEDLOCK, end-start);
325 return SQLITE_OK;
326}
327
328static int Close(sqlite3_file *file)
329{
330 auto f = (cephsqlite_file*)file;
331 auto start = ceph::coarse_mono_clock::now();
332 df(5) << dendl;
333 f->~cephsqlite_file();
334 auto end = ceph::coarse_mono_clock::now();
335 getdata(f->vfs).logger->tinc(P_OPF_CLOSE, end-start);
336 return SQLITE_OK;
337}
338
339static int Read(sqlite3_file *file, void *buf, int len, sqlite_int64 off)
340{
341 auto f = (cephsqlite_file*)file;
342 auto start = ceph::coarse_mono_clock::now();
343 df(5) << buf << " " << off << "~" << len << dendl;
344
345 if (int rc = f->io.rs->read(buf, len, off); rc < 0) {
346 df(5) << "read failed: " << cpp_strerror(rc) << dendl;
aee94f69
TL
347 if (rc == -EBLOCKLISTED) {
348 getdata(f->vfs).maybe_reconnect(f->io.cluster);
349 }
f67539c2
TL
350 return SQLITE_IOERR_READ;
351 } else {
352 df(5) << "= " << rc << dendl;
353 auto end = ceph::coarse_mono_clock::now();
354 getdata(f->vfs).logger->tinc(P_OPF_READ, end-start);
355 if (rc < len) {
aee94f69 356 memset((unsigned char*)buf+rc, 0, len-rc);
f67539c2
TL
357 return SQLITE_IOERR_SHORT_READ;
358 } else {
359 return SQLITE_OK;
360 }
361 }
362}
363
364static int Write(sqlite3_file *file, const void *buf, int len, sqlite_int64 off)
365{
366 auto f = (cephsqlite_file*)file;
367 auto start = ceph::coarse_mono_clock::now();
368 df(5) << off << "~" << len << dendl;
369
370 if (int rc = f->io.rs->write(buf, len, off); rc < 0) {
371 df(5) << "write failed: " << cpp_strerror(rc) << dendl;
aee94f69
TL
372 if (rc == -EBLOCKLISTED) {
373 getdata(f->vfs).maybe_reconnect(f->io.cluster);
374 }
f67539c2
TL
375 return SQLITE_IOERR_WRITE;
376 } else {
377 df(5) << "= " << rc << dendl;
378 auto end = ceph::coarse_mono_clock::now();
379 getdata(f->vfs).logger->tinc(P_OPF_WRITE, end-start);
380 return SQLITE_OK;
381 }
382
383}
384
385static int Truncate(sqlite3_file *file, sqlite_int64 size)
386{
387 auto f = (cephsqlite_file*)file;
388 auto start = ceph::coarse_mono_clock::now();
389 df(5) << size << dendl;
390
391 if (int rc = f->io.rs->truncate(size); rc < 0) {
392 df(5) << "truncate failed: " << cpp_strerror(rc) << dendl;
aee94f69
TL
393 if (rc == -EBLOCKLISTED) {
394 getdata(f->vfs).maybe_reconnect(f->io.cluster);
395 }
f67539c2
TL
396 return SQLITE_IOERR;
397 }
398
399 auto end = ceph::coarse_mono_clock::now();
400 getdata(f->vfs).logger->tinc(P_OPF_TRUNCATE, end-start);
401 return SQLITE_OK;
402}
403
404static int Sync(sqlite3_file *file, int flags)
405{
406 auto f = (cephsqlite_file*)file;
407 auto start = ceph::coarse_mono_clock::now();
408 df(5) << flags << dendl;
409
410 if (int rc = f->io.rs->flush(); rc < 0) {
411 df(5) << "failed: " << cpp_strerror(rc) << dendl;
aee94f69
TL
412 if (rc == -EBLOCKLISTED) {
413 getdata(f->vfs).maybe_reconnect(f->io.cluster);
414 }
f67539c2
TL
415 return SQLITE_IOERR;
416 }
417
418 df(5) << " = 0" << dendl;
419
420 auto end = ceph::coarse_mono_clock::now();
421 getdata(f->vfs).logger->tinc(P_OPF_SYNC, end-start);
422 return SQLITE_OK;
423}
424
425
426static int FileSize(sqlite3_file *file, sqlite_int64 *osize)
427{
428 auto f = (cephsqlite_file*)file;
429 auto start = ceph::coarse_mono_clock::now();
430 df(5) << dendl;
431
432 uint64_t size = 0;
433 if (int rc = f->io.rs->stat(&size); rc < 0) {
434 df(5) << "stat failed: " << cpp_strerror(rc) << dendl;
aee94f69
TL
435 if (rc == -EBLOCKLISTED) {
436 getdata(f->vfs).maybe_reconnect(f->io.cluster);
437 }
f67539c2
TL
438 return SQLITE_NOTFOUND;
439 }
440
441 *osize = (sqlite_int64)size;
442
443 df(5) << "= " << size << dendl;
444
445 auto end = ceph::coarse_mono_clock::now();
446 getdata(f->vfs).logger->tinc(P_OPF_FILESIZE, end-start);
447 return SQLITE_OK;
448}
449
450
451static bool parsepath(std::string_view path, struct cephsqlite_fileloc* fileloc)
452{
0948533f
TL
453 static const std::regex re1{"^/*(\\*[[:digit:]]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"};
454 static const std::regex re2{"^/*([[:alnum:]\\-_.]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"};
f67539c2
TL
455
456 std::cmatch cm;
457 if (!std::regex_match(path.data(), cm, re1)) {
458 if (!std::regex_match(path.data(), cm, re2)) {
459 return false;
460 }
461 }
462 fileloc->pool = cm[1];
463 fileloc->radosns = cm[2];
464 fileloc->name = cm[3];
465
466 return true;
467}
468
aee94f69 469static int makestriper(sqlite3_vfs* vfs, cctptr cct, rsptr cluster, const cephsqlite_fileloc& loc, cephsqlite_fileio* io)
f67539c2 470{
f67539c2
TL
471 bool gotmap = false;
472
aee94f69 473 d(cct,cluster,10) << loc << dendl;
f67539c2
TL
474
475enoent_retry:
476 if (loc.pool[0] == '*') {
477 std::string err;
478 int64_t id = strict_strtoll(loc.pool.c_str()+1, 10, &err);
479 ceph_assert(err.empty());
aee94f69 480 if (int rc = cluster->ioctx_create2(id, io->ioctx); rc < 0) {
f67539c2 481 if (rc == -ENOENT && !gotmap) {
aee94f69 482 cluster->wait_for_latest_osdmap();
f67539c2
TL
483 gotmap = true;
484 goto enoent_retry;
485 }
aee94f69 486 d(cct,cluster,1) << "cannot create ioctx: " << cpp_strerror(rc) << dendl;
f67539c2
TL
487 return rc;
488 }
489 } else {
aee94f69 490 if (int rc = cluster->ioctx_create(loc.pool.c_str(), io->ioctx); rc < 0) {
f67539c2 491 if (rc == -ENOENT && !gotmap) {
aee94f69 492 cluster->wait_for_latest_osdmap();
f67539c2
TL
493 gotmap = true;
494 goto enoent_retry;
495 }
aee94f69 496 d(cct,cluster,1) << "cannot create ioctx: " << cpp_strerror(rc) << dendl;
f67539c2
TL
497 return rc;
498 }
499 }
500
501 if (!loc.radosns.empty())
502 io->ioctx.set_namespace(loc.radosns);
503
504 io->rs = std::make_unique<SimpleRADOSStriper>(io->ioctx, loc.name);
aee94f69 505 io->rs->set_logger(getdata(vfs).striper_logger);
f67539c2
TL
506 io->rs->set_lock_timeout(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_timeout"));
507 io->rs->set_lock_interval(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_interval"));
508 io->rs->set_blocklist_the_dead(cct->_conf.get_val<bool>("cephsqlite_blocklist_dead_locker"));
aee94f69
TL
509 io->cluster = std::move(cluster);
510 io->cct = cct;
f67539c2
TL
511
512 return 0;
513}
514
515static int SectorSize(sqlite3_file* sf)
516{
517 static const int size = 65536;
518 auto start = ceph::coarse_mono_clock::now();
519 auto f = (cephsqlite_file*)sf;
520 df(5) << " = " << size << dendl;
521 auto end = ceph::coarse_mono_clock::now();
522 getdata(f->vfs).logger->tinc(P_OPF_SECTORSIZE, end-start);
523 return size;
524}
525
526static int FileControl(sqlite3_file* sf, int op, void *arg)
527{
528 auto f = (cephsqlite_file*)sf;
529 auto start = ceph::coarse_mono_clock::now();
530 df(5) << op << ", " << arg << dendl;
531 auto end = ceph::coarse_mono_clock::now();
532 getdata(f->vfs).logger->tinc(P_OPF_FILECONTROL, end-start);
533 return SQLITE_NOTFOUND;
534}
535
536static int DeviceCharacteristics(sqlite3_file* sf)
537{
538 auto f = (cephsqlite_file*)sf;
539 auto start = ceph::coarse_mono_clock::now();
540 df(5) << dendl;
541 static const int c = 0
542 |SQLITE_IOCAP_ATOMIC
543 |SQLITE_IOCAP_POWERSAFE_OVERWRITE
544 |SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN
545 |SQLITE_IOCAP_SAFE_APPEND
546 ;
547 auto end = ceph::coarse_mono_clock::now();
548 getdata(f->vfs).logger->tinc(P_OPF_DEVICECHARACTERISTICS, end-start);
549 return c;
550}
551
552static int Open(sqlite3_vfs *vfs, const char *name, sqlite3_file *file,
553 int flags, int *oflags)
554{
555 static const sqlite3_io_methods io = {
556 1, /* iVersion */
557 Close, /* xClose */
558 Read, /* xRead */
559 Write, /* xWrite */
560 Truncate, /* xTruncate */
561 Sync, /* xSync */
562 FileSize, /* xFileSize */
563 Lock, /* xLock */
564 Unlock, /* xUnlock */
565 CheckReservedLock, /* xCheckReservedLock */
566 FileControl, /* xFileControl */
567 SectorSize, /* xSectorSize */
568 DeviceCharacteristics /* xDeviceCharacteristics */
569 };
570
571 auto start = ceph::coarse_mono_clock::now();
572 bool gotmap = false;
aee94f69 573 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2
TL
574
575 /* we are not going to create temporary files */
576 if (name == NULL) {
577 dv(-1) << " cannot open temporary database" << dendl;
578 return SQLITE_CANTOPEN;
579 }
580 auto path = std::string_view(name);
20effc67 581 if (path == ":memory:") {
f67539c2
TL
582 dv(-1) << " cannot open temporary database" << dendl;
583 return SQLITE_IOERR;
584 }
585
586 dv(5) << path << " flags=" << std::hex << flags << dendl;
587
588 auto f = new (file)cephsqlite_file();
589 f->vfs = vfs;
590 if (!parsepath(path, &f->loc)) {
591 ceph_assert(0); /* xFullPathname validates! */
592 }
593 f->flags = flags;
594
595enoent_retry:
aee94f69 596 if (int rc = makestriper(vfs, cct, cluster, f->loc, &f->io); rc < 0) {
f67539c2 597 f->~cephsqlite_file();
aee94f69 598 dv(-1) << "cannot open striper" << dendl;
f67539c2
TL
599 return SQLITE_IOERR;
600 }
601
602 if (flags & SQLITE_OPEN_CREATE) {
603 dv(10) << "OPEN_CREATE" << dendl;
604 if (int rc = f->io.rs->create(); rc < 0 && rc != -EEXIST) {
605 if (rc == -ENOENT && !gotmap) {
606 /* we may have an out of date OSDMap which cancels the op in the
607 * Objecter. Try to get a new one and retry. This is mostly noticable
608 * in testing when pools are getting created/deleted left and right.
609 */
610 dv(5) << "retrying create after getting latest OSDMap" << dendl;
aee94f69 611 cluster->wait_for_latest_osdmap();
f67539c2
TL
612 gotmap = true;
613 goto enoent_retry;
614 }
615 dv(5) << "file cannot be created: " << cpp_strerror(rc) << dendl;
616 return SQLITE_IOERR;
617 }
618 }
619
620 if (int rc = f->io.rs->open(); rc < 0) {
621 if (rc == -ENOENT && !gotmap) {
622 /* See comment above for create case. */
623 dv(5) << "retrying open after getting latest OSDMap" << dendl;
aee94f69 624 cluster->wait_for_latest_osdmap();
f67539c2
TL
625 gotmap = true;
626 goto enoent_retry;
627 }
628 dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl;
629 return rc;
630 }
631
632 if (oflags) {
633 *oflags = flags;
634 }
635 f->base.pMethods = &io;
636 auto end = ceph::coarse_mono_clock::now();
637 getdata(vfs).logger->tinc(P_OP_OPEN, end-start);
638 return SQLITE_OK;
639}
640
641/*
642** Delete the file identified by argument path. If the dsync parameter
643** is non-zero, then ensure the file-system modification to delete the
644** file has been synced to disk before returning.
645*/
646static int Delete(sqlite3_vfs* vfs, const char* path, int dsync)
647{
648 auto start = ceph::coarse_mono_clock::now();
aee94f69 649 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2
TL
650 dv(5) << "'" << path << "', " << dsync << dendl;
651
652 cephsqlite_fileloc fileloc;
653 if (!parsepath(path, &fileloc)) {
654 dv(5) << "path does not parse!" << dendl;
655 return SQLITE_NOTFOUND;
656 }
657
658 cephsqlite_fileio io;
aee94f69
TL
659 if (int rc = makestriper(vfs, cct, cluster, fileloc, &io); rc < 0) {
660 dv(-1) << "cannot open striper" << dendl;
f67539c2
TL
661 return SQLITE_IOERR;
662 }
663
664 if (int rc = io.rs->lock(0); rc < 0) {
665 return SQLITE_IOERR;
666 }
667
668 if (int rc = io.rs->remove(); rc < 0) {
669 dv(5) << "= " << rc << dendl;
670 return SQLITE_IOERR_DELETE;
671 }
672
673 /* No need to unlock */
674 dv(5) << "= 0" << dendl;
675 auto end = ceph::coarse_mono_clock::now();
676 getdata(vfs).logger->tinc(P_OP_DELETE, end-start);
677
678 return SQLITE_OK;
679}
680
681/*
682** Query the file-system to see if the named file exists, is readable or
683** is both readable and writable.
684*/
685static int Access(sqlite3_vfs* vfs, const char* path, int flags, int* result)
686{
687 auto start = ceph::coarse_mono_clock::now();
aee94f69 688 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2
TL
689 dv(5) << path << " " << std::hex << flags << dendl;
690
691 cephsqlite_fileloc fileloc;
692 if (!parsepath(path, &fileloc)) {
693 dv(5) << "path does not parse!" << dendl;
694 return SQLITE_NOTFOUND;
695 }
696
697 cephsqlite_fileio io;
aee94f69
TL
698 if (int rc = makestriper(vfs, cct, cluster, fileloc, &io); rc < 0) {
699 dv(-1) << "cannot open striper" << dendl;
f67539c2
TL
700 return SQLITE_IOERR;
701 }
702
703 if (int rc = io.rs->open(); rc < 0) {
704 if (rc == -ENOENT) {
705 *result = 0;
706 return SQLITE_OK;
707 } else {
708 dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl;
709 *result = 0;
710 return SQLITE_IOERR;
711 }
712 }
713
714 uint64_t size = 0;
715 if (int rc = io.rs->stat(&size); rc < 0) {
716 dv(5) << "= " << rc << " (" << cpp_strerror(rc) << ")" << dendl;
717 *result = 0;
718 } else {
719 dv(5) << "= 0" << dendl;
720 *result = 1;
721 }
722
723 auto end = ceph::coarse_mono_clock::now();
724 getdata(vfs).logger->tinc(P_OP_ACCESS, end-start);
725 return SQLITE_OK;
726}
727
728/* This method is only called once for each database. It provides a chance to
729 * reformat the path into a canonical format.
730 */
731static int FullPathname(sqlite3_vfs* vfs, const char* ipath, int opathlen, char* opath)
732{
733 auto start = ceph::coarse_mono_clock::now();
734 auto path = std::string_view(ipath);
aee94f69 735 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2
TL
736 dv(5) << "1: " << path << dendl;
737
738 cephsqlite_fileloc fileloc;
739 if (!parsepath(path, &fileloc)) {
740 dv(5) << "path does not parse!" << dendl;
741 return SQLITE_NOTFOUND;
742 }
743 dv(5) << " parsed " << fileloc << dendl;
744
745 auto p = fmt::format("{}:{}/{}", fileloc.pool, fileloc.radosns, fileloc.name);
746 if (p.size() >= (size_t)opathlen) {
747 dv(5) << "path too long!" << dendl;
748 return SQLITE_CANTOPEN;
749 }
750 strcpy(opath, p.c_str());
751 dv(5) << " output " << p << dendl;
752
753 auto end = ceph::coarse_mono_clock::now();
754 getdata(vfs).logger->tinc(P_OP_FULLPATHNAME, end-start);
755 return SQLITE_OK;
756}
757
758static int CurrentTime(sqlite3_vfs* vfs, sqlite3_int64* time)
759{
760 auto start = ceph::coarse_mono_clock::now();
aee94f69 761 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2
TL
762 dv(5) << time << dendl;
763
764 auto t = ceph_clock_now();
765 *time = t.to_msec() + 2440587.5*86400000; /* julian days since 1970 converted to ms */
766
767 auto end = ceph::coarse_mono_clock::now();
768 getdata(vfs).logger->tinc(P_OP_CURRENTTIME, end-start);
769 return SQLITE_OK;
770}
771
aee94f69 772LIBCEPHSQLITE_API int cephsqlite_setcct(CephContext* _cct, char** ident)
f67539c2 773{
aee94f69 774 ldout(_cct, 1) << "cct: " << _cct << dendl;
f67539c2
TL
775
776 if (sqlite3_api == nullptr) {
aee94f69 777 lderr(_cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl;
f67539c2
TL
778 return -EINVAL;
779 }
780
781 auto vfs = sqlite3_vfs_find("ceph");
782 if (!vfs) {
aee94f69 783 lderr(_cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl;
f67539c2
TL
784 return -EINVAL;
785 }
786
787 auto& appd = getdata(vfs);
aee94f69 788 if (int rc = appd.open(_cct); rc < 0) {
f67539c2
TL
789 return rc;
790 }
791
aee94f69
TL
792 auto [cct, cluster] = appd.get_cluster();
793
794 auto s = cluster->get_addrs();
f67539c2
TL
795 if (ident) {
796 *ident = strdup(s.c_str());
797 }
798
799 ldout(cct, 1) << "complete" << dendl;
800
801 return 0;
802}
803
804static void f_perf(sqlite3_context* ctx, int argc, sqlite3_value** argv)
805{
806 auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx);
aee94f69 807 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2
TL
808 dv(10) << dendl;
809 auto&& appd = getdata(vfs);
810 JSONFormatter f(false);
811 f.open_object_section("ceph_perf");
1e59de90
TL
812 appd.logger->dump_formatted(&f, false, false);
813 appd.striper_logger->dump_formatted(&f, false, false);
f67539c2
TL
814 f.close_section();
815 {
816 CachedStackStringStream css;
817 f.flush(*css);
818 auto sv = css->strv();
819 dv(20) << " = " << sv << dendl;
820 sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT);
821 }
822}
823
824static void f_status(sqlite3_context* ctx, int argc, sqlite3_value** argv)
825{
826 auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx);
aee94f69 827 auto [cct, cluster] = getdata(vfs).get_cluster();
f67539c2 828 dv(10) << dendl;
f67539c2
TL
829 JSONFormatter f(false);
830 f.open_object_section("ceph_status");
aee94f69
TL
831 f.dump_int("id", cluster->get_instance_id());
832 f.dump_string("addr", cluster->get_addrs());
f67539c2
TL
833 f.close_section();
834 {
835 CachedStackStringStream css;
836 f.flush(*css);
837 auto sv = css->strv();
838 dv(20) << " = " << sv << dendl;
839 sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT);
840 }
841}
842
843static int autoreg(sqlite3* db, char** err, const struct sqlite3_api_routines* thunk)
844{
845 auto vfs = sqlite3_vfs_find("ceph");
846 if (!vfs) {
847 ceph_abort("ceph vfs not found");
848 }
849
850 if (int rc = sqlite3_create_function(db, "ceph_perf", 0, SQLITE_UTF8, vfs, f_perf, nullptr, nullptr); rc) {
851 return rc;
852 }
853
854 if (int rc = sqlite3_create_function(db, "ceph_status", 0, SQLITE_UTF8, vfs, f_status, nullptr, nullptr); rc) {
855 return rc;
856 }
857
858 return SQLITE_OK;
859}
860
20effc67
TL
861/* You may wonder why we have an atexit handler? After all, atexit/exit creates
862 * a mess for multithreaded programs. Well, sqlite3 does not have an API for
863 * orderly removal of extensions. And, in fact, any API we might make
864 * unofficially (such as "sqlite3_cephsqlite_fini") would potentially race with
865 * other threads interacting with sqlite3 + the "ceph" VFS. There is a method
866 * for removing a VFS but it's not called by sqlite3 in any error scenario and
867 * there is no mechanism within sqlite3 to tell a VFS to unregister itself.
868 *
869 * This all would be mostly okay if /bin/sqlite3 did not call exit(3), but it
870 * does. (This occurs only for the sqlite3 binary, not when used as a library.)
871 * exit(3) calls destructors on all static-duration structures for the program.
872 * This breaks any outstanding threads created by the librados handle in all
873 * sorts of fantastic ways from C++ exceptions to memory faults. In general,
874 * Ceph libraries are not tolerant of exit(3) (_exit(3) is okay!). Applications
875 * must clean up after themselves or _exit(3).
876 *
877 * So, we have an atexit handler for libcephsqlite. This simply shuts down the
878 * RADOS handle. We can be assured that this occurs before any ceph library
879 * static-duration structures are destructed due to ordering guarantees by
880 * exit(3). Generally, we only see this called when the VFS is used by
881 * /bin/sqlite3 and only during sqlite3 error scenarios (like I/O errors
882 * arrising from blocklisting).
883 */
884
885static void cephsqlite_atexit()
886{
887 if (auto vfs = sqlite3_vfs_find("ceph"); vfs) {
888 if (vfs->pAppData) {
889 auto&& appd = getdata(vfs);
890 delete &appd;
891 vfs->pAppData = nullptr;
892 }
893 }
894}
895
f67539c2
TL
896LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api)
897{
898 SQLITE_EXTENSION_INIT2(api);
899
900 auto vfs = sqlite3_vfs_find("ceph");
901 if (!vfs) {
20effc67 902 vfs = (sqlite3_vfs*) calloc(1, sizeof(sqlite3_vfs));
f67539c2 903 auto appd = new cephsqlite_appdata;
f67539c2
TL
904 vfs->iVersion = 2;
905 vfs->szOsFile = sizeof(struct cephsqlite_file);
906 vfs->mxPathname = 4096;
907 vfs->zName = "ceph";
908 vfs->pAppData = appd;
909 vfs->xOpen = Open;
910 vfs->xDelete = Delete;
911 vfs->xAccess = Access;
912 vfs->xFullPathname = FullPathname;
913 vfs->xCurrentTimeInt64 = CurrentTime;
20effc67
TL
914 if (int rc = sqlite3_vfs_register(vfs, 0); rc) {
915 delete appd;
916 free(vfs);
917 return rc;
918 }
919 }
920
921 if (int rc = std::atexit(cephsqlite_atexit); rc) {
922 return SQLITE_INTERNAL;
f67539c2
TL
923 }
924
925 if (int rc = sqlite3_auto_extension((void(*)(void))autoreg); rc) {
926 return rc;
927 }
928 if (int rc = autoreg(db, err, api); rc) {
929 return rc;
930 }
931
932 return SQLITE_OK_LOAD_PERMANENTLY;
933}