]> git.proxmox.com Git - ceph.git/blob - ceph/src/libcephsqlite.cc
f533780c548bd99bda9555b8902c995872a1f3cb
[ceph.git] / ceph / src / libcephsqlite.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 /*
5 * Ceph - scalable distributed file system
6 *
7 * Copyright (C) 2021 Red Hat, Inc.
8 *
9 * This is free software; you can redistribute it and/or modify it under the
10 * terms of the GNU Lesser General Public License version 2.1, as published by
11 * the Free Software Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/smart_ptr/intrusive_ptr.hpp>
16 #include <fmt/format.h>
17
18 #include <fcntl.h>
19 #include <stdio.h>
20 #include <sys/stat.h>
21 #include <sys/types.h>
22 #include <unistd.h>
23
24 #include <cstdlib>
25 #include <iomanip>
26 #include <iostream>
27 #include <regex>
28 #include <sstream>
29 #include <string_view>
30
31 #include <limits.h>
32 #include <string.h>
33
34 #include <sqlite3ext.h>
35 SQLITE_EXTENSION_INIT1
36
37 #include "include/ceph_assert.h"
38 #include "include/rados/librados.hpp"
39
40 #include "common/Clock.h"
41 #include "common/Formatter.h"
42 #include "common/ceph_argparse.h"
43 #include "common/ceph_mutex.h"
44 #include "common/common_init.h"
45 #include "common/config.h"
46 #include "common/debug.h"
47 #include "common/errno.h"
48 #include "common/perf_counters.h"
49 #include "common/version.h"
50
51 #include "include/libcephsqlite.h"
52 #include "SimpleRADOSStriper.h"
53
54 #define dout_subsys ceph_subsys_cephsqlite
55 #undef dout_prefix
56 #define dout_prefix *_dout << "cephsqlite: " << __func__ << ": "
57 #define d(vfs,lvl) ldout(getcct(vfs), (lvl)) << "(client." << getdata(vfs).cluster.get_instance_id() << ") "
58 #define dv(lvl) d(vfs,(lvl))
59 #define df(lvl) d(f->vfs,(lvl)) << f->loc << " "
60
61 enum {
62 P_FIRST = 0xf0000,
63 P_OP_OPEN,
64 P_OP_DELETE,
65 P_OP_ACCESS,
66 P_OP_FULLPATHNAME,
67 P_OP_CURRENTTIME,
68 P_OPF_CLOSE,
69 P_OPF_READ,
70 P_OPF_WRITE,
71 P_OPF_TRUNCATE,
72 P_OPF_SYNC,
73 P_OPF_FILESIZE,
74 P_OPF_LOCK,
75 P_OPF_UNLOCK,
76 P_OPF_CHECKRESERVEDLOCK,
77 P_OPF_FILECONTROL,
78 P_OPF_SECTORSIZE,
79 P_OPF_DEVICECHARACTERISTICS,
80 P_LAST,
81 };
82
83 struct cephsqlite_appdata {
84 ~cephsqlite_appdata() {
85 if (logger) {
86 cct->get_perfcounters_collection()->remove(logger.get());
87 }
88 if (striper_logger) {
89 cct->get_perfcounters_collection()->remove(striper_logger.get());
90 }
91 cluster.shutdown();
92 }
93 int setup_perf() {
94 ceph_assert(cct);
95 PerfCountersBuilder plb(cct.get(), "libcephsqlite_vfs", P_FIRST, P_LAST);
96 plb.add_time_avg(P_OP_OPEN, "op_open", "Time average of Open operations");
97 plb.add_time_avg(P_OP_DELETE, "op_delete", "Time average of Delete operations");
98 plb.add_time_avg(P_OP_ACCESS, "op_access", "Time average of Access operations");
99 plb.add_time_avg(P_OP_FULLPATHNAME, "op_fullpathname", "Time average of FullPathname operations");
100 plb.add_time_avg(P_OP_CURRENTTIME, "op_currenttime", "Time average of Currenttime operations");
101 plb.add_time_avg(P_OPF_CLOSE, "opf_close", "Time average of Close file operations");
102 plb.add_time_avg(P_OPF_READ, "opf_read", "Time average of Read file operations");
103 plb.add_time_avg(P_OPF_WRITE, "opf_write", "Time average of Write file operations");
104 plb.add_time_avg(P_OPF_TRUNCATE, "opf_truncate", "Time average of Truncate file operations");
105 plb.add_time_avg(P_OPF_SYNC, "opf_sync", "Time average of Sync file operations");
106 plb.add_time_avg(P_OPF_FILESIZE, "opf_filesize", "Time average of FileSize file operations");
107 plb.add_time_avg(P_OPF_LOCK, "opf_lock", "Time average of Lock file operations");
108 plb.add_time_avg(P_OPF_UNLOCK, "opf_unlock", "Time average of Unlock file operations");
109 plb.add_time_avg(P_OPF_CHECKRESERVEDLOCK, "opf_checkreservedlock", "Time average of CheckReservedLock file operations");
110 plb.add_time_avg(P_OPF_FILECONTROL, "opf_filecontrol", "Time average of FileControl file operations");
111 plb.add_time_avg(P_OPF_SECTORSIZE, "opf_sectorsize", "Time average of SectorSize file operations");
112 plb.add_time_avg(P_OPF_DEVICECHARACTERISTICS, "opf_devicecharacteristics", "Time average of DeviceCharacteristics file operations");
113 logger.reset(plb.create_perf_counters());
114 if (int rc = SimpleRADOSStriper::config_logger(cct.get(), "libcephsqlite_striper", &striper_logger); rc < 0) {
115 return rc;
116 }
117 cct->get_perfcounters_collection()->add(logger.get());
118 cct->get_perfcounters_collection()->add(striper_logger.get());
119 return 0;
120 }
121 int init_cluster() {
122 ceph_assert(cct);
123 ldout(cct, 5) << "initializing RADOS handle as " << cct->_conf->name << dendl;
124 if (int rc = cluster.init_with_context(cct.get()); rc < 0) {
125 lderr(cct) << "cannot initialize RADOS: " << cpp_strerror(rc) << dendl;
126 return rc;
127 }
128 if (int rc = cluster.connect(); rc < 0) {
129 lderr(cct) << "cannot connect: " << cpp_strerror(rc) << dendl;
130 return rc;
131 }
132 auto s = cluster.get_addrs();
133 ldout(cct, 5) << "completed connection to RADOS with address " << s << dendl;
134 return 0;
135 }
136
137 boost::intrusive_ptr<CephContext> cct;
138 std::unique_ptr<PerfCounters> logger;
139 std::shared_ptr<PerfCounters> striper_logger;
140 librados::Rados cluster;
141 };
142
143 struct cephsqlite_fileloc {
144 std::string pool;
145 std::string radosns;
146 std::string name;
147 };
148
149 struct cephsqlite_fileio {
150 librados::IoCtx ioctx;
151 std::unique_ptr<SimpleRADOSStriper> rs;
152 };
153
154 std::ostream& operator<<(std::ostream &out, const cephsqlite_fileloc& fileloc) {
155 return out
156 << "["
157 << fileloc.pool
158 << ":"
159 << fileloc.radosns
160 << "/"
161 << fileloc.name
162 << "]"
163 ;
164 }
165
166 struct cephsqlite_file {
167 sqlite3_file base;
168 struct sqlite3_vfs* vfs = nullptr;
169 int flags = 0;
170 // There are 5 lock states: https://sqlite.org/c3ref/c_lock_exclusive.html
171 int lock = 0;
172 struct cephsqlite_fileloc loc{};
173 struct cephsqlite_fileio io{};
174 };
175
176
177 #define getdata(vfs) (*((cephsqlite_appdata*)((vfs)->pAppData)))
178
179 static CephContext* getcct(sqlite3_vfs* vfs)
180 {
181 auto&& appd = getdata(vfs);
182 auto& cct = appd.cct;
183 if (cct) {
184 return cct.get();
185 }
186
187 /* bootstrap cct */
188 std::vector<const char*> env_args;
189 env_to_vec(env_args, "CEPH_ARGS");
190 std::string cluster, conf_file_list; // unused
191 CephInitParameters iparams = ceph_argparse_early_args(env_args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
192 cct = boost::intrusive_ptr<CephContext>(common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0), false);
193 cct->_conf.parse_config_files(nullptr, &std::cerr, 0);
194 cct->_conf.parse_env(cct->get_module_type()); // environment variables override
195 cct->_conf.apply_changes(nullptr);
196 common_init_finish(cct.get());
197
198 if (int rc = appd.setup_perf(); rc < 0) {
199 ceph_abort("cannot setup perf counters");
200 }
201
202 if (int rc = appd.init_cluster(); rc < 0) {
203 ceph_abort("cannot setup RADOS cluster handle");
204 }
205
206 return cct.get();
207 }
208
209 static int Lock(sqlite3_file *file, int ilock)
210 {
211 auto f = (cephsqlite_file*)file;
212 auto start = ceph::coarse_mono_clock::now();
213 df(5) << std::hex << ilock << dendl;
214
215 auto& lock = f->lock;
216 ceph_assert(!f->io.rs->is_locked() || lock > SQLITE_LOCK_NONE);
217 ceph_assert(lock <= ilock);
218 if (!f->io.rs->is_locked() && ilock > SQLITE_LOCK_NONE) {
219 if (int rc = f->io.rs->lock(0); rc < 0) {
220 df(5) << "failed: " << rc << dendl;
221 return SQLITE_IOERR;
222 }
223 }
224
225 lock = ilock;
226 auto end = ceph::coarse_mono_clock::now();
227 getdata(f->vfs).logger->tinc(P_OPF_LOCK, end-start);
228 return SQLITE_OK;
229 }
230
231 static int Unlock(sqlite3_file *file, int ilock)
232 {
233 auto f = (cephsqlite_file*)file;
234 auto start = ceph::coarse_mono_clock::now();
235 df(5) << std::hex << ilock << dendl;
236
237 auto& lock = f->lock;
238 ceph_assert(lock == SQLITE_LOCK_NONE || (lock > SQLITE_LOCK_NONE && f->io.rs->is_locked()));
239 ceph_assert(lock >= ilock);
240 if (ilock <= SQLITE_LOCK_NONE && SQLITE_LOCK_NONE < lock) {
241 if (int rc = f->io.rs->unlock(); rc < 0) {
242 df(5) << "failed: " << rc << dendl;
243 return SQLITE_IOERR;
244 }
245 }
246
247 lock = ilock;
248 auto end = ceph::coarse_mono_clock::now();
249 getdata(f->vfs).logger->tinc(P_OPF_UNLOCK, end-start);
250 return SQLITE_OK;
251 }
252
253 static int CheckReservedLock(sqlite3_file *file, int *result)
254 {
255 auto f = (cephsqlite_file*)file;
256 auto start = ceph::coarse_mono_clock::now();
257 df(5) << dendl;
258 *result = 0;
259
260 auto& lock = f->lock;
261 if (lock > SQLITE_LOCK_SHARED) {
262 *result = 1;
263 }
264
265 df(10);
266 f->io.rs->print_lockers(*_dout);
267 *_dout << dendl;
268
269 auto end = ceph::coarse_mono_clock::now();
270 getdata(f->vfs).logger->tinc(P_OPF_CHECKRESERVEDLOCK, end-start);
271 return SQLITE_OK;
272 }
273
274 static int Close(sqlite3_file *file)
275 {
276 auto f = (cephsqlite_file*)file;
277 auto start = ceph::coarse_mono_clock::now();
278 df(5) << dendl;
279 f->~cephsqlite_file();
280 auto end = ceph::coarse_mono_clock::now();
281 getdata(f->vfs).logger->tinc(P_OPF_CLOSE, end-start);
282 return SQLITE_OK;
283 }
284
285 static int Read(sqlite3_file *file, void *buf, int len, sqlite_int64 off)
286 {
287 auto f = (cephsqlite_file*)file;
288 auto start = ceph::coarse_mono_clock::now();
289 df(5) << buf << " " << off << "~" << len << dendl;
290
291 if (int rc = f->io.rs->read(buf, len, off); rc < 0) {
292 df(5) << "read failed: " << cpp_strerror(rc) << dendl;
293 return SQLITE_IOERR_READ;
294 } else {
295 df(5) << "= " << rc << dendl;
296 auto end = ceph::coarse_mono_clock::now();
297 getdata(f->vfs).logger->tinc(P_OPF_READ, end-start);
298 if (rc < len) {
299 memset(buf, 0, len-rc);
300 return SQLITE_IOERR_SHORT_READ;
301 } else {
302 return SQLITE_OK;
303 }
304 }
305 }
306
307 static int Write(sqlite3_file *file, const void *buf, int len, sqlite_int64 off)
308 {
309 auto f = (cephsqlite_file*)file;
310 auto start = ceph::coarse_mono_clock::now();
311 df(5) << off << "~" << len << dendl;
312
313 if (int rc = f->io.rs->write(buf, len, off); rc < 0) {
314 df(5) << "write failed: " << cpp_strerror(rc) << dendl;
315 return SQLITE_IOERR_WRITE;
316 } else {
317 df(5) << "= " << rc << dendl;
318 auto end = ceph::coarse_mono_clock::now();
319 getdata(f->vfs).logger->tinc(P_OPF_WRITE, end-start);
320 return SQLITE_OK;
321 }
322
323 }
324
325 static int Truncate(sqlite3_file *file, sqlite_int64 size)
326 {
327 auto f = (cephsqlite_file*)file;
328 auto start = ceph::coarse_mono_clock::now();
329 df(5) << size << dendl;
330
331 if (int rc = f->io.rs->truncate(size); rc < 0) {
332 df(5) << "truncate failed: " << cpp_strerror(rc) << dendl;
333 return SQLITE_IOERR;
334 }
335
336 auto end = ceph::coarse_mono_clock::now();
337 getdata(f->vfs).logger->tinc(P_OPF_TRUNCATE, end-start);
338 return SQLITE_OK;
339 }
340
341 static int Sync(sqlite3_file *file, int flags)
342 {
343 auto f = (cephsqlite_file*)file;
344 auto start = ceph::coarse_mono_clock::now();
345 df(5) << flags << dendl;
346
347 if (int rc = f->io.rs->flush(); rc < 0) {
348 df(5) << "failed: " << cpp_strerror(rc) << dendl;
349 return SQLITE_IOERR;
350 }
351
352 df(5) << " = 0" << dendl;
353
354 auto end = ceph::coarse_mono_clock::now();
355 getdata(f->vfs).logger->tinc(P_OPF_SYNC, end-start);
356 return SQLITE_OK;
357 }
358
359
360 static int FileSize(sqlite3_file *file, sqlite_int64 *osize)
361 {
362 auto f = (cephsqlite_file*)file;
363 auto start = ceph::coarse_mono_clock::now();
364 df(5) << dendl;
365
366 uint64_t size = 0;
367 if (int rc = f->io.rs->stat(&size); rc < 0) {
368 df(5) << "stat failed: " << cpp_strerror(rc) << dendl;
369 return SQLITE_NOTFOUND;
370 }
371
372 *osize = (sqlite_int64)size;
373
374 df(5) << "= " << size << dendl;
375
376 auto end = ceph::coarse_mono_clock::now();
377 getdata(f->vfs).logger->tinc(P_OPF_FILESIZE, end-start);
378 return SQLITE_OK;
379 }
380
381
382 static bool parsepath(std::string_view path, struct cephsqlite_fileloc* fileloc)
383 {
384 static const std::regex re1{"^/*(\\*[[:digit:]]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"};
385 static const std::regex re2{"^/*([[:alnum:]\\-_.]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"};
386
387 std::cmatch cm;
388 if (!std::regex_match(path.data(), cm, re1)) {
389 if (!std::regex_match(path.data(), cm, re2)) {
390 return false;
391 }
392 }
393 fileloc->pool = cm[1];
394 fileloc->radosns = cm[2];
395 fileloc->name = cm[3];
396
397 return true;
398 }
399
400 static int makestriper(sqlite3_vfs* vfs, const cephsqlite_fileloc& loc, cephsqlite_fileio* io)
401 {
402 auto&& appd = getdata(vfs);
403 auto& cct = appd.cct;
404 auto& cluster = appd.cluster;
405 bool gotmap = false;
406
407 dv(10) << loc << dendl;
408
409 enoent_retry:
410 if (loc.pool[0] == '*') {
411 std::string err;
412 int64_t id = strict_strtoll(loc.pool.c_str()+1, 10, &err);
413 ceph_assert(err.empty());
414 if (int rc = cluster.ioctx_create2(id, io->ioctx); rc < 0) {
415 if (rc == -ENOENT && !gotmap) {
416 cluster.wait_for_latest_osdmap();
417 gotmap = true;
418 goto enoent_retry;
419 }
420 dv(10) << "cannot create ioctx: " << cpp_strerror(rc) << dendl;
421 return rc;
422 }
423 } else {
424 if (int rc = cluster.ioctx_create(loc.pool.c_str(), io->ioctx); rc < 0) {
425 if (rc == -ENOENT && !gotmap) {
426 cluster.wait_for_latest_osdmap();
427 gotmap = true;
428 goto enoent_retry;
429 }
430 dv(10) << "cannot create ioctx: " << cpp_strerror(rc) << dendl;
431 return rc;
432 }
433 }
434
435 if (!loc.radosns.empty())
436 io->ioctx.set_namespace(loc.radosns);
437
438 io->rs = std::make_unique<SimpleRADOSStriper>(io->ioctx, loc.name);
439 io->rs->set_logger(appd.striper_logger);
440 io->rs->set_lock_timeout(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_timeout"));
441 io->rs->set_lock_interval(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_interval"));
442 io->rs->set_blocklist_the_dead(cct->_conf.get_val<bool>("cephsqlite_blocklist_dead_locker"));
443
444 return 0;
445 }
446
447 static int SectorSize(sqlite3_file* sf)
448 {
449 static const int size = 65536;
450 auto start = ceph::coarse_mono_clock::now();
451 auto f = (cephsqlite_file*)sf;
452 df(5) << " = " << size << dendl;
453 auto end = ceph::coarse_mono_clock::now();
454 getdata(f->vfs).logger->tinc(P_OPF_SECTORSIZE, end-start);
455 return size;
456 }
457
458 static int FileControl(sqlite3_file* sf, int op, void *arg)
459 {
460 auto f = (cephsqlite_file*)sf;
461 auto start = ceph::coarse_mono_clock::now();
462 df(5) << op << ", " << arg << dendl;
463 auto end = ceph::coarse_mono_clock::now();
464 getdata(f->vfs).logger->tinc(P_OPF_FILECONTROL, end-start);
465 return SQLITE_NOTFOUND;
466 }
467
468 static int DeviceCharacteristics(sqlite3_file* sf)
469 {
470 auto f = (cephsqlite_file*)sf;
471 auto start = ceph::coarse_mono_clock::now();
472 df(5) << dendl;
473 static const int c = 0
474 |SQLITE_IOCAP_ATOMIC
475 |SQLITE_IOCAP_POWERSAFE_OVERWRITE
476 |SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN
477 |SQLITE_IOCAP_SAFE_APPEND
478 ;
479 auto end = ceph::coarse_mono_clock::now();
480 getdata(f->vfs).logger->tinc(P_OPF_DEVICECHARACTERISTICS, end-start);
481 return c;
482 }
483
484 static int Open(sqlite3_vfs *vfs, const char *name, sqlite3_file *file,
485 int flags, int *oflags)
486 {
487 static const sqlite3_io_methods io = {
488 1, /* iVersion */
489 Close, /* xClose */
490 Read, /* xRead */
491 Write, /* xWrite */
492 Truncate, /* xTruncate */
493 Sync, /* xSync */
494 FileSize, /* xFileSize */
495 Lock, /* xLock */
496 Unlock, /* xUnlock */
497 CheckReservedLock, /* xCheckReservedLock */
498 FileControl, /* xFileControl */
499 SectorSize, /* xSectorSize */
500 DeviceCharacteristics /* xDeviceCharacteristics */
501 };
502
503 auto start = ceph::coarse_mono_clock::now();
504 bool gotmap = false;
505 auto& cluster = getdata(vfs).cluster;
506
507 /* we are not going to create temporary files */
508 if (name == NULL) {
509 dv(-1) << " cannot open temporary database" << dendl;
510 return SQLITE_CANTOPEN;
511 }
512 auto path = std::string_view(name);
513 if (path == ":memory:") {
514 dv(-1) << " cannot open temporary database" << dendl;
515 return SQLITE_IOERR;
516 }
517
518 dv(5) << path << " flags=" << std::hex << flags << dendl;
519
520 auto f = new (file)cephsqlite_file();
521 f->vfs = vfs;
522 if (!parsepath(path, &f->loc)) {
523 ceph_assert(0); /* xFullPathname validates! */
524 }
525 f->flags = flags;
526
527 enoent_retry:
528 if (int rc = makestriper(vfs, f->loc, &f->io); rc < 0) {
529 f->~cephsqlite_file();
530 dv(5) << "cannot open striper" << dendl;
531 return SQLITE_IOERR;
532 }
533
534 if (flags & SQLITE_OPEN_CREATE) {
535 dv(10) << "OPEN_CREATE" << dendl;
536 if (int rc = f->io.rs->create(); rc < 0 && rc != -EEXIST) {
537 if (rc == -ENOENT && !gotmap) {
538 /* we may have an out of date OSDMap which cancels the op in the
539 * Objecter. Try to get a new one and retry. This is mostly noticable
540 * in testing when pools are getting created/deleted left and right.
541 */
542 dv(5) << "retrying create after getting latest OSDMap" << dendl;
543 cluster.wait_for_latest_osdmap();
544 gotmap = true;
545 goto enoent_retry;
546 }
547 dv(5) << "file cannot be created: " << cpp_strerror(rc) << dendl;
548 return SQLITE_IOERR;
549 }
550 }
551
552 if (int rc = f->io.rs->open(); rc < 0) {
553 if (rc == -ENOENT && !gotmap) {
554 /* See comment above for create case. */
555 dv(5) << "retrying open after getting latest OSDMap" << dendl;
556 cluster.wait_for_latest_osdmap();
557 gotmap = true;
558 goto enoent_retry;
559 }
560 dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl;
561 return rc;
562 }
563
564 if (oflags) {
565 *oflags = flags;
566 }
567 f->base.pMethods = &io;
568 auto end = ceph::coarse_mono_clock::now();
569 getdata(vfs).logger->tinc(P_OP_OPEN, end-start);
570 return SQLITE_OK;
571 }
572
573 /*
574 ** Delete the file identified by argument path. If the dsync parameter
575 ** is non-zero, then ensure the file-system modification to delete the
576 ** file has been synced to disk before returning.
577 */
578 static int Delete(sqlite3_vfs* vfs, const char* path, int dsync)
579 {
580 auto start = ceph::coarse_mono_clock::now();
581 dv(5) << "'" << path << "', " << dsync << dendl;
582
583 cephsqlite_fileloc fileloc;
584 if (!parsepath(path, &fileloc)) {
585 dv(5) << "path does not parse!" << dendl;
586 return SQLITE_NOTFOUND;
587 }
588
589 cephsqlite_fileio io;
590 if (int rc = makestriper(vfs, fileloc, &io); rc < 0) {
591 dv(5) << "cannot open striper" << dendl;
592 return SQLITE_IOERR;
593 }
594
595 if (int rc = io.rs->lock(0); rc < 0) {
596 return SQLITE_IOERR;
597 }
598
599 if (int rc = io.rs->remove(); rc < 0) {
600 dv(5) << "= " << rc << dendl;
601 return SQLITE_IOERR_DELETE;
602 }
603
604 /* No need to unlock */
605 dv(5) << "= 0" << dendl;
606 auto end = ceph::coarse_mono_clock::now();
607 getdata(vfs).logger->tinc(P_OP_DELETE, end-start);
608
609 return SQLITE_OK;
610 }
611
612 /*
613 ** Query the file-system to see if the named file exists, is readable or
614 ** is both readable and writable.
615 */
616 static int Access(sqlite3_vfs* vfs, const char* path, int flags, int* result)
617 {
618 auto start = ceph::coarse_mono_clock::now();
619 dv(5) << path << " " << std::hex << flags << dendl;
620
621 cephsqlite_fileloc fileloc;
622 if (!parsepath(path, &fileloc)) {
623 dv(5) << "path does not parse!" << dendl;
624 return SQLITE_NOTFOUND;
625 }
626
627 cephsqlite_fileio io;
628 if (int rc = makestriper(vfs, fileloc, &io); rc < 0) {
629 dv(5) << "cannot open striper" << dendl;
630 return SQLITE_IOERR;
631 }
632
633 if (int rc = io.rs->open(); rc < 0) {
634 if (rc == -ENOENT) {
635 *result = 0;
636 return SQLITE_OK;
637 } else {
638 dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl;
639 *result = 0;
640 return SQLITE_IOERR;
641 }
642 }
643
644 uint64_t size = 0;
645 if (int rc = io.rs->stat(&size); rc < 0) {
646 dv(5) << "= " << rc << " (" << cpp_strerror(rc) << ")" << dendl;
647 *result = 0;
648 } else {
649 dv(5) << "= 0" << dendl;
650 *result = 1;
651 }
652
653 auto end = ceph::coarse_mono_clock::now();
654 getdata(vfs).logger->tinc(P_OP_ACCESS, end-start);
655 return SQLITE_OK;
656 }
657
658 /* This method is only called once for each database. It provides a chance to
659 * reformat the path into a canonical format.
660 */
661 static int FullPathname(sqlite3_vfs* vfs, const char* ipath, int opathlen, char* opath)
662 {
663 auto start = ceph::coarse_mono_clock::now();
664 auto path = std::string_view(ipath);
665
666 dv(5) << "1: " << path << dendl;
667
668 cephsqlite_fileloc fileloc;
669 if (!parsepath(path, &fileloc)) {
670 dv(5) << "path does not parse!" << dendl;
671 return SQLITE_NOTFOUND;
672 }
673 dv(5) << " parsed " << fileloc << dendl;
674
675 auto p = fmt::format("{}:{}/{}", fileloc.pool, fileloc.radosns, fileloc.name);
676 if (p.size() >= (size_t)opathlen) {
677 dv(5) << "path too long!" << dendl;
678 return SQLITE_CANTOPEN;
679 }
680 strcpy(opath, p.c_str());
681 dv(5) << " output " << p << dendl;
682
683 auto end = ceph::coarse_mono_clock::now();
684 getdata(vfs).logger->tinc(P_OP_FULLPATHNAME, end-start);
685 return SQLITE_OK;
686 }
687
688 static int CurrentTime(sqlite3_vfs* vfs, sqlite3_int64* time)
689 {
690 auto start = ceph::coarse_mono_clock::now();
691 dv(5) << time << dendl;
692
693 auto t = ceph_clock_now();
694 *time = t.to_msec() + 2440587.5*86400000; /* julian days since 1970 converted to ms */
695
696 auto end = ceph::coarse_mono_clock::now();
697 getdata(vfs).logger->tinc(P_OP_CURRENTTIME, end-start);
698 return SQLITE_OK;
699 }
700
701 LIBCEPHSQLITE_API int cephsqlite_setcct(CephContext* cct, char** ident)
702 {
703 ldout(cct, 1) << "cct: " << cct << dendl;
704
705 if (sqlite3_api == nullptr) {
706 lderr(cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl;
707 return -EINVAL;
708 }
709
710 auto vfs = sqlite3_vfs_find("ceph");
711 if (!vfs) {
712 lderr(cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl;
713 return -EINVAL;
714 }
715
716 auto& appd = getdata(vfs);
717 appd.cct = cct;
718 if (int rc = appd.setup_perf(); rc < 0) {
719 appd.cct = nullptr;
720 return rc;
721 }
722 if (int rc = appd.init_cluster(); rc < 0) {
723 appd.cct = nullptr;
724 return rc;
725 }
726
727 auto s = appd.cluster.get_addrs();
728 if (ident) {
729 *ident = strdup(s.c_str());
730 }
731
732 ldout(cct, 1) << "complete" << dendl;
733
734 return 0;
735 }
736
737 static void f_perf(sqlite3_context* ctx, int argc, sqlite3_value** argv)
738 {
739 auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx);
740 dv(10) << dendl;
741 auto&& appd = getdata(vfs);
742 JSONFormatter f(false);
743 f.open_object_section("ceph_perf");
744 appd.logger->dump_formatted(&f, false, false);
745 appd.striper_logger->dump_formatted(&f, false, false);
746 f.close_section();
747 {
748 CachedStackStringStream css;
749 f.flush(*css);
750 auto sv = css->strv();
751 dv(20) << " = " << sv << dendl;
752 sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT);
753 }
754 }
755
756 static void f_status(sqlite3_context* ctx, int argc, sqlite3_value** argv)
757 {
758 auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx);
759 dv(10) << dendl;
760 auto&& appd = getdata(vfs);
761 JSONFormatter f(false);
762 f.open_object_section("ceph_status");
763 f.dump_int("id", appd.cluster.get_instance_id());
764 f.dump_string("addr", appd.cluster.get_addrs());
765 f.close_section();
766 {
767 CachedStackStringStream css;
768 f.flush(*css);
769 auto sv = css->strv();
770 dv(20) << " = " << sv << dendl;
771 sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT);
772 }
773 }
774
775 static int autoreg(sqlite3* db, char** err, const struct sqlite3_api_routines* thunk)
776 {
777 auto vfs = sqlite3_vfs_find("ceph");
778 if (!vfs) {
779 ceph_abort("ceph vfs not found");
780 }
781
782 if (int rc = sqlite3_create_function(db, "ceph_perf", 0, SQLITE_UTF8, vfs, f_perf, nullptr, nullptr); rc) {
783 return rc;
784 }
785
786 if (int rc = sqlite3_create_function(db, "ceph_status", 0, SQLITE_UTF8, vfs, f_status, nullptr, nullptr); rc) {
787 return rc;
788 }
789
790 return SQLITE_OK;
791 }
792
793 /* You may wonder why we have an atexit handler? After all, atexit/exit creates
794 * a mess for multithreaded programs. Well, sqlite3 does not have an API for
795 * orderly removal of extensions. And, in fact, any API we might make
796 * unofficially (such as "sqlite3_cephsqlite_fini") would potentially race with
797 * other threads interacting with sqlite3 + the "ceph" VFS. There is a method
798 * for removing a VFS but it's not called by sqlite3 in any error scenario and
799 * there is no mechanism within sqlite3 to tell a VFS to unregister itself.
800 *
801 * This all would be mostly okay if /bin/sqlite3 did not call exit(3), but it
802 * does. (This occurs only for the sqlite3 binary, not when used as a library.)
803 * exit(3) calls destructors on all static-duration structures for the program.
804 * This breaks any outstanding threads created by the librados handle in all
805 * sorts of fantastic ways from C++ exceptions to memory faults. In general,
806 * Ceph libraries are not tolerant of exit(3) (_exit(3) is okay!). Applications
807 * must clean up after themselves or _exit(3).
808 *
809 * So, we have an atexit handler for libcephsqlite. This simply shuts down the
810 * RADOS handle. We can be assured that this occurs before any ceph library
811 * static-duration structures are destructed due to ordering guarantees by
812 * exit(3). Generally, we only see this called when the VFS is used by
813 * /bin/sqlite3 and only during sqlite3 error scenarios (like I/O errors
814 * arrising from blocklisting).
815 */
816
817 static void cephsqlite_atexit()
818 {
819 if (auto vfs = sqlite3_vfs_find("ceph"); vfs) {
820 if (vfs->pAppData) {
821 auto&& appd = getdata(vfs);
822 delete &appd;
823 vfs->pAppData = nullptr;
824 }
825 }
826 }
827
828 LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api)
829 {
830 SQLITE_EXTENSION_INIT2(api);
831
832 auto vfs = sqlite3_vfs_find("ceph");
833 if (!vfs) {
834 vfs = (sqlite3_vfs*) calloc(1, sizeof(sqlite3_vfs));
835 auto appd = new cephsqlite_appdata;
836 vfs->iVersion = 2;
837 vfs->szOsFile = sizeof(struct cephsqlite_file);
838 vfs->mxPathname = 4096;
839 vfs->zName = "ceph";
840 vfs->pAppData = appd;
841 vfs->xOpen = Open;
842 vfs->xDelete = Delete;
843 vfs->xAccess = Access;
844 vfs->xFullPathname = FullPathname;
845 vfs->xCurrentTimeInt64 = CurrentTime;
846 if (int rc = sqlite3_vfs_register(vfs, 0); rc) {
847 delete appd;
848 free(vfs);
849 return rc;
850 }
851 }
852
853 if (int rc = std::atexit(cephsqlite_atexit); rc) {
854 return SQLITE_INTERNAL;
855 }
856
857 if (int rc = sqlite3_auto_extension((void(*)(void))autoreg); rc) {
858 return rc;
859 }
860 if (int rc = autoreg(db, err, api); rc) {
861 return rc;
862 }
863
864 return SQLITE_OK_LOAD_PERMANENTLY;
865 }