]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | /* | |
5 | * Ceph - scalable distributed file system | |
6 | * | |
7 | * Copyright (C) 2021 Red Hat, Inc. | |
8 | * | |
9 | * This is free software; you can redistribute it and/or modify it under the | |
10 | * terms of the GNU Lesser General Public License version 2.1, as published by | |
11 | * the Free Software Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include <boost/smart_ptr/intrusive_ptr.hpp> | |
16 | #include <fmt/format.h> | |
17 | ||
18 | #include <fcntl.h> | |
19 | #include <stdio.h> | |
20 | #include <sys/stat.h> | |
21 | #include <sys/types.h> | |
22 | #include <unistd.h> | |
23 | ||
20effc67 | 24 | #include <cstdlib> |
f67539c2 TL |
25 | #include <iomanip> |
26 | #include <iostream> | |
27 | #include <regex> | |
28 | #include <sstream> | |
29 | #include <string_view> | |
30 | ||
31 | #include <limits.h> | |
32 | #include <string.h> | |
33 | ||
34 | #include <sqlite3ext.h> | |
35 | SQLITE_EXTENSION_INIT1 | |
36 | ||
37 | #include "include/ceph_assert.h" | |
38 | #include "include/rados/librados.hpp" | |
39 | ||
40 | #include "common/Clock.h" | |
41 | #include "common/Formatter.h" | |
42 | #include "common/ceph_argparse.h" | |
43 | #include "common/ceph_mutex.h" | |
44 | #include "common/common_init.h" | |
45 | #include "common/config.h" | |
46 | #include "common/debug.h" | |
47 | #include "common/errno.h" | |
48 | #include "common/perf_counters.h" | |
49 | #include "common/version.h" | |
50 | ||
51 | #include "include/libcephsqlite.h" | |
52 | #include "SimpleRADOSStriper.h" | |
53 | ||
54 | #define dout_subsys ceph_subsys_cephsqlite | |
55 | #undef dout_prefix | |
56 | #define dout_prefix *_dout << "cephsqlite: " << __func__ << ": " | |
aee94f69 TL |
57 | #define d(cct,cluster,lvl) ldout((cct), (lvl)) << "(client." << cluster->get_instance_id() << ") " |
58 | #define dv(lvl) d(cct,cluster,(lvl)) | |
59 | #define df(lvl) d(f->io.cct,f->io.cluster,(lvl)) << f->loc << " " | |
f67539c2 TL |
60 | |
61 | enum { | |
62 | P_FIRST = 0xf0000, | |
63 | P_OP_OPEN, | |
64 | P_OP_DELETE, | |
65 | P_OP_ACCESS, | |
66 | P_OP_FULLPATHNAME, | |
67 | P_OP_CURRENTTIME, | |
68 | P_OPF_CLOSE, | |
69 | P_OPF_READ, | |
70 | P_OPF_WRITE, | |
71 | P_OPF_TRUNCATE, | |
72 | P_OPF_SYNC, | |
73 | P_OPF_FILESIZE, | |
74 | P_OPF_LOCK, | |
75 | P_OPF_UNLOCK, | |
76 | P_OPF_CHECKRESERVEDLOCK, | |
77 | P_OPF_FILECONTROL, | |
78 | P_OPF_SECTORSIZE, | |
79 | P_OPF_DEVICECHARACTERISTICS, | |
80 | P_LAST, | |
81 | }; | |
82 | ||
aee94f69 TL |
83 | using cctptr = boost::intrusive_ptr<CephContext>; |
84 | using rsptr = std::shared_ptr<librados::Rados>; | |
85 | ||
f67539c2 TL |
86 | struct cephsqlite_appdata { |
87 | ~cephsqlite_appdata() { | |
aee94f69 TL |
88 | { |
89 | std::scoped_lock lock(cluster_mutex); | |
90 | _disconnect(); | |
91 | } | |
f67539c2 TL |
92 | if (logger) { |
93 | cct->get_perfcounters_collection()->remove(logger.get()); | |
94 | } | |
95 | if (striper_logger) { | |
96 | cct->get_perfcounters_collection()->remove(striper_logger.get()); | |
97 | } | |
98 | } | |
99 | int setup_perf() { | |
100 | ceph_assert(cct); | |
101 | PerfCountersBuilder plb(cct.get(), "libcephsqlite_vfs", P_FIRST, P_LAST); | |
102 | plb.add_time_avg(P_OP_OPEN, "op_open", "Time average of Open operations"); | |
103 | plb.add_time_avg(P_OP_DELETE, "op_delete", "Time average of Delete operations"); | |
104 | plb.add_time_avg(P_OP_ACCESS, "op_access", "Time average of Access operations"); | |
105 | plb.add_time_avg(P_OP_FULLPATHNAME, "op_fullpathname", "Time average of FullPathname operations"); | |
106 | plb.add_time_avg(P_OP_CURRENTTIME, "op_currenttime", "Time average of Currenttime operations"); | |
107 | plb.add_time_avg(P_OPF_CLOSE, "opf_close", "Time average of Close file operations"); | |
108 | plb.add_time_avg(P_OPF_READ, "opf_read", "Time average of Read file operations"); | |
109 | plb.add_time_avg(P_OPF_WRITE, "opf_write", "Time average of Write file operations"); | |
110 | plb.add_time_avg(P_OPF_TRUNCATE, "opf_truncate", "Time average of Truncate file operations"); | |
111 | plb.add_time_avg(P_OPF_SYNC, "opf_sync", "Time average of Sync file operations"); | |
112 | plb.add_time_avg(P_OPF_FILESIZE, "opf_filesize", "Time average of FileSize file operations"); | |
113 | plb.add_time_avg(P_OPF_LOCK, "opf_lock", "Time average of Lock file operations"); | |
114 | plb.add_time_avg(P_OPF_UNLOCK, "opf_unlock", "Time average of Unlock file operations"); | |
115 | plb.add_time_avg(P_OPF_CHECKRESERVEDLOCK, "opf_checkreservedlock", "Time average of CheckReservedLock file operations"); | |
116 | plb.add_time_avg(P_OPF_FILECONTROL, "opf_filecontrol", "Time average of FileControl file operations"); | |
117 | plb.add_time_avg(P_OPF_SECTORSIZE, "opf_sectorsize", "Time average of SectorSize file operations"); | |
118 | plb.add_time_avg(P_OPF_DEVICECHARACTERISTICS, "opf_devicecharacteristics", "Time average of DeviceCharacteristics file operations"); | |
119 | logger.reset(plb.create_perf_counters()); | |
120 | if (int rc = SimpleRADOSStriper::config_logger(cct.get(), "libcephsqlite_striper", &striper_logger); rc < 0) { | |
121 | return rc; | |
122 | } | |
123 | cct->get_perfcounters_collection()->add(logger.get()); | |
124 | cct->get_perfcounters_collection()->add(striper_logger.get()); | |
125 | return 0; | |
126 | } | |
aee94f69 TL |
127 | |
128 | std::pair<cctptr, rsptr> get_cluster() { | |
129 | std::scoped_lock lock(cluster_mutex); | |
130 | if (!cct) { | |
131 | if (int rc = _open(nullptr); rc < 0) { | |
132 | ceph_abort("could not open connection to ceph"); | |
133 | } | |
134 | } | |
135 | return {cct, cluster}; | |
136 | } | |
137 | int connect() { | |
138 | std::scoped_lock lock(cluster_mutex); | |
139 | return _connect(); | |
140 | } | |
141 | int reconnect() { | |
142 | std::scoped_lock lock(cluster_mutex); | |
143 | _disconnect(); | |
144 | return _connect(); | |
145 | } | |
146 | int maybe_reconnect(rsptr _cluster) { | |
147 | std::scoped_lock lock(cluster_mutex); | |
148 | if (!cluster || cluster == _cluster) { | |
149 | ldout(cct, 10) << "reconnecting to RADOS" << dendl; | |
150 | _disconnect(); | |
151 | return _connect(); | |
152 | } else { | |
153 | ldout(cct, 10) << "already reconnected" << dendl; | |
154 | return 0; | |
155 | } | |
156 | } | |
157 | int open(CephContext* _cct) { | |
158 | std::scoped_lock lock(cluster_mutex); | |
159 | return _open(_cct); | |
160 | } | |
161 | ||
162 | std::unique_ptr<PerfCounters> logger; | |
163 | std::shared_ptr<PerfCounters> striper_logger; | |
164 | ||
165 | private: | |
166 | int _open(CephContext* _cct) { | |
167 | if (!_cct) { | |
168 | std::vector<const char*> env_args; | |
169 | env_to_vec(env_args, "CEPH_ARGS"); | |
170 | std::string cluster, conf_file_list; // unused | |
171 | CephInitParameters iparams = ceph_argparse_early_args(env_args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list); | |
172 | cct = cctptr(common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0), false); | |
173 | cct->_conf.parse_config_files(nullptr, &std::cerr, 0); | |
174 | cct->_conf.parse_env(cct->get_module_type()); // environment variables override | |
175 | cct->_conf.apply_changes(nullptr); | |
176 | common_init_finish(cct.get()); | |
177 | } else { | |
178 | cct = cctptr(_cct); | |
179 | } | |
180 | ||
181 | if (int rc = setup_perf(); rc < 0) { | |
182 | return rc; | |
183 | } | |
184 | ||
185 | if (int rc = _connect(); rc < 0) { | |
186 | return rc; | |
187 | } | |
188 | ||
189 | return 0; | |
190 | } | |
191 | void _disconnect() { | |
192 | if (cluster) { | |
193 | cluster.reset(); | |
194 | } | |
195 | } | |
196 | int _connect() { | |
f67539c2 | 197 | ceph_assert(cct); |
aee94f69 | 198 | auto _cluster = rsptr(new librados::Rados()); |
f67539c2 | 199 | ldout(cct, 5) << "initializing RADOS handle as " << cct->_conf->name << dendl; |
aee94f69 | 200 | if (int rc = _cluster->init_with_context(cct.get()); rc < 0) { |
f67539c2 TL |
201 | lderr(cct) << "cannot initialize RADOS: " << cpp_strerror(rc) << dendl; |
202 | return rc; | |
203 | } | |
aee94f69 | 204 | if (int rc = _cluster->connect(); rc < 0) { |
f67539c2 TL |
205 | lderr(cct) << "cannot connect: " << cpp_strerror(rc) << dendl; |
206 | return rc; | |
207 | } | |
aee94f69 | 208 | auto s = _cluster->get_addrs(); |
f67539c2 | 209 | ldout(cct, 5) << "completed connection to RADOS with address " << s << dendl; |
aee94f69 | 210 | cluster = std::move(_cluster); |
f67539c2 TL |
211 | return 0; |
212 | } | |
213 | ||
aee94f69 TL |
214 | ceph::mutex cluster_mutex = ceph::make_mutex("libcephsqlite");; |
215 | cctptr cct; | |
216 | rsptr cluster; | |
f67539c2 TL |
217 | }; |
218 | ||
219 | struct cephsqlite_fileloc { | |
220 | std::string pool; | |
221 | std::string radosns; | |
222 | std::string name; | |
223 | }; | |
224 | ||
225 | struct cephsqlite_fileio { | |
aee94f69 TL |
226 | cctptr cct; |
227 | rsptr cluster; // anchor for ioctx | |
f67539c2 TL |
228 | librados::IoCtx ioctx; |
229 | std::unique_ptr<SimpleRADOSStriper> rs; | |
230 | }; | |
231 | ||
232 | std::ostream& operator<<(std::ostream &out, const cephsqlite_fileloc& fileloc) { | |
233 | return out | |
234 | << "[" | |
235 | << fileloc.pool | |
236 | << ":" | |
237 | << fileloc.radosns | |
238 | << "/" | |
239 | << fileloc.name | |
240 | << "]" | |
241 | ; | |
242 | } | |
243 | ||
244 | struct cephsqlite_file { | |
245 | sqlite3_file base; | |
246 | struct sqlite3_vfs* vfs = nullptr; | |
247 | int flags = 0; | |
248 | // There are 5 lock states: https://sqlite.org/c3ref/c_lock_exclusive.html | |
249 | int lock = 0; | |
250 | struct cephsqlite_fileloc loc{}; | |
251 | struct cephsqlite_fileio io{}; | |
252 | }; | |
253 | ||
254 | ||
255 | #define getdata(vfs) (*((cephsqlite_appdata*)((vfs)->pAppData))) | |
256 | ||
f67539c2 TL |
257 | static int Lock(sqlite3_file *file, int ilock) |
258 | { | |
259 | auto f = (cephsqlite_file*)file; | |
260 | auto start = ceph::coarse_mono_clock::now(); | |
261 | df(5) << std::hex << ilock << dendl; | |
262 | ||
263 | auto& lock = f->lock; | |
264 | ceph_assert(!f->io.rs->is_locked() || lock > SQLITE_LOCK_NONE); | |
265 | ceph_assert(lock <= ilock); | |
266 | if (!f->io.rs->is_locked() && ilock > SQLITE_LOCK_NONE) { | |
267 | if (int rc = f->io.rs->lock(0); rc < 0) { | |
268 | df(5) << "failed: " << rc << dendl; | |
aee94f69 TL |
269 | if (rc == -EBLOCKLISTED) { |
270 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
271 | } | |
f67539c2 TL |
272 | return SQLITE_IOERR; |
273 | } | |
274 | } | |
275 | ||
276 | lock = ilock; | |
277 | auto end = ceph::coarse_mono_clock::now(); | |
278 | getdata(f->vfs).logger->tinc(P_OPF_LOCK, end-start); | |
279 | return SQLITE_OK; | |
280 | } | |
281 | ||
282 | static int Unlock(sqlite3_file *file, int ilock) | |
283 | { | |
284 | auto f = (cephsqlite_file*)file; | |
285 | auto start = ceph::coarse_mono_clock::now(); | |
286 | df(5) << std::hex << ilock << dendl; | |
287 | ||
288 | auto& lock = f->lock; | |
289 | ceph_assert(lock == SQLITE_LOCK_NONE || (lock > SQLITE_LOCK_NONE && f->io.rs->is_locked())); | |
290 | ceph_assert(lock >= ilock); | |
291 | if (ilock <= SQLITE_LOCK_NONE && SQLITE_LOCK_NONE < lock) { | |
292 | if (int rc = f->io.rs->unlock(); rc < 0) { | |
293 | df(5) << "failed: " << rc << dendl; | |
aee94f69 TL |
294 | if (rc == -EBLOCKLISTED) { |
295 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
296 | } | |
f67539c2 TL |
297 | return SQLITE_IOERR; |
298 | } | |
299 | } | |
300 | ||
301 | lock = ilock; | |
302 | auto end = ceph::coarse_mono_clock::now(); | |
303 | getdata(f->vfs).logger->tinc(P_OPF_UNLOCK, end-start); | |
304 | return SQLITE_OK; | |
305 | } | |
306 | ||
307 | static int CheckReservedLock(sqlite3_file *file, int *result) | |
308 | { | |
309 | auto f = (cephsqlite_file*)file; | |
310 | auto start = ceph::coarse_mono_clock::now(); | |
311 | df(5) << dendl; | |
20effc67 | 312 | *result = 0; |
f67539c2 TL |
313 | |
314 | auto& lock = f->lock; | |
315 | if (lock > SQLITE_LOCK_SHARED) { | |
316 | *result = 1; | |
317 | } | |
318 | ||
319 | df(10); | |
320 | f->io.rs->print_lockers(*_dout); | |
321 | *_dout << dendl; | |
322 | ||
f67539c2 TL |
323 | auto end = ceph::coarse_mono_clock::now(); |
324 | getdata(f->vfs).logger->tinc(P_OPF_CHECKRESERVEDLOCK, end-start); | |
325 | return SQLITE_OK; | |
326 | } | |
327 | ||
328 | static int Close(sqlite3_file *file) | |
329 | { | |
330 | auto f = (cephsqlite_file*)file; | |
331 | auto start = ceph::coarse_mono_clock::now(); | |
332 | df(5) << dendl; | |
333 | f->~cephsqlite_file(); | |
334 | auto end = ceph::coarse_mono_clock::now(); | |
335 | getdata(f->vfs).logger->tinc(P_OPF_CLOSE, end-start); | |
336 | return SQLITE_OK; | |
337 | } | |
338 | ||
339 | static int Read(sqlite3_file *file, void *buf, int len, sqlite_int64 off) | |
340 | { | |
341 | auto f = (cephsqlite_file*)file; | |
342 | auto start = ceph::coarse_mono_clock::now(); | |
343 | df(5) << buf << " " << off << "~" << len << dendl; | |
344 | ||
345 | if (int rc = f->io.rs->read(buf, len, off); rc < 0) { | |
346 | df(5) << "read failed: " << cpp_strerror(rc) << dendl; | |
aee94f69 TL |
347 | if (rc == -EBLOCKLISTED) { |
348 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
349 | } | |
f67539c2 TL |
350 | return SQLITE_IOERR_READ; |
351 | } else { | |
352 | df(5) << "= " << rc << dendl; | |
353 | auto end = ceph::coarse_mono_clock::now(); | |
354 | getdata(f->vfs).logger->tinc(P_OPF_READ, end-start); | |
355 | if (rc < len) { | |
aee94f69 | 356 | memset((unsigned char*)buf+rc, 0, len-rc); |
f67539c2 TL |
357 | return SQLITE_IOERR_SHORT_READ; |
358 | } else { | |
359 | return SQLITE_OK; | |
360 | } | |
361 | } | |
362 | } | |
363 | ||
364 | static int Write(sqlite3_file *file, const void *buf, int len, sqlite_int64 off) | |
365 | { | |
366 | auto f = (cephsqlite_file*)file; | |
367 | auto start = ceph::coarse_mono_clock::now(); | |
368 | df(5) << off << "~" << len << dendl; | |
369 | ||
370 | if (int rc = f->io.rs->write(buf, len, off); rc < 0) { | |
371 | df(5) << "write failed: " << cpp_strerror(rc) << dendl; | |
aee94f69 TL |
372 | if (rc == -EBLOCKLISTED) { |
373 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
374 | } | |
f67539c2 TL |
375 | return SQLITE_IOERR_WRITE; |
376 | } else { | |
377 | df(5) << "= " << rc << dendl; | |
378 | auto end = ceph::coarse_mono_clock::now(); | |
379 | getdata(f->vfs).logger->tinc(P_OPF_WRITE, end-start); | |
380 | return SQLITE_OK; | |
381 | } | |
382 | ||
383 | } | |
384 | ||
385 | static int Truncate(sqlite3_file *file, sqlite_int64 size) | |
386 | { | |
387 | auto f = (cephsqlite_file*)file; | |
388 | auto start = ceph::coarse_mono_clock::now(); | |
389 | df(5) << size << dendl; | |
390 | ||
391 | if (int rc = f->io.rs->truncate(size); rc < 0) { | |
392 | df(5) << "truncate failed: " << cpp_strerror(rc) << dendl; | |
aee94f69 TL |
393 | if (rc == -EBLOCKLISTED) { |
394 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
395 | } | |
f67539c2 TL |
396 | return SQLITE_IOERR; |
397 | } | |
398 | ||
399 | auto end = ceph::coarse_mono_clock::now(); | |
400 | getdata(f->vfs).logger->tinc(P_OPF_TRUNCATE, end-start); | |
401 | return SQLITE_OK; | |
402 | } | |
403 | ||
404 | static int Sync(sqlite3_file *file, int flags) | |
405 | { | |
406 | auto f = (cephsqlite_file*)file; | |
407 | auto start = ceph::coarse_mono_clock::now(); | |
408 | df(5) << flags << dendl; | |
409 | ||
410 | if (int rc = f->io.rs->flush(); rc < 0) { | |
411 | df(5) << "failed: " << cpp_strerror(rc) << dendl; | |
aee94f69 TL |
412 | if (rc == -EBLOCKLISTED) { |
413 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
414 | } | |
f67539c2 TL |
415 | return SQLITE_IOERR; |
416 | } | |
417 | ||
418 | df(5) << " = 0" << dendl; | |
419 | ||
420 | auto end = ceph::coarse_mono_clock::now(); | |
421 | getdata(f->vfs).logger->tinc(P_OPF_SYNC, end-start); | |
422 | return SQLITE_OK; | |
423 | } | |
424 | ||
425 | ||
426 | static int FileSize(sqlite3_file *file, sqlite_int64 *osize) | |
427 | { | |
428 | auto f = (cephsqlite_file*)file; | |
429 | auto start = ceph::coarse_mono_clock::now(); | |
430 | df(5) << dendl; | |
431 | ||
432 | uint64_t size = 0; | |
433 | if (int rc = f->io.rs->stat(&size); rc < 0) { | |
434 | df(5) << "stat failed: " << cpp_strerror(rc) << dendl; | |
aee94f69 TL |
435 | if (rc == -EBLOCKLISTED) { |
436 | getdata(f->vfs).maybe_reconnect(f->io.cluster); | |
437 | } | |
f67539c2 TL |
438 | return SQLITE_NOTFOUND; |
439 | } | |
440 | ||
441 | *osize = (sqlite_int64)size; | |
442 | ||
443 | df(5) << "= " << size << dendl; | |
444 | ||
445 | auto end = ceph::coarse_mono_clock::now(); | |
446 | getdata(f->vfs).logger->tinc(P_OPF_FILESIZE, end-start); | |
447 | return SQLITE_OK; | |
448 | } | |
449 | ||
450 | ||
451 | static bool parsepath(std::string_view path, struct cephsqlite_fileloc* fileloc) | |
452 | { | |
0948533f TL |
453 | static const std::regex re1{"^/*(\\*[[:digit:]]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"}; |
454 | static const std::regex re2{"^/*([[:alnum:]\\-_.]+):([[:alnum:]\\-_.]*)/([[:alnum:]\\-._]+)$"}; | |
f67539c2 TL |
455 | |
456 | std::cmatch cm; | |
457 | if (!std::regex_match(path.data(), cm, re1)) { | |
458 | if (!std::regex_match(path.data(), cm, re2)) { | |
459 | return false; | |
460 | } | |
461 | } | |
462 | fileloc->pool = cm[1]; | |
463 | fileloc->radosns = cm[2]; | |
464 | fileloc->name = cm[3]; | |
465 | ||
466 | return true; | |
467 | } | |
468 | ||
aee94f69 | 469 | static int makestriper(sqlite3_vfs* vfs, cctptr cct, rsptr cluster, const cephsqlite_fileloc& loc, cephsqlite_fileio* io) |
f67539c2 | 470 | { |
f67539c2 TL |
471 | bool gotmap = false; |
472 | ||
aee94f69 | 473 | d(cct,cluster,10) << loc << dendl; |
f67539c2 TL |
474 | |
475 | enoent_retry: | |
476 | if (loc.pool[0] == '*') { | |
477 | std::string err; | |
478 | int64_t id = strict_strtoll(loc.pool.c_str()+1, 10, &err); | |
479 | ceph_assert(err.empty()); | |
aee94f69 | 480 | if (int rc = cluster->ioctx_create2(id, io->ioctx); rc < 0) { |
f67539c2 | 481 | if (rc == -ENOENT && !gotmap) { |
aee94f69 | 482 | cluster->wait_for_latest_osdmap(); |
f67539c2 TL |
483 | gotmap = true; |
484 | goto enoent_retry; | |
485 | } | |
aee94f69 | 486 | d(cct,cluster,1) << "cannot create ioctx: " << cpp_strerror(rc) << dendl; |
f67539c2 TL |
487 | return rc; |
488 | } | |
489 | } else { | |
aee94f69 | 490 | if (int rc = cluster->ioctx_create(loc.pool.c_str(), io->ioctx); rc < 0) { |
f67539c2 | 491 | if (rc == -ENOENT && !gotmap) { |
aee94f69 | 492 | cluster->wait_for_latest_osdmap(); |
f67539c2 TL |
493 | gotmap = true; |
494 | goto enoent_retry; | |
495 | } | |
aee94f69 | 496 | d(cct,cluster,1) << "cannot create ioctx: " << cpp_strerror(rc) << dendl; |
f67539c2 TL |
497 | return rc; |
498 | } | |
499 | } | |
500 | ||
501 | if (!loc.radosns.empty()) | |
502 | io->ioctx.set_namespace(loc.radosns); | |
503 | ||
504 | io->rs = std::make_unique<SimpleRADOSStriper>(io->ioctx, loc.name); | |
aee94f69 | 505 | io->rs->set_logger(getdata(vfs).striper_logger); |
f67539c2 TL |
506 | io->rs->set_lock_timeout(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_timeout")); |
507 | io->rs->set_lock_interval(cct->_conf.get_val<std::chrono::milliseconds>("cephsqlite_lock_renewal_interval")); | |
508 | io->rs->set_blocklist_the_dead(cct->_conf.get_val<bool>("cephsqlite_blocklist_dead_locker")); | |
aee94f69 TL |
509 | io->cluster = std::move(cluster); |
510 | io->cct = cct; | |
f67539c2 TL |
511 | |
512 | return 0; | |
513 | } | |
514 | ||
515 | static int SectorSize(sqlite3_file* sf) | |
516 | { | |
517 | static const int size = 65536; | |
518 | auto start = ceph::coarse_mono_clock::now(); | |
519 | auto f = (cephsqlite_file*)sf; | |
520 | df(5) << " = " << size << dendl; | |
521 | auto end = ceph::coarse_mono_clock::now(); | |
522 | getdata(f->vfs).logger->tinc(P_OPF_SECTORSIZE, end-start); | |
523 | return size; | |
524 | } | |
525 | ||
526 | static int FileControl(sqlite3_file* sf, int op, void *arg) | |
527 | { | |
528 | auto f = (cephsqlite_file*)sf; | |
529 | auto start = ceph::coarse_mono_clock::now(); | |
530 | df(5) << op << ", " << arg << dendl; | |
531 | auto end = ceph::coarse_mono_clock::now(); | |
532 | getdata(f->vfs).logger->tinc(P_OPF_FILECONTROL, end-start); | |
533 | return SQLITE_NOTFOUND; | |
534 | } | |
535 | ||
536 | static int DeviceCharacteristics(sqlite3_file* sf) | |
537 | { | |
538 | auto f = (cephsqlite_file*)sf; | |
539 | auto start = ceph::coarse_mono_clock::now(); | |
540 | df(5) << dendl; | |
541 | static const int c = 0 | |
542 | |SQLITE_IOCAP_ATOMIC | |
543 | |SQLITE_IOCAP_POWERSAFE_OVERWRITE | |
544 | |SQLITE_IOCAP_UNDELETABLE_WHEN_OPEN | |
545 | |SQLITE_IOCAP_SAFE_APPEND | |
546 | ; | |
547 | auto end = ceph::coarse_mono_clock::now(); | |
548 | getdata(f->vfs).logger->tinc(P_OPF_DEVICECHARACTERISTICS, end-start); | |
549 | return c; | |
550 | } | |
551 | ||
552 | static int Open(sqlite3_vfs *vfs, const char *name, sqlite3_file *file, | |
553 | int flags, int *oflags) | |
554 | { | |
555 | static const sqlite3_io_methods io = { | |
556 | 1, /* iVersion */ | |
557 | Close, /* xClose */ | |
558 | Read, /* xRead */ | |
559 | Write, /* xWrite */ | |
560 | Truncate, /* xTruncate */ | |
561 | Sync, /* xSync */ | |
562 | FileSize, /* xFileSize */ | |
563 | Lock, /* xLock */ | |
564 | Unlock, /* xUnlock */ | |
565 | CheckReservedLock, /* xCheckReservedLock */ | |
566 | FileControl, /* xFileControl */ | |
567 | SectorSize, /* xSectorSize */ | |
568 | DeviceCharacteristics /* xDeviceCharacteristics */ | |
569 | }; | |
570 | ||
571 | auto start = ceph::coarse_mono_clock::now(); | |
572 | bool gotmap = false; | |
aee94f69 | 573 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 TL |
574 | |
575 | /* we are not going to create temporary files */ | |
576 | if (name == NULL) { | |
577 | dv(-1) << " cannot open temporary database" << dendl; | |
578 | return SQLITE_CANTOPEN; | |
579 | } | |
580 | auto path = std::string_view(name); | |
20effc67 | 581 | if (path == ":memory:") { |
f67539c2 TL |
582 | dv(-1) << " cannot open temporary database" << dendl; |
583 | return SQLITE_IOERR; | |
584 | } | |
585 | ||
586 | dv(5) << path << " flags=" << std::hex << flags << dendl; | |
587 | ||
588 | auto f = new (file)cephsqlite_file(); | |
589 | f->vfs = vfs; | |
590 | if (!parsepath(path, &f->loc)) { | |
591 | ceph_assert(0); /* xFullPathname validates! */ | |
592 | } | |
593 | f->flags = flags; | |
594 | ||
595 | enoent_retry: | |
aee94f69 | 596 | if (int rc = makestriper(vfs, cct, cluster, f->loc, &f->io); rc < 0) { |
f67539c2 | 597 | f->~cephsqlite_file(); |
aee94f69 | 598 | dv(-1) << "cannot open striper" << dendl; |
f67539c2 TL |
599 | return SQLITE_IOERR; |
600 | } | |
601 | ||
602 | if (flags & SQLITE_OPEN_CREATE) { | |
603 | dv(10) << "OPEN_CREATE" << dendl; | |
604 | if (int rc = f->io.rs->create(); rc < 0 && rc != -EEXIST) { | |
605 | if (rc == -ENOENT && !gotmap) { | |
606 | /* we may have an out of date OSDMap which cancels the op in the | |
607 | * Objecter. Try to get a new one and retry. This is mostly noticable | |
608 | * in testing when pools are getting created/deleted left and right. | |
609 | */ | |
610 | dv(5) << "retrying create after getting latest OSDMap" << dendl; | |
aee94f69 | 611 | cluster->wait_for_latest_osdmap(); |
f67539c2 TL |
612 | gotmap = true; |
613 | goto enoent_retry; | |
614 | } | |
615 | dv(5) << "file cannot be created: " << cpp_strerror(rc) << dendl; | |
616 | return SQLITE_IOERR; | |
617 | } | |
618 | } | |
619 | ||
620 | if (int rc = f->io.rs->open(); rc < 0) { | |
621 | if (rc == -ENOENT && !gotmap) { | |
622 | /* See comment above for create case. */ | |
623 | dv(5) << "retrying open after getting latest OSDMap" << dendl; | |
aee94f69 | 624 | cluster->wait_for_latest_osdmap(); |
f67539c2 TL |
625 | gotmap = true; |
626 | goto enoent_retry; | |
627 | } | |
628 | dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl; | |
629 | return rc; | |
630 | } | |
631 | ||
632 | if (oflags) { | |
633 | *oflags = flags; | |
634 | } | |
635 | f->base.pMethods = &io; | |
636 | auto end = ceph::coarse_mono_clock::now(); | |
637 | getdata(vfs).logger->tinc(P_OP_OPEN, end-start); | |
638 | return SQLITE_OK; | |
639 | } | |
640 | ||
641 | /* | |
642 | ** Delete the file identified by argument path. If the dsync parameter | |
643 | ** is non-zero, then ensure the file-system modification to delete the | |
644 | ** file has been synced to disk before returning. | |
645 | */ | |
646 | static int Delete(sqlite3_vfs* vfs, const char* path, int dsync) | |
647 | { | |
648 | auto start = ceph::coarse_mono_clock::now(); | |
aee94f69 | 649 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 TL |
650 | dv(5) << "'" << path << "', " << dsync << dendl; |
651 | ||
652 | cephsqlite_fileloc fileloc; | |
653 | if (!parsepath(path, &fileloc)) { | |
654 | dv(5) << "path does not parse!" << dendl; | |
655 | return SQLITE_NOTFOUND; | |
656 | } | |
657 | ||
658 | cephsqlite_fileio io; | |
aee94f69 TL |
659 | if (int rc = makestriper(vfs, cct, cluster, fileloc, &io); rc < 0) { |
660 | dv(-1) << "cannot open striper" << dendl; | |
f67539c2 TL |
661 | return SQLITE_IOERR; |
662 | } | |
663 | ||
664 | if (int rc = io.rs->lock(0); rc < 0) { | |
665 | return SQLITE_IOERR; | |
666 | } | |
667 | ||
668 | if (int rc = io.rs->remove(); rc < 0) { | |
669 | dv(5) << "= " << rc << dendl; | |
670 | return SQLITE_IOERR_DELETE; | |
671 | } | |
672 | ||
673 | /* No need to unlock */ | |
674 | dv(5) << "= 0" << dendl; | |
675 | auto end = ceph::coarse_mono_clock::now(); | |
676 | getdata(vfs).logger->tinc(P_OP_DELETE, end-start); | |
677 | ||
678 | return SQLITE_OK; | |
679 | } | |
680 | ||
681 | /* | |
682 | ** Query the file-system to see if the named file exists, is readable or | |
683 | ** is both readable and writable. | |
684 | */ | |
685 | static int Access(sqlite3_vfs* vfs, const char* path, int flags, int* result) | |
686 | { | |
687 | auto start = ceph::coarse_mono_clock::now(); | |
aee94f69 | 688 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 TL |
689 | dv(5) << path << " " << std::hex << flags << dendl; |
690 | ||
691 | cephsqlite_fileloc fileloc; | |
692 | if (!parsepath(path, &fileloc)) { | |
693 | dv(5) << "path does not parse!" << dendl; | |
694 | return SQLITE_NOTFOUND; | |
695 | } | |
696 | ||
697 | cephsqlite_fileio io; | |
aee94f69 TL |
698 | if (int rc = makestriper(vfs, cct, cluster, fileloc, &io); rc < 0) { |
699 | dv(-1) << "cannot open striper" << dendl; | |
f67539c2 TL |
700 | return SQLITE_IOERR; |
701 | } | |
702 | ||
703 | if (int rc = io.rs->open(); rc < 0) { | |
704 | if (rc == -ENOENT) { | |
705 | *result = 0; | |
706 | return SQLITE_OK; | |
707 | } else { | |
708 | dv(10) << "cannot open striper: " << cpp_strerror(rc) << dendl; | |
709 | *result = 0; | |
710 | return SQLITE_IOERR; | |
711 | } | |
712 | } | |
713 | ||
714 | uint64_t size = 0; | |
715 | if (int rc = io.rs->stat(&size); rc < 0) { | |
716 | dv(5) << "= " << rc << " (" << cpp_strerror(rc) << ")" << dendl; | |
717 | *result = 0; | |
718 | } else { | |
719 | dv(5) << "= 0" << dendl; | |
720 | *result = 1; | |
721 | } | |
722 | ||
723 | auto end = ceph::coarse_mono_clock::now(); | |
724 | getdata(vfs).logger->tinc(P_OP_ACCESS, end-start); | |
725 | return SQLITE_OK; | |
726 | } | |
727 | ||
728 | /* This method is only called once for each database. It provides a chance to | |
729 | * reformat the path into a canonical format. | |
730 | */ | |
731 | static int FullPathname(sqlite3_vfs* vfs, const char* ipath, int opathlen, char* opath) | |
732 | { | |
733 | auto start = ceph::coarse_mono_clock::now(); | |
734 | auto path = std::string_view(ipath); | |
aee94f69 | 735 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 TL |
736 | dv(5) << "1: " << path << dendl; |
737 | ||
738 | cephsqlite_fileloc fileloc; | |
739 | if (!parsepath(path, &fileloc)) { | |
740 | dv(5) << "path does not parse!" << dendl; | |
741 | return SQLITE_NOTFOUND; | |
742 | } | |
743 | dv(5) << " parsed " << fileloc << dendl; | |
744 | ||
745 | auto p = fmt::format("{}:{}/{}", fileloc.pool, fileloc.radosns, fileloc.name); | |
746 | if (p.size() >= (size_t)opathlen) { | |
747 | dv(5) << "path too long!" << dendl; | |
748 | return SQLITE_CANTOPEN; | |
749 | } | |
750 | strcpy(opath, p.c_str()); | |
751 | dv(5) << " output " << p << dendl; | |
752 | ||
753 | auto end = ceph::coarse_mono_clock::now(); | |
754 | getdata(vfs).logger->tinc(P_OP_FULLPATHNAME, end-start); | |
755 | return SQLITE_OK; | |
756 | } | |
757 | ||
758 | static int CurrentTime(sqlite3_vfs* vfs, sqlite3_int64* time) | |
759 | { | |
760 | auto start = ceph::coarse_mono_clock::now(); | |
aee94f69 | 761 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 TL |
762 | dv(5) << time << dendl; |
763 | ||
764 | auto t = ceph_clock_now(); | |
765 | *time = t.to_msec() + 2440587.5*86400000; /* julian days since 1970 converted to ms */ | |
766 | ||
767 | auto end = ceph::coarse_mono_clock::now(); | |
768 | getdata(vfs).logger->tinc(P_OP_CURRENTTIME, end-start); | |
769 | return SQLITE_OK; | |
770 | } | |
771 | ||
aee94f69 | 772 | LIBCEPHSQLITE_API int cephsqlite_setcct(CephContext* _cct, char** ident) |
f67539c2 | 773 | { |
aee94f69 | 774 | ldout(_cct, 1) << "cct: " << _cct << dendl; |
f67539c2 TL |
775 | |
776 | if (sqlite3_api == nullptr) { | |
aee94f69 | 777 | lderr(_cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl; |
f67539c2 TL |
778 | return -EINVAL; |
779 | } | |
780 | ||
781 | auto vfs = sqlite3_vfs_find("ceph"); | |
782 | if (!vfs) { | |
aee94f69 | 783 | lderr(_cct) << "API violation: must have sqlite3 init libcephsqlite" << dendl; |
f67539c2 TL |
784 | return -EINVAL; |
785 | } | |
786 | ||
787 | auto& appd = getdata(vfs); | |
aee94f69 | 788 | if (int rc = appd.open(_cct); rc < 0) { |
f67539c2 TL |
789 | return rc; |
790 | } | |
791 | ||
aee94f69 TL |
792 | auto [cct, cluster] = appd.get_cluster(); |
793 | ||
794 | auto s = cluster->get_addrs(); | |
f67539c2 TL |
795 | if (ident) { |
796 | *ident = strdup(s.c_str()); | |
797 | } | |
798 | ||
799 | ldout(cct, 1) << "complete" << dendl; | |
800 | ||
801 | return 0; | |
802 | } | |
803 | ||
804 | static void f_perf(sqlite3_context* ctx, int argc, sqlite3_value** argv) | |
805 | { | |
806 | auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx); | |
aee94f69 | 807 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 TL |
808 | dv(10) << dendl; |
809 | auto&& appd = getdata(vfs); | |
810 | JSONFormatter f(false); | |
811 | f.open_object_section("ceph_perf"); | |
1e59de90 TL |
812 | appd.logger->dump_formatted(&f, false, false); |
813 | appd.striper_logger->dump_formatted(&f, false, false); | |
f67539c2 TL |
814 | f.close_section(); |
815 | { | |
816 | CachedStackStringStream css; | |
817 | f.flush(*css); | |
818 | auto sv = css->strv(); | |
819 | dv(20) << " = " << sv << dendl; | |
820 | sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT); | |
821 | } | |
822 | } | |
823 | ||
824 | static void f_status(sqlite3_context* ctx, int argc, sqlite3_value** argv) | |
825 | { | |
826 | auto vfs = (sqlite3_vfs*)sqlite3_user_data(ctx); | |
aee94f69 | 827 | auto [cct, cluster] = getdata(vfs).get_cluster(); |
f67539c2 | 828 | dv(10) << dendl; |
f67539c2 TL |
829 | JSONFormatter f(false); |
830 | f.open_object_section("ceph_status"); | |
aee94f69 TL |
831 | f.dump_int("id", cluster->get_instance_id()); |
832 | f.dump_string("addr", cluster->get_addrs()); | |
f67539c2 TL |
833 | f.close_section(); |
834 | { | |
835 | CachedStackStringStream css; | |
836 | f.flush(*css); | |
837 | auto sv = css->strv(); | |
838 | dv(20) << " = " << sv << dendl; | |
839 | sqlite3_result_text(ctx, sv.data(), sv.size(), SQLITE_TRANSIENT); | |
840 | } | |
841 | } | |
842 | ||
843 | static int autoreg(sqlite3* db, char** err, const struct sqlite3_api_routines* thunk) | |
844 | { | |
845 | auto vfs = sqlite3_vfs_find("ceph"); | |
846 | if (!vfs) { | |
847 | ceph_abort("ceph vfs not found"); | |
848 | } | |
849 | ||
850 | if (int rc = sqlite3_create_function(db, "ceph_perf", 0, SQLITE_UTF8, vfs, f_perf, nullptr, nullptr); rc) { | |
851 | return rc; | |
852 | } | |
853 | ||
854 | if (int rc = sqlite3_create_function(db, "ceph_status", 0, SQLITE_UTF8, vfs, f_status, nullptr, nullptr); rc) { | |
855 | return rc; | |
856 | } | |
857 | ||
858 | return SQLITE_OK; | |
859 | } | |
860 | ||
20effc67 TL |
861 | /* You may wonder why we have an atexit handler? After all, atexit/exit creates |
862 | * a mess for multithreaded programs. Well, sqlite3 does not have an API for | |
863 | * orderly removal of extensions. And, in fact, any API we might make | |
864 | * unofficially (such as "sqlite3_cephsqlite_fini") would potentially race with | |
865 | * other threads interacting with sqlite3 + the "ceph" VFS. There is a method | |
866 | * for removing a VFS but it's not called by sqlite3 in any error scenario and | |
867 | * there is no mechanism within sqlite3 to tell a VFS to unregister itself. | |
868 | * | |
869 | * This all would be mostly okay if /bin/sqlite3 did not call exit(3), but it | |
870 | * does. (This occurs only for the sqlite3 binary, not when used as a library.) | |
871 | * exit(3) calls destructors on all static-duration structures for the program. | |
872 | * This breaks any outstanding threads created by the librados handle in all | |
873 | * sorts of fantastic ways from C++ exceptions to memory faults. In general, | |
874 | * Ceph libraries are not tolerant of exit(3) (_exit(3) is okay!). Applications | |
875 | * must clean up after themselves or _exit(3). | |
876 | * | |
877 | * So, we have an atexit handler for libcephsqlite. This simply shuts down the | |
878 | * RADOS handle. We can be assured that this occurs before any ceph library | |
879 | * static-duration structures are destructed due to ordering guarantees by | |
880 | * exit(3). Generally, we only see this called when the VFS is used by | |
881 | * /bin/sqlite3 and only during sqlite3 error scenarios (like I/O errors | |
882 | * arrising from blocklisting). | |
883 | */ | |
884 | ||
885 | static void cephsqlite_atexit() | |
886 | { | |
887 | if (auto vfs = sqlite3_vfs_find("ceph"); vfs) { | |
888 | if (vfs->pAppData) { | |
889 | auto&& appd = getdata(vfs); | |
890 | delete &appd; | |
891 | vfs->pAppData = nullptr; | |
892 | } | |
893 | } | |
894 | } | |
895 | ||
f67539c2 TL |
896 | LIBCEPHSQLITE_API int sqlite3_cephsqlite_init(sqlite3* db, char** err, const sqlite3_api_routines* api) |
897 | { | |
898 | SQLITE_EXTENSION_INIT2(api); | |
899 | ||
900 | auto vfs = sqlite3_vfs_find("ceph"); | |
901 | if (!vfs) { | |
20effc67 | 902 | vfs = (sqlite3_vfs*) calloc(1, sizeof(sqlite3_vfs)); |
f67539c2 | 903 | auto appd = new cephsqlite_appdata; |
f67539c2 TL |
904 | vfs->iVersion = 2; |
905 | vfs->szOsFile = sizeof(struct cephsqlite_file); | |
906 | vfs->mxPathname = 4096; | |
907 | vfs->zName = "ceph"; | |
908 | vfs->pAppData = appd; | |
909 | vfs->xOpen = Open; | |
910 | vfs->xDelete = Delete; | |
911 | vfs->xAccess = Access; | |
912 | vfs->xFullPathname = FullPathname; | |
913 | vfs->xCurrentTimeInt64 = CurrentTime; | |
20effc67 TL |
914 | if (int rc = sqlite3_vfs_register(vfs, 0); rc) { |
915 | delete appd; | |
916 | free(vfs); | |
917 | return rc; | |
918 | } | |
919 | } | |
920 | ||
921 | if (int rc = std::atexit(cephsqlite_atexit); rc) { | |
922 | return SQLITE_INTERNAL; | |
f67539c2 TL |
923 | } |
924 | ||
925 | if (int rc = sqlite3_auto_extension((void(*)(void))autoreg); rc) { | |
926 | return rc; | |
927 | } | |
928 | if (int rc = autoreg(db, err, api); rc) { | |
929 | return rc; | |
930 | } | |
931 | ||
932 | return SQLITE_OK_LOAD_PERMANENTLY; | |
933 | } |