]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | ||
4 | /* | |
5 | * Ceph - scalable distributed file system | |
6 | * | |
7 | * Copyright (C) 2021 Red Hat, Inc. | |
8 | * | |
9 | * This is free software; you can redistribute it and/or modify it under the | |
10 | * terms of the GNU Lesser General Public License version 2.1, as published by | |
11 | * the Free Software Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include <boost/smart_ptr/intrusive_ptr.hpp> | |
16 | ||
17 | #include <fcntl.h> | |
18 | #include <stdio.h> | |
19 | #include <sys/stat.h> | |
20 | #include <sys/types.h> | |
21 | #include <unistd.h> | |
22 | ||
23 | #include <iomanip> | |
24 | #include <iostream> | |
25 | #include <regex> | |
26 | #include <sstream> | |
27 | #include <string_view> | |
28 | ||
29 | #include <limits.h> | |
30 | #include <string.h> | |
31 | ||
32 | #include "include/ceph_assert.h" | |
33 | #include "include/rados/librados.hpp" | |
34 | ||
35 | #include "cls/lock/cls_lock_client.h" | |
36 | ||
37 | #include "common/ceph_argparse.h" | |
38 | #include "common/ceph_mutex.h" | |
39 | #include "common/common_init.h" | |
40 | #include "common/config.h" | |
41 | #include "common/debug.h" | |
42 | #include "common/errno.h" | |
43 | #include "common/version.h" | |
44 | ||
45 | #include "SimpleRADOSStriper.h" | |
46 | ||
47 | using ceph::bufferlist; | |
48 | ||
522d829b | 49 | #define dout_subsys ceph_subsys_cephsqlite |
f67539c2 TL |
50 | #undef dout_prefix |
51 | #define dout_prefix *_dout << "client." << ioctx.get_instance_id() << ": SimpleRADOSStriper: " << __func__ << ": " << oid << ": " | |
52 | #define d(lvl) ldout((CephContext*)ioctx.cct(), (lvl)) | |
53 | ||
54 | enum { | |
55 | P_FIRST = 0xe0000, | |
56 | P_UPDATE_METADATA, | |
57 | P_UPDATE_ALLOCATED, | |
58 | P_UPDATE_SIZE, | |
59 | P_UPDATE_VERSION, | |
60 | P_SHRINK, | |
61 | P_SHRINK_BYTES, | |
62 | P_LOCK, | |
63 | P_UNLOCK, | |
64 | P_LAST, | |
65 | }; | |
66 | ||
67 | int SimpleRADOSStriper::config_logger(CephContext* cct, std::string_view name, std::shared_ptr<PerfCounters>* l) | |
68 | { | |
69 | PerfCountersBuilder plb(cct, name.data(), P_FIRST, P_LAST); | |
70 | plb.add_u64_counter(P_UPDATE_METADATA, "update_metadata", "Number of metadata updates"); | |
71 | plb.add_u64_counter(P_UPDATE_ALLOCATED, "update_allocated", "Number of allocated updates"); | |
72 | plb.add_u64_counter(P_UPDATE_SIZE, "update_size", "Number of size updates"); | |
73 | plb.add_u64_counter(P_UPDATE_VERSION, "update_version", "Number of version updates"); | |
74 | plb.add_u64_counter(P_SHRINK, "shrink", "Number of allocation shrinks"); | |
75 | plb.add_u64_counter(P_SHRINK_BYTES, "shrink_bytes", "Bytes shrunk"); | |
76 | plb.add_u64_counter(P_LOCK, "lock", "Number of locks"); | |
77 | plb.add_u64_counter(P_UNLOCK, "unlock", "Number of unlocks"); | |
78 | l->reset(plb.create_perf_counters()); | |
79 | return 0; | |
80 | } | |
81 | ||
82 | SimpleRADOSStriper::~SimpleRADOSStriper() | |
83 | { | |
84 | if (lock_keeper.joinable()) { | |
85 | shutdown = true; | |
86 | lock_keeper_cvar.notify_all(); | |
87 | lock_keeper.join(); | |
88 | } | |
89 | ||
90 | if (ioctx.is_valid()) { | |
91 | d(5) << dendl; | |
92 | ||
93 | if (is_locked()) { | |
94 | unlock(); | |
95 | } | |
96 | } | |
97 | } | |
98 | ||
99 | SimpleRADOSStriper::extent SimpleRADOSStriper::get_next_extent(uint64_t off, size_t len) const | |
100 | { | |
101 | extent e; | |
102 | { | |
103 | uint64_t stripe = (off>>object_size); | |
104 | CachedStackStringStream css; | |
105 | *css << oid; | |
106 | *css << "."; | |
107 | *css << std::setw(16) << std::setfill('0') << std::hex << stripe; | |
108 | e.soid = css->str(); | |
109 | } | |
110 | e.off = off & ((1<<object_size)-1); | |
111 | e.len = std::min<size_t>(len, (1<<object_size)-e.off); | |
112 | return e; | |
113 | } | |
114 | ||
115 | int SimpleRADOSStriper::remove() | |
116 | { | |
117 | d(5) << dendl; | |
118 | ||
119 | if (blocklisted.load()) { | |
120 | return -EBLOCKLISTED; | |
121 | } | |
122 | ||
123 | if (int rc = wait_for_aios(true); rc < 0) { | |
124 | aios_failure = 0; | |
125 | return rc; | |
126 | } | |
127 | ||
128 | if (int rc = set_metadata(0, true); rc < 0) { | |
129 | return rc; | |
130 | } | |
131 | ||
132 | auto ext = get_first_extent(); | |
133 | if (int rc = ioctx.remove(ext.soid); rc < 0) { | |
134 | d(5) << " remove failed: " << cpp_strerror(rc) << dendl; | |
135 | return rc; | |
136 | } | |
137 | ||
138 | locked = false; | |
139 | ||
140 | return 0; | |
141 | } | |
142 | ||
143 | int SimpleRADOSStriper::truncate(uint64_t size) | |
144 | { | |
145 | d(5) << size << dendl; | |
146 | ||
147 | if (blocklisted.load()) { | |
148 | return -EBLOCKLISTED; | |
149 | } | |
150 | ||
151 | /* TODO: (not currently used by SQLite) handle growth + sparse */ | |
152 | if (int rc = set_metadata(size, true); rc < 0) { | |
153 | return rc; | |
154 | } | |
155 | ||
156 | return 0; | |
157 | } | |
158 | ||
159 | int SimpleRADOSStriper::wait_for_aios(bool block) | |
160 | { | |
161 | while (!aios.empty()) { | |
162 | auto& aiocp = aios.front(); | |
163 | int rc; | |
164 | if (block) { | |
165 | rc = aiocp->wait_for_complete(); | |
166 | } else { | |
167 | if (aiocp->is_complete()) { | |
168 | rc = aiocp->get_return_value(); | |
169 | } else { | |
170 | return 0; | |
171 | } | |
172 | } | |
173 | if (rc) { | |
174 | d(5) << " aio failed: " << cpp_strerror(rc) << dendl; | |
175 | if (aios_failure == 0) { | |
176 | aios_failure = rc; | |
177 | } | |
178 | } | |
179 | aios.pop(); | |
180 | } | |
181 | return aios_failure; | |
182 | } | |
183 | ||
184 | int SimpleRADOSStriper::flush() | |
185 | { | |
186 | d(5) << dendl; | |
187 | ||
188 | if (blocklisted.load()) { | |
189 | return -EBLOCKLISTED; | |
190 | } | |
191 | ||
192 | if (size_dirty) { | |
193 | if (int rc = set_metadata(size, true); rc < 0) { | |
194 | return rc; | |
195 | } | |
196 | } | |
197 | ||
198 | if (int rc = wait_for_aios(true); rc < 0) { | |
199 | aios_failure = 0; | |
200 | return rc; | |
201 | } | |
202 | ||
203 | return 0; | |
204 | } | |
205 | ||
206 | int SimpleRADOSStriper::stat(uint64_t* s) | |
207 | { | |
208 | d(5) << dendl; | |
209 | ||
210 | if (blocklisted.load()) { | |
211 | return -EBLOCKLISTED; | |
212 | } | |
213 | ||
214 | *s = size; | |
215 | return 0; | |
216 | } | |
217 | ||
218 | int SimpleRADOSStriper::create() | |
219 | { | |
220 | d(5) << dendl; | |
221 | ||
222 | if (blocklisted.load()) { | |
223 | return -EBLOCKLISTED; | |
224 | } | |
225 | ||
226 | auto ext = get_first_extent(); | |
227 | auto op = librados::ObjectWriteOperation(); | |
228 | /* exclusive create ensures we do none of these setxattrs happen if it fails */ | |
229 | op.create(1); | |
230 | op.setxattr(XATTR_VERSION, uint2bl(0)); | |
231 | op.setxattr(XATTR_EXCL, bufferlist()); | |
232 | op.setxattr(XATTR_SIZE, uint2bl(0)); | |
233 | op.setxattr(XATTR_ALLOCATED, uint2bl(0)); | |
234 | op.setxattr(XATTR_LAYOUT_STRIPE_UNIT, uint2bl(1)); | |
235 | op.setxattr(XATTR_LAYOUT_STRIPE_COUNT, uint2bl(1)); | |
236 | op.setxattr(XATTR_LAYOUT_OBJECT_SIZE, uint2bl(1<<object_size)); | |
237 | if (int rc = ioctx.operate(ext.soid, &op); rc < 0) { | |
238 | return rc; /* including EEXIST */ | |
239 | } | |
240 | return 0; | |
241 | } | |
242 | ||
243 | int SimpleRADOSStriper::open() | |
244 | { | |
245 | d(5) << oid << dendl; | |
246 | ||
247 | if (blocklisted.load()) { | |
248 | return -EBLOCKLISTED; | |
249 | } | |
250 | ||
251 | auto ext = get_first_extent(); | |
252 | auto op = librados::ObjectReadOperation(); | |
253 | bufferlist bl_excl, bl_size, bl_alloc, bl_version, pbl; | |
254 | int prval_excl, prval_size, prval_alloc, prval_version; | |
255 | op.getxattr(XATTR_EXCL, &bl_excl, &prval_excl); | |
256 | op.getxattr(XATTR_SIZE, &bl_size, &prval_size); | |
257 | op.getxattr(XATTR_ALLOCATED, &bl_alloc, &prval_alloc); | |
258 | op.getxattr(XATTR_VERSION, &bl_version, &prval_version); | |
259 | if (int rc = ioctx.operate(ext.soid, &op, &pbl); rc < 0) { | |
260 | d(5) << " getxattr failed: " << cpp_strerror(rc) << dendl; | |
261 | return rc; | |
262 | } | |
263 | exclusive_holder = bl_excl.to_str(); | |
264 | { | |
265 | auto sstr = bl_size.to_str(); | |
266 | std::string err; | |
267 | size = strict_strtoll(sstr.c_str(), 10, &err); | |
268 | ceph_assert(err.empty()); | |
269 | } | |
270 | { | |
271 | auto sstr = bl_alloc.to_str(); | |
272 | std::string err; | |
273 | allocated = strict_strtoll(sstr.c_str(), 10, &err); | |
274 | ceph_assert(err.empty()); | |
275 | } | |
276 | { | |
277 | auto sstr = bl_version.to_str(); | |
278 | std::string err; | |
279 | version = strict_strtoll(sstr.c_str(), 10, &err); | |
280 | ceph_assert(err.empty()); | |
281 | } | |
282 | d(15) << " size: " << size << " allocated: " << allocated << " version: " << version << dendl; | |
283 | return 0; | |
284 | } | |
285 | ||
286 | int SimpleRADOSStriper::shrink_alloc(uint64_t a) | |
287 | { | |
288 | d(5) << dendl; | |
289 | std::vector<aiocompletionptr> removes; | |
290 | ||
291 | ceph_assert(a <= allocated); | |
292 | uint64_t prune = std::max<uint64_t>(a, (1u << object_size)); /* never delete first extent here */ | |
293 | uint64_t len = allocated - prune; | |
294 | const uint64_t bytes_removed = len; | |
295 | uint64_t offset = prune; | |
296 | while (len > 0) { | |
297 | auto ext = get_next_extent(offset, len); | |
298 | auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); | |
299 | if (int rc = ioctx.aio_remove(ext.soid, aiocp.get()); rc < 0) { | |
300 | d(5) << " aio_remove failed: " << cpp_strerror(rc) << dendl; | |
301 | return rc; | |
302 | } | |
303 | removes.emplace_back(std::move(aiocp)); | |
304 | len -= ext.len; | |
305 | offset += ext.len; | |
306 | } | |
307 | ||
308 | for (auto& aiocp : removes) { | |
309 | if (int rc = aiocp->wait_for_complete(); rc < 0 && rc != -ENOENT) { | |
310 | d(5) << " aio_remove failed: " << cpp_strerror(rc) << dendl; | |
311 | return rc; | |
312 | } | |
313 | } | |
314 | ||
315 | auto ext = get_first_extent(); | |
316 | auto op = librados::ObjectWriteOperation(); | |
317 | auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); | |
318 | op.setxattr(XATTR_ALLOCATED, uint2bl(a)); | |
319 | d(15) << " updating allocated to " << a << dendl; | |
320 | op.setxattr(XATTR_VERSION, uint2bl(version+1)); | |
321 | d(15) << " updating version to " << (version+1) << dendl; | |
322 | if (int rc = ioctx.aio_operate(ext.soid, aiocp.get(), &op); rc < 0) { | |
323 | d(5) << " update failed: " << cpp_strerror(rc) << dendl; | |
324 | return rc; | |
325 | } | |
326 | /* we need to wait so we don't have dangling extents */ | |
327 | d(10) << " waiting for allocated update" << dendl; | |
328 | if (int rc = aiocp->wait_for_complete(); rc < 0) { | |
329 | d(1) << " update failure: " << cpp_strerror(rc) << dendl; | |
330 | return rc; | |
331 | } | |
332 | if (logger) { | |
333 | logger->inc(P_UPDATE_METADATA); | |
334 | logger->inc(P_UPDATE_ALLOCATED); | |
335 | logger->inc(P_UPDATE_VERSION); | |
336 | logger->inc(P_SHRINK); | |
337 | logger->inc(P_SHRINK_BYTES, bytes_removed); | |
338 | } | |
339 | ||
340 | version += 1; | |
341 | allocated = a; | |
342 | return 0; | |
343 | } | |
344 | ||
345 | int SimpleRADOSStriper::maybe_shrink_alloc() | |
346 | { | |
347 | d(15) << dendl; | |
348 | ||
349 | if (size == 0) { | |
350 | if (allocated > 0) { | |
351 | d(10) << "allocation shrink to 0" << dendl; | |
352 | return shrink_alloc(0); | |
353 | } else { | |
354 | return 0; | |
355 | } | |
356 | } | |
357 | ||
358 | uint64_t mask = (1<<object_size)-1; | |
359 | uint64_t new_allocated = min_growth + ((size + mask) & ~mask); /* round up base 2 */ | |
360 | if (allocated > new_allocated && ((allocated-new_allocated) > min_growth)) { | |
361 | d(10) << "allocation shrink to " << new_allocated << dendl; | |
362 | return shrink_alloc(new_allocated); | |
363 | } | |
364 | ||
365 | return 0; | |
366 | } | |
367 | ||
368 | bufferlist SimpleRADOSStriper::str2bl(std::string_view sv) | |
369 | { | |
370 | bufferlist bl; | |
371 | bl.append(sv); | |
372 | return bl; | |
373 | } | |
374 | ||
375 | bufferlist SimpleRADOSStriper::uint2bl(uint64_t v) | |
376 | { | |
377 | CachedStackStringStream css; | |
378 | *css << std::dec << std::setw(16) << std::setfill('0') << v; | |
379 | bufferlist bl; | |
380 | bl.append(css->strv()); | |
381 | return bl; | |
382 | } | |
383 | ||
384 | int SimpleRADOSStriper::set_metadata(uint64_t new_size, bool update_size) | |
385 | { | |
386 | d(10) << " new_size: " << new_size | |
387 | << " update_size: " << update_size | |
388 | << " allocated: " << allocated | |
389 | << " size: " << size | |
390 | << " version: " << version | |
391 | << dendl; | |
392 | ||
393 | bool do_op = false; | |
394 | auto new_allocated = allocated; | |
395 | auto ext = get_first_extent(); | |
396 | auto op = librados::ObjectWriteOperation(); | |
397 | if (new_size > allocated) { | |
398 | uint64_t mask = (1<<object_size)-1; | |
399 | new_allocated = min_growth + ((size + mask) & ~mask); /* round up base 2 */ | |
400 | op.setxattr(XATTR_ALLOCATED, uint2bl(new_allocated)); | |
401 | do_op = true; | |
402 | if (logger) logger->inc(P_UPDATE_ALLOCATED); | |
403 | d(15) << " updating allocated to " << new_allocated << dendl; | |
404 | } | |
405 | if (update_size) { | |
406 | op.setxattr(XATTR_SIZE, uint2bl(new_size)); | |
407 | do_op = true; | |
408 | if (logger) logger->inc(P_UPDATE_SIZE); | |
409 | d(15) << " updating size to " << new_size << dendl; | |
410 | } | |
411 | if (do_op) { | |
412 | if (logger) logger->inc(P_UPDATE_METADATA); | |
413 | if (logger) logger->inc(P_UPDATE_VERSION); | |
414 | op.setxattr(XATTR_VERSION, uint2bl(version+1)); | |
415 | d(15) << " updating version to " << (version+1) << dendl; | |
416 | auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); | |
417 | if (int rc = ioctx.aio_operate(ext.soid, aiocp.get(), &op); rc < 0) { | |
418 | d(1) << " update failure: " << cpp_strerror(rc) << dendl; | |
419 | return rc; | |
420 | } | |
421 | version += 1; | |
422 | if (allocated != new_allocated) { | |
423 | /* we need to wait so we don't have dangling extents */ | |
424 | d(10) << "waiting for allocated update" << dendl; | |
425 | if (int rc = aiocp->wait_for_complete(); rc < 0) { | |
426 | d(1) << " update failure: " << cpp_strerror(rc) << dendl; | |
427 | return rc; | |
428 | } | |
429 | aiocp.reset(); | |
430 | allocated = new_allocated; | |
431 | } | |
432 | if (aiocp) { | |
433 | aios.emplace(std::move(aiocp)); | |
434 | } | |
435 | if (update_size) { | |
436 | size = new_size; | |
437 | size_dirty = false; | |
438 | return maybe_shrink_alloc(); | |
439 | } | |
440 | } | |
441 | return 0; | |
442 | } | |
443 | ||
444 | ssize_t SimpleRADOSStriper::write(const void* data, size_t len, uint64_t off) | |
445 | { | |
446 | d(5) << off << "~" << len << dendl; | |
447 | ||
448 | if (blocklisted.load()) { | |
449 | return -EBLOCKLISTED; | |
450 | } | |
451 | ||
452 | if (allocated < (len+off)) { | |
453 | if (int rc = set_metadata(len+off, false); rc < 0) { | |
454 | return rc; | |
455 | } | |
456 | } | |
457 | ||
458 | size_t w = 0; | |
459 | while ((len-w) > 0) { | |
460 | auto ext = get_next_extent(off+w, len-w); | |
461 | auto aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); | |
462 | bufferlist bl; | |
463 | bl.append((const char*)data+w, ext.len); | |
464 | if (int rc = ioctx.aio_write(ext.soid, aiocp.get(), bl, ext.len, ext.off); rc < 0) { | |
465 | break; | |
466 | } | |
467 | aios.emplace(std::move(aiocp)); | |
468 | w += ext.len; | |
469 | } | |
470 | ||
471 | wait_for_aios(false); // clean up finished completions | |
472 | ||
473 | if (size < (len+off)) { | |
474 | size = len+off; | |
475 | size_dirty = true; | |
476 | d(10) << " dirty size: " << size << dendl; | |
477 | } | |
478 | ||
479 | return (ssize_t)w; | |
480 | } | |
481 | ||
482 | ssize_t SimpleRADOSStriper::read(void* data, size_t len, uint64_t off) | |
483 | { | |
484 | d(5) << off << "~" << len << dendl; | |
485 | ||
486 | if (blocklisted.load()) { | |
487 | return -EBLOCKLISTED; | |
488 | } | |
489 | ||
490 | size_t r = 0; | |
491 | std::vector<std::pair<bufferlist, aiocompletionptr>> reads; | |
492 | while ((len-r) > 0) { | |
493 | auto ext = get_next_extent(off+r, len-r); | |
494 | auto& [bl, aiocp] = reads.emplace_back(); | |
495 | aiocp = aiocompletionptr(librados::Rados::aio_create_completion()); | |
496 | if (int rc = ioctx.aio_read(ext.soid, aiocp.get(), &bl, ext.len, ext.off); rc < 0) { | |
497 | d(1) << " read failure: " << cpp_strerror(rc) << dendl; | |
498 | return rc; | |
499 | } | |
500 | r += ext.len; | |
501 | } | |
502 | ||
503 | r = 0; | |
504 | for (auto& [bl, aiocp] : reads) { | |
505 | if (int rc = aiocp->wait_for_complete(); rc < 0) { | |
506 | d(1) << " read failure: " << cpp_strerror(rc) << dendl; | |
507 | return rc; | |
508 | } | |
509 | bl.begin().copy(bl.length(), ((char*)data)+r); | |
510 | r += bl.length(); | |
511 | } | |
512 | ceph_assert(r <= len); | |
513 | ||
514 | return r; | |
515 | } | |
516 | ||
517 | int SimpleRADOSStriper::print_lockers(std::ostream& out) | |
518 | { | |
519 | int exclusive; | |
520 | std::string tag; | |
521 | std::list<librados::locker_t> lockers; | |
522 | auto ext = get_first_extent(); | |
523 | if (int rc = ioctx.list_lockers(ext.soid, biglock, &exclusive, &tag, &lockers); rc < 0) { | |
524 | d(1) << " list_lockers failure: " << cpp_strerror(rc) << dendl; | |
525 | return rc; | |
526 | } | |
527 | if (lockers.empty()) { | |
528 | out << " lockers none"; | |
529 | } else { | |
530 | out << " lockers exclusive=" << exclusive << " tag=" << tag << " lockers=["; | |
531 | bool first = true; | |
532 | for (const auto& l : lockers) { | |
533 | if (!first) out << ","; | |
534 | out << l.client << ":" << l.cookie << ":" << l.address; | |
535 | } | |
536 | out << "]"; | |
537 | } | |
538 | return 0; | |
539 | } | |
540 | ||
541 | /* Do lock renewal in a separate thread: while it's unlikely sqlite chews on | |
542 | * something for multiple seconds without calling into the VFS (where we could | |
543 | * initiate a lock renewal), it's not impossible with complex queries. Also, we | |
544 | * want to allow "PRAGMA locking_mode = exclusive" where the application may | |
545 | * not use the sqlite3 database connection for an indeterminate amount of time. | |
546 | */ | |
547 | void SimpleRADOSStriper::lock_keeper_main(void) | |
548 | { | |
549 | d(20) << dendl; | |
550 | const auto ext = get_first_extent(); | |
551 | while (!shutdown) { | |
552 | d(20) << "tick" << dendl; | |
553 | std::unique_lock lock(lock_keeper_mutex); | |
554 | auto now = clock::now(); | |
555 | auto since = now-last_renewal; | |
556 | ||
557 | if (since >= lock_keeper_interval && locked) { | |
558 | d(10) << "renewing lock" << dendl; | |
559 | auto tv = ceph::to_timeval(lock_keeper_timeout); | |
560 | int rc = ioctx.lock_exclusive(ext.soid, biglock, cookie.to_string(), lockdesc, &tv, LIBRADOS_LOCK_FLAG_MUST_RENEW); | |
561 | if (rc) { | |
562 | /* If lock renewal fails, we cannot continue the application. Return | |
563 | * -EBLOCKLISTED for all calls into the striper for this instance, even | |
564 | * if we're not actually blocklisted. | |
565 | */ | |
566 | d(-1) << "lock renewal failed: " << cpp_strerror(rc) << dendl; | |
567 | blocklisted = true; | |
568 | break; | |
569 | } | |
570 | last_renewal = clock::now(); | |
571 | } | |
572 | ||
573 | lock_keeper_cvar.wait_for(lock, lock_keeper_interval); | |
574 | } | |
575 | } | |
576 | ||
577 | int SimpleRADOSStriper::recover_lock() | |
578 | { | |
579 | d(5) << "attempting to recover lock" << dendl; | |
580 | ||
581 | std::string addrs; | |
582 | const auto ext = get_first_extent(); | |
583 | ||
584 | { | |
585 | auto tv = ceph::to_timeval(lock_keeper_timeout); | |
586 | if (int rc = ioctx.lock_exclusive(ext.soid, biglock, cookie.to_string(), lockdesc, &tv, 0); rc < 0) { | |
587 | return rc; | |
588 | } | |
589 | locked = true; | |
590 | last_renewal = clock::now(); | |
591 | } | |
592 | ||
593 | d(5) << "acquired lock, fetching last owner" << dendl; | |
594 | ||
595 | { | |
596 | bufferlist bl_excl; | |
597 | if (int rc = ioctx.getxattr(ext.soid, XATTR_EXCL, bl_excl); rc < 0) { | |
598 | if (rc == -ENOENT) { | |
599 | /* someone removed it? ok... */ | |
600 | goto setowner; | |
601 | } else { | |
602 | d(-1) << "could not recover exclusive locker" << dendl; | |
603 | locked = false; /* it will drop eventually */ | |
604 | return -EIO; | |
605 | } | |
606 | } | |
607 | addrs = bl_excl.to_str(); | |
608 | } | |
609 | ||
610 | if (addrs.empty()) { | |
611 | d(5) << "someone else cleaned up" << dendl; | |
612 | goto setowner; | |
613 | } else { | |
614 | d(5) << "exclusive lock holder was " << addrs << dendl; | |
615 | } | |
616 | ||
617 | if (blocklist_the_dead) { | |
618 | entity_addrvec_t addrv; | |
619 | addrv.parse(addrs.c_str()); | |
620 | auto R = librados::Rados(ioctx); | |
20effc67 | 621 | std::string_view b = "blocklist"; |
f67539c2 TL |
622 | retry: |
623 | for (auto& a : addrv.v) { | |
624 | CachedStackStringStream css; | |
625 | *css << "{\"prefix\":\"osd " << b << "\", \"" << b << "op\":\"add\","; | |
626 | *css << "\"addr\":\""; | |
627 | *css << a; | |
628 | *css << "\"}"; | |
629 | std::vector<std::string> cmd = {css->str()}; | |
630 | d(5) << "sending blocklist command: " << cmd << dendl; | |
631 | std::string out; | |
632 | if (int rc = R.mon_command(css->str(), bufferlist(), nullptr, &out); rc < 0) { | |
20effc67 TL |
633 | if (rc == -EINVAL && b == "blocklist") { |
634 | b = "blacklist"; | |
f67539c2 TL |
635 | goto retry; |
636 | } | |
637 | d(-1) << "Cannot proceed with recovery because I have failed to blocklist the old client: " << cpp_strerror(rc) << ", out = " << out << dendl; | |
638 | locked = false; /* it will drop eventually */ | |
639 | return -EIO; | |
640 | } | |
641 | } | |
642 | /* Ensure our osd_op requests have the latest epoch. */ | |
643 | R.wait_for_latest_osdmap(); | |
644 | } | |
645 | ||
646 | setowner: | |
647 | d(5) << "setting new owner to myself, " << myaddrs << dendl; | |
648 | { | |
649 | auto myaddrbl = str2bl(myaddrs); | |
650 | if (int rc = ioctx.setxattr(ext.soid, XATTR_EXCL, myaddrbl); rc < 0) { | |
651 | d(-1) << "could not set lock owner" << dendl; | |
652 | locked = false; /* it will drop eventually */ | |
653 | return -EIO; | |
654 | } | |
655 | } | |
656 | return 0; | |
657 | } | |
658 | ||
659 | int SimpleRADOSStriper::lock(uint64_t timeoutms) | |
660 | { | |
661 | /* XXX: timeoutms is unused */ | |
662 | d(5) << "timeout=" << timeoutms << dendl; | |
663 | ||
664 | if (blocklisted.load()) { | |
665 | return -EBLOCKLISTED; | |
666 | } | |
667 | ||
668 | std::scoped_lock lock(lock_keeper_mutex); | |
669 | ||
670 | ceph_assert(!is_locked()); | |
671 | ||
672 | /* We're going to be very lazy here in implementation: only exclusive locks | |
673 | * are allowed. That even ensures a single reader. | |
674 | */ | |
675 | uint64_t slept = 0; | |
676 | ||
677 | auto ext = get_first_extent(); | |
678 | while (true) { | |
679 | /* The general fast path in one compound operation: obtain the lock, | |
680 | * confirm the past locker cleaned up after themselves (set XATTR_EXCL to | |
681 | * ""), then finally set XATTR_EXCL to our address vector as the new | |
682 | * exclusive locker. | |
683 | */ | |
684 | ||
685 | auto op = librados::ObjectWriteOperation(); | |
686 | auto tv = ceph::to_timeval(lock_keeper_timeout); | |
687 | utime_t duration; | |
688 | duration.set_from_timeval(&tv); | |
689 | rados::cls::lock::lock(&op, biglock, ClsLockType::EXCLUSIVE, cookie.to_string(), "", lockdesc, duration, 0); | |
690 | op.cmpxattr(XATTR_EXCL, LIBRADOS_CMPXATTR_OP_EQ, bufferlist()); | |
691 | op.setxattr(XATTR_EXCL, str2bl(myaddrs)); | |
692 | int rc = ioctx.operate(ext.soid, &op); | |
693 | if (rc == 0) { | |
694 | locked = true; | |
695 | last_renewal = clock::now(); | |
696 | break; | |
697 | } else if (rc == -EBUSY) { | |
698 | if ((slept % 500000) == 0) { | |
699 | d(-1) << "waiting for locks: "; | |
700 | print_lockers(*_dout); | |
701 | *_dout << dendl; | |
702 | } | |
703 | usleep(5000); | |
704 | slept += 5000; | |
705 | continue; | |
706 | } else if (rc == -ECANCELED) { | |
707 | /* CMPXATTR failed, a locker didn't cleanup. Try to recover! */ | |
708 | if (rc = recover_lock(); rc < 0) { | |
709 | if (rc == -EBUSY) { | |
710 | continue; /* try again */ | |
711 | } | |
712 | return rc; | |
713 | } | |
714 | break; | |
715 | } else { | |
716 | d(-1) << " lock failed: " << cpp_strerror(rc) << dendl; | |
717 | return rc; | |
718 | } | |
719 | } | |
720 | ||
721 | if (!lock_keeper.joinable()) { | |
722 | lock_keeper = std::thread(&SimpleRADOSStriper::lock_keeper_main, this); | |
723 | } | |
724 | ||
725 | if (int rc = open(); rc < 0) { | |
726 | d(5) << " open failed: " << cpp_strerror(rc) << dendl; | |
727 | return rc; | |
728 | } | |
729 | ||
730 | d(5) << " = 0" << dendl; | |
731 | if (logger) { | |
732 | logger->inc(P_LOCK); | |
733 | } | |
734 | ||
735 | return 0; | |
736 | } | |
737 | ||
738 | int SimpleRADOSStriper::unlock() | |
739 | { | |
740 | d(5) << dendl; | |
741 | ||
742 | if (blocklisted.load()) { | |
743 | return -EBLOCKLISTED; | |
744 | } | |
745 | ||
746 | std::scoped_lock lock(lock_keeper_mutex); | |
747 | ||
748 | ceph_assert(is_locked()); | |
749 | ||
750 | /* wait for flush of metadata */ | |
751 | if (int rc = flush(); rc < 0) { | |
752 | return rc; | |
753 | } | |
754 | ||
755 | const auto ext = get_first_extent(); | |
756 | auto op = librados::ObjectWriteOperation(); | |
757 | op.cmpxattr(XATTR_EXCL, LIBRADOS_CMPXATTR_OP_EQ, str2bl(myaddrs)); | |
758 | op.setxattr(XATTR_EXCL, bufferlist()); | |
759 | rados::cls::lock::unlock(&op, biglock, cookie.to_string()); | |
760 | if (int rc = ioctx.operate(ext.soid, &op); rc < 0) { | |
761 | d(-1) << " unlock failed: " << cpp_strerror(rc) << dendl; | |
762 | return rc; | |
763 | } | |
764 | locked = false; | |
765 | ||
766 | d(5) << " = 0" << dendl; | |
767 | if (logger) { | |
768 | logger->inc(P_UNLOCK); | |
769 | } | |
770 | ||
771 | return 0; | |
772 | } |