]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_reshard.cc
68352f3e56ad446aa13374ff8c829d35c50e1c85
[ceph.git] / ceph / src / rgw / rgw_reshard.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab ft=cpp
3
4 #include <limits>
5 #include <sstream>
6
7 #include "rgw_zone.h"
8 #include "rgw_bucket.h"
9 #include "rgw_reshard.h"
10 #include "rgw_sal.h"
11 #include "rgw_sal_rados.h"
12 #include "cls/rgw/cls_rgw_client.h"
13 #include "cls/lock/cls_lock_client.h"
14 #include "common/errno.h"
15 #include "common/ceph_json.h"
16
17 #include "common/dout.h"
18
19 #include "services/svc_zone.h"
20 #include "services/svc_sys_obj.h"
21 #include "services/svc_tier_rados.h"
22
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_rgw
25
26 const string reshard_oid_prefix = "reshard.";
27 const string reshard_lock_name = "reshard_process";
28 const string bucket_instance_lock_name = "bucket_instance_lock";
29
30 /* All primes up to 2000 used to attempt to make dynamic sharding use
31 * a prime numbers of shards. Note: this list also includes 1 for when
32 * 1 shard is the most appropriate, even though 1 is not prime.
33 */
34 const std::initializer_list<uint16_t> RGWBucketReshard::reshard_primes = {
35 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
36 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
37 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211,
38 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283,
39 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379,
40 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461,
41 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563,
42 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643,
43 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739,
44 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829,
45 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, 937,
46 941, 947, 953, 967, 971, 977, 983, 991, 997, 1009, 1013, 1019, 1021,
47 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093,
48 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181,
49 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, 1259,
50 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
51 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433,
52 1439, 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493,
53 1499, 1511, 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579,
54 1583, 1597, 1601, 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657,
55 1663, 1667, 1669, 1693, 1697, 1699, 1709, 1721, 1723, 1733, 1741,
56 1747, 1753, 1759, 1777, 1783, 1787, 1789, 1801, 1811, 1823, 1831,
57 1847, 1861, 1867, 1871, 1873, 1877, 1879, 1889, 1901, 1907, 1913,
58 1931, 1933, 1949, 1951, 1973, 1979, 1987, 1993, 1997, 1999
59 };
60
61 class BucketReshardShard {
62 rgw::sal::RGWRadosStore *store;
63 const RGWBucketInfo& bucket_info;
64 int num_shard;
65 const rgw::bucket_index_layout_generation& idx_layout;
66 RGWRados::BucketShard bs;
67 vector<rgw_cls_bi_entry> entries;
68 map<RGWObjCategory, rgw_bucket_category_stats> stats;
69 deque<librados::AioCompletion *>& aio_completions;
70 uint64_t max_aio_completions;
71 uint64_t reshard_shard_batch_size;
72
73 int wait_next_completion() {
74 librados::AioCompletion *c = aio_completions.front();
75 aio_completions.pop_front();
76
77 c->wait_for_complete();
78
79 int ret = c->get_return_value();
80 c->release();
81
82 if (ret < 0) {
83 derr << "ERROR: reshard rados operation failed: " << cpp_strerror(-ret) << dendl;
84 return ret;
85 }
86
87 return 0;
88 }
89
90 int get_completion(librados::AioCompletion **c) {
91 if (aio_completions.size() >= max_aio_completions) {
92 int ret = wait_next_completion();
93 if (ret < 0) {
94 return ret;
95 }
96 }
97
98 *c = librados::Rados::aio_create_completion(nullptr, nullptr);
99 aio_completions.push_back(*c);
100
101 return 0;
102 }
103
104 public:
105 BucketReshardShard(rgw::sal::RGWRadosStore *_store, const RGWBucketInfo& _bucket_info,
106 int _num_shard, const rgw::bucket_index_layout_generation& _idx_layout,
107 deque<librados::AioCompletion *>& _completions) :
108 store(_store), bucket_info(_bucket_info), idx_layout(_idx_layout), bs(store->getRados()),
109 aio_completions(_completions)
110 {
111 num_shard = (idx_layout.layout.normal.num_shards > 0 ? _num_shard : -1);
112
113 bs.init(bucket_info.bucket, num_shard, idx_layout, nullptr /* no RGWBucketInfo */);
114
115 max_aio_completions =
116 store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_max_aio");
117 reshard_shard_batch_size =
118 store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_batch_size");
119 }
120
121 int get_num_shard() {
122 return num_shard;
123 }
124
125 int add_entry(rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
126 const rgw_bucket_category_stats& entry_stats) {
127 entries.push_back(entry);
128 if (account) {
129 rgw_bucket_category_stats& target = stats[category];
130 target.num_entries += entry_stats.num_entries;
131 target.total_size += entry_stats.total_size;
132 target.total_size_rounded += entry_stats.total_size_rounded;
133 target.actual_size += entry_stats.actual_size;
134 }
135 if (entries.size() >= reshard_shard_batch_size) {
136 int ret = flush();
137 if (ret < 0) {
138 return ret;
139 }
140 }
141
142 return 0;
143 }
144
145 int flush() {
146 if (entries.size() == 0) {
147 return 0;
148 }
149
150 librados::ObjectWriteOperation op;
151 for (auto& entry : entries) {
152 store->getRados()->bi_put(op, bs, entry);
153 }
154 cls_rgw_bucket_update_stats(op, false, stats);
155
156 librados::AioCompletion *c;
157 int ret = get_completion(&c);
158 if (ret < 0) {
159 return ret;
160 }
161 ret = bs.bucket_obj.aio_operate(c, &op);
162 if (ret < 0) {
163 derr << "ERROR: failed to store entries in target bucket shard (bs=" << bs.bucket << "/" << bs.shard_id << ") error=" << cpp_strerror(-ret) << dendl;
164 return ret;
165 }
166 entries.clear();
167 stats.clear();
168 return 0;
169 }
170
171 int wait_all_aio() {
172 int ret = 0;
173 while (!aio_completions.empty()) {
174 int r = wait_next_completion();
175 if (r < 0) {
176 ret = r;
177 }
178 }
179 return ret;
180 }
181 }; // class BucketReshardShard
182
183
184 class BucketReshardManager {
185 rgw::sal::RGWRadosStore *store;
186 const RGWBucketInfo& target_bucket_info;
187 deque<librados::AioCompletion *> completions;
188 int num_target_shards;
189 vector<BucketReshardShard *> target_shards;
190
191 public:
192 BucketReshardManager(rgw::sal::RGWRadosStore *_store,
193 const RGWBucketInfo& _target_bucket_info,
194 int _num_target_shards) :
195 store(_store), target_bucket_info(_target_bucket_info),
196 num_target_shards(_num_target_shards)
197 {
198 const auto& idx_layout = target_bucket_info.layout.current_index;
199 target_shards.resize(num_target_shards);
200 for (int i = 0; i < num_target_shards; ++i) {
201 target_shards[i] = new BucketReshardShard(store, target_bucket_info, i, idx_layout, completions);
202 }
203 }
204
205 ~BucketReshardManager() {
206 for (auto& shard : target_shards) {
207 int ret = shard->wait_all_aio();
208 if (ret < 0) {
209 ldout(store->ctx(), 20) << __func__ <<
210 ": shard->wait_all_aio() returned ret=" << ret << dendl;
211 }
212 }
213 }
214
215 int add_entry(int shard_index,
216 rgw_cls_bi_entry& entry, bool account, RGWObjCategory category,
217 const rgw_bucket_category_stats& entry_stats) {
218 int ret = target_shards[shard_index]->add_entry(entry, account, category,
219 entry_stats);
220 if (ret < 0) {
221 derr << "ERROR: target_shards.add_entry(" << entry.idx <<
222 ") returned error: " << cpp_strerror(-ret) << dendl;
223 return ret;
224 }
225
226 return 0;
227 }
228
229 int finish() {
230 int ret = 0;
231 for (auto& shard : target_shards) {
232 int r = shard->flush();
233 if (r < 0) {
234 derr << "ERROR: target_shards[" << shard->get_num_shard() << "].flush() returned error: " << cpp_strerror(-r) << dendl;
235 ret = r;
236 }
237 }
238 for (auto& shard : target_shards) {
239 int r = shard->wait_all_aio();
240 if (r < 0) {
241 derr << "ERROR: target_shards[" << shard->get_num_shard() << "].wait_all_aio() returned error: " << cpp_strerror(-r) << dendl;
242 ret = r;
243 }
244 delete shard;
245 }
246 target_shards.clear();
247 return ret;
248 }
249 }; // class BucketReshardManager
250
251 RGWBucketReshard::RGWBucketReshard(rgw::sal::RGWRadosStore *_store,
252 const RGWBucketInfo& _bucket_info,
253 const map<string, bufferlist>& _bucket_attrs,
254 RGWBucketReshardLock* _outer_reshard_lock) :
255 store(_store), bucket_info(_bucket_info), bucket_attrs(_bucket_attrs),
256 reshard_lock(store, bucket_info, true),
257 outer_reshard_lock(_outer_reshard_lock)
258 { }
259
260 int RGWBucketReshard::set_resharding_status(rgw::sal::RGWRadosStore* store,
261 const RGWBucketInfo& bucket_info,
262 const string& new_instance_id,
263 int32_t num_shards,
264 cls_rgw_reshard_status status)
265 {
266 if (new_instance_id.empty()) {
267 ldout(store->ctx(), 0) << __func__ << " missing new bucket instance id" << dendl;
268 return -EINVAL;
269 }
270
271 cls_rgw_bucket_instance_entry instance_entry;
272 instance_entry.set_status(new_instance_id, num_shards, status);
273
274 int ret = store->getRados()->bucket_set_reshard(bucket_info, instance_entry);
275 if (ret < 0) {
276 ldout(store->ctx(), 0) << "RGWReshard::" << __func__ << " ERROR: error setting bucket resharding flag on bucket index: "
277 << cpp_strerror(-ret) << dendl;
278 return ret;
279 }
280 return 0;
281 }
282
283 // reshard lock assumes lock is held
284 int RGWBucketReshard::clear_resharding(rgw::sal::RGWRadosStore* store,
285 const RGWBucketInfo& bucket_info)
286 {
287 int ret = clear_index_shard_reshard_status(store, bucket_info);
288 if (ret < 0) {
289 ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
290 " ERROR: error clearing reshard status from index shard " <<
291 cpp_strerror(-ret) << dendl;
292 return ret;
293 }
294
295 cls_rgw_bucket_instance_entry instance_entry;
296 ret = store->getRados()->bucket_set_reshard(bucket_info, instance_entry);
297 if (ret < 0) {
298 ldout(store->ctx(), 0) << "RGWReshard::" << __func__ <<
299 " ERROR: error setting bucket resharding flag on bucket index: " <<
300 cpp_strerror(-ret) << dendl;
301 return ret;
302 }
303
304 return 0;
305 }
306
307 int RGWBucketReshard::clear_index_shard_reshard_status(rgw::sal::RGWRadosStore* store,
308 const RGWBucketInfo& bucket_info)
309 {
310 uint32_t num_shards = bucket_info.layout.current_index.layout.normal.num_shards;
311
312 if (num_shards < std::numeric_limits<uint32_t>::max()) {
313 int ret = set_resharding_status(store, bucket_info,
314 bucket_info.bucket.bucket_id,
315 (num_shards < 1 ? 1 : num_shards),
316 cls_rgw_reshard_status::NOT_RESHARDING);
317 if (ret < 0) {
318 ldout(store->ctx(), 0) << "RGWBucketReshard::" << __func__ <<
319 " ERROR: error clearing reshard status from index shard " <<
320 cpp_strerror(-ret) << dendl;
321 return ret;
322 }
323 }
324
325 return 0;
326 }
327
328 static int create_new_bucket_instance(rgw::sal::RGWRadosStore *store,
329 int new_num_shards,
330 const RGWBucketInfo& bucket_info,
331 map<string, bufferlist>& attrs,
332 RGWBucketInfo& new_bucket_info)
333 {
334 new_bucket_info = bucket_info;
335
336 store->getRados()->create_bucket_id(&new_bucket_info.bucket.bucket_id);
337
338 new_bucket_info.layout.current_index.layout.normal.num_shards = new_num_shards;
339 new_bucket_info.objv_tracker.clear();
340
341 new_bucket_info.new_bucket_instance_id.clear();
342 new_bucket_info.reshard_status = cls_rgw_reshard_status::NOT_RESHARDING;
343
344 int ret = store->svc()->bi->init_index(new_bucket_info);
345 if (ret < 0) {
346 cerr << "ERROR: failed to init new bucket indexes: " << cpp_strerror(-ret) << std::endl;
347 return ret;
348 }
349
350 ret = store->getRados()->put_bucket_instance_info(new_bucket_info, true, real_time(), &attrs);
351 if (ret < 0) {
352 cerr << "ERROR: failed to store new bucket instance info: " << cpp_strerror(-ret) << std::endl;
353 return ret;
354 }
355
356 return 0;
357 }
358
359 int RGWBucketReshard::create_new_bucket_instance(int new_num_shards,
360 RGWBucketInfo& new_bucket_info)
361 {
362 return ::create_new_bucket_instance(store, new_num_shards,
363 bucket_info, bucket_attrs, new_bucket_info);
364 }
365
366 int RGWBucketReshard::cancel()
367 {
368 int ret = reshard_lock.lock();
369 if (ret < 0) {
370 return ret;
371 }
372
373 ret = clear_resharding();
374
375 reshard_lock.unlock();
376 return ret;
377 }
378
379 class BucketInfoReshardUpdate
380 {
381 rgw::sal::RGWRadosStore *store;
382 RGWBucketInfo& bucket_info;
383 std::map<string, bufferlist> bucket_attrs;
384
385 bool in_progress{false};
386
387 int set_status(cls_rgw_reshard_status s) {
388 bucket_info.reshard_status = s;
389 int ret = store->getRados()->put_bucket_instance_info(bucket_info, false, real_time(), &bucket_attrs);
390 if (ret < 0) {
391 ldout(store->ctx(), 0) << "ERROR: failed to write bucket info, ret=" << ret << dendl;
392 return ret;
393 }
394 return 0;
395 }
396
397 public:
398 BucketInfoReshardUpdate(rgw::sal::RGWRadosStore *_store,
399 RGWBucketInfo& _bucket_info,
400 map<string, bufferlist>& _bucket_attrs,
401 const string& new_bucket_id) :
402 store(_store),
403 bucket_info(_bucket_info),
404 bucket_attrs(_bucket_attrs)
405 {
406 bucket_info.new_bucket_instance_id = new_bucket_id;
407 }
408
409 ~BucketInfoReshardUpdate() {
410 if (in_progress) {
411 // resharding must not have ended correctly, clean up
412 int ret =
413 RGWBucketReshard::clear_index_shard_reshard_status(store, bucket_info);
414 if (ret < 0) {
415 lderr(store->ctx()) << "Error: " << __func__ <<
416 " clear_index_shard_status returned " << ret << dendl;
417 }
418 bucket_info.new_bucket_instance_id.clear();
419
420 // clears new_bucket_instance as well
421 set_status(cls_rgw_reshard_status::NOT_RESHARDING);
422 }
423 }
424
425 int start() {
426 int ret = set_status(cls_rgw_reshard_status::IN_PROGRESS);
427 if (ret < 0) {
428 return ret;
429 }
430 in_progress = true;
431 return 0;
432 }
433
434 int complete() {
435 int ret = set_status(cls_rgw_reshard_status::DONE);
436 if (ret < 0) {
437 return ret;
438 }
439 in_progress = false;
440 return 0;
441 }
442 };
443
444
445 RGWBucketReshardLock::RGWBucketReshardLock(rgw::sal::RGWRadosStore* _store,
446 const std::string& reshard_lock_oid,
447 bool _ephemeral) :
448 store(_store),
449 lock_oid(reshard_lock_oid),
450 ephemeral(_ephemeral),
451 internal_lock(reshard_lock_name)
452 {
453 const int lock_dur_secs = store->ctx()->_conf.get_val<uint64_t>(
454 "rgw_reshard_bucket_lock_duration");
455 duration = std::chrono::seconds(lock_dur_secs);
456
457 #define COOKIE_LEN 16
458 char cookie_buf[COOKIE_LEN + 1];
459 gen_rand_alphanumeric(store->ctx(), cookie_buf, sizeof(cookie_buf) - 1);
460 cookie_buf[COOKIE_LEN] = '\0';
461
462 internal_lock.set_cookie(cookie_buf);
463 internal_lock.set_duration(duration);
464 }
465
466 int RGWBucketReshardLock::lock() {
467 internal_lock.set_must_renew(false);
468 int ret;
469 if (ephemeral) {
470 ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
471 lock_oid);
472 } else {
473 ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
474 }
475 if (ret < 0) {
476 ldout(store->ctx(), 0) << "RGWReshardLock::" << __func__ <<
477 " failed to acquire lock on " << lock_oid << " ret=" << ret << dendl;
478 return ret;
479 }
480 reset_time(Clock::now());
481
482 return 0;
483 }
484
485 void RGWBucketReshardLock::unlock() {
486 int ret = internal_lock.unlock(&store->getRados()->reshard_pool_ctx, lock_oid);
487 if (ret < 0) {
488 ldout(store->ctx(), 0) << "WARNING: RGWBucketReshardLock::" << __func__ <<
489 " failed to drop lock on " << lock_oid << " ret=" << ret << dendl;
490 }
491 }
492
493 int RGWBucketReshardLock::renew(const Clock::time_point& now) {
494 internal_lock.set_must_renew(true);
495 int ret;
496 if (ephemeral) {
497 ret = internal_lock.lock_exclusive_ephemeral(&store->getRados()->reshard_pool_ctx,
498 lock_oid);
499 } else {
500 ret = internal_lock.lock_exclusive(&store->getRados()->reshard_pool_ctx, lock_oid);
501 }
502 if (ret < 0) { /* expired or already locked by another processor */
503 std::stringstream error_s;
504 if (-ENOENT == ret) {
505 error_s << "ENOENT (lock expired or never initially locked)";
506 } else {
507 error_s << ret << " (" << cpp_strerror(-ret) << ")";
508 }
509 ldout(store->ctx(), 5) << __func__ << "(): failed to renew lock on " <<
510 lock_oid << " with error " << error_s.str() << dendl;
511 return ret;
512 }
513 internal_lock.set_must_renew(false);
514
515 reset_time(now);
516 ldout(store->ctx(), 20) << __func__ << "(): successfully renewed lock on " <<
517 lock_oid << dendl;
518
519 return 0;
520 }
521
522
523 int RGWBucketReshard::do_reshard(int num_shards,
524 RGWBucketInfo& new_bucket_info,
525 int max_entries,
526 bool verbose,
527 ostream *out,
528 Formatter *formatter)
529 {
530 if (out) {
531 const rgw_bucket& bucket = bucket_info.bucket;
532 (*out) << "tenant: " << bucket.tenant << std::endl;
533 (*out) << "bucket name: " << bucket.name << std::endl;
534 (*out) << "old bucket instance id: " << bucket.bucket_id <<
535 std::endl;
536 (*out) << "new bucket instance id: " << new_bucket_info.bucket.bucket_id <<
537 std::endl;
538 }
539
540 /* update bucket info -- in progress*/
541 list<rgw_cls_bi_entry> entries;
542
543 if (max_entries < 0) {
544 ldout(store->ctx(), 0) << __func__ <<
545 ": can't reshard, negative max_entries" << dendl;
546 return -EINVAL;
547 }
548
549 // NB: destructor cleans up sharding state if reshard does not
550 // complete successfully
551 BucketInfoReshardUpdate bucket_info_updater(store, bucket_info, bucket_attrs, new_bucket_info.bucket.bucket_id);
552
553 int ret = bucket_info_updater.start();
554 if (ret < 0) {
555 ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl;
556 return ret;
557 }
558
559 int num_target_shards = (new_bucket_info.layout.current_index.layout.normal.num_shards > 0 ? new_bucket_info.layout.current_index.layout.normal.num_shards : 1);
560
561 BucketReshardManager target_shards_mgr(store, new_bucket_info, num_target_shards);
562
563 bool verbose_json_out = verbose && (formatter != nullptr) && (out != nullptr);
564
565 if (verbose_json_out) {
566 formatter->open_array_section("entries");
567 }
568
569 uint64_t total_entries = 0;
570
571 if (!verbose_json_out && out) {
572 (*out) << "total entries:";
573 }
574
575 const int num_source_shards =
576 (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
577 string marker;
578 for (int i = 0; i < num_source_shards; ++i) {
579 bool is_truncated = true;
580 marker.clear();
581 while (is_truncated) {
582 entries.clear();
583 ret = store->getRados()->bi_list(bucket_info, i, string(), marker, max_entries, &entries, &is_truncated);
584 if (ret < 0 && ret != -ENOENT) {
585 derr << "ERROR: bi_list(): " << cpp_strerror(-ret) << dendl;
586 return ret;
587 }
588
589 for (auto iter = entries.begin(); iter != entries.end(); ++iter) {
590 rgw_cls_bi_entry& entry = *iter;
591 if (verbose_json_out) {
592 formatter->open_object_section("entry");
593
594 encode_json("shard_id", i, formatter);
595 encode_json("num_entry", total_entries, formatter);
596 encode_json("entry", entry, formatter);
597 }
598 total_entries++;
599
600 marker = entry.idx;
601
602 int target_shard_id;
603 cls_rgw_obj_key cls_key;
604 RGWObjCategory category;
605 rgw_bucket_category_stats stats;
606 bool account = entry.get_info(&cls_key, &category, &stats);
607 rgw_obj_key key(cls_key);
608 rgw_obj obj(new_bucket_info.bucket, key);
609 RGWMPObj mp;
610 if (key.ns == RGW_OBJ_NS_MULTIPART && mp.from_meta(key.name)) {
611 // place the multipart .meta object on the same shard as its head object
612 obj.index_hash_source = mp.get_key();
613 }
614 int ret = store->getRados()->get_target_shard_id(new_bucket_info.layout.current_index.layout.normal, obj.get_hash_object(), &target_shard_id);
615 if (ret < 0) {
616 lderr(store->ctx()) << "ERROR: get_target_shard_id() returned ret=" << ret << dendl;
617 return ret;
618 }
619
620 int shard_index = (target_shard_id > 0 ? target_shard_id : 0);
621
622 ret = target_shards_mgr.add_entry(shard_index, entry, account,
623 category, stats);
624 if (ret < 0) {
625 return ret;
626 }
627
628 Clock::time_point now = Clock::now();
629 if (reshard_lock.should_renew(now)) {
630 // assume outer locks have timespans at least the size of ours, so
631 // can call inside conditional
632 if (outer_reshard_lock) {
633 ret = outer_reshard_lock->renew(now);
634 if (ret < 0) {
635 return ret;
636 }
637 }
638 ret = reshard_lock.renew(now);
639 if (ret < 0) {
640 lderr(store->ctx()) << "Error renewing bucket lock: " << ret << dendl;
641 return ret;
642 }
643 }
644 if (verbose_json_out) {
645 formatter->close_section();
646 formatter->flush(*out);
647 } else if (out && !(total_entries % 1000)) {
648 (*out) << " " << total_entries;
649 }
650 } // entries loop
651 }
652 }
653
654 if (verbose_json_out) {
655 formatter->close_section();
656 formatter->flush(*out);
657 } else if (out) {
658 (*out) << " " << total_entries << std::endl;
659 }
660
661 ret = target_shards_mgr.finish();
662 if (ret < 0) {
663 lderr(store->ctx()) << "ERROR: failed to reshard" << dendl;
664 return -EIO;
665 }
666
667 ret = store->ctl()->bucket->link_bucket(new_bucket_info.owner, new_bucket_info.bucket, bucket_info.creation_time, null_yield);
668 if (ret < 0) {
669 lderr(store->ctx()) << "failed to link new bucket instance (bucket_id=" << new_bucket_info.bucket.bucket_id << ": " << cpp_strerror(-ret) << ")" << dendl;
670 return ret;
671 }
672
673 ret = bucket_info_updater.complete();
674 if (ret < 0) {
675 ldout(store->ctx(), 0) << __func__ << ": failed to update bucket info ret=" << ret << dendl;
676 /* don't error out, reshard process succeeded */
677 }
678
679 return 0;
680 // NB: some error clean-up is done by ~BucketInfoReshardUpdate
681 } // RGWBucketReshard::do_reshard
682
683 int RGWBucketReshard::get_status(list<cls_rgw_bucket_instance_entry> *status)
684 {
685 return store->svc()->bi_rados->get_reshard_status(bucket_info, status);
686 }
687
688
689 int RGWBucketReshard::execute(int num_shards, int max_op_entries,
690 bool verbose, ostream *out, Formatter *formatter,
691 RGWReshard* reshard_log)
692 {
693 int ret = reshard_lock.lock();
694 if (ret < 0) {
695 return ret;
696 }
697
698 RGWBucketInfo new_bucket_info;
699 ret = create_new_bucket_instance(num_shards, new_bucket_info);
700 if (ret < 0) {
701 // shard state is uncertain, but this will attempt to remove them anyway
702 goto error_out;
703 }
704
705 if (reshard_log) {
706 ret = reshard_log->update(bucket_info, new_bucket_info);
707 if (ret < 0) {
708 goto error_out;
709 }
710 }
711
712 // set resharding status of current bucket_info & shards with
713 // information about planned resharding
714 ret = set_resharding_status(new_bucket_info.bucket.bucket_id,
715 num_shards, cls_rgw_reshard_status::IN_PROGRESS);
716 if (ret < 0) {
717 goto error_out;
718 }
719
720 ret = do_reshard(num_shards,
721 new_bucket_info,
722 max_op_entries,
723 verbose, out, formatter);
724 if (ret < 0) {
725 goto error_out;
726 }
727
728 // at this point we've done the main work; we'll make a best-effort
729 // to clean-up but will not indicate any errors encountered
730
731 reshard_lock.unlock();
732
733 // resharding successful, so remove old bucket index shards; use
734 // best effort and don't report out an error; the lock isn't needed
735 // at this point since all we're using a best effor to to remove old
736 // shard objects
737 ret = store->svc()->bi->clean_index(bucket_info);
738 if (ret < 0) {
739 lderr(store->ctx()) << "Error: " << __func__ <<
740 " failed to clean up old shards; " <<
741 "RGWRados::clean_bucket_index returned " << ret << dendl;
742 }
743
744 ret = store->ctl()->bucket->remove_bucket_instance_info(bucket_info.bucket,
745 bucket_info, null_yield);
746 if (ret < 0) {
747 lderr(store->ctx()) << "Error: " << __func__ <<
748 " failed to clean old bucket info object \"" <<
749 bucket_info.bucket.get_key() <<
750 "\"created after successful resharding with error " << ret << dendl;
751 }
752
753 ldout(store->ctx(), 1) << __func__ <<
754 " INFO: reshard of bucket \"" << bucket_info.bucket.name << "\" from \"" <<
755 bucket_info.bucket.get_key() << "\" to \"" <<
756 new_bucket_info.bucket.get_key() << "\" completed successfully" << dendl;
757
758 return 0;
759
760 error_out:
761
762 reshard_lock.unlock();
763
764 // since the real problem is the issue that led to this error code
765 // path, we won't touch ret and instead use another variable to
766 // temporarily error codes
767 int ret2 = store->svc()->bi->clean_index(new_bucket_info);
768 if (ret2 < 0) {
769 lderr(store->ctx()) << "Error: " << __func__ <<
770 " failed to clean up shards from failed incomplete resharding; " <<
771 "RGWRados::clean_bucket_index returned " << ret2 << dendl;
772 }
773
774 ret2 = store->ctl()->bucket->remove_bucket_instance_info(new_bucket_info.bucket,
775 new_bucket_info,
776 null_yield);
777 if (ret2 < 0) {
778 lderr(store->ctx()) << "Error: " << __func__ <<
779 " failed to clean bucket info object \"" <<
780 new_bucket_info.bucket.get_key() <<
781 "\"created during incomplete resharding with error " << ret2 << dendl;
782 }
783
784 return ret;
785 } // execute
786
787
788 RGWReshard::RGWReshard(rgw::sal::RGWRadosStore* _store, bool _verbose, ostream *_out,
789 Formatter *_formatter) :
790 store(_store), instance_lock(bucket_instance_lock_name),
791 verbose(_verbose), out(_out), formatter(_formatter)
792 {
793 num_logshards = store->ctx()->_conf.get_val<uint64_t>("rgw_reshard_num_logs");
794 }
795
796 string RGWReshard::get_logshard_key(const string& tenant,
797 const string& bucket_name)
798 {
799 return tenant + ":" + bucket_name;
800 }
801
802 #define MAX_RESHARD_LOGSHARDS_PRIME 7877
803
804 void RGWReshard::get_bucket_logshard_oid(const string& tenant, const string& bucket_name, string *oid)
805 {
806 string key = get_logshard_key(tenant, bucket_name);
807
808 uint32_t sid = ceph_str_hash_linux(key.c_str(), key.size());
809 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
810 sid = sid2 % MAX_RESHARD_LOGSHARDS_PRIME % num_logshards;
811
812 get_logshard_oid(int(sid), oid);
813 }
814
815 int RGWReshard::add(cls_rgw_reshard_entry& entry)
816 {
817 if (!store->svc()->zone->can_reshard()) {
818 ldout(store->ctx(), 20) << __func__ << " Resharding is disabled" << dendl;
819 return 0;
820 }
821
822 string logshard_oid;
823
824 get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
825
826 librados::ObjectWriteOperation op;
827 cls_rgw_reshard_add(op, entry);
828
829 int ret = rgw_rados_operate(store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
830 if (ret < 0) {
831 lderr(store->ctx()) << "ERROR: failed to add entry to reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
832 return ret;
833 }
834 return 0;
835 }
836
837 int RGWReshard::update(const RGWBucketInfo& bucket_info, const RGWBucketInfo& new_bucket_info)
838 {
839 cls_rgw_reshard_entry entry;
840 entry.bucket_name = bucket_info.bucket.name;
841 entry.bucket_id = bucket_info.bucket.bucket_id;
842 entry.tenant = bucket_info.owner.tenant;
843
844 int ret = get(entry);
845 if (ret < 0) {
846 return ret;
847 }
848
849 entry.new_instance_id = new_bucket_info.bucket.name + ":" + new_bucket_info.bucket.bucket_id;
850
851 ret = add(entry);
852 if (ret < 0) {
853 ldout(store->ctx(), 0) << __func__ << ":Error in updating entry bucket " << entry.bucket_name << ": " <<
854 cpp_strerror(-ret) << dendl;
855 }
856
857 return ret;
858 }
859
860
861 int RGWReshard::list(int logshard_num, string& marker, uint32_t max, std::list<cls_rgw_reshard_entry>& entries, bool *is_truncated)
862 {
863 string logshard_oid;
864
865 get_logshard_oid(logshard_num, &logshard_oid);
866
867 int ret = cls_rgw_reshard_list(store->getRados()->reshard_pool_ctx, logshard_oid, marker, max, entries, is_truncated);
868
869 if (ret < 0) {
870 lderr(store->ctx()) << "ERROR: failed to list reshard log entries, oid=" << logshard_oid << " "
871 << "marker=" << marker << " " << cpp_strerror(ret) << dendl;
872 if (ret == -ENOENT) {
873 *is_truncated = false;
874 ret = 0;
875 } else {
876 if (ret == -EACCES) {
877 lderr(store->ctx()) << "access denied to pool " << store->svc()->zone->get_zone_params().reshard_pool
878 << ". Fix the pool access permissions of your client" << dendl;
879 }
880 }
881 }
882
883 return ret;
884 }
885
886 int RGWReshard::get(cls_rgw_reshard_entry& entry)
887 {
888 string logshard_oid;
889
890 get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
891
892 int ret = cls_rgw_reshard_get(store->getRados()->reshard_pool_ctx, logshard_oid, entry);
893 if (ret < 0) {
894 if (ret != -ENOENT) {
895 lderr(store->ctx()) << "ERROR: failed to get entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant <<
896 " bucket=" << entry.bucket_name << dendl;
897 }
898 return ret;
899 }
900
901 return 0;
902 }
903
904 int RGWReshard::remove(cls_rgw_reshard_entry& entry)
905 {
906 string logshard_oid;
907
908 get_bucket_logshard_oid(entry.tenant, entry.bucket_name, &logshard_oid);
909
910 librados::ObjectWriteOperation op;
911 cls_rgw_reshard_remove(op, entry);
912
913 int ret = rgw_rados_operate(store->getRados()->reshard_pool_ctx, logshard_oid, &op, null_yield);
914 if (ret < 0) {
915 lderr(store->ctx()) << "ERROR: failed to remove entry from reshard log, oid=" << logshard_oid << " tenant=" << entry.tenant << " bucket=" << entry.bucket_name << dendl;
916 return ret;
917 }
918
919 return ret;
920 }
921
922 int RGWReshard::clear_bucket_resharding(const string& bucket_instance_oid, cls_rgw_reshard_entry& entry)
923 {
924 int ret = cls_rgw_clear_bucket_resharding(store->getRados()->reshard_pool_ctx, bucket_instance_oid);
925 if (ret < 0) {
926 lderr(store->ctx()) << "ERROR: failed to clear bucket resharding, bucket_instance_oid=" << bucket_instance_oid << dendl;
927 return ret;
928 }
929
930 return 0;
931 }
932
933 int RGWReshardWait::wait(optional_yield y)
934 {
935 std::unique_lock lock(mutex);
936
937 if (going_down) {
938 return -ECANCELED;
939 }
940
941 if (y) {
942 auto& context = y.get_io_context();
943 auto& yield = y.get_yield_context();
944
945 Waiter waiter(context);
946 waiters.push_back(waiter);
947 lock.unlock();
948
949 waiter.timer.expires_after(duration);
950
951 boost::system::error_code ec;
952 waiter.timer.async_wait(yield[ec]);
953
954 lock.lock();
955 waiters.erase(waiters.iterator_to(waiter));
956 return -ec.value();
957 }
958
959 cond.wait_for(lock, duration);
960
961 if (going_down) {
962 return -ECANCELED;
963 }
964
965 return 0;
966 }
967
968 void RGWReshardWait::stop()
969 {
970 std::scoped_lock lock(mutex);
971 going_down = true;
972 cond.notify_all();
973 for (auto& waiter : waiters) {
974 // unblock any waiters with ECANCELED
975 waiter.timer.cancel();
976 }
977 }
978
979 int RGWReshard::process_single_logshard(int logshard_num)
980 {
981 string marker;
982 bool truncated = true;
983
984 CephContext *cct = store->ctx();
985 constexpr uint32_t max_entries = 1000;
986
987 string logshard_oid;
988 get_logshard_oid(logshard_num, &logshard_oid);
989
990 RGWBucketReshardLock logshard_lock(store, logshard_oid, false);
991
992 int ret = logshard_lock.lock();
993 if (ret < 0) {
994 ldout(store->ctx(), 5) << __func__ << "(): failed to acquire lock on " <<
995 logshard_oid << ", ret = " << ret <<dendl;
996 return ret;
997 }
998
999 do {
1000 std::list<cls_rgw_reshard_entry> entries;
1001 ret = list(logshard_num, marker, max_entries, entries, &truncated);
1002 if (ret < 0) {
1003 ldout(cct, 10) << "cannot list all reshards in logshard oid=" <<
1004 logshard_oid << dendl;
1005 continue;
1006 }
1007
1008 for(auto& entry: entries) { // logshard entries
1009 if(entry.new_instance_id.empty()) {
1010
1011 ldout(store->ctx(), 20) << __func__ << " resharding " <<
1012 entry.bucket_name << dendl;
1013
1014 rgw_bucket bucket;
1015 RGWBucketInfo bucket_info;
1016 map<string, bufferlist> attrs;
1017
1018 ret = store->getRados()->get_bucket_info(store->svc(),
1019 entry.tenant, entry.bucket_name,
1020 bucket_info, nullptr,
1021 null_yield, &attrs);
1022 if (ret < 0 || bucket_info.bucket.bucket_id != entry.bucket_id) {
1023 if (ret < 0) {
1024 ldout(cct, 0) << __func__ <<
1025 ": Error in get_bucket_info for bucket " << entry.bucket_name <<
1026 ": " << cpp_strerror(-ret) << dendl;
1027 if (ret != -ENOENT) {
1028 // any error other than ENOENT will abort
1029 return ret;
1030 }
1031 } else {
1032 ldout(cct,0) << __func__ <<
1033 ": Bucket: " << entry.bucket_name <<
1034 " already resharded by someone, skipping " << dendl;
1035 }
1036
1037 // we've encountered a reshard queue entry for an apparently
1038 // non-existent bucket; let's try to recover by cleaning up
1039 ldout(cct, 0) << __func__ <<
1040 ": removing reshard queue entry for a resharded or non-existent bucket" <<
1041 entry.bucket_name << dendl;
1042
1043 ret = remove(entry);
1044 if (ret < 0) {
1045 ldout(cct, 0) << __func__ <<
1046 ": Error removing non-existent bucket " <<
1047 entry.bucket_name << " from resharding queue: " <<
1048 cpp_strerror(-ret) << dendl;
1049 return ret;
1050 }
1051
1052 // we cleaned up, move on to the next entry
1053 goto finished_entry;
1054 }
1055
1056 RGWBucketReshard br(store, bucket_info, attrs, nullptr);
1057 ret = br.execute(entry.new_num_shards, max_entries, false, nullptr,
1058 nullptr, this);
1059 if (ret < 0) {
1060 ldout(store->ctx(), 0) << __func__ <<
1061 ": Error during resharding bucket " << entry.bucket_name << ":" <<
1062 cpp_strerror(-ret)<< dendl;
1063 return ret;
1064 }
1065
1066 ldout(store->ctx(), 20) << __func__ <<
1067 " removing reshard queue entry for bucket " << entry.bucket_name <<
1068 dendl;
1069
1070 ret = remove(entry);
1071 if (ret < 0) {
1072 ldout(cct, 0) << __func__ << ": Error removing bucket " <<
1073 entry.bucket_name << " from resharding queue: " <<
1074 cpp_strerror(-ret) << dendl;
1075 return ret;
1076 }
1077 } // if new instance id is empty
1078
1079 finished_entry:
1080
1081 Clock::time_point now = Clock::now();
1082 if (logshard_lock.should_renew(now)) {
1083 ret = logshard_lock.renew(now);
1084 if (ret < 0) {
1085 return ret;
1086 }
1087 }
1088
1089 entry.get_key(&marker);
1090 } // entry for loop
1091 } while (truncated);
1092
1093 logshard_lock.unlock();
1094 return 0;
1095 }
1096
1097
1098 void RGWReshard::get_logshard_oid(int shard_num, string *logshard)
1099 {
1100 char buf[32];
1101 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
1102
1103 string objname(reshard_oid_prefix);
1104 *logshard = objname + buf;
1105 }
1106
1107 int RGWReshard::process_all_logshards()
1108 {
1109 if (!store->svc()->zone->can_reshard()) {
1110 ldout(store->ctx(), 20) << __func__ << " Resharding is disabled" << dendl;
1111 return 0;
1112 }
1113 int ret = 0;
1114
1115 for (int i = 0; i < num_logshards; i++) {
1116 string logshard;
1117 get_logshard_oid(i, &logshard);
1118
1119 ldout(store->ctx(), 20) << "processing logshard = " << logshard << dendl;
1120
1121 ret = process_single_logshard(i);
1122
1123 ldout(store->ctx(), 20) << "finish processing logshard = " << logshard << " , ret = " << ret << dendl;
1124 }
1125
1126 return 0;
1127 }
1128
1129 bool RGWReshard::going_down()
1130 {
1131 return down_flag;
1132 }
1133
1134 void RGWReshard::start_processor()
1135 {
1136 worker = new ReshardWorker(store->ctx(), this);
1137 worker->create("rgw_reshard");
1138 }
1139
1140 void RGWReshard::stop_processor()
1141 {
1142 down_flag = true;
1143 if (worker) {
1144 worker->stop();
1145 worker->join();
1146 }
1147 delete worker;
1148 worker = nullptr;
1149 }
1150
1151 void *RGWReshard::ReshardWorker::entry() {
1152 do {
1153 utime_t start = ceph_clock_now();
1154 reshard->process_all_logshards();
1155
1156 if (reshard->going_down())
1157 break;
1158
1159 utime_t end = ceph_clock_now();
1160 end -= start;
1161 int secs = cct->_conf.get_val<uint64_t>("rgw_reshard_thread_interval");
1162
1163 if (secs <= end.sec())
1164 continue; // next round
1165
1166 secs -= end.sec();
1167
1168 std::unique_lock locker{lock};
1169 cond.wait_for(locker, std::chrono::seconds(secs));
1170 } while (!reshard->going_down());
1171
1172 return NULL;
1173 }
1174
1175 void RGWReshard::ReshardWorker::stop()
1176 {
1177 std::lock_guard l{lock};
1178 cond.notify_all();
1179 }