]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
3f6c32c8d48673f5075ae84fcde2f2b5f88532d6
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab ft=cpp
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <sstream>
9
10 #include <boost/algorithm/string.hpp>
11 #include <string_view>
12
13 #include <boost/container/flat_set.hpp>
14 #include <boost/format.hpp>
15 #include <boost/optional.hpp>
16 #include <boost/utility/in_place_factory.hpp>
17
18 #include "common/ceph_json.h"
19
20 #include "common/errno.h"
21 #include "common/Formatter.h"
22 #include "common/Throttle.h"
23
24 #include "rgw_sal.h"
25 #include "rgw_zone.h"
26 #include "rgw_cache.h"
27 #include "rgw_acl.h"
28 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
29 #include "rgw_aio_throttle.h"
30 #include "rgw_bucket.h"
31 #include "rgw_rest_conn.h"
32 #include "rgw_cr_rados.h"
33 #include "rgw_cr_rest.h"
34 #include "rgw_putobj_processor.h"
35
36 #include "cls/rgw/cls_rgw_ops.h"
37 #include "cls/rgw/cls_rgw_client.h"
38 #include "cls/rgw/cls_rgw_const.h"
39 #include "cls/refcount/cls_refcount_client.h"
40 #include "cls/version/cls_version_client.h"
41 #include "osd/osd_types.h"
42
43 #include "rgw_tools.h"
44 #include "rgw_coroutine.h"
45 #include "rgw_compression.h"
46 #include "rgw_worker.h"
47
48 #undef fork // fails to compile RGWPeriod::fork() below
49
50 #include "common/Clock.h"
51
52 using namespace librados;
53
54 #include <string>
55 #include <iostream>
56 #include <vector>
57 #include <atomic>
58 #include <list>
59 #include <map>
60 #include "include/random.h"
61
62 #include "rgw_gc.h"
63 #include "rgw_lc.h"
64
65 #include "rgw_object_expirer_core.h"
66 #include "rgw_sync.h"
67 #include "rgw_sync_counters.h"
68 #include "rgw_sync_trace.h"
69 #include "rgw_trim_datalog.h"
70 #include "rgw_trim_mdlog.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "services/svc_zone.h"
76 #include "services/svc_zone_utils.h"
77 #include "services/svc_quota.h"
78 #include "services/svc_sync_modules.h"
79 #include "services/svc_sys_obj.h"
80 #include "services/svc_sys_obj_cache.h"
81 #include "services/svc_bucket.h"
82 #include "services/svc_mdlog.h"
83 #include "services/svc_datalog_rados.h"
84
85 #include "compressor/Compressor.h"
86
87 #ifdef WITH_LTTNG
88 #define TRACEPOINT_DEFINE
89 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
90 #include "tracing/rgw_rados.h"
91 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
92 #undef TRACEPOINT_DEFINE
93 #else
94 #define tracepoint(...)
95 #endif
96
97 #define dout_context g_ceph_context
98 #define dout_subsys ceph_subsys_rgw
99
100
101 static string shadow_ns = "shadow";
102 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
103 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
104
105 static RGWObjCategory main_category = RGWObjCategory::Main;
106 #define RGW_USAGE_OBJ_PREFIX "usage."
107
108 #define dout_subsys ceph_subsys_rgw
109
110
111 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
112 const rgw_placement_rule& head_placement_rule,
113 const rgw_obj& obj, rgw_pool *pool)
114 {
115 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
116 RGWZonePlacementInfo placement;
117 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
118 return false;
119 }
120
121 if (!obj.in_extra_data) {
122 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
123 } else {
124 *pool = placement.get_data_extra_pool();
125 }
126 }
127
128 return true;
129 }
130
131 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
132 const rgw_placement_rule& head_placement_rule,
133 const rgw_obj& obj, rgw_raw_obj *raw_obj)
134 {
135 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
136
137 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
138 }
139
140 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
141 {
142 if (!is_raw) {
143 rgw_raw_obj r;
144 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
145 return r;
146 }
147 return raw_obj;
148 }
149
150 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
151 {
152 if (!is_raw) {
153 rgw_raw_obj r;
154 store->obj_to_raw(placement_rule, obj, &r);
155 return r;
156 }
157 return raw_obj;
158 }
159
160 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
161 {
162 obj_version *check_objv = version_for_check();
163
164 if (check_objv) {
165 cls_version_check(*op, *check_objv, VER_COND_EQ);
166 }
167
168 cls_version_read(*op, &read_version);
169 }
170
171 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
172 {
173 obj_version *check_objv = version_for_check();
174 obj_version *modify_version = version_for_write();
175
176 if (check_objv) {
177 cls_version_check(*op, *check_objv, VER_COND_EQ);
178 }
179
180 if (modify_version) {
181 cls_version_set(*op, *modify_version);
182 } else {
183 cls_version_inc(*op);
184 }
185 }
186
187 RGWObjState::RGWObjState() {
188 }
189
190 RGWObjState::~RGWObjState() {
191 }
192
193 RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
194 is_atomic = rhs.is_atomic;
195 has_attrs = rhs.has_attrs;
196 exists = rhs.exists;
197 size = rhs.size;
198 accounted_size = rhs.accounted_size;
199 mtime = rhs.mtime;
200 epoch = rhs.epoch;
201 if (rhs.obj_tag.length()) {
202 obj_tag = rhs.obj_tag;
203 }
204 if (rhs.tail_tag.length()) {
205 tail_tag = rhs.tail_tag;
206 }
207 write_tag = rhs.write_tag;
208 fake_tag = rhs.fake_tag;
209 manifest = rhs.manifest;
210 shadow_obj = rhs.shadow_obj;
211 has_data = rhs.has_data;
212 if (rhs.data.length()) {
213 data = rhs.data;
214 }
215 prefetch_data = rhs.prefetch_data;
216 keep_tail = rhs.keep_tail;
217 is_olh = rhs.is_olh;
218 objv_tracker = rhs.objv_tracker;
219 pg_ver = rhs.pg_ver;
220 }
221
222 RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
223 RGWObjState *result;
224 typename std::map<rgw_obj, RGWObjState>::iterator iter;
225 lock.lock_shared();
226 assert (!obj.empty());
227 iter = objs_state.find(obj);
228 if (iter != objs_state.end()) {
229 result = &iter->second;
230 lock.unlock_shared();
231 } else {
232 lock.unlock_shared();
233 lock.lock();
234 result = &objs_state[obj];
235 lock.unlock();
236 }
237 return result;
238 }
239
240 void RGWObjectCtx::set_atomic(rgw_obj& obj) {
241 std::unique_lock wl{lock};
242 assert (!obj.empty());
243 objs_state[obj].is_atomic = true;
244 }
245 void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
246 std::unique_lock wl{lock};
247 assert (!obj.empty());
248 objs_state[obj].prefetch_data = true;
249 }
250
251 void RGWObjectCtx::invalidate(const rgw_obj& obj) {
252 std::unique_lock wl{lock};
253 auto iter = objs_state.find(obj);
254 if (iter == objs_state.end()) {
255 return;
256 }
257 bool is_atomic = iter->second.is_atomic;
258 bool prefetch_data = iter->second.prefetch_data;
259
260 objs_state.erase(iter);
261
262 if (is_atomic || prefetch_data) {
263 auto& state = objs_state[obj];
264 state.is_atomic = is_atomic;
265 state.prefetch_data = prefetch_data;
266 }
267 }
268
269 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
270 {
271 write_version.ver = 1;
272 #define TAG_LEN 24
273
274 write_version.tag.clear();
275 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
276 }
277
278 class RGWMetaNotifierManager : public RGWCoroutinesManager {
279 RGWRados *store;
280 RGWHTTPManager http_manager;
281
282 public:
283 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
284 http_manager(store->ctx(), completion_mgr) {
285 http_manager.start();
286 }
287
288 int notify_all(map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
289 rgw_http_param_pair pairs[] = { { "type", "metadata" },
290 { "notify", NULL },
291 { NULL, NULL } };
292
293 list<RGWCoroutinesStack *> stacks;
294 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
295 RGWRESTConn *conn = iter->second;
296 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
297 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
298
299 stacks.push_back(stack);
300 }
301 return run(stacks);
302 }
303 };
304
305 class RGWDataNotifierManager : public RGWCoroutinesManager {
306 RGWRados *store;
307 RGWHTTPManager http_manager;
308
309 public:
310 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
311 http_manager(store->ctx(), completion_mgr) {
312 http_manager.start();
313 }
314
315 int notify_all(map<rgw_zone_id, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
316 rgw_http_param_pair pairs[] = { { "type", "data" },
317 { "notify", NULL },
318 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
319 { NULL, NULL } };
320
321 list<RGWCoroutinesStack *> stacks;
322 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
323 RGWRESTConn *conn = iter->second;
324 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
325 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
326
327 stacks.push_back(stack);
328 }
329 return run(stacks);
330 }
331 };
332
333 /* class RGWRadosThread */
334
335 void RGWRadosThread::start()
336 {
337 worker = new Worker(cct, this);
338 worker->create(thread_name.c_str());
339 }
340
341 void RGWRadosThread::stop()
342 {
343 down_flag = true;
344 stop_process();
345 if (worker) {
346 worker->signal();
347 worker->join();
348 }
349 delete worker;
350 worker = NULL;
351 }
352
353 void *RGWRadosThread::Worker::entry() {
354 uint64_t msec = processor->interval_msec();
355 auto interval = std::chrono::milliseconds(msec);
356
357 do {
358 auto start = ceph::real_clock::now();
359 int r = processor->process();
360 if (r < 0) {
361 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
362 }
363
364 if (processor->going_down())
365 break;
366
367 auto end = ceph::real_clock::now() - start;
368
369 uint64_t cur_msec = processor->interval_msec();
370 if (cur_msec != msec) { /* was it reconfigured? */
371 msec = cur_msec;
372 interval = std::chrono::milliseconds(msec);
373 }
374
375 if (cur_msec > 0) {
376 if (interval <= end)
377 continue; // next round
378
379 auto wait_time = interval - end;
380 wait_interval(wait_time);
381 } else {
382 wait();
383 }
384 } while (!processor->going_down());
385
386 return NULL;
387 }
388
389 class RGWMetaNotifier : public RGWRadosThread {
390 RGWMetaNotifierManager notify_mgr;
391 RGWMetadataLog *const log;
392
393 uint64_t interval_msec() override {
394 return cct->_conf->rgw_md_notify_interval_msec;
395 }
396 void stop_process() override {
397 notify_mgr.stop();
398 }
399 public:
400 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
401 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
402
403 int process() override;
404 };
405
406 int RGWMetaNotifier::process()
407 {
408 set<int> shards;
409
410 log->read_clear_modified(shards);
411
412 if (shards.empty()) {
413 return 0;
414 }
415
416 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
417 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
418 }
419
420 notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards);
421
422 return 0;
423 }
424
425 class RGWDataNotifier : public RGWRadosThread {
426 RGWDataNotifierManager notify_mgr;
427
428 uint64_t interval_msec() override {
429 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
430 }
431 void stop_process() override {
432 notify_mgr.stop();
433 }
434 public:
435 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
436
437 int process() override;
438 };
439
440 int RGWDataNotifier::process()
441 {
442 auto data_log = store->svc.datalog_rados->get_log();
443 if (!data_log) {
444 return 0;
445 }
446
447 map<int, set<string> > shards;
448
449 data_log->read_clear_modified(shards);
450
451 if (shards.empty()) {
452 return 0;
453 }
454
455 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
456 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
457 }
458
459 notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards);
460
461 return 0;
462 }
463
464 class RGWSyncProcessorThread : public RGWRadosThread {
465 public:
466 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
467 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
468 ~RGWSyncProcessorThread() override {}
469 int init() override = 0 ;
470 int process() override = 0;
471 };
472
473 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
474 {
475 RGWMetaSyncStatusManager sync;
476
477 uint64_t interval_msec() override {
478 return 0; /* no interval associated, it'll run once until stopped */
479 }
480 void stop_process() override {
481 sync.stop();
482 }
483 public:
484 RGWMetaSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados)
485 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
486
487 void wakeup_sync_shards(set<int>& shard_ids) {
488 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
489 sync.wakeup(*iter);
490 }
491 }
492 RGWMetaSyncStatusManager* get_manager() { return &sync; }
493
494 int init() override {
495 int ret = sync.init();
496 if (ret < 0) {
497 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
498 return ret;
499 }
500 return 0;
501 }
502
503 int process() override {
504 sync.run();
505 return 0;
506 }
507 };
508
509 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
510 {
511 PerfCountersRef counters;
512 RGWDataSyncStatusManager sync;
513 bool initialized;
514
515 uint64_t interval_msec() override {
516 if (initialized) {
517 return 0; /* no interval associated, it'll run once until stopped */
518 } else {
519 #define DATA_SYNC_INIT_WAIT_SEC 20
520 return DATA_SYNC_INIT_WAIT_SEC * 1000;
521 }
522 }
523 void stop_process() override {
524 sync.stop();
525 }
526 public:
527 RGWDataSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados,
528 const RGWZone* source_zone)
529 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
530 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
531 sync(_store, async_rados, source_zone->id, counters.get()),
532 initialized(false) {}
533
534 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
535 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
536 sync.wakeup(iter->first, iter->second);
537 }
538 }
539 RGWDataSyncStatusManager* get_manager() { return &sync; }
540
541 int init() override {
542 return 0;
543 }
544
545 int process() override {
546 while (!initialized) {
547 if (going_down()) {
548 return 0;
549 }
550 int ret = sync.init();
551 if (ret >= 0) {
552 initialized = true;
553 break;
554 }
555 /* we'll be back! */
556 return 0;
557 }
558 sync.run();
559 return 0;
560 }
561 };
562
563 class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
564 {
565 RGWCoroutinesManager crs;
566 rgw::sal::RGWRadosStore *store;
567 rgw::BucketTrimManager *bucket_trim;
568 RGWHTTPManager http;
569 const utime_t trim_interval;
570
571 uint64_t interval_msec() override { return 0; }
572 void stop_process() override { crs.stop(); }
573 public:
574 RGWSyncLogTrimThread(rgw::sal::RGWRadosStore *store, rgw::BucketTrimManager *bucket_trim,
575 int interval)
576 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
577 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
578 bucket_trim(bucket_trim),
579 http(store->ctx(), crs.get_completion_mgr()),
580 trim_interval(interval, 0)
581 {}
582
583 int init() override {
584 return http.start();
585 }
586 int process() override {
587 list<RGWCoroutinesStack*> stacks;
588 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
589 meta->call(create_meta_log_trim_cr(this, store, &http,
590 cct->_conf->rgw_md_log_max_shards,
591 trim_interval));
592 stacks.push_back(meta);
593
594 if (store->svc()->zone->sync_module_exports_data()) {
595 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
596 data->call(create_data_log_trim_cr(store, &http,
597 cct->_conf->rgw_data_log_num_shards,
598 trim_interval));
599 stacks.push_back(data);
600
601 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
602 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
603 stacks.push_back(bucket);
604 }
605
606 crs.run(stacks);
607 return 0;
608 }
609
610 // implements DoutPrefixProvider
611 CephContext *get_cct() const override { return store->ctx(); }
612 unsigned get_subsys() const override
613 {
614 return dout_subsys;
615 }
616
617 std::ostream& gen_prefix(std::ostream& out) const override
618 {
619 return out << "sync log trim: ";
620 }
621
622 };
623
624 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
625 {
626 std::lock_guard l{meta_sync_thread_lock};
627 if (meta_sync_processor_thread) {
628 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
629 }
630 }
631
632 void RGWRados::wakeup_data_sync_shards(const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
633 {
634 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
635 std::lock_guard l{data_sync_thread_lock};
636 auto iter = data_sync_processor_threads.find(source_zone);
637 if (iter == data_sync_processor_threads.end()) {
638 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
639 return;
640 }
641
642 RGWDataSyncProcessorThread *thread = iter->second;
643 ceph_assert(thread);
644 thread->wakeup_sync_shards(shard_ids);
645 }
646
647 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
648 {
649 std::lock_guard l{meta_sync_thread_lock};
650 if (meta_sync_processor_thread) {
651 return meta_sync_processor_thread->get_manager();
652 }
653 return nullptr;
654 }
655
656 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
657 {
658 std::lock_guard l{data_sync_thread_lock};
659 auto thread = data_sync_processor_threads.find(source_zone);
660 if (thread == data_sync_processor_threads.end()) {
661 return nullptr;
662 }
663 return thread->second->get_manager();
664 }
665
666 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
667 {
668 IoCtx ioctx;
669 int r = open_pool_ctx(pool, ioctx, false);
670 if (r < 0) {
671 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
672 return r;
673 }
674
675 bool requires;
676 r = ioctx.pool_requires_alignment2(&requires);
677 if (r < 0) {
678 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
679 << r << dendl;
680 return r;
681 }
682
683 if (!requires) {
684 *alignment = 0;
685 return 0;
686 }
687
688 uint64_t align;
689 r = ioctx.pool_required_alignment2(&align);
690 if (r < 0) {
691 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
692 << r << dendl;
693 return r;
694 }
695 if (align != 0) {
696 ldout(cct, 20) << "required alignment=" << align << dendl;
697 }
698 *alignment = align;
699 return 0;
700 }
701
702 void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
703 {
704 if (alignment == 0) {
705 *max_size = size;
706 return;
707 }
708
709 if (size <= alignment) {
710 *max_size = alignment;
711 return;
712 }
713
714 *max_size = size - (size % alignment);
715 }
716
717 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment)
718 {
719 uint64_t alignment;
720 int r = get_required_alignment(pool, &alignment);
721 if (r < 0) {
722 return r;
723 }
724
725 if (palignment) {
726 *palignment = alignment;
727 }
728
729 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
730
731 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
732
733 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
734
735 return 0;
736 }
737
738 int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
739 uint64_t *max_chunk_size, uint64_t *palignment)
740 {
741 rgw_pool pool;
742 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
743 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
744 return -EIO;
745 }
746 return get_max_chunk_size(pool, max_chunk_size, palignment);
747 }
748
749 class RGWIndexCompletionManager;
750
751 struct complete_op_data {
752 ceph::mutex lock = ceph::make_mutex("complete_op_data");
753 AioCompletion *rados_completion{nullptr};
754 int manager_shard_id{-1};
755 RGWIndexCompletionManager *manager{nullptr};
756 rgw_obj obj;
757 RGWModifyOp op;
758 string tag;
759 rgw_bucket_entry_ver ver;
760 cls_rgw_obj_key key;
761 rgw_bucket_dir_entry_meta dir_meta;
762 list<cls_rgw_obj_key> remove_objs;
763 bool log_op;
764 uint16_t bilog_op;
765 rgw_zone_set zones_trace;
766
767 bool stopped{false};
768
769 void stop() {
770 std::lock_guard l{lock};
771 stopped = true;
772 }
773 };
774
775 class RGWIndexCompletionThread : public RGWRadosThread {
776 RGWRados *store;
777
778 uint64_t interval_msec() override {
779 return 0;
780 }
781
782 list<complete_op_data *> completions;
783
784 ceph::mutex completions_lock =
785 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
786 public:
787 RGWIndexCompletionThread(RGWRados *_store)
788 : RGWRadosThread(_store, "index-complete"), store(_store) {}
789
790 int process() override;
791
792 void add_completion(complete_op_data *completion) {
793 {
794 std::lock_guard l{completions_lock};
795 completions.push_back(completion);
796 }
797
798 signal();
799 }
800 };
801
802 int RGWIndexCompletionThread::process()
803 {
804 list<complete_op_data *> comps;
805
806 {
807 std::lock_guard l{completions_lock};
808 completions.swap(comps);
809 }
810
811 for (auto c : comps) {
812 std::unique_ptr<complete_op_data> up{c};
813
814 if (going_down()) {
815 continue;
816 }
817 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
818
819 RGWRados::BucketShard bs(store);
820 RGWBucketInfo bucket_info;
821
822 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
823 if (r < 0) {
824 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
825 /* not much to do */
826 continue;
827 }
828
829 r = store->guard_reshard(&bs, c->obj, bucket_info,
830 [&](RGWRados::BucketShard *bs) -> int {
831 librados::ObjectWriteOperation o;
832 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
833 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
834 c->log_op, c->bilog_op, &c->zones_trace);
835 return bs->bucket_obj.operate(&o, null_yield);
836 });
837 if (r < 0) {
838 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
839 /* ignoring error, can't do anything about it */
840 continue;
841 }
842 r = store->svc.datalog_rados->add_entry(bucket_info, bs.shard_id);
843 if (r < 0) {
844 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
845 }
846 }
847
848 return 0;
849 }
850
851 class RGWIndexCompletionManager {
852 RGWRados *store{nullptr};
853 ceph::containers::tiny_vector<ceph::mutex> locks;
854 vector<set<complete_op_data *> > completions;
855
856 RGWIndexCompletionThread *completion_thread{nullptr};
857
858 int num_shards;
859
860 std::atomic<int> cur_shard {0};
861
862
863 public:
864 RGWIndexCompletionManager(RGWRados *_store) :
865 store(_store),
866 locks{ceph::make_lock_container<ceph::mutex>(
867 store->ctx()->_conf->rgw_thread_pool_size,
868 [](const size_t i) {
869 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
870 std::to_string(i));
871 })}
872 {
873 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
874 completions.resize(num_shards);
875 }
876 ~RGWIndexCompletionManager() {
877 stop();
878 }
879
880 int next_shard() {
881 int result = cur_shard % num_shards;
882 cur_shard++;
883 return result;
884 }
885
886 void create_completion(const rgw_obj& obj,
887 RGWModifyOp op, string& tag,
888 rgw_bucket_entry_ver& ver,
889 const cls_rgw_obj_key& key,
890 rgw_bucket_dir_entry_meta& dir_meta,
891 list<cls_rgw_obj_key> *remove_objs, bool log_op,
892 uint16_t bilog_op,
893 rgw_zone_set *zones_trace,
894 complete_op_data **result);
895 bool handle_completion(completion_t cb, complete_op_data *arg);
896
897 int start() {
898 completion_thread = new RGWIndexCompletionThread(store);
899 int ret = completion_thread->init();
900 if (ret < 0) {
901 return ret;
902 }
903 completion_thread->start();
904 return 0;
905 }
906 void stop() {
907 if (completion_thread) {
908 completion_thread->stop();
909 delete completion_thread;
910 }
911
912 for (int i = 0; i < num_shards; ++i) {
913 std::lock_guard l{locks[i]};
914 for (auto c : completions[i]) {
915 c->stop();
916 }
917 }
918 completions.clear();
919 }
920 };
921
922 static void obj_complete_cb(completion_t cb, void *arg)
923 {
924 complete_op_data *completion = (complete_op_data *)arg;
925 completion->lock.lock();
926 if (completion->stopped) {
927 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
928 delete completion;
929 return;
930 }
931 bool need_delete = completion->manager->handle_completion(cb, completion);
932 completion->lock.unlock();
933 if (need_delete) {
934 delete completion;
935 }
936 }
937
938
939 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
940 RGWModifyOp op, string& tag,
941 rgw_bucket_entry_ver& ver,
942 const cls_rgw_obj_key& key,
943 rgw_bucket_dir_entry_meta& dir_meta,
944 list<cls_rgw_obj_key> *remove_objs, bool log_op,
945 uint16_t bilog_op,
946 rgw_zone_set *zones_trace,
947 complete_op_data **result)
948 {
949 complete_op_data *entry = new complete_op_data;
950
951 int shard_id = next_shard();
952
953 entry->manager_shard_id = shard_id;
954 entry->manager = this;
955 entry->obj = obj;
956 entry->op = op;
957 entry->tag = tag;
958 entry->ver = ver;
959 entry->key = key;
960 entry->dir_meta = dir_meta;
961 entry->log_op = log_op;
962 entry->bilog_op = bilog_op;
963
964 if (remove_objs) {
965 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
966 entry->remove_objs.push_back(*iter);
967 }
968 }
969
970 if (zones_trace) {
971 entry->zones_trace = *zones_trace;
972 } else {
973 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
974 }
975
976 *result = entry;
977
978 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
979
980 std::lock_guard l{locks[shard_id]};
981 completions[shard_id].insert(entry);
982 }
983
984 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
985 {
986 int shard_id = arg->manager_shard_id;
987 {
988 std::lock_guard l{locks[shard_id]};
989
990 auto& comps = completions[shard_id];
991
992 auto iter = comps.find(arg);
993 if (iter == comps.end()) {
994 return true;
995 }
996
997 comps.erase(iter);
998 }
999
1000 int r = rados_aio_get_return_value(cb);
1001 if (r != -ERR_BUSY_RESHARDING) {
1002 return true;
1003 }
1004 completion_thread->add_completion(arg);
1005 return false;
1006 }
1007
1008 void RGWRados::finalize()
1009 {
1010 if (run_sync_thread) {
1011 std::lock_guard l{meta_sync_thread_lock};
1012 meta_sync_processor_thread->stop();
1013
1014 std::lock_guard dl{data_sync_thread_lock};
1015 for (auto iter : data_sync_processor_threads) {
1016 RGWDataSyncProcessorThread *thread = iter.second;
1017 thread->stop();
1018 }
1019 if (sync_log_trimmer) {
1020 sync_log_trimmer->stop();
1021 }
1022 }
1023 if (run_sync_thread) {
1024 delete meta_sync_processor_thread;
1025 meta_sync_processor_thread = NULL;
1026 std::lock_guard dl{data_sync_thread_lock};
1027 for (auto iter : data_sync_processor_threads) {
1028 RGWDataSyncProcessorThread *thread = iter.second;
1029 delete thread;
1030 }
1031 data_sync_processor_threads.clear();
1032 delete sync_log_trimmer;
1033 sync_log_trimmer = nullptr;
1034 bucket_trim = boost::none;
1035 }
1036 if (meta_notifier) {
1037 meta_notifier->stop();
1038 delete meta_notifier;
1039 }
1040 if (data_notifier) {
1041 data_notifier->stop();
1042 delete data_notifier;
1043 }
1044 delete sync_tracer;
1045
1046 delete lc;
1047 lc = NULL;
1048
1049 delete gc;
1050 gc = NULL;
1051
1052 delete obj_expirer;
1053 obj_expirer = NULL;
1054
1055 RGWQuotaHandler::free_handler(quota_handler);
1056 if (cr_registry) {
1057 cr_registry->put();
1058 }
1059
1060 svc.shutdown();
1061
1062 delete binfo_cache;
1063 delete obj_tombstone_cache;
1064
1065 if (reshard_wait.get()) {
1066 reshard_wait->stop();
1067 reshard_wait.reset();
1068 }
1069
1070 if (run_reshard_thread) {
1071 reshard->stop_processor();
1072 }
1073 delete reshard;
1074 delete index_completion_manager;
1075 }
1076
1077 /**
1078 * Initialize the RADOS instance and prepare to do other ops
1079 * Returns 0 on success, -ERR# on failure.
1080 */
1081 int RGWRados::init_rados()
1082 {
1083 int ret = 0;
1084
1085 ret = rados.init_with_context(cct);
1086 if (ret < 0) {
1087 return ret;
1088 }
1089 ret = rados.connect();
1090 if (ret < 0) {
1091 return ret;
1092 }
1093
1094 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1095 new RGWCoroutinesManagerRegistry(cct)};
1096 ret = crs->hook_to_admin_command("cr dump");
1097 if (ret < 0) {
1098 return ret;
1099 }
1100
1101 cr_registry = crs.release();
1102 return ret;
1103 }
1104
1105 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
1106 {
1107 map<string,string> metadata = meta;
1108 metadata["num_handles"] = "1"s;
1109 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1110 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1111 metadata["zone_name"] = svc.zone->zone_name();
1112 metadata["zone_id"] = svc.zone->zone_id().id;
1113 string name = cct->_conf->name.get_id();
1114 if (name.compare(0, 4, "rgw.") == 0) {
1115 name = name.substr(4);
1116 }
1117 int ret = rados.service_daemon_register(daemon_type, name, metadata);
1118 if (ret < 0) {
1119 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1120 return ret;
1121 }
1122
1123 return 0;
1124 }
1125
1126 int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
1127 {
1128 int ret = rados.service_daemon_update_status(move(status));
1129 if (ret < 0) {
1130 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1131 return ret;
1132 }
1133
1134 return 0;
1135 }
1136
1137 /**
1138 * Initialize the RADOS instance and prepare to do other ops
1139 * Returns 0 on success, -ERR# on failure.
1140 */
1141 int RGWRados::init_complete()
1142 {
1143 int ret;
1144
1145 /*
1146 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1147 */
1148 sync_module = svc.sync_modules->get_sync_module();
1149
1150 ret = open_root_pool_ctx();
1151 if (ret < 0)
1152 return ret;
1153
1154 ret = open_gc_pool_ctx();
1155 if (ret < 0)
1156 return ret;
1157
1158 ret = open_lc_pool_ctx();
1159 if (ret < 0)
1160 return ret;
1161
1162 ret = open_objexp_pool_ctx();
1163 if (ret < 0)
1164 return ret;
1165
1166 ret = open_reshard_pool_ctx();
1167 if (ret < 0)
1168 return ret;
1169
1170 pools_initialized = true;
1171
1172 gc = new RGWGC();
1173 gc->initialize(cct, this);
1174
1175 obj_expirer = new RGWObjectExpirer(this->store);
1176
1177 if (use_gc_thread) {
1178 gc->start_processor();
1179 obj_expirer->start_processor();
1180 }
1181
1182 auto& current_period = svc.zone->get_current_period();
1183 auto& zonegroup = svc.zone->get_zonegroup();
1184 auto& zone_params = svc.zone->get_zone_params();
1185 auto& zone = svc.zone->get_zone();
1186
1187 /* no point of running sync thread if we don't have a master zone configured
1188 or there is no rest_master_conn */
1189 if (!svc.zone->need_to_sync()) {
1190 run_sync_thread = false;
1191 }
1192
1193 if (svc.zone->is_meta_master()) {
1194 auto md_log = svc.mdlog->get_log(current_period.get_id());
1195 meta_notifier = new RGWMetaNotifier(this, md_log);
1196 meta_notifier->start();
1197 }
1198
1199 /* init it anyway, might run sync through radosgw-admin explicitly */
1200 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1201 sync_tracer->init(this);
1202 ret = sync_tracer->hook_to_admin_command();
1203 if (ret < 0) {
1204 return ret;
1205 }
1206
1207 if (run_sync_thread) {
1208 for (const auto &pt: zonegroup.placement_targets) {
1209 if (zone_params.placement_pools.find(pt.second.name)
1210 == zone_params.placement_pools.end()){
1211 ldout(cct, 0) << "WARNING: This zone does not contain the placement target "
1212 << pt.second.name << " present in zonegroup" << dendl;
1213 }
1214 }
1215 auto async_processor = svc.rados->get_async_processor();
1216 std::lock_guard l{meta_sync_thread_lock};
1217 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
1218 ret = meta_sync_processor_thread->init();
1219 if (ret < 0) {
1220 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1221 return ret;
1222 }
1223 meta_sync_processor_thread->start();
1224
1225 // configure the bucket trim manager
1226 rgw::BucketTrimConfig config;
1227 rgw::configure_bucket_trim(cct, config);
1228
1229 bucket_trim.emplace(this->store, config);
1230 ret = bucket_trim->init();
1231 if (ret < 0) {
1232 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1233 return ret;
1234 }
1235 svc.datalog_rados->set_observer(&*bucket_trim);
1236
1237 std::lock_guard dl{data_sync_thread_lock};
1238 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1239 ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
1240 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
1241 ret = thread->init();
1242 if (ret < 0) {
1243 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1244 return ret;
1245 }
1246 thread->start();
1247 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
1248 }
1249 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1250 if (interval > 0) {
1251 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
1252 ret = sync_log_trimmer->init();
1253 if (ret < 0) {
1254 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1255 return ret;
1256 }
1257 sync_log_trimmer->start();
1258 }
1259 }
1260 data_notifier = new RGWDataNotifier(this);
1261 data_notifier->start();
1262
1263 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1264 binfo_cache->init(svc.cache);
1265
1266 lc = new RGWLC();
1267 lc->initialize(cct, this->store);
1268
1269 if (use_lc_thread)
1270 lc->start_processor();
1271
1272 quota_handler = RGWQuotaHandler::generate_handler(this->store, quota_threads);
1273
1274 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
1275 zone.bucket_index_max_shards);
1276 if (bucket_index_max_shards > get_max_bucket_shards()) {
1277 bucket_index_max_shards = get_max_bucket_shards();
1278 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
1279 << get_max_bucket_shards() << dendl;
1280 }
1281 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1282
1283 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1284
1285 if (need_tombstone_cache) {
1286 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1287 }
1288
1289 reshard_wait = std::make_shared<RGWReshardWait>();
1290
1291 reshard = new RGWReshard(this->store);
1292
1293 /* only the master zone in the zonegroup reshards buckets */
1294 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
1295 if (run_reshard_thread) {
1296 reshard->start_processor();
1297 }
1298
1299 index_completion_manager = new RGWIndexCompletionManager(this);
1300 ret = index_completion_manager->start();
1301
1302 return ret;
1303 }
1304
1305 int RGWRados::init_svc(bool raw)
1306 {
1307 if (raw) {
1308 return svc.init_raw(cct, use_cache);
1309 }
1310
1311 return svc.init(cct, use_cache, run_sync_thread);
1312 }
1313
1314 int RGWRados::init_ctl()
1315 {
1316 return ctl.init(&svc);
1317 }
1318
1319 /**
1320 * Initialize the RADOS instance and prepare to do other ops
1321 * Returns 0 on success, -ERR# on failure.
1322 */
1323 int RGWRados::initialize()
1324 {
1325 int ret;
1326
1327 inject_notify_timeout_probability =
1328 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1329 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
1330
1331 ret = init_svc(false);
1332 if (ret < 0) {
1333 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
1334 return ret;
1335 }
1336
1337 ret = init_ctl();
1338 if (ret < 0) {
1339 ldout(cct, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
1340 return ret;
1341 }
1342
1343 host_id = svc.zone_utils->gen_host_id();
1344
1345 ret = init_rados();
1346 if (ret < 0)
1347 return ret;
1348
1349 return init_complete();
1350 }
1351
1352 /**
1353 * Open the pool used as root for this gateway
1354 * Returns: 0 on success, -ERR# otherwise.
1355 */
1356 int RGWRados::open_root_pool_ctx()
1357 {
1358 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
1359 }
1360
1361 int RGWRados::open_gc_pool_ctx()
1362 {
1363 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
1364 }
1365
1366 int RGWRados::open_lc_pool_ctx()
1367 {
1368 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
1369 }
1370
1371 int RGWRados::open_objexp_pool_ctx()
1372 {
1373 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
1374 }
1375
1376 int RGWRados::open_reshard_pool_ctx()
1377 {
1378 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
1379 }
1380
1381 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx,
1382 bool mostly_omap)
1383 {
1384 constexpr bool create = true; // create the pool if it doesn't exist
1385 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create, mostly_omap);
1386 }
1387
1388 /**** logs ****/
1389
1390 struct log_list_state {
1391 string prefix;
1392 librados::IoCtx io_ctx;
1393 librados::NObjectIterator obit;
1394 };
1395
1396 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
1397 {
1398 log_list_state *state = new log_list_state;
1399 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1400 if (r < 0) {
1401 delete state;
1402 return r;
1403 }
1404 state->prefix = prefix;
1405 state->obit = state->io_ctx.nobjects_begin();
1406 *handle = (RGWAccessHandle)state;
1407 return 0;
1408 }
1409
1410 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1411 {
1412 log_list_state *state = static_cast<log_list_state *>(handle);
1413 while (true) {
1414 if (state->obit == state->io_ctx.nobjects_end()) {
1415 delete state;
1416 return -ENOENT;
1417 }
1418 if (state->prefix.length() &&
1419 state->obit->get_oid().find(state->prefix) != 0) {
1420 state->obit++;
1421 continue;
1422 }
1423 *name = state->obit->get_oid();
1424 state->obit++;
1425 break;
1426 }
1427 return 0;
1428 }
1429
1430 int RGWRados::log_remove(const string& name)
1431 {
1432 librados::IoCtx io_ctx;
1433 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
1434 if (r < 0)
1435 return r;
1436 return io_ctx.remove(name);
1437 }
1438
1439 struct log_show_state {
1440 librados::IoCtx io_ctx;
1441 bufferlist bl;
1442 bufferlist::const_iterator p;
1443 string name;
1444 uint64_t pos;
1445 bool eof;
1446 log_show_state() : pos(0), eof(false) {}
1447 };
1448
1449 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
1450 {
1451 log_show_state *state = new log_show_state;
1452 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1453 if (r < 0) {
1454 delete state;
1455 return r;
1456 }
1457 state->name = name;
1458 *handle = (RGWAccessHandle)state;
1459 return 0;
1460 }
1461
1462 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1463 {
1464 log_show_state *state = static_cast<log_show_state *>(handle);
1465 off_t off = state->p.get_off();
1466
1467 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1468 << " off " << off
1469 << " eof " << (int)state->eof
1470 << dendl;
1471 // read some?
1472 unsigned chunk = 1024*1024;
1473 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1474 bufferlist more;
1475 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1476 if (r < 0)
1477 return r;
1478 state->pos += r;
1479 bufferlist old;
1480 try {
1481 old.substr_of(state->bl, off, state->bl.length() - off);
1482 } catch (buffer::error& err) {
1483 return -EINVAL;
1484 }
1485 state->bl.clear();
1486 state->bl.claim(old);
1487 state->bl.claim_append(more);
1488 state->p = state->bl.cbegin();
1489 if ((unsigned)r < chunk)
1490 state->eof = true;
1491 ldout(cct, 10) << " read " << r << dendl;
1492 }
1493
1494 if (state->p.end())
1495 return 0; // end of file
1496 try {
1497 decode(*entry, state->p);
1498 }
1499 catch (const buffer::error &e) {
1500 return -EINVAL;
1501 }
1502 return 1;
1503 }
1504
1505 /**
1506 * usage_log_hash: get usage log key hash, based on name and index
1507 *
1508 * Get the usage object name. Since a user may have more than 1
1509 * object holding that info (multiple shards), we use index to
1510 * specify that shard number. Once index exceeds max shards it
1511 * wraps.
1512 * If name is not being set, results for all users will be returned
1513 * and index will wrap only after total shards number.
1514 *
1515 * @param cct [in] ceph context
1516 * @param name [in] user name
1517 * @param hash [out] hash value
1518 * @param index [in] shard index number
1519 */
1520 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1521 {
1522 uint32_t val = index;
1523
1524 if (!name.empty()) {
1525 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
1526 val %= max_user_shards;
1527 val += ceph_str_hash_linux(name.c_str(), name.size());
1528 }
1529 char buf[17];
1530 int max_shards = cct->_conf->rgw_usage_max_shards;
1531 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1532 hash = buf;
1533 }
1534
1535 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1536 {
1537 uint32_t index = 0;
1538
1539 map<string, rgw_usage_log_info> log_objs;
1540
1541 string hash;
1542 string last_user;
1543
1544 /* restructure usage map, zone by object hash */
1545 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1546 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1547 const rgw_user_bucket& ub = iter->first;
1548 RGWUsageBatch& info = iter->second;
1549
1550 if (ub.user.empty()) {
1551 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1552 continue;
1553 }
1554
1555 if (ub.user != last_user) {
1556 /* index *should* be random, but why waste extra cycles
1557 in most cases max user shards is not going to exceed 1,
1558 so just incrementing it */
1559 usage_log_hash(cct, ub.user, hash, index++);
1560 }
1561 last_user = ub.user;
1562 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1563
1564 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1565 v.push_back(miter->second);
1566 }
1567 }
1568
1569 map<string, rgw_usage_log_info>::iterator liter;
1570
1571 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1572 int r = cls_obj_usage_log_add(liter->first, liter->second);
1573 if (r < 0)
1574 return r;
1575 }
1576 return 0;
1577 }
1578
1579 int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1580 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1581 rgw_usage_log_entry>& usage)
1582 {
1583 uint32_t num = max_entries;
1584 string hash, first_hash;
1585 string user_str = user.to_str();
1586 usage_log_hash(cct, user_str, first_hash, 0);
1587
1588 if (usage_iter.index) {
1589 usage_log_hash(cct, user_str, hash, usage_iter.index);
1590 } else {
1591 hash = first_hash;
1592 }
1593
1594 usage.clear();
1595
1596 do {
1597 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1598 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1599
1600 int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num,
1601 usage_iter.read_iter, ret_usage, is_truncated);
1602 if (ret == -ENOENT)
1603 goto next;
1604
1605 if (ret < 0)
1606 return ret;
1607
1608 num -= ret_usage.size();
1609
1610 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1611 usage[iter->first].aggregate(iter->second);
1612 }
1613
1614 next:
1615 if (!*is_truncated) {
1616 usage_iter.read_iter.clear();
1617 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1618 }
1619 } while (num && !*is_truncated && hash != first_hash);
1620 return 0;
1621 }
1622
1623 int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
1624 {
1625 uint32_t index = 0;
1626 string hash, first_hash;
1627 string user_str = user.to_str();
1628 usage_log_hash(cct, user_str, first_hash, index);
1629
1630 hash = first_hash;
1631 do {
1632 int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch);
1633
1634 if (ret < 0 && ret != -ENOENT)
1635 return ret;
1636
1637 usage_log_hash(cct, user_str, hash, ++index);
1638 } while (hash != first_hash);
1639
1640 return 0;
1641 }
1642
1643
1644 int RGWRados::clear_usage()
1645 {
1646 auto max_shards = cct->_conf->rgw_usage_max_shards;
1647 int ret=0;
1648 for (unsigned i=0; i < max_shards; i++){
1649 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
1650 ret = cls_obj_usage_log_clear(oid);
1651 if (ret < 0){
1652 ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
1653 return ret;
1654 }
1655 }
1656 return ret;
1657 }
1658
1659 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
1660 {
1661 auto i = bl.cbegin();
1662 RGWAccessControlPolicy policy(cct);
1663 try {
1664 policy.decode_owner(i);
1665 } catch (buffer::error& err) {
1666 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1667 return -EIO;
1668 }
1669 *owner = policy.get_owner();
1670 return 0;
1671 }
1672
1673 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
1674 {
1675 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1676 if (aiter == attrset.end())
1677 return -EIO;
1678
1679 bufferlist& bl = aiter->second;
1680 auto iter = bl.cbegin();
1681 try {
1682 policy->decode(iter);
1683 } catch (buffer::error& err) {
1684 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1685 return -EIO;
1686 }
1687 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1688 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
1689 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
1690 s3policy->to_xml(*_dout);
1691 *_dout << dendl;
1692 }
1693 return 0;
1694 }
1695
1696
1697 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
1698 {
1699 rgw_bucket bucket = bucket_info.bucket;
1700 bucket.update_bucket_id(new_bucket_id);
1701
1702 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
1703
1704 bucket_info.objv_tracker.clear();
1705 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield);
1706 if (ret < 0) {
1707 return ret;
1708 }
1709
1710 return 0;
1711 }
1712
1713
1714 /**
1715 * Get ordered listing of the objects in a bucket.
1716 *
1717 * max_p: maximum number of results to return
1718 * bucket: bucket to list contents of
1719 * prefix: only return results that match this prefix
1720 * delim: do not include results that match this string.
1721 * Any skipped results will have the matching portion of their name
1722 * inserted in common_prefixes with a "true" mark.
1723 * marker: if filled in, begin the listing with this object.
1724 * end_marker: if filled in, end the listing with this object.
1725 * result: the objects are put in here.
1726 * common_prefixes: if delim is filled in, any matching prefixes are
1727 * placed here.
1728 * is_truncated: if number of objects in the bucket is bigger than
1729 * max, then truncated.
1730 */
1731 int RGWRados::Bucket::List::list_objects_ordered(
1732 int64_t max_p,
1733 vector<rgw_bucket_dir_entry> *result,
1734 map<string, bool> *common_prefixes,
1735 bool *is_truncated,
1736 optional_yield y)
1737 {
1738 RGWRados *store = target->get_store();
1739 CephContext *cct = store->ctx();
1740 int shard_id = target->get_shard_id();
1741
1742 int count = 0;
1743 bool truncated = true;
1744 bool cls_filtered = false;
1745 const int64_t max = // protect against memory issues and negative vals
1746 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1747 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
1748
1749 result->clear();
1750
1751 // use a local marker; either the marker will have a previous entry
1752 // or it will be empty; either way it's OK to copy
1753 rgw_obj_key marker_obj(params.marker.name,
1754 params.marker.instance,
1755 params.marker.ns);
1756 rgw_obj_index_key cur_marker;
1757 marker_obj.get_index_key(&cur_marker);
1758
1759 rgw_obj_key end_marker_obj(params.end_marker.name,
1760 params.end_marker.instance,
1761 params.end_marker.ns);
1762 rgw_obj_index_key cur_end_marker;
1763 end_marker_obj.get_index_key(&cur_end_marker);
1764 const bool cur_end_marker_valid = !params.end_marker.empty();
1765
1766 rgw_obj_key prefix_obj(params.prefix);
1767 prefix_obj.set_ns(params.ns);
1768 string cur_prefix = prefix_obj.get_index_key_name();
1769 string after_delim_s; /* needed in !params.delim.empty() AND later */
1770
1771 if (!params.delim.empty()) {
1772 after_delim_s = cls_rgw_after_delim(params.delim);
1773 /* if marker points at a common prefix, fast forward it into its
1774 * upper bound string */
1775 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
1776 if (delim_pos >= 0) {
1777 string s = cur_marker.name.substr(0, delim_pos);
1778 s.append(after_delim_s);
1779 cur_marker = s;
1780 }
1781 }
1782
1783 rgw_obj_index_key prev_marker;
1784 uint16_t attempt = 0;
1785 while (true) {
1786 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1787 " beginning attempt=" << ++attempt << dendl;
1788
1789 // this loop is generally expected only to have a single
1790 // iteration; the standard exit is at the bottom of the loop, but
1791 // there's an error condition emergency exit as well
1792
1793 if (attempt > 1 && !(prev_marker < cur_marker)) {
1794 // we've failed to make forward progress
1795 ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ <<
1796 ": ERROR marker failed to make forward progress; attempt=" << attempt <<
1797 ", prev_marker=" << prev_marker <<
1798 ", cur_marker=" << cur_marker << dendl;
1799 break;
1800 }
1801 prev_marker = cur_marker;
1802
1803 ent_map_t ent_map;
1804 ent_map.reserve(read_ahead);
1805 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
1806 shard_id,
1807 cur_marker,
1808 cur_prefix,
1809 params.delim,
1810 read_ahead + 1 - count,
1811 params.list_versions,
1812 attempt,
1813 ent_map,
1814 &truncated,
1815 &cls_filtered,
1816 &cur_marker,
1817 y);
1818 if (r < 0) {
1819 return r;
1820 }
1821
1822 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
1823 rgw_bucket_dir_entry& entry = eiter->second;
1824 rgw_obj_index_key index_key = entry.key;
1825 rgw_obj_key obj(index_key);
1826
1827 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1828 " considering entry " << entry.key << dendl;
1829
1830 /* note that parse_raw_oid() here will not set the correct
1831 * object's instance, as rgw_obj_index_key encodes that
1832 * separately. We don't need to set the instance because it's
1833 * not needed for the checks here and we end up using the raw
1834 * entry for the return vector
1835 */
1836 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1837 if (!valid) {
1838 ldout(cct, 0) << "ERROR: could not parse object name: " <<
1839 obj.name << dendl;
1840 continue;
1841 }
1842
1843 bool matched_ns = (obj.ns == params.ns);
1844 if (!params.list_versions && !entry.is_visible()) {
1845 continue;
1846 }
1847
1848 if (params.enforce_ns && !matched_ns) {
1849 if (!params.ns.empty()) {
1850 /* we've iterated past the namespace we're searching -- done now */
1851 truncated = false;
1852 goto done;
1853 }
1854
1855 /* we're not looking at the namespace this object is in, next! */
1856 continue;
1857 }
1858
1859 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1860 truncated = false;
1861 goto done;
1862 }
1863
1864 if (count < max) {
1865 params.marker = index_key;
1866 next_marker = index_key;
1867 }
1868
1869 if (params.filter &&
1870 ! params.filter->filter(obj.name, index_key.name)) {
1871 continue;
1872 }
1873
1874 if (params.prefix.size() &&
1875 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
1876 continue;
1877 }
1878
1879 if (!params.delim.empty()) {
1880 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1881 if (delim_pos >= 0) {
1882 // run either the code where delimiter filtering is done a)
1883 // in the OSD/CLS or b) here.
1884 if (cls_filtered) {
1885 // NOTE: this condition is for the newer versions of the
1886 // OSD that does filtering on the CLS side
1887
1888 // should only find one delimiter at the end if it finds any
1889 // after the prefix
1890 if (delim_pos !=
1891 int(obj.name.length() - params.delim.length())) {
1892 ldout(cct, 0) <<
1893 "WARNING: found delimiter in place other than the end of "
1894 "the prefix; obj.name=" << obj.name <<
1895 ", prefix=" << params.prefix << dendl;
1896 }
1897 if (common_prefixes) {
1898 if (count >= max) {
1899 truncated = true;
1900 goto done;
1901 }
1902
1903 (*common_prefixes)[obj.name] = true;
1904 count++;
1905 }
1906
1907 continue;
1908 } else {
1909 // NOTE: this condition is for older versions of the OSD
1910 // that do not filter on the CLS side, so the following code
1911 // must do the filtering; once we reach version 16 of ceph,
1912 // this code can be removed along with the conditional that
1913 // can lead this way
1914
1915 /* extract key -with trailing delimiter- for CommonPrefix */
1916 string prefix_key =
1917 obj.name.substr(0, delim_pos + params.delim.length());
1918
1919 if (common_prefixes &&
1920 common_prefixes->find(prefix_key) == common_prefixes->end()) {
1921 if (count >= max) {
1922 truncated = true;
1923 goto done;
1924 }
1925 next_marker = prefix_key;
1926 (*common_prefixes)[prefix_key] = true;
1927
1928 count++;
1929 }
1930
1931 continue;
1932 } // if we're running an older OSD version
1933 } // if a delimiter was found after prefix
1934 } // if a delimiter was passed in
1935
1936 if (count >= max) {
1937 truncated = true;
1938 goto done;
1939 }
1940
1941 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1942 " adding entry " << entry.key << " to result" << dendl;
1943
1944 result->emplace_back(std::move(entry));
1945 count++;
1946 } // eiter for loop
1947
1948 // NOTE: the following conditional is needed by older versions of
1949 // the OSD that don't do delimiter filtering on the CLS side; once
1950 // we reach version 16 of ceph, the following conditional and the
1951 // code within can be removed
1952 if (!cls_filtered && !params.delim.empty()) {
1953 int marker_delim_pos =
1954 cur_marker.name.find(params.delim, cur_prefix.size());
1955 if (marker_delim_pos >= 0) {
1956 std::string skip_after_delim =
1957 cur_marker.name.substr(0, marker_delim_pos);
1958 skip_after_delim.append(after_delim_s);
1959
1960 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
1961
1962 if (skip_after_delim > cur_marker.name) {
1963 cur_marker = skip_after_delim;
1964 ldout(cct, 20) << "setting cur_marker="
1965 << cur_marker.name
1966 << "[" << cur_marker.instance << "]"
1967 << dendl;
1968 }
1969 }
1970 } // if older osd didn't do delimiter filtering
1971
1972 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1973 " INFO end of outer loop, truncated=" << truncated <<
1974 ", count=" << count << ", attempt=" << attempt << dendl;
1975
1976 if (!truncated || count >= (max + 1) / 2) {
1977 // if we finished listing, or if we're returning at least half the
1978 // requested entries, that's enough; S3 and swift protocols allow
1979 // returning fewer than max entries
1980 break;
1981 } else if (attempt > 8 && count >= 1) {
1982 // if we've made at least 8 attempts and we have some, but very
1983 // few, results, return with what we have
1984 break;
1985 }
1986
1987 ldout(cct, 1) << "RGWRados::Bucket::List::" << __func__ <<
1988 " INFO ordered bucket listing requires read #" << (1 + attempt) <<
1989 dendl;
1990 } // read attempt loop
1991
1992 done:
1993
1994 if (is_truncated) {
1995 *is_truncated = truncated;
1996 }
1997
1998 return 0;
1999 } // list_objects_ordered
2000
2001
2002 /**
2003 * Get listing of the objects in a bucket and allow the results to be out
2004 * of order.
2005 *
2006 * Even though there are key differences with the ordered counterpart,
2007 * the parameters are the same to maintain some compatability.
2008 *
2009 * max: maximum number of results to return
2010 * bucket: bucket to list contents of
2011 * prefix: only return results that match this prefix
2012 * delim: should not be set; if it is we should have indicated an error
2013 * marker: if filled in, begin the listing with this object.
2014 * end_marker: if filled in, end the listing with this object.
2015 * result: the objects are put in here.
2016 * common_prefixes: this is never filled with an unordered list; the param
2017 * is maintained for compatibility
2018 * is_truncated: if number of objects in the bucket is bigger than max, then
2019 * truncated.
2020 */
2021 int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p,
2022 vector<rgw_bucket_dir_entry> *result,
2023 map<string, bool> *common_prefixes,
2024 bool *is_truncated,
2025 optional_yield y)
2026 {
2027 RGWRados *store = target->get_store();
2028 CephContext *cct = store->ctx();
2029 int shard_id = target->get_shard_id();
2030
2031 int count = 0;
2032 bool truncated = true;
2033
2034 const int64_t max = // protect against memory issues and negative vals
2035 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2036
2037 // read a few extra in each call to cls_bucket_list_unordered in
2038 // case some are filtered out due to namespace matching, versioning,
2039 // filtering, etc.
2040 const int64_t max_read_ahead = 100;
2041 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2042
2043 result->clear();
2044
2045 // use a local marker; either the marker will have a previous entry
2046 // or it will be empty; either way it's OK to copy
2047 rgw_obj_key marker_obj(params.marker.name,
2048 params.marker.instance,
2049 params.marker.ns);
2050 rgw_obj_index_key cur_marker;
2051 marker_obj.get_index_key(&cur_marker);
2052
2053 rgw_obj_key end_marker_obj(params.end_marker.name,
2054 params.end_marker.instance,
2055 params.end_marker.ns);
2056 rgw_obj_index_key cur_end_marker;
2057 end_marker_obj.get_index_key(&cur_end_marker);
2058 const bool cur_end_marker_valid = !params.end_marker.empty();
2059
2060 rgw_obj_key prefix_obj(params.prefix);
2061 prefix_obj.set_ns(params.ns);
2062 string cur_prefix = prefix_obj.get_index_key_name();
2063
2064 while (truncated && count <= max) {
2065 std::vector<rgw_bucket_dir_entry> ent_list;
2066 ent_list.reserve(read_ahead);
2067
2068 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
2069 shard_id,
2070 cur_marker,
2071 cur_prefix,
2072 read_ahead,
2073 params.list_versions,
2074 ent_list,
2075 &truncated,
2076 &cur_marker,
2077 y);
2078 if (r < 0)
2079 return r;
2080
2081 // NB: while regions of ent_list will be sorted, we have no
2082 // guarantee that all items will be sorted since they can cross
2083 // shard boundaries
2084
2085 for (auto& entry : ent_list) {
2086 rgw_obj_index_key index_key = entry.key;
2087 rgw_obj_key obj(index_key);
2088
2089 if (count < max) {
2090 params.marker.set(index_key);
2091 next_marker.set(index_key);
2092 }
2093
2094 /* note that parse_raw_oid() here will not set the correct
2095 * object's instance, as rgw_obj_index_key encodes that
2096 * separately. We don't need to set the instance because it's
2097 * not needed for the checks here and we end up using the raw
2098 * entry for the return vector
2099 */
2100 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2101 if (!valid) {
2102 ldout(cct, 0) << "ERROR: could not parse object name: " <<
2103 obj.name << dendl;
2104 continue;
2105 }
2106
2107 if (!params.list_versions && !entry.is_visible()) {
2108 continue;
2109 }
2110
2111 if (params.enforce_ns && obj.ns != params.ns) {
2112 continue;
2113 }
2114
2115 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2116 // we're not guaranteed items will come in order, so we have
2117 // to loop through all
2118 continue;
2119 }
2120
2121 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2122 continue;
2123
2124 if (params.prefix.size() &&
2125 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2126 continue;
2127
2128 if (count >= max) {
2129 truncated = true;
2130 goto done;
2131 }
2132
2133 result->emplace_back(std::move(entry));
2134 count++;
2135 } // for (auto& entry : ent_list)
2136 } // while (truncated && count <= max)
2137
2138 done:
2139 if (is_truncated)
2140 *is_truncated = truncated;
2141
2142 return 0;
2143 } // list_objects_unordered
2144
2145
2146 /**
2147 * create a rados pool, associated meta info
2148 * returns 0 on success, -ERR# otherwise.
2149 */
2150 int RGWRados::create_pool(const rgw_pool& pool)
2151 {
2152 librados::IoCtx io_ctx;
2153 constexpr bool create = true;
2154 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
2155 }
2156
2157 void RGWRados::create_bucket_id(string *bucket_id)
2158 {
2159 uint64_t iid = instance_id();
2160 uint64_t bid = next_bucket_id();
2161 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2162 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2163 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2164 *bucket_id = buf;
2165 }
2166
2167 int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
2168 const string& zonegroup_id,
2169 const rgw_placement_rule& placement_rule,
2170 const string& swift_ver_location,
2171 const RGWQuotaInfo * pquota_info,
2172 map<std::string, bufferlist>& attrs,
2173 RGWBucketInfo& info,
2174 obj_version *pobjv,
2175 obj_version *pep_objv,
2176 real_time creation_time,
2177 rgw_bucket *pmaster_bucket,
2178 uint32_t *pmaster_num_shards,
2179 bool exclusive)
2180 {
2181 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2182 rgw_placement_rule selected_placement_rule;
2183 RGWZonePlacementInfo rule_info;
2184
2185 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2186 int ret = 0;
2187 ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule,
2188 &selected_placement_rule, &rule_info);
2189 if (ret < 0)
2190 return ret;
2191
2192 if (!pmaster_bucket) {
2193 create_bucket_id(&bucket.marker);
2194 bucket.bucket_id = bucket.marker;
2195 } else {
2196 bucket.marker = pmaster_bucket->marker;
2197 bucket.bucket_id = pmaster_bucket->bucket_id;
2198 }
2199
2200 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2201
2202 objv_tracker.read_version.clear();
2203
2204 if (pobjv) {
2205 objv_tracker.write_version = *pobjv;
2206 } else {
2207 objv_tracker.generate_new_write_ver(cct);
2208 }
2209
2210 info.bucket = bucket;
2211 info.owner = owner.user_id;
2212 info.zonegroup = zonegroup_id;
2213 info.placement_rule = selected_placement_rule;
2214 info.index_type = rule_info.index_type;
2215 info.swift_ver_location = swift_ver_location;
2216 info.swift_versioning = (!swift_ver_location.empty());
2217 if (pmaster_num_shards) {
2218 info.num_shards = *pmaster_num_shards;
2219 } else {
2220 info.num_shards = bucket_index_max_shards;
2221 }
2222 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
2223 info.requester_pays = false;
2224 if (real_clock::is_zero(creation_time)) {
2225 info.creation_time = ceph::real_clock::now();
2226 } else {
2227 info.creation_time = creation_time;
2228 }
2229 if (pquota_info) {
2230 info.quota = *pquota_info;
2231 }
2232
2233 int r = svc.bi->init_index(info);
2234 if (r < 0) {
2235 return r;
2236 }
2237
2238 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
2239 if (ret == -ECANCELED) {
2240 ret = -EEXIST;
2241 }
2242 if (ret == -EEXIST) {
2243 /* we need to reread the info and return it, caller will have a use for it */
2244 RGWBucketInfo orig_info;
2245 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
2246 if (r < 0) {
2247 if (r == -ENOENT) {
2248 continue;
2249 }
2250 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
2251 return r;
2252 }
2253
2254 /* only remove it if it's a different bucket instance */
2255 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
2256 int r = svc.bi->clean_index(info);
2257 if (r < 0) {
2258 ldout(cct, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
2259 }
2260 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield);
2261 if (r < 0) {
2262 ldout(cct, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
2263 /* continue anyway */
2264 }
2265 }
2266
2267 info = std::move(orig_info);
2268 /* ret == -EEXIST here */
2269 }
2270 return ret;
2271 }
2272
2273 /* this is highly unlikely */
2274 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2275 return -ENOENT;
2276 }
2277
2278 bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
2279 {
2280 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2281 }
2282
2283 bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2284 {
2285 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
2286
2287 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
2288 }
2289
2290 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2291 {
2292 string oid, key;
2293 get_obj_bucket_and_oid_loc(obj, oid, key);
2294
2295 rgw_pool pool;
2296 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2297 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2298 return -EIO;
2299 }
2300
2301 int r = open_pool_ctx(pool, *ioctx, false);
2302 if (r < 0) {
2303 return r;
2304 }
2305
2306 ioctx->locator_set_key(key);
2307
2308 return 0;
2309 }
2310
2311 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2312 {
2313 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
2314
2315 rgw_pool pool;
2316 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2317 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2318 return -EIO;
2319 }
2320
2321 ref->pool = svc.rados->pool(pool);
2322
2323 int r = ref->pool.open(RGWSI_RADOS::OpenParams()
2324 .set_mostly_omap(false));
2325 if (r < 0) {
2326 ldout(cct, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
2327 return r;
2328 }
2329
2330 ref->pool.ioctx().locator_set_key(ref->obj.loc);
2331
2332 return 0;
2333 }
2334
2335 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
2336 {
2337 ref->obj = obj;
2338
2339 if (ref->obj.oid.empty()) {
2340 ref->obj.oid = obj.pool.to_str();
2341 ref->obj.pool = svc.zone->get_zone_params().domain_root;
2342 }
2343 ref->pool = svc.rados->pool(obj.pool);
2344 int r = ref->pool.open(RGWSI_RADOS::OpenParams()
2345 .set_mostly_omap(false));
2346 if (r < 0) {
2347 ldout(cct, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
2348 return r;
2349 }
2350
2351 ref->pool.ioctx().locator_set_key(ref->obj.loc);
2352
2353 return 0;
2354 }
2355
2356 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
2357 {
2358 return get_raw_obj_ref(obj, ref);
2359 }
2360
2361 /*
2362 * fixes an issue where head objects were supposed to have a locator created, but ended
2363 * up without one
2364 */
2365 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
2366 {
2367 const rgw_bucket& bucket = bucket_info.bucket;
2368 string oid;
2369 string locator;
2370
2371 rgw_obj obj(bucket, key);
2372
2373 get_obj_bucket_and_oid_loc(obj, oid, locator);
2374
2375 if (locator.empty()) {
2376 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
2377 return 0;
2378 }
2379
2380 librados::IoCtx ioctx;
2381
2382 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
2383 if (ret < 0) {
2384 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2385 return ret;
2386 }
2387 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2388
2389 uint64_t size;
2390 bufferlist data;
2391
2392 struct timespec mtime_ts;
2393 map<string, bufferlist> attrs;
2394 librados::ObjectReadOperation op;
2395 op.getxattrs(&attrs, NULL);
2396 op.stat2(&size, &mtime_ts, NULL);
2397 #define HEAD_SIZE 512 * 1024
2398 op.read(0, HEAD_SIZE, &data, NULL);
2399
2400 ret = rgw_rados_operate(ioctx, oid, &op, &data, null_yield);
2401 if (ret < 0) {
2402 lderr(cct) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
2403 return ret;
2404 }
2405
2406 if (size > HEAD_SIZE) {
2407 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
2408 return -EIO;
2409 }
2410
2411 if (size != data.length()) {
2412 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
2413 return -EIO;
2414 }
2415
2416 if (copy_obj) {
2417 librados::ObjectWriteOperation wop;
2418
2419 wop.mtime2(&mtime_ts);
2420
2421 map<string, bufferlist>::iterator iter;
2422 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2423 wop.setxattr(iter->first.c_str(), iter->second);
2424 }
2425
2426 wop.write(0, data);
2427
2428 ioctx.locator_set_key(locator);
2429 rgw_rados_operate(ioctx, oid, &wop, null_yield);
2430 }
2431
2432 if (remove_bad) {
2433 ioctx.locator_set_key(string());
2434
2435 ret = ioctx.remove(oid);
2436 if (ret < 0) {
2437 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
2438 return ret;
2439 }
2440 }
2441
2442 return 0;
2443 }
2444
2445 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
2446 const string& src_oid, const string& src_locator,
2447 librados::IoCtx& dst_ioctx,
2448 const string& dst_oid, const string& dst_locator)
2449 {
2450
2451 #define COPY_BUF_SIZE (4 * 1024 * 1024)
2452 bool done = false;
2453 uint64_t chunk_size = COPY_BUF_SIZE;
2454 uint64_t ofs = 0;
2455 int ret = 0;
2456 real_time mtime;
2457 struct timespec mtime_ts;
2458 uint64_t size;
2459
2460 if (src_oid == dst_oid && src_locator == dst_locator) {
2461 return 0;
2462 }
2463
2464 src_ioctx.locator_set_key(src_locator);
2465 dst_ioctx.locator_set_key(dst_locator);
2466
2467 do {
2468 bufferlist data;
2469 ObjectReadOperation rop;
2470 ObjectWriteOperation wop;
2471
2472 if (ofs == 0) {
2473 rop.stat2(&size, &mtime_ts, NULL);
2474 mtime = real_clock::from_timespec(mtime_ts);
2475 }
2476 rop.read(ofs, chunk_size, &data, NULL);
2477 ret = rgw_rados_operate(src_ioctx, src_oid, &rop, &data, null_yield);
2478 if (ret < 0) {
2479 goto done_err;
2480 }
2481
2482 if (data.length() == 0) {
2483 break;
2484 }
2485
2486 if (ofs == 0) {
2487 wop.create(true); /* make it exclusive */
2488 wop.mtime2(&mtime_ts);
2489 mtime = real_clock::from_timespec(mtime_ts);
2490 }
2491 wop.write(ofs, data);
2492 ret = rgw_rados_operate(dst_ioctx, dst_oid, &wop, null_yield);
2493 if (ret < 0) {
2494 goto done_err;
2495 }
2496 ofs += data.length();
2497 done = data.length() != chunk_size;
2498 } while (!done);
2499
2500 if (ofs != size) {
2501 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
2502 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2503 ret = -EIO;
2504 goto done_err;
2505 }
2506
2507 src_ioctx.remove(src_oid);
2508
2509 return 0;
2510
2511 done_err:
2512 // TODO: clean up dst_oid if we created it
2513 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
2514 return ret;
2515 }
2516
2517 /*
2518 * fixes an issue where head objects were supposed to have a locator created, but ended
2519 * up without one
2520 */
2521 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
2522 {
2523 const rgw_bucket& bucket = bucket_info.bucket;
2524 rgw_obj obj(bucket, key);
2525
2526 if (need_fix) {
2527 *need_fix = false;
2528 }
2529
2530 rgw_rados_ref ref;
2531 int r = get_obj_head_ref(bucket_info, obj, &ref);
2532 if (r < 0) {
2533 return r;
2534 }
2535
2536 RGWObjState *astate = NULL;
2537 RGWObjectCtx rctx(this->store);
2538 r = get_obj_state(&rctx, bucket_info, obj, &astate, false, y);
2539 if (r < 0)
2540 return r;
2541
2542 if (astate->manifest) {
2543 RGWObjManifest::obj_iterator miter;
2544 RGWObjManifest& manifest = *astate->manifest;
2545 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
2546 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
2547 rgw_obj loc;
2548 string oid;
2549 string locator;
2550
2551 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
2552
2553 if (loc.key.ns.empty()) {
2554 /* continue, we're only interested in tail objects */
2555 continue;
2556 }
2557
2558 auto& ioctx = ref.pool.ioctx();
2559
2560 get_obj_bucket_and_oid_loc(loc, oid, locator);
2561 ref.pool.ioctx().locator_set_key(locator);
2562
2563 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
2564
2565 r = ioctx.stat(oid, NULL, NULL);
2566 if (r != -ENOENT) {
2567 continue;
2568 }
2569
2570 string bad_loc;
2571 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2572
2573 /* create a new ioctx with the bad locator */
2574 librados::IoCtx src_ioctx;
2575 src_ioctx.dup(ioctx);
2576 src_ioctx.locator_set_key(bad_loc);
2577
2578 r = src_ioctx.stat(oid, NULL, NULL);
2579 if (r != 0) {
2580 /* cannot find a broken part */
2581 continue;
2582 }
2583 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
2584 if (need_fix) {
2585 *need_fix = true;
2586 }
2587 if (fix) {
2588 r = move_rados_obj(src_ioctx, oid, bad_loc, ioctx, oid, locator);
2589 if (r < 0) {
2590 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
2591 }
2592 }
2593 }
2594 }
2595
2596 return 0;
2597 }
2598
2599 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2600 const rgw_obj& obj,
2601 RGWBucketInfo* bucket_info_out)
2602 {
2603 bucket = _bucket;
2604
2605 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
2606
2607 RGWBucketInfo bucket_info;
2608 RGWBucketInfo* bucket_info_p =
2609 bucket_info_out ? bucket_info_out : &bucket_info;
2610
2611 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield);
2612 if (ret < 0) {
2613 return ret;
2614 }
2615
2616 string oid;
2617
2618 ret = store->svc.bi_rados->open_bucket_index_shard(*bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
2619 if (ret < 0) {
2620 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2621 return ret;
2622 }
2623 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
2624
2625 return 0;
2626 }
2627
2628 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2629 int sid,
2630 RGWBucketInfo* bucket_info_out)
2631 {
2632 bucket = _bucket;
2633 shard_id = sid;
2634
2635 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
2636
2637 RGWBucketInfo bucket_info;
2638 RGWBucketInfo* bucket_info_p =
2639 bucket_info_out ? bucket_info_out : &bucket_info;
2640 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield);
2641 if (ret < 0) {
2642 return ret;
2643 }
2644
2645 string oid;
2646
2647 ret = store->svc.bi_rados->open_bucket_index_shard(*bucket_info_p, shard_id, &bucket_obj);
2648 if (ret < 0) {
2649 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2650 return ret;
2651 }
2652 ldout(store->ctx(), 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
2653
2654 return 0;
2655 }
2656
2657 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
2658 const rgw_obj& obj)
2659 {
2660 bucket = bucket_info.bucket;
2661
2662 int ret = store->svc.bi_rados->open_bucket_index_shard(bucket_info,
2663 obj.get_hash_object(),
2664 &bucket_obj,
2665 &shard_id);
2666 if (ret < 0) {
2667 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2668 return ret;
2669 }
2670 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
2671
2672 return 0;
2673 }
2674
2675 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
2676 {
2677 bucket = bucket_info.bucket;
2678 shard_id = sid;
2679
2680 int ret = store->svc.bi_rados->open_bucket_index_shard(bucket_info, shard_id, &bucket_obj);
2681 if (ret < 0) {
2682 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2683 return ret;
2684 }
2685 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
2686
2687 return 0;
2688 }
2689
2690
2691 /* Execute @handler on last item in bucket listing for bucket specified
2692 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2693 * to objects matching these criterias. */
2694 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
2695 const std::string& obj_prefix,
2696 const std::string& obj_delim,
2697 std::function<int(const rgw_bucket_dir_entry&)> handler)
2698 {
2699 RGWRados::Bucket target(this, bucket_info);
2700 RGWRados::Bucket::List list_op(&target);
2701
2702 list_op.params.prefix = obj_prefix;
2703 list_op.params.delim = obj_delim;
2704
2705 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
2706 << ", obj_prefix=" << obj_prefix
2707 << ", obj_delim=" << obj_delim
2708 << dendl;
2709
2710 bool is_truncated = false;
2711
2712 boost::optional<rgw_bucket_dir_entry> last_entry;
2713 /* We need to rewind to the last object in a listing. */
2714 do {
2715 /* List bucket entries in chunks. */
2716 static constexpr int MAX_LIST_OBJS = 100;
2717 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2718
2719 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
2720 &is_truncated, null_yield);
2721 if (ret < 0) {
2722 return ret;
2723 } else if (!entries.empty()) {
2724 last_entry = entries.back();
2725 }
2726 } while (is_truncated);
2727
2728 if (last_entry) {
2729 return handler(*last_entry);
2730 }
2731
2732 /* Empty listing - no items we can run handler on. */
2733 return 0;
2734 }
2735
2736
2737 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2738 const rgw_user& user,
2739 RGWBucketInfo& bucket_info,
2740 rgw_obj& obj,
2741 const DoutPrefixProvider *dpp,
2742 optional_yield y)
2743 {
2744 if (! swift_versioning_enabled(bucket_info)) {
2745 return 0;
2746 }
2747
2748 obj_ctx.set_atomic(obj);
2749
2750 RGWObjState * state = nullptr;
2751 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false, y);
2752 if (r < 0) {
2753 return r;
2754 }
2755
2756 if (!state->exists) {
2757 return 0;
2758 }
2759
2760 const string& src_name = obj.get_oid();
2761 char buf[src_name.size() + 32];
2762 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2763 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2764 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2765
2766 RGWBucketInfo dest_bucket_info;
2767
2768 r = get_bucket_info(&svc, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
2769 if (r < 0) {
2770 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
2771 if (r == -ENOENT) {
2772 return -ERR_PRECONDITION_FAILED;
2773 }
2774 return r;
2775 }
2776
2777 if (dest_bucket_info.owner != bucket_info.owner) {
2778 return -ERR_PRECONDITION_FAILED;
2779 }
2780
2781 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
2782
2783 if (dest_bucket_info.versioning_enabled()){
2784 gen_rand_obj_instance_name(&dest_obj);
2785 }
2786
2787 obj_ctx.set_atomic(dest_obj);
2788
2789 rgw_zone_id no_zone;
2790
2791 r = copy_obj(obj_ctx,
2792 user,
2793 NULL, /* req_info *info */
2794 no_zone,
2795 dest_obj,
2796 obj,
2797 dest_bucket_info,
2798 bucket_info,
2799 bucket_info.placement_rule,
2800 NULL, /* time_t *src_mtime */
2801 NULL, /* time_t *mtime */
2802 NULL, /* const time_t *mod_ptr */
2803 NULL, /* const time_t *unmod_ptr */
2804 false, /* bool high_precision_time */
2805 NULL, /* const char *if_match */
2806 NULL, /* const char *if_nomatch */
2807 RGWRados::ATTRSMOD_NONE,
2808 true, /* bool copy_if_newer */
2809 state->attrset,
2810 RGWObjCategory::Main,
2811 0, /* uint64_t olh_epoch */
2812 real_time(), /* time_t delete_at */
2813 NULL, /* string *version_id */
2814 NULL, /* string *ptag */
2815 NULL, /* string *petag */
2816 NULL, /* void (*progress_cb)(off_t, void *) */
2817 NULL, /* void *progress_data */
2818 dpp,
2819 null_yield);
2820 if (r == -ECANCELED || r == -ENOENT) {
2821 /* Has already been overwritten, meaning another rgw process already
2822 * copied it out */
2823 return 0;
2824 }
2825
2826 return r;
2827 }
2828
2829 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
2830 const rgw_user& user,
2831 RGWBucketInfo& bucket_info,
2832 rgw_obj& obj,
2833 bool& restored, /* out */
2834 const DoutPrefixProvider *dpp)
2835 {
2836 if (! swift_versioning_enabled(bucket_info)) {
2837 return 0;
2838 }
2839
2840 /* Bucket info of the bucket that stores previous versions of our object. */
2841 RGWBucketInfo archive_binfo;
2842
2843 int ret = get_bucket_info(&svc, bucket_info.bucket.tenant,
2844 bucket_info.swift_ver_location, archive_binfo,
2845 nullptr, null_yield, nullptr);
2846 if (ret < 0) {
2847 return ret;
2848 }
2849
2850 /* Abort the operation if the bucket storing our archive belongs to someone
2851 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
2852 * into consideration. For we can live with that.
2853 *
2854 * TODO: delegate this check to un upper layer and compare with ACLs. */
2855 if (bucket_info.owner != archive_binfo.owner) {
2856 return -EPERM;
2857 }
2858
2859 /* This code will be executed on latest version of the object. */
2860 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
2861 rgw_zone_id no_zone;
2862
2863 /* We don't support object versioning of Swift API on those buckets that
2864 * are already versioned using the S3 mechanism. This affects also bucket
2865 * storing archived objects. Otherwise the delete operation would create
2866 * a deletion marker. */
2867 if (archive_binfo.versioned()) {
2868 restored = false;
2869 return -ERR_PRECONDITION_FAILED;
2870 }
2871
2872 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
2873 * irrelevant and may be safely skipped. */
2874 std::map<std::string, ceph::bufferlist> no_attrs;
2875
2876 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
2877
2878 if (bucket_info.versioning_enabled()){
2879 gen_rand_obj_instance_name(&obj);
2880 }
2881
2882 obj_ctx.set_atomic(archive_obj);
2883 obj_ctx.set_atomic(obj);
2884
2885 int ret = copy_obj(obj_ctx,
2886 user,
2887 nullptr, /* req_info *info */
2888 no_zone,
2889 obj, /* dest obj */
2890 archive_obj, /* src obj */
2891 bucket_info, /* dest bucket info */
2892 archive_binfo, /* src bucket info */
2893 bucket_info.placement_rule, /* placement_rule */
2894 nullptr, /* time_t *src_mtime */
2895 nullptr, /* time_t *mtime */
2896 nullptr, /* const time_t *mod_ptr */
2897 nullptr, /* const time_t *unmod_ptr */
2898 false, /* bool high_precision_time */
2899 nullptr, /* const char *if_match */
2900 nullptr, /* const char *if_nomatch */
2901 RGWRados::ATTRSMOD_NONE,
2902 true, /* bool copy_if_newer */
2903 no_attrs,
2904 RGWObjCategory::Main,
2905 0, /* uint64_t olh_epoch */
2906 real_time(), /* time_t delete_at */
2907 nullptr, /* string *version_id */
2908 nullptr, /* string *ptag */
2909 nullptr, /* string *petag */
2910 nullptr, /* void (*progress_cb)(off_t, void *) */
2911 nullptr, /* void *progress_data */
2912 dpp,
2913 null_yield);
2914 if (ret == -ECANCELED || ret == -ENOENT) {
2915 /* Has already been overwritten, meaning another rgw process already
2916 * copied it out */
2917 return 0;
2918 } else if (ret < 0) {
2919 return ret;
2920 } else {
2921 restored = true;
2922 }
2923
2924 /* Need to remove the archived copy. */
2925 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
2926 archive_binfo.versioning_status());
2927
2928 return ret;
2929 };
2930
2931 const std::string& obj_name = obj.get_oid();
2932 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
2933 % obj_name);
2934
2935 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
2936 handler);
2937 }
2938
2939 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
2940 map<string, bufferlist>& attrs,
2941 bool assume_noent, bool modify_tail,
2942 void *_index_op, optional_yield y)
2943 {
2944 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
2945 RGWRados *store = target->get_store();
2946
2947 ObjectWriteOperation op;
2948 #ifdef WITH_LTTNG
2949 const struct req_state* s = get_req_state();
2950 string req_id;
2951 if (!s) {
2952 // fake req_id
2953 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
2954 } else {
2955 req_id = s->req_id;
2956 }
2957 #endif
2958
2959 RGWObjState *state;
2960 int r = target->get_state(&state, false, y, assume_noent);
2961 if (r < 0)
2962 return r;
2963
2964 rgw_obj& obj = target->get_obj();
2965
2966 if (obj.get_oid().empty()) {
2967 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
2968 return -EIO;
2969 }
2970
2971 rgw_rados_ref ref;
2972 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
2973 if (r < 0)
2974 return r;
2975
2976 bool is_olh = state->is_olh;
2977
2978 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
2979
2980 const string *ptag = meta.ptag;
2981 if (!ptag && !index_op->get_optag()->empty()) {
2982 ptag = index_op->get_optag();
2983 }
2984 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
2985 if (r < 0)
2986 return r;
2987
2988 if (real_clock::is_zero(meta.set_mtime)) {
2989 meta.set_mtime = real_clock::now();
2990 }
2991
2992 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
2993 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
2994 if (iter == attrs.end()) {
2995 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
2996 string mode = target->bucket_info.obj_lock.get_mode();
2997 RGWObjectRetention obj_retention(mode, lock_until_date);
2998 bufferlist bl;
2999 obj_retention.encode(bl);
3000 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3001 }
3002 }
3003
3004 if (state->is_olh) {
3005 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3006 }
3007
3008 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3009 op.mtime2(&mtime_ts);
3010
3011 if (meta.data) {
3012 /* if we want to overwrite the data, we also want to overwrite the
3013 xattrs, so just remove the object */
3014 op.write_full(*meta.data);
3015 }
3016
3017 string etag;
3018 string content_type;
3019 bufferlist acl_bl;
3020 string storage_class;
3021
3022 map<string, bufferlist>::iterator iter;
3023 if (meta.rmattrs) {
3024 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3025 const string& name = iter->first;
3026 op.rmxattr(name.c_str());
3027 }
3028 }
3029
3030 if (meta.manifest) {
3031 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3032
3033 /* remove existing manifest attr */
3034 iter = attrs.find(RGW_ATTR_MANIFEST);
3035 if (iter != attrs.end())
3036 attrs.erase(iter);
3037
3038 bufferlist bl;
3039 encode(*meta.manifest, bl);
3040 op.setxattr(RGW_ATTR_MANIFEST, bl);
3041 }
3042
3043 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3044 const string& name = iter->first;
3045 bufferlist& bl = iter->second;
3046
3047 if (!bl.length())
3048 continue;
3049
3050 op.setxattr(name.c_str(), bl);
3051
3052 if (name.compare(RGW_ATTR_ETAG) == 0) {
3053 etag = rgw_bl_str(bl);
3054 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
3055 content_type = rgw_bl_str(bl);
3056 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3057 acl_bl = bl;
3058 }
3059 }
3060 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3061 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3062 }
3063
3064 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3065 bufferlist bl;
3066 encode(store->svc.zone->get_zone_short_id(), bl);
3067 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3068 }
3069
3070 if (!storage_class.empty()) {
3071 bufferlist bl;
3072 bl.append(storage_class);
3073 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3074 }
3075
3076 if (!op.size())
3077 return 0;
3078
3079 uint64_t epoch;
3080 int64_t poolid;
3081 bool orig_exists;
3082 uint64_t orig_size;
3083
3084 if (!reset_obj) { //Multipart upload, it has immutable head.
3085 orig_exists = false;
3086 orig_size = 0;
3087 } else {
3088 orig_exists = state->exists;
3089 orig_size = state->accounted_size;
3090 }
3091
3092 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3093 !obj.key.instance.empty();
3094
3095 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3096
3097 if (versioned_op) {
3098 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3099 }
3100
3101 if (!index_op->is_prepared()) {
3102 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
3103 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag, y);
3104 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
3105 if (r < 0)
3106 return r;
3107 }
3108
3109 auto& ioctx = ref.pool.ioctx();
3110
3111 tracepoint(rgw_rados, operate_enter, req_id.c_str());
3112 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
3113 tracepoint(rgw_rados, operate_exit, req_id.c_str());
3114 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3115 or -ENOENT if was removed, or -EEXIST if it did not exist
3116 before and now it does */
3117 if (r == -EEXIST && assume_noent) {
3118 target->invalidate_state();
3119 return r;
3120 }
3121 goto done_cancel;
3122 }
3123
3124 epoch = ioctx.get_last_version();
3125 poolid = ioctx.get_id();
3126
3127 r = target->complete_atomic_modification();
3128 if (r < 0) {
3129 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3130 }
3131
3132 tracepoint(rgw_rados, complete_enter, req_id.c_str());
3133 r = index_op->complete(poolid, epoch, size, accounted_size,
3134 meta.set_mtime, etag, content_type,
3135 storage_class, &acl_bl,
3136 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3137 tracepoint(rgw_rados, complete_exit, req_id.c_str());
3138 if (r < 0)
3139 goto done_cancel;
3140
3141 if (meta.mtime) {
3142 *meta.mtime = meta.set_mtime;
3143 }
3144
3145 /* note that index_op was using state so we couldn't invalidate it earlier */
3146 target->invalidate_state();
3147 state = NULL;
3148
3149 if (versioned_op && meta.olh_epoch) {
3150 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
3151 if (r < 0) {
3152 return r;
3153 }
3154 }
3155
3156 if (!real_clock::is_zero(meta.delete_at)) {
3157 rgw_obj_index_key obj_key;
3158 obj.key.get_index_key(&obj_key);
3159
3160 r = store->obj_expirer->hint_add(meta.delete_at, obj.bucket.tenant, obj.bucket.name,
3161 obj.bucket.bucket_id, obj_key);
3162 if (r < 0) {
3163 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3164 /* ignoring error, nothing we can do at this point */
3165 }
3166 }
3167 meta.canceled = false;
3168
3169 /* update quota cache */
3170 if (meta.completeMultipart){
3171 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3172 0, orig_size);
3173 }
3174 else {
3175 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3176 accounted_size, orig_size);
3177 }
3178 return 0;
3179
3180 done_cancel:
3181 int ret = index_op->cancel();
3182 if (ret < 0) {
3183 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3184 }
3185
3186 meta.canceled = true;
3187
3188 /* we lost in a race. There are a few options:
3189 * - existing object was rewritten (ECANCELED)
3190 * - non existing object was created (EEXIST)
3191 * - object was removed (ENOENT)
3192 * should treat it as a success
3193 */
3194 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3195 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3196 r = 0;
3197 }
3198 } else {
3199 if (meta.if_match != NULL) {
3200 // only overwrite existing object
3201 if (strcmp(meta.if_match, "*") == 0) {
3202 if (r == -ENOENT) {
3203 r = -ERR_PRECONDITION_FAILED;
3204 } else if (r == -ECANCELED) {
3205 r = 0;
3206 }
3207 }
3208 }
3209
3210 if (meta.if_nomatch != NULL) {
3211 // only create a new object
3212 if (strcmp(meta.if_nomatch, "*") == 0) {
3213 if (r == -EEXIST) {
3214 r = -ERR_PRECONDITION_FAILED;
3215 } else if (r == -ENOENT) {
3216 r = 0;
3217 }
3218 }
3219 }
3220 }
3221
3222 return r;
3223 }
3224
3225 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
3226 map<string, bufferlist>& attrs, optional_yield y)
3227 {
3228 RGWBucketInfo& bucket_info = target->get_bucket_info();
3229
3230 RGWRados::Bucket bop(target->get_store(), bucket_info);
3231 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
3232 index_op.set_zones_trace(meta.zones_trace);
3233
3234 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3235 int r;
3236 if (assume_noent) {
3237 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
3238 if (r == -EEXIST) {
3239 assume_noent = false;
3240 }
3241 }
3242 if (!assume_noent) {
3243 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
3244 }
3245 return r;
3246 }
3247
3248 class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
3249 {
3250 CephContext* cct;
3251 rgw_obj obj;
3252 rgw::putobj::DataProcessor *filter;
3253 boost::optional<RGWPutObj_Compress>& compressor;
3254 boost::optional<rgw::putobj::ChunkProcessor> buffering;
3255 CompressorRef& plugin;
3256 rgw::putobj::ObjectProcessor *processor;
3257 void (*progress_cb)(off_t, void *);
3258 void *progress_data;
3259 bufferlist extra_data_bl;
3260 uint64_t extra_data_left{0};
3261 bool need_to_process_attrs{true};
3262 uint64_t data_len{0};
3263 map<string, bufferlist> src_attrs;
3264 uint64_t ofs{0};
3265 uint64_t lofs{0}; /* logical ofs */
3266 std::function<int(map<string, bufferlist>&)> attrs_handler;
3267 public:
3268 RGWRadosPutObj(CephContext* cct,
3269 CompressorRef& plugin,
3270 boost::optional<RGWPutObj_Compress>& compressor,
3271 rgw::putobj::ObjectProcessor *p,
3272 void (*_progress_cb)(off_t, void *),
3273 void *_progress_data,
3274 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
3275 cct(cct),
3276 filter(p),
3277 compressor(compressor),
3278 plugin(plugin),
3279 processor(p),
3280 progress_cb(_progress_cb),
3281 progress_data(_progress_data),
3282 attrs_handler(_attrs_handler) {}
3283
3284 int process_attrs(void) {
3285 if (extra_data_bl.length()) {
3286 JSONParser jp;
3287 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3288 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3289 return -EIO;
3290 }
3291
3292 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3293
3294 src_attrs.erase(RGW_ATTR_COMPRESSION);
3295 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3296
3297 // filter out olh attributes
3298 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3299 while (iter != src_attrs.end()) {
3300 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3301 break;
3302 }
3303 iter = src_attrs.erase(iter);
3304 }
3305 }
3306
3307 int ret = attrs_handler(src_attrs);
3308 if (ret < 0) {
3309 return ret;
3310 }
3311
3312 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3313 //do not compress if object is encrypted
3314 compressor = boost::in_place(cct, plugin, filter);
3315 // add a filter that buffers data so we don't try to compress tiny blocks.
3316 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3317 // compression ratio
3318 constexpr unsigned buffer_size = 512 * 1024;
3319 buffering = boost::in_place(&*compressor, buffer_size);
3320 filter = &*buffering;
3321 }
3322
3323 need_to_process_attrs = false;
3324
3325 return 0;
3326 }
3327
3328 int handle_data(bufferlist& bl, bool *pause) override {
3329 if (progress_cb) {
3330 progress_cb(data_len, progress_data);
3331 }
3332 if (extra_data_left) {
3333 uint64_t extra_len = bl.length();
3334 if (extra_len > extra_data_left)
3335 extra_len = extra_data_left;
3336
3337 bufferlist extra;
3338 bl.splice(0, extra_len, &extra);
3339 extra_data_bl.append(extra);
3340
3341 extra_data_left -= extra_len;
3342 if (extra_data_left == 0) {
3343 int res = process_attrs();
3344 if (res < 0)
3345 return res;
3346 }
3347 ofs += extra_len;
3348 if (bl.length() == 0) {
3349 return 0;
3350 }
3351 }
3352 if (need_to_process_attrs) {
3353 /* need to call process_attrs() even if we don't get any attrs,
3354 * need it to call attrs_handler().
3355 */
3356 int res = process_attrs();
3357 if (res < 0) {
3358 return res;
3359 }
3360 }
3361
3362 ceph_assert(uint64_t(ofs) >= extra_data_len);
3363
3364 uint64_t size = bl.length();
3365 ofs += size;
3366
3367 const uint64_t lofs = data_len;
3368 data_len += size;
3369
3370 return filter->process(std::move(bl), lofs);
3371 }
3372
3373 int flush() {
3374 return filter->process({}, data_len);
3375 }
3376
3377 bufferlist& get_extra_data() { return extra_data_bl; }
3378
3379 map<string, bufferlist>& get_attrs() { return src_attrs; }
3380
3381 void set_extra_data_len(uint64_t len) override {
3382 extra_data_left = len;
3383 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
3384 }
3385
3386 uint64_t get_data_len() {
3387 return data_len;
3388 }
3389 };
3390
3391 /*
3392 * prepare attrset depending on attrs_mod.
3393 */
3394 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3395 map<string, bufferlist>& attrs,
3396 RGWRados::AttrsMod attrs_mod)
3397 {
3398 switch (attrs_mod) {
3399 case RGWRados::ATTRSMOD_NONE:
3400 attrs = src_attrs;
3401 break;
3402 case RGWRados::ATTRSMOD_REPLACE:
3403 if (!attrs[RGW_ATTR_ETAG].length()) {
3404 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3405 }
3406 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3407 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3408 if (ttiter != src_attrs.end()) {
3409 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3410 }
3411 }
3412 break;
3413 case RGWRados::ATTRSMOD_MERGE:
3414 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3415 if (attrs.find(it->first) == attrs.end()) {
3416 attrs[it->first] = it->second;
3417 }
3418 }
3419 break;
3420 }
3421 }
3422
3423 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y)
3424 {
3425 map<string, bufferlist> attrset;
3426
3427 real_time mtime;
3428 uint64_t obj_size;
3429 RGWObjectCtx rctx(this->store);
3430
3431 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
3432 RGWRados::Object::Read read_op(&op_target);
3433
3434 read_op.params.attrs = &attrset;
3435 read_op.params.lastmod = &mtime;
3436 read_op.params.obj_size = &obj_size;
3437
3438 int ret = read_op.prepare(y);
3439 if (ret < 0)
3440 return ret;
3441
3442 attrset.erase(RGW_ATTR_ID_TAG);
3443 attrset.erase(RGW_ATTR_TAIL_TAG);
3444
3445 return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
3446 read_op, obj_size - 1, obj, NULL, mtime, attrset,
3447 0, real_time(), NULL, dpp, y);
3448 }
3449
3450 struct obj_time_weight {
3451 real_time mtime;
3452 uint32_t zone_short_id;
3453 uint64_t pg_ver;
3454 bool high_precision;
3455
3456 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3457
3458 bool compare_low_precision(const obj_time_weight& rhs) {
3459 struct timespec l = ceph::real_clock::to_timespec(mtime);
3460 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3461 l.tv_nsec = 0;
3462 r.tv_nsec = 0;
3463 if (l > r) {
3464 return false;
3465 }
3466 if (l < r) {
3467 return true;
3468 }
3469 if (!zone_short_id || !rhs.zone_short_id) {
3470 /* don't compare zone ids, if one wasn't provided */
3471 return false;
3472 }
3473 if (zone_short_id != rhs.zone_short_id) {
3474 return (zone_short_id < rhs.zone_short_id);
3475 }
3476 return (pg_ver < rhs.pg_ver);
3477
3478 }
3479
3480 bool operator<(const obj_time_weight& rhs) {
3481 if (!high_precision || !rhs.high_precision) {
3482 return compare_low_precision(rhs);
3483 }
3484 if (mtime > rhs.mtime) {
3485 return false;
3486 }
3487 if (mtime < rhs.mtime) {
3488 return true;
3489 }
3490 if (!zone_short_id || !rhs.zone_short_id) {
3491 /* don't compare zone ids, if one wasn't provided */
3492 return false;
3493 }
3494 if (zone_short_id != rhs.zone_short_id) {
3495 return (zone_short_id < rhs.zone_short_id);
3496 }
3497 return (pg_ver < rhs.pg_ver);
3498 }
3499
3500 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3501 mtime = _mtime;
3502 zone_short_id = _short_id;
3503 pg_ver = _pg_ver;
3504 }
3505
3506 void init(RGWObjState *state) {
3507 mtime = state->mtime;
3508 zone_short_id = state->zone_short_id;
3509 pg_ver = state->pg_ver;
3510 }
3511 };
3512
3513 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3514 out << o.mtime;
3515
3516 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3517 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3518 }
3519
3520 return out;
3521 }
3522
3523 class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
3524 bufferlist extra_data;
3525 public:
3526 RGWGetExtraDataCB() {}
3527 int handle_data(bufferlist& bl, bool *pause) override {
3528 int bl_len = (int)bl.length();
3529 if (extra_data.length() < extra_data_len) {
3530 off_t max = extra_data_len - extra_data.length();
3531 if (max > bl_len) {
3532 max = bl_len;
3533 }
3534 bl.splice(0, max, &extra_data);
3535 }
3536 return bl_len;
3537 }
3538
3539 bufferlist& get_extra_data() {
3540 return extra_data;
3541 }
3542 };
3543
3544 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
3545 const rgw_user& user_id,
3546 req_info *info,
3547 const rgw_zone_id& source_zone,
3548 rgw_obj& src_obj,
3549 const RGWBucketInfo *src_bucket_info,
3550 real_time *src_mtime,
3551 uint64_t *psize,
3552 const real_time *mod_ptr,
3553 const real_time *unmod_ptr,
3554 bool high_precision_time,
3555 const char *if_match,
3556 const char *if_nomatch,
3557 map<string, bufferlist> *pattrs,
3558 map<string, string> *pheaders,
3559 string *version_id,
3560 string *ptag,
3561 string *petag)
3562 {
3563 /* source is in a different zonegroup, copy from there */
3564
3565 RGWRESTStreamRWRequest *in_stream_req;
3566 string tag;
3567 map<string, bufferlist> src_attrs;
3568 append_rand_alpha(cct, tag, tag, 32);
3569 obj_time_weight set_mtime_weight;
3570 set_mtime_weight.high_precision = high_precision_time;
3571
3572 RGWRESTConn *conn;
3573 if (source_zone.empty()) {
3574 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
3575 /* source is in the master zonegroup */
3576 conn = svc.zone->get_master_conn();
3577 } else {
3578 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
3579 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
3580 if (iter == zonegroup_conn_map.end()) {
3581 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3582 return -ENOENT;
3583 }
3584 conn = iter->second;
3585 }
3586 } else {
3587 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3588 auto iter = zone_conn_map.find(source_zone);
3589 if (iter == zone_conn_map.end()) {
3590 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3591 return -ENOENT;
3592 }
3593 conn = iter->second;
3594 }
3595
3596 RGWGetExtraDataCB cb;
3597 map<string, string> req_headers;
3598 real_time set_mtime;
3599
3600 const real_time *pmod = mod_ptr;
3601
3602 obj_time_weight dest_mtime_weight;
3603
3604 constexpr bool prepend_meta = true;
3605 constexpr bool get_op = true;
3606 constexpr bool rgwx_stat = true;
3607 constexpr bool sync_manifest = true;
3608 constexpr bool skip_decrypt = true;
3609 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
3610 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
3611 prepend_meta, get_op, rgwx_stat,
3612 sync_manifest, skip_decrypt,
3613 true, &cb, &in_stream_req);
3614 if (ret < 0) {
3615 return ret;
3616 }
3617
3618 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders);
3619 if (ret < 0) {
3620 return ret;
3621 }
3622
3623 bufferlist& extra_data_bl = cb.get_extra_data();
3624 if (extra_data_bl.length()) {
3625 JSONParser jp;
3626 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3627 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3628 return -EIO;
3629 }
3630
3631 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3632
3633 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3634 }
3635
3636 if (src_mtime) {
3637 *src_mtime = set_mtime;
3638 }
3639
3640 if (petag) {
3641 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3642 if (iter != src_attrs.end()) {
3643 bufferlist& etagbl = iter->second;
3644 *petag = etagbl.to_str();
3645 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3646 *petag = petag->substr(0, petag->size() - 1);
3647 }
3648 }
3649 }
3650
3651 if (pattrs) {
3652 *pattrs = std::move(src_attrs);
3653 }
3654
3655 return 0;
3656 }
3657
3658 int RGWFetchObjFilter_Default::filter(CephContext *cct,
3659 const rgw_obj_key& source_key,
3660 const RGWBucketInfo& dest_bucket_info,
3661 std::optional<rgw_placement_rule> dest_placement_rule,
3662 const map<string, bufferlist>& obj_attrs,
3663 std::optional<rgw_user> *poverride_owner,
3664 const rgw_placement_rule **prule)
3665 {
3666 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3667 if (!ptail_rule) {
3668 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3669 if (iter != obj_attrs.end()) {
3670 dest_rule.storage_class = iter->second.to_str();
3671 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3672 ptail_rule = &dest_rule;
3673 } else {
3674 ptail_rule = &dest_bucket_info.placement_rule;
3675 }
3676 }
3677 *prule = ptail_rule;
3678 return 0;
3679 }
3680
3681 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3682 const rgw_user& user_id,
3683 req_info *info,
3684 const rgw_zone_id& source_zone,
3685 const rgw_obj& dest_obj,
3686 const rgw_obj& src_obj,
3687 const RGWBucketInfo& dest_bucket_info,
3688 const RGWBucketInfo *src_bucket_info,
3689 std::optional<rgw_placement_rule> dest_placement_rule,
3690 real_time *src_mtime,
3691 real_time *mtime,
3692 const real_time *mod_ptr,
3693 const real_time *unmod_ptr,
3694 bool high_precision_time,
3695 const char *if_match,
3696 const char *if_nomatch,
3697 AttrsMod attrs_mod,
3698 bool copy_if_newer,
3699 map<string, bufferlist>& attrs,
3700 RGWObjCategory category,
3701 std::optional<uint64_t> olh_epoch,
3702 real_time delete_at,
3703 string *ptag,
3704 string *petag,
3705 void (*progress_cb)(off_t, void *),
3706 void *progress_data,
3707 const DoutPrefixProvider *dpp,
3708 RGWFetchObjFilter *filter,
3709 rgw_zone_set *zones_trace,
3710 std::optional<uint64_t>* bytes_transferred)
3711 {
3712 /* source is in a different zonegroup, copy from there */
3713
3714 RGWRESTStreamRWRequest *in_stream_req;
3715 string tag;
3716 int i;
3717 append_rand_alpha(cct, tag, tag, 32);
3718 obj_time_weight set_mtime_weight;
3719 set_mtime_weight.high_precision = high_precision_time;
3720 int ret;
3721
3722 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
3723 using namespace rgw::putobj;
3724 AtomicObjectProcessor processor(&aio, this->store, dest_bucket_info, nullptr, user_id,
3725 obj_ctx, dest_obj, olh_epoch, tag, dpp, null_yield);
3726 RGWRESTConn *conn;
3727 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3728 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
3729 if (source_zone.empty()) {
3730 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
3731 /* source is in the master zonegroup */
3732 conn = svc.zone->get_master_conn();
3733 } else {
3734 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
3735 if (iter == zonegroup_conn_map.end()) {
3736 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3737 return -ENOENT;
3738 }
3739 conn = iter->second;
3740 }
3741 } else {
3742 auto iter = zone_conn_map.find(source_zone);
3743 if (iter == zone_conn_map.end()) {
3744 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3745 return -ENOENT;
3746 }
3747 conn = iter->second;
3748 }
3749
3750 boost::optional<RGWPutObj_Compress> compressor;
3751 CompressorRef plugin;
3752
3753 RGWFetchObjFilter_Default source_filter;
3754 if (!filter) {
3755 filter = &source_filter;
3756 }
3757
3758 std::optional<rgw_user> override_owner;
3759
3760 RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data,
3761 [&](map<string, bufferlist>& obj_attrs) {
3762 const rgw_placement_rule *ptail_rule;
3763
3764 int ret = filter->filter(cct,
3765 src_obj.key,
3766 dest_bucket_info,
3767 dest_placement_rule,
3768 obj_attrs,
3769 &override_owner,
3770 &ptail_rule);
3771 if (ret < 0) {
3772 ldout(cct, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
3773 return ret;
3774 }
3775
3776 processor.set_tail_placement(*ptail_rule);
3777
3778 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
3779 if (compression_type != "none") {
3780 plugin = Compressor::create(cct, compression_type);
3781 if (!plugin) {
3782 ldout(cct, 1) << "Cannot load plugin for compression type "
3783 << compression_type << dendl;
3784 }
3785 }
3786
3787 ret = processor.prepare(null_yield);
3788 if (ret < 0) {
3789 return ret;
3790 }
3791 return 0;
3792 });
3793
3794 string etag;
3795 real_time set_mtime;
3796 uint64_t expected_size = 0;
3797
3798 RGWObjState *dest_state = NULL;
3799
3800 const real_time *pmod = mod_ptr;
3801
3802 obj_time_weight dest_mtime_weight;
3803
3804 if (copy_if_newer) {
3805 /* need to get mtime for destination */
3806 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false, null_yield);
3807 if (ret < 0)
3808 goto set_err_state;
3809
3810 if (!real_clock::is_zero(dest_state->mtime)) {
3811 dest_mtime_weight.init(dest_state);
3812 pmod = &dest_mtime_weight.mtime;
3813 }
3814 }
3815
3816 static constexpr bool prepend_meta = true;
3817 static constexpr bool get_op = true;
3818 static constexpr bool rgwx_stat = false;
3819 static constexpr bool sync_manifest = true;
3820 static constexpr bool skip_decrypt = true;
3821 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
3822 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
3823 prepend_meta, get_op, rgwx_stat,
3824 sync_manifest, skip_decrypt,
3825 true,
3826 &cb, &in_stream_req);
3827 if (ret < 0) {
3828 goto set_err_state;
3829 }
3830
3831 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
3832 &expected_size, nullptr, nullptr);
3833 if (ret < 0) {
3834 goto set_err_state;
3835 }
3836 ret = cb.flush();
3837 if (ret < 0) {
3838 goto set_err_state;
3839 }
3840 if (cb.get_data_len() != expected_size) {
3841 ret = -EIO;
3842 ldout(cct, 0) << "ERROR: object truncated during fetching, expected "
3843 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
3844 goto set_err_state;
3845 }
3846 if (compressor && compressor->is_compressed()) {
3847 bufferlist tmp;
3848 RGWCompressionInfo cs_info;
3849 cs_info.compression_type = plugin->get_type_name();
3850 cs_info.orig_size = cb.get_data_len();
3851 cs_info.blocks = move(compressor->get_compression_blocks());
3852 encode(cs_info, tmp);
3853 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
3854 }
3855
3856 if (override_owner) {
3857 processor.set_owner(*override_owner);
3858
3859 auto& obj_attrs = cb.get_attrs();
3860
3861 RGWUserInfo owner_info;
3862 if (ctl.user->get_info_by_uid(*override_owner, &owner_info, null_yield) < 0) {
3863 ldout(cct, 10) << "owner info does not exist" << dendl;
3864 return -EINVAL;
3865 }
3866
3867 RGWAccessControlPolicy acl;
3868
3869 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
3870 if (aiter == obj_attrs.end()) {
3871 ldout(cct, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
3872 acl.create_default(owner_info.user_id, owner_info.display_name);
3873 } else {
3874 auto iter = aiter->second.cbegin();
3875 try {
3876 acl.decode(iter);
3877 } catch (buffer::error& err) {
3878 ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
3879 return -EIO;
3880 }
3881 }
3882
3883 ACLOwner new_owner;
3884 new_owner.set_id(*override_owner);
3885 new_owner.set_name(owner_info.display_name);
3886
3887 acl.set_owner(new_owner);
3888
3889 bufferlist bl;
3890 acl.encode(bl);
3891 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
3892 }
3893
3894 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
3895 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
3896 } else {
3897 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
3898 if (iter != cb.get_attrs().end()) {
3899 try {
3900 decode(delete_at, iter->second);
3901 } catch (buffer::error& err) {
3902 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
3903 }
3904 }
3905 }
3906
3907 if (src_mtime) {
3908 *src_mtime = set_mtime;
3909 }
3910
3911 if (petag) {
3912 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
3913 if (iter != cb.get_attrs().end()) {
3914 *petag = iter->second.to_str();
3915 }
3916 }
3917
3918 //erase the append attr
3919 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
3920
3921 if (source_zone.empty()) {
3922 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
3923 } else {
3924 attrs = cb.get_attrs();
3925 }
3926
3927 if (copy_if_newer) {
3928 uint64_t pg_ver = 0;
3929 auto i = attrs.find(RGW_ATTR_PG_VER);
3930 if (i != attrs.end() && i->second.length() > 0) {
3931 auto iter = i->second.cbegin();
3932 try {
3933 decode(pg_ver, iter);
3934 } catch (buffer::error& err) {
3935 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
3936 /* non critical error */
3937 }
3938 }
3939 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
3940 }
3941
3942 #define MAX_COMPLETE_RETRY 100
3943 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
3944 bool canceled = false;
3945 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
3946 attrs, delete_at, nullptr, nullptr, nullptr,
3947 zones_trace, &canceled, null_yield);
3948 if (ret < 0) {
3949 goto set_err_state;
3950 }
3951 if (copy_if_newer && canceled) {
3952 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
3953 obj_ctx.invalidate(dest_obj); /* object was overwritten */
3954 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false, null_yield);
3955 if (ret < 0) {
3956 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
3957 goto set_err_state;
3958 }
3959 dest_mtime_weight.init(dest_state);
3960 dest_mtime_weight.high_precision = high_precision_time;
3961 if (!dest_state->exists ||
3962 dest_mtime_weight < set_mtime_weight) {
3963 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
3964 continue;
3965 } else {
3966 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
3967 }
3968 }
3969 break;
3970 }
3971
3972 if (i == MAX_COMPLETE_RETRY) {
3973 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
3974 ret = -EIO;
3975 goto set_err_state;
3976 }
3977
3978 if (bytes_transferred) {
3979 *bytes_transferred = cb.get_data_len();
3980 }
3981 return 0;
3982 set_err_state:
3983 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
3984 // we may have already fetched during sync of OP_ADD, but were waiting
3985 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
3986 if (olh_epoch && *olh_epoch > 0) {
3987 constexpr bool log_data_change = true;
3988 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
3989 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
3990 } else {
3991 // we already have the latest copy
3992 ret = 0;
3993 }
3994 }
3995 return ret;
3996 }
3997
3998
3999 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
4000 map<string, bufferlist>& src_attrs,
4001 RGWRados::Object::Read& read_op,
4002 const rgw_user& user_id,
4003 rgw_obj& dest_obj,
4004 real_time *mtime)
4005 {
4006 string etag;
4007
4008 RGWRESTStreamS3PutObj *out_stream_req;
4009
4010 auto rest_master_conn = svc.zone->get_master_conn();
4011
4012 int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
4013 if (ret < 0) {
4014 return ret;
4015 }
4016
4017 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
4018 if (ret < 0) {
4019 delete out_stream_req;
4020 return ret;
4021 }
4022
4023 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
4024 if (ret < 0)
4025 return ret;
4026
4027 return 0;
4028 }
4029
4030 /**
4031 * Copy an object.
4032 * dest_obj: the object to copy into
4033 * src_obj: the object to copy from
4034 * attrs: usage depends on attrs_mod parameter
4035 * attrs_mod: the modification mode of the attrs, may have the following values:
4036 * ATTRSMOD_NONE - the attributes of the source object will be
4037 * copied without modifications, attrs parameter is ignored;
4038 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4039 * parameter, source object attributes are not copied;
4040 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4041 * are overwritten by values contained in attrs parameter.
4042 * err: stores any errors resulting from the get of the original object
4043 * Returns: 0 on success, -ERR# otherwise.
4044 */
4045 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4046 const rgw_user& user_id,
4047 req_info *info,
4048 const rgw_zone_id& source_zone,
4049 rgw_obj& dest_obj,
4050 rgw_obj& src_obj,
4051 RGWBucketInfo& dest_bucket_info,
4052 RGWBucketInfo& src_bucket_info,
4053 const rgw_placement_rule& dest_placement,
4054 real_time *src_mtime,
4055 real_time *mtime,
4056 const real_time *mod_ptr,
4057 const real_time *unmod_ptr,
4058 bool high_precision_time,
4059 const char *if_match,
4060 const char *if_nomatch,
4061 AttrsMod attrs_mod,
4062 bool copy_if_newer,
4063 map<string, bufferlist>& attrs,
4064 RGWObjCategory category,
4065 uint64_t olh_epoch,
4066 real_time delete_at,
4067 string *version_id,
4068 string *ptag,
4069 string *petag,
4070 void (*progress_cb)(off_t, void *),
4071 void *progress_data,
4072 const DoutPrefixProvider *dpp,
4073 optional_yield y)
4074 {
4075 int ret;
4076 uint64_t obj_size;
4077 rgw_obj shadow_obj = dest_obj;
4078 string shadow_oid;
4079
4080 bool remote_src;
4081 bool remote_dest;
4082
4083 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
4084 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
4085
4086 auto& zonegroup = svc.zone->get_zonegroup();
4087
4088 remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
4089 remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
4090
4091 if (remote_src && remote_dest) {
4092 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
4093 return -EINVAL;
4094 }
4095
4096 ldpp_dout(dpp, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
4097
4098 if (remote_src || !source_zone.empty()) {
4099 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
4100 dest_obj, src_obj, dest_bucket_info, &src_bucket_info,
4101 dest_placement, src_mtime, mtime, mod_ptr,
4102 unmod_ptr, high_precision_time,
4103 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
4104 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4105 nullptr /* filter */);
4106 }
4107
4108 map<string, bufferlist> src_attrs;
4109 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
4110 RGWRados::Object::Read read_op(&src_op_target);
4111
4112 read_op.conds.mod_ptr = mod_ptr;
4113 read_op.conds.unmod_ptr = unmod_ptr;
4114 read_op.conds.high_precision_time = high_precision_time;
4115 read_op.conds.if_match = if_match;
4116 read_op.conds.if_nomatch = if_nomatch;
4117 read_op.params.attrs = &src_attrs;
4118 read_op.params.lastmod = src_mtime;
4119 read_op.params.obj_size = &obj_size;
4120
4121 ret = read_op.prepare(y);
4122 if (ret < 0) {
4123 return ret;
4124 }
4125 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4126 // Current implementation does not follow S3 spec and even
4127 // may result in data corruption silently when copying
4128 // multipart objects acorss pools. So reject COPY operations
4129 //on encrypted objects before it is fully functional.
4130 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
4131 << " has not been implemented." << dendl;
4132 return -ERR_NOT_IMPLEMENTED;
4133 }
4134
4135 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4136 src_attrs.erase(RGW_ATTR_DELETE_AT);
4137
4138 set_copy_attrs(src_attrs, attrs, attrs_mod);
4139 attrs.erase(RGW_ATTR_ID_TAG);
4140 attrs.erase(RGW_ATTR_PG_VER);
4141 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4142 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4143 if (cmp != src_attrs.end())
4144 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4145
4146 RGWObjManifest manifest;
4147 RGWObjState *astate = NULL;
4148
4149 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate, y);
4150 if (ret < 0) {
4151 return ret;
4152 }
4153
4154 vector<rgw_raw_obj> ref_objs;
4155
4156 if (remote_dest) {
4157 /* dest is in a different zonegroup, copy it there */
4158 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
4159 }
4160 uint64_t max_chunk_size;
4161
4162 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
4163 if (ret < 0) {
4164 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
4165 return ret;
4166 }
4167
4168 rgw_pool src_pool;
4169 rgw_pool dest_pool;
4170
4171 const rgw_placement_rule *src_rule{nullptr};
4172
4173 if (astate->manifest) {
4174 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4175 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
4176 }
4177
4178 if (!src_rule || src_rule->empty()) {
4179 src_rule = &src_bucket_info.placement_rule;
4180 }
4181
4182 if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
4183 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
4184 return -EIO;
4185 }
4186
4187 if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
4188 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
4189 return -EIO;
4190 }
4191
4192 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
4193 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4194
4195 bool copy_data = (!astate->manifest) ||
4196 (*src_rule != dest_placement) ||
4197 (src_pool != dest_pool);
4198
4199 bool copy_first = false;
4200 if (astate->manifest) {
4201 if (!astate->manifest->has_tail()) {
4202 copy_data = true;
4203 } else {
4204 uint64_t head_size = astate->manifest->get_head_size();
4205
4206 if (head_size > 0) {
4207 if (head_size > max_chunk_size) {
4208 copy_data = true;
4209 } else {
4210 copy_first = true;
4211 }
4212 }
4213 }
4214 }
4215
4216 if (petag) {
4217 const auto iter = attrs.find(RGW_ATTR_ETAG);
4218 if (iter != attrs.end()) {
4219 *petag = iter->second.to_str();
4220 }
4221 }
4222
4223 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
4224 attrs.erase(RGW_ATTR_TAIL_TAG);
4225 return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
4226 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
4227 }
4228
4229 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin();
4230
4231 if (copy_first) { // we need to copy first chunk, not increase refcount
4232 ++miter;
4233 }
4234
4235 rgw_rados_ref ref;
4236 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
4237 if (ret < 0) {
4238 return ret;
4239 }
4240
4241 bufferlist first_chunk;
4242
4243 bool copy_itself = (dest_obj == src_obj);
4244 RGWObjManifest *pmanifest;
4245 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
4246
4247 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
4248 RGWRados::Object::Write write_op(&dest_op_target);
4249
4250 string tag;
4251
4252 if (ptag) {
4253 tag = *ptag;
4254 }
4255
4256 if (tag.empty()) {
4257 append_rand_alpha(cct, tag, tag, 32);
4258 }
4259
4260 if (!copy_itself) {
4261 attrs.erase(RGW_ATTR_TAIL_TAG);
4262 manifest = *astate->manifest;
4263 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4264 if (tail_placement.bucket.name.empty()) {
4265 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
4266 }
4267 string ref_tag;
4268 for (; miter != astate->manifest->obj_end(); ++miter) {
4269 ObjectWriteOperation op;
4270 ref_tag = tag + '\0';
4271 cls_refcount_get(op, ref_tag, true);
4272 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
4273
4274 auto& ioctx = ref.pool.ioctx();
4275 ioctx.locator_set_key(loc.loc);
4276
4277 ret = rgw_rados_operate(ioctx, loc.oid, &op, null_yield);
4278 if (ret < 0) {
4279 goto done_ret;
4280 }
4281
4282 ref_objs.push_back(loc);
4283 }
4284
4285 pmanifest = &manifest;
4286 } else {
4287 pmanifest = &(*astate->manifest);
4288 /* don't send the object's tail for garbage collection */
4289 astate->keep_tail = true;
4290 }
4291
4292 if (copy_first) {
4293 ret = read_op.read(0, max_chunk_size, first_chunk, y);
4294 if (ret < 0) {
4295 goto done_ret;
4296 }
4297
4298 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
4299 } else {
4300 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
4301 }
4302
4303 write_op.meta.data = &first_chunk;
4304 write_op.meta.manifest = pmanifest;
4305 write_op.meta.ptag = &tag;
4306 write_op.meta.owner = dest_bucket_info.owner;
4307 write_op.meta.mtime = mtime;
4308 write_op.meta.flags = PUT_OBJ_CREATE;
4309 write_op.meta.category = category;
4310 write_op.meta.olh_epoch = olh_epoch;
4311 write_op.meta.delete_at = delete_at;
4312 write_op.meta.modify_tail = !copy_itself;
4313
4314 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs, y);
4315 if (ret < 0) {
4316 goto done_ret;
4317 }
4318
4319 return 0;
4320
4321 done_ret:
4322 if (!copy_itself) {
4323 vector<rgw_raw_obj>::iterator riter;
4324
4325 /* rollback reference */
4326 string ref_tag = tag + '\0';
4327 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4328 ObjectWriteOperation op;
4329 cls_refcount_put(op, ref_tag, true);
4330
4331 ref.pool.ioctx().locator_set_key(riter->loc);
4332
4333 int r = rgw_rados_operate(ref.pool.ioctx(), riter->oid, &op, null_yield);
4334 if (r < 0) {
4335 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
4336 }
4337 }
4338 }
4339 return ret;
4340 }
4341
4342
4343 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4344 RGWBucketInfo& dest_bucket_info,
4345 const rgw_placement_rule& dest_placement,
4346 RGWRados::Object::Read& read_op, off_t end,
4347 const rgw_obj& dest_obj,
4348 real_time *mtime,
4349 real_time set_mtime,
4350 map<string, bufferlist>& attrs,
4351 uint64_t olh_epoch,
4352 real_time delete_at,
4353 string *petag,
4354 const DoutPrefixProvider *dpp,
4355 optional_yield y)
4356 {
4357 string tag;
4358 append_rand_alpha(cct, tag, tag, 32);
4359
4360 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4361 using namespace rgw::putobj;
4362 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4363 // it causes crashes in the ragweed tests
4364 AtomicObjectProcessor processor(&aio, this->store, dest_bucket_info, &dest_placement,
4365 dest_bucket_info.owner, obj_ctx,
4366 dest_obj, olh_epoch, tag, dpp, null_yield);
4367 int ret = processor.prepare(y);
4368 if (ret < 0)
4369 return ret;
4370
4371 off_t ofs = 0;
4372
4373 do {
4374 bufferlist bl;
4375 ret = read_op.read(ofs, end, bl, y);
4376 if (ret < 0) {
4377 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
4378 return ret;
4379 }
4380
4381 uint64_t read_len = ret;
4382 ret = processor.process(std::move(bl), ofs);
4383 if (ret < 0) {
4384 return ret;
4385 }
4386
4387 ofs += read_len;
4388 } while (ofs <= end);
4389
4390 // flush
4391 ret = processor.process({}, ofs);
4392 if (ret < 0) {
4393 return ret;
4394 }
4395
4396 string etag;
4397 auto iter = attrs.find(RGW_ATTR_ETAG);
4398 if (iter != attrs.end()) {
4399 bufferlist& bl = iter->second;
4400 etag = bl.to_str();
4401 if (petag) {
4402 *petag = etag;
4403 }
4404 }
4405
4406 uint64_t accounted_size;
4407 {
4408 bool compressed{false};
4409 RGWCompressionInfo cs_info;
4410 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4411 if (ret < 0) {
4412 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
4413 return ret;
4414 }
4415 // pass original size if compressed
4416 accounted_size = compressed ? cs_info.orig_size : ofs;
4417 }
4418
4419 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
4420 nullptr, nullptr, nullptr, nullptr, nullptr, y);
4421 }
4422
4423 int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4424 RGWBucketInfo& bucket_info,
4425 rgw_obj& obj,
4426 const rgw_placement_rule& placement_rule,
4427 const real_time& mtime,
4428 uint64_t olh_epoch,
4429 const DoutPrefixProvider *dpp,
4430 optional_yield y)
4431 {
4432 map<string, bufferlist> attrs;
4433 real_time read_mtime;
4434 uint64_t obj_size;
4435
4436 obj_ctx.set_atomic(obj);
4437
4438 RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
4439 RGWRados::Object::Read read_op(&op_target);
4440
4441 read_op.params.attrs = &attrs;
4442 read_op.params.lastmod = &read_mtime;
4443 read_op.params.obj_size = &obj_size;
4444
4445 int ret = read_op.prepare(y);
4446 if (ret < 0) {
4447 return ret;
4448 }
4449
4450 if (read_mtime != mtime) {
4451 /* raced */
4452 return -ECANCELED;
4453 }
4454
4455 attrs.erase(RGW_ATTR_ID_TAG);
4456 attrs.erase(RGW_ATTR_TAIL_TAG);
4457
4458 ret = copy_obj_data(obj_ctx,
4459 bucket_info,
4460 placement_rule,
4461 read_op,
4462 obj_size - 1,
4463 obj,
4464 nullptr /* pmtime */,
4465 mtime,
4466 attrs,
4467 olh_epoch,
4468 real_time(),
4469 nullptr /* petag */,
4470 dpp,
4471 y);
4472 if (ret < 0) {
4473 return ret;
4474 }
4475
4476 return 0;
4477 }
4478
4479 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info, optional_yield y)
4480 {
4481 constexpr uint NUM_ENTRIES = 1000u;
4482
4483 rgw_obj_index_key marker;
4484 string prefix;
4485 bool is_truncated;
4486
4487 do {
4488 std::vector<rgw_bucket_dir_entry> ent_list;
4489 ent_list.reserve(NUM_ENTRIES);
4490
4491 int r = cls_bucket_list_unordered(bucket_info,
4492 RGW_NO_SHARD,
4493 marker,
4494 prefix,
4495 NUM_ENTRIES,
4496 true,
4497 ent_list,
4498 &is_truncated,
4499 &marker,
4500 y);
4501 if (r < 0) {
4502 return r;
4503 }
4504
4505 string ns;
4506 for (auto const& dirent : ent_list) {
4507 rgw_obj_key obj;
4508
4509 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
4510 return -ENOTEMPTY;
4511 }
4512 }
4513 } while (is_truncated);
4514
4515 return 0;
4516 }
4517
4518 /**
4519 * Delete a bucket.
4520 * bucket: the name of the bucket to delete
4521 * Returns 0 on success, -ERR# otherwise.
4522 */
4523 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, bool check_empty)
4524 {
4525 const rgw_bucket& bucket = bucket_info.bucket;
4526 RGWSI_RADOS::Pool index_pool;
4527 map<int, string> bucket_objs;
4528 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
4529 if (r < 0)
4530 return r;
4531
4532 if (check_empty) {
4533 r = check_bucket_empty(bucket_info, y);
4534 if (r < 0) {
4535 return r;
4536 }
4537 }
4538
4539 bool remove_ep = true;
4540
4541 if (objv_tracker.read_version.empty()) {
4542 RGWBucketEntryPoint ep;
4543 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4544 &ep,
4545 null_yield,
4546 RGWBucketCtl::Bucket::GetParams()
4547 .set_objv_tracker(&objv_tracker));
4548 if (r < 0 ||
4549 (!bucket_info.bucket.bucket_id.empty() &&
4550 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4551 if (r != -ENOENT) {
4552 ldout(cct, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
4553 /* we have no idea what caused the error, will not try to remove it */
4554 }
4555 /*
4556 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4557 * requested
4558 */
4559 remove_ep = false;
4560 }
4561 }
4562
4563 if (remove_ep) {
4564 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield,
4565 RGWBucketCtl::Bucket::RemoveParams()
4566 .set_objv_tracker(&objv_tracker));
4567 if (r < 0)
4568 return r;
4569 }
4570
4571 /* if the bucket is not synced we can remove the meta file */
4572 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
4573 RGWObjVersionTracker objv_tracker;
4574 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield);
4575 if (r < 0) {
4576 return r;
4577 }
4578
4579 /* remove bucket index objects asynchronously by best effort */
4580 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
4581 bucket_objs,
4582 cct->_conf->rgw_bucket_index_max_aio)();
4583 }
4584
4585 return 0;
4586 }
4587
4588 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
4589 {
4590 RGWBucketInfo info;
4591 map<string, bufferlist> attrs;
4592 int r;
4593 auto obj_ctx = svc.sysobj->init_obj_ctx();
4594
4595 if (bucket.bucket_id.empty()) {
4596 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, &attrs);
4597 } else {
4598 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield);
4599 }
4600 if (r < 0) {
4601 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4602 return r;
4603 }
4604
4605 info.owner = owner.get_id();
4606
4607 r = put_bucket_instance_info(info, false, real_time(), &attrs);
4608 if (r < 0) {
4609 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4610 return r;
4611 }
4612
4613 return 0;
4614 }
4615
4616
4617 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
4618 {
4619 int ret = 0;
4620
4621 vector<rgw_bucket>::iterator iter;
4622
4623 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4624 rgw_bucket& bucket = *iter;
4625 if (enabled)
4626 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
4627 else
4628 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
4629
4630 RGWBucketInfo info;
4631 map<string, bufferlist> attrs;
4632 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, &attrs);
4633 if (r < 0) {
4634 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4635 ret = r;
4636 continue;
4637 }
4638 if (enabled) {
4639 info.flags &= ~BUCKET_SUSPENDED;
4640 } else {
4641 info.flags |= BUCKET_SUSPENDED;
4642 }
4643
4644 r = put_bucket_instance_info(info, false, real_time(), &attrs);
4645 if (r < 0) {
4646 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4647 ret = r;
4648 continue;
4649 }
4650 }
4651 return ret;
4652 }
4653
4654 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
4655 {
4656 RGWBucketInfo bucket_info;
4657 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield);
4658 if (ret < 0) {
4659 return ret;
4660 }
4661
4662 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4663 return 0;
4664 }
4665
4666 int RGWRados::Object::complete_atomic_modification()
4667 {
4668 if ((!state->manifest)|| state->keep_tail)
4669 return 0;
4670
4671 cls_rgw_obj_chain chain;
4672 store->update_gc_chain(obj, *state->manifest, &chain);
4673
4674 if (chain.empty()) {
4675 return 0;
4676 }
4677
4678 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
4679 auto ret = store->gc->send_chain(chain, tag); // do it synchronously
4680 if (ret < 0) {
4681 //Delete objects inline if send chain to gc fails
4682 store->delete_objs_inline(chain, tag);
4683 }
4684 return 0;
4685 }
4686
4687 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
4688 {
4689 RGWObjManifest::obj_iterator iter;
4690 rgw_raw_obj raw_head;
4691 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
4692 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
4693 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
4694 if (mobj == raw_head)
4695 continue;
4696 cls_rgw_obj_key key(mobj.oid);
4697 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4698 }
4699 }
4700
4701 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
4702 {
4703 return gc->send_chain(chain, tag);
4704 }
4705
4706 void RGWRados::delete_objs_inline(cls_rgw_obj_chain& chain, const string& tag)
4707 {
4708 string last_pool;
4709 std::unique_ptr<IoCtx> ctx(new IoCtx);
4710 int ret = 0;
4711 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4712 cls_rgw_obj& obj = *liter;
4713 if (obj.pool != last_pool) {
4714 ctx.reset(new IoCtx);
4715 ret = rgw_init_ioctx(get_rados_handle(), obj.pool, *ctx);
4716 if (ret < 0) {
4717 last_pool = "";
4718 ldout(cct, 0) << "ERROR: failed to create ioctx pool=" <<
4719 obj.pool << dendl;
4720 continue;
4721 }
4722 last_pool = obj.pool;
4723 }
4724 ctx->locator_set_key(obj.loc);
4725 const string& oid = obj.key.name; /* just stored raw oid there */
4726 ldout(cct, 5) << "delete_objs_inline: removing " << obj.pool <<
4727 ":" << obj.key.name << dendl;
4728 ObjectWriteOperation op;
4729 cls_refcount_put(op, tag, true);
4730 ret = ctx->operate(oid, &op);
4731 if (ret < 0) {
4732 ldout(cct, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
4733 }
4734 }
4735 }
4736
4737 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
4738 map<RGWObjCategory, RGWStorageStats>& stats)
4739 {
4740 for (const auto& pair : header.stats) {
4741 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
4742 const rgw_bucket_category_stats& header_stats = pair.second;
4743
4744 RGWStorageStats& s = stats[category];
4745
4746 s.category = category;
4747 s.size += header_stats.total_size;
4748 s.size_rounded += header_stats.total_size_rounded;
4749 s.size_utilized += header_stats.actual_size;
4750 s.num_objects += header_stats.num_entries;
4751 }
4752 }
4753
4754 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
4755 map<RGWObjCategory, RGWStorageStats> *existing_stats,
4756 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
4757 {
4758 RGWSI_RADOS::Pool index_pool;
4759 // key - bucket index object id
4760 // value - bucket index check OP returned result with the given bucket index object (shard)
4761 map<int, string> oids;
4762 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
4763
4764 int ret = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &oids, nullptr);
4765 if (ret < 0) {
4766 return ret;
4767 }
4768
4769 for (auto& iter : oids) {
4770 bucket_objs_ret[iter.first] = rgw_cls_check_index_ret();
4771 }
4772
4773 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
4774 if (ret < 0) {
4775 return ret;
4776 }
4777
4778 // Aggregate results (from different shards if there is any)
4779 map<int, struct rgw_cls_check_index_ret>::iterator iter;
4780 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
4781 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
4782 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
4783 }
4784
4785 return 0;
4786 }
4787
4788 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
4789 {
4790 RGWSI_RADOS::Pool index_pool;
4791 map<int, string> bucket_objs;
4792
4793 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
4794 if (r < 0) {
4795 return r;
4796 }
4797
4798 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
4799 }
4800
4801 int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
4802 {
4803 RGWSI_RADOS::Pool index_pool;
4804 map<int, string> bucket_objs;
4805
4806 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
4807 if (r < 0) {
4808 return r;
4809 }
4810
4811 return CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
4812 }
4813
4814 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
4815 {
4816 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
4817 std::string oid, key;
4818 get_obj_bucket_and_oid_loc(obj, oid, key);
4819 if (!rctx)
4820 return 0;
4821
4822 RGWObjState *state = NULL;
4823
4824 int r = get_obj_state(rctx, bucket_info, obj, &state, false, y);
4825 if (r < 0)
4826 return r;
4827
4828 if (!state->is_atomic) {
4829 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
4830 return -EINVAL;
4831 }
4832
4833 string tag;
4834
4835 if (state->tail_tag.length() > 0) {
4836 tag = state->tail_tag.c_str();
4837 } else if (state->obj_tag.length() > 0) {
4838 tag = state->obj_tag.c_str();
4839 } else {
4840 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
4841 return -EINVAL;
4842 }
4843
4844 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
4845
4846 cls_rgw_obj_chain chain;
4847 update_gc_chain(state->obj, *state->manifest, &chain);
4848 return gc->async_defer_chain(tag, chain);
4849 }
4850
4851 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
4852 {
4853 list<string> prefixes;
4854 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
4855 cls_rgw_remove_obj(op, prefixes);
4856 }
4857
4858 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
4859 {
4860 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
4861 }
4862
4863 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
4864 {
4865 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
4866 }
4867
4868 struct tombstone_entry {
4869 ceph::real_time mtime;
4870 uint32_t zone_short_id;
4871 uint64_t pg_ver;
4872
4873 tombstone_entry() = default;
4874 explicit tombstone_entry(const RGWObjState& state)
4875 : mtime(state.mtime), zone_short_id(state.zone_short_id),
4876 pg_ver(state.pg_ver) {}
4877 };
4878
4879 /**
4880 * Delete an object.
4881 * bucket: name of the bucket storing the object
4882 * obj: name of the object to delete
4883 * Returns: 0 on success, -ERR# otherwise.
4884 */
4885 int RGWRados::Object::Delete::delete_obj(optional_yield y)
4886 {
4887 RGWRados *store = target->get_store();
4888 rgw_obj& src_obj = target->get_obj();
4889 const string& instance = src_obj.key.instance;
4890 rgw_obj obj = src_obj;
4891
4892 if (instance == "null") {
4893 obj.key.instance.clear();
4894 }
4895
4896 bool explicit_marker_version = (!params.marker_version_id.empty());
4897
4898 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
4899 if (instance.empty() || explicit_marker_version) {
4900 rgw_obj marker = obj;
4901
4902 if (!params.marker_version_id.empty()) {
4903 if (params.marker_version_id != "null") {
4904 marker.key.set_instance(params.marker_version_id);
4905 }
4906 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
4907 store->gen_rand_obj_instance_name(&marker);
4908 }
4909
4910 result.version_id = marker.key.instance;
4911 if (result.version_id.empty())
4912 result.version_id = "null";
4913 result.delete_marker = true;
4914
4915 struct rgw_bucket_dir_entry_meta meta;
4916
4917 meta.owner = params.obj_owner.get_id().to_str();
4918 meta.owner_display_name = params.obj_owner.get_display_name();
4919
4920 if (real_clock::is_zero(params.mtime)) {
4921 meta.mtime = real_clock::now();
4922 } else {
4923 meta.mtime = params.mtime;
4924 }
4925
4926 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
4927 if (r < 0) {
4928 return r;
4929 }
4930 } else {
4931 rgw_bucket_dir_entry dirent;
4932
4933 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
4934 if (r < 0) {
4935 return r;
4936 }
4937 result.delete_marker = dirent.is_delete_marker();
4938 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
4939 if (r < 0) {
4940 return r;
4941 }
4942 result.version_id = instance;
4943 }
4944
4945 BucketShard *bs;
4946 int r = target->get_bucket_shard(&bs);
4947 if (r < 0) {
4948 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
4949 return r;
4950 }
4951
4952 r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
4953 if (r < 0) {
4954 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
4955 return r;
4956 }
4957
4958 return 0;
4959 }
4960
4961 rgw_rados_ref ref;
4962 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
4963 if (r < 0) {
4964 return r;
4965 }
4966
4967 RGWObjState *state;
4968 r = target->get_state(&state, false, y);
4969 if (r < 0)
4970 return r;
4971
4972 ObjectWriteOperation op;
4973
4974 if (!real_clock::is_zero(params.unmod_since)) {
4975 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
4976 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
4977 if (!params.high_precision_time) {
4978 ctime.tv_nsec = 0;
4979 unmod.tv_nsec = 0;
4980 }
4981
4982 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
4983 if (ctime > unmod) {
4984 return -ERR_PRECONDITION_FAILED;
4985 }
4986
4987 /* only delete object if mtime is less than or equal to params.unmod_since */
4988 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
4989 }
4990 uint64_t obj_accounted_size = state->accounted_size;
4991
4992 if(params.abortmp) {
4993 obj_accounted_size = params.parts_accounted_size;
4994 }
4995
4996 if (!real_clock::is_zero(params.expiration_time)) {
4997 bufferlist bl;
4998 real_time delete_at;
4999
5000 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5001 try {
5002 auto iter = bl.cbegin();
5003 decode(delete_at, iter);
5004 } catch (buffer::error& err) {
5005 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
5006 return -EIO;
5007 }
5008
5009 if (params.expiration_time != delete_at) {
5010 return -ERR_PRECONDITION_FAILED;
5011 }
5012 } else {
5013 return -ERR_PRECONDITION_FAILED;
5014 }
5015 }
5016
5017 if (!state->exists) {
5018 target->invalidate_state();
5019 return -ENOENT;
5020 }
5021
5022 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false, y);
5023 if (r < 0)
5024 return r;
5025
5026 RGWBucketInfo& bucket_info = target->get_bucket_info();
5027
5028 RGWRados::Bucket bop(store, bucket_info);
5029 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5030
5031 index_op.set_zones_trace(params.zones_trace);
5032 index_op.set_bilog_flags(params.bilog_flags);
5033
5034 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag, y);
5035 if (r < 0)
5036 return r;
5037
5038 store->remove_rgw_head_obj(op);
5039
5040 auto& ioctx = ref.pool.ioctx();
5041 r = rgw_rados_operate(ioctx, ref.obj.oid, &op, null_yield);
5042
5043 /* raced with another operation, object state is indeterminate */
5044 const bool need_invalidate = (r == -ECANCELED);
5045
5046 int64_t poolid = ioctx.get_id();
5047 if (r >= 0) {
5048 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5049 if (obj_tombstone_cache) {
5050 tombstone_entry entry{*state};
5051 obj_tombstone_cache->add(obj, entry);
5052 }
5053 r = index_op.complete_del(poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
5054
5055 int ret = target->complete_atomic_modification();
5056 if (ret < 0) {
5057 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5058 }
5059 /* other than that, no need to propagate error */
5060 } else {
5061 int ret = index_op.cancel();
5062 if (ret < 0) {
5063 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5064 }
5065 }
5066
5067 if (need_invalidate) {
5068 target->invalidate_state();
5069 }
5070
5071 if (r < 0)
5072 return r;
5073
5074 /* update quota cache */
5075 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
5076
5077 return 0;
5078 }
5079
5080 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
5081 const RGWBucketInfo& bucket_info,
5082 const rgw_obj& obj,
5083 int versioning_status,
5084 uint16_t bilog_flags,
5085 const real_time& expiration_time,
5086 rgw_zone_set *zones_trace)
5087 {
5088 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5089 RGWRados::Object::Delete del_op(&del_target);
5090
5091 del_op.params.bucket_owner = bucket_info.owner;
5092 del_op.params.versioning_status = versioning_status;
5093 del_op.params.bilog_flags = bilog_flags;
5094 del_op.params.expiration_time = expiration_time;
5095 del_op.params.zones_trace = zones_trace;
5096
5097 return del_op.delete_obj(null_yield);
5098 }
5099
5100 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
5101 {
5102 rgw_rados_ref ref;
5103 int r = get_raw_obj_ref(obj, &ref);
5104 if (r < 0) {
5105 return r;
5106 }
5107
5108 ObjectWriteOperation op;
5109
5110 op.remove();
5111 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
5112 if (r < 0)
5113 return r;
5114
5115 return 0;
5116 }
5117
5118 int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime)
5119 {
5120 std::string oid, key;
5121 get_obj_bucket_and_oid_loc(obj, oid, key);
5122
5123 auto obj_ctx = svc.sysobj->init_obj_ctx();
5124
5125 RGWBucketInfo bucket_info;
5126 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield);
5127 if (ret < 0) {
5128 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5129 return ret;
5130 }
5131
5132 RGWRados::Bucket bop(this, bucket_info);
5133 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5134
5135 return index_op.complete_del(-1 /* pool */, 0, mtime, NULL);
5136 }
5137
5138 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5139 {
5140 string tag;
5141
5142 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
5143 if (mi != manifest.obj_end()) {
5144 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5145 ++mi;
5146 tag = mi.get_location().get_raw_obj(store).oid;
5147 tag.append("_");
5148 }
5149
5150 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5151 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5152 MD5 hash;
5153 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
5154
5155 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5156 if (iter != attrset.end()) {
5157 bufferlist& bl = iter->second;
5158 hash.Update((const unsigned char *)bl.c_str(), bl.length());
5159 }
5160
5161 hash.Final(md5);
5162 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5163 tag.append(md5_str);
5164
5165 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5166
5167 tag_bl.append(tag.c_str(), tag.size() + 1);
5168 }
5169
5170 static bool is_olh(map<string, bufferlist>& attrs)
5171 {
5172 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5173 return (iter != attrs.end());
5174 }
5175
5176 static bool has_olh_tag(map<string, bufferlist>& attrs)
5177 {
5178 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5179 return (iter != attrs.end());
5180 }
5181
5182 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5183 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
5184 {
5185 ceph_assert(olh_state->is_olh);
5186
5187 rgw_obj target;
5188 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5189 if (r < 0) {
5190 return r;
5191 }
5192 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false, y);
5193 if (r < 0) {
5194 return r;
5195 }
5196
5197 return 0;
5198 }
5199
5200 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5201 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
5202 {
5203 if (obj.empty()) {
5204 return -EINVAL;
5205 }
5206
5207 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5208
5209 RGWObjState *s = rctx->get_state(obj);
5210 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5211 *state = s;
5212 if (s->has_attrs) {
5213 if (s->is_olh && need_follow_olh) {
5214 return get_olh_target_state(*rctx, bucket_info, obj, s, state, y);
5215 }
5216 return 0;
5217 }
5218
5219 s->obj = obj;
5220
5221 rgw_raw_obj raw_obj;
5222 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5223
5224 int r = -ENOENT;
5225
5226 if (!assume_noent) {
5227 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
5228 }
5229
5230 if (r == -ENOENT) {
5231 s->exists = false;
5232 s->has_attrs = true;
5233 tombstone_entry entry;
5234 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5235 s->mtime = entry.mtime;
5236 s->zone_short_id = entry.zone_short_id;
5237 s->pg_ver = entry.pg_ver;
5238 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5239 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5240 } else {
5241 s->mtime = real_time();
5242 }
5243 return 0;
5244 }
5245 if (r < 0)
5246 return r;
5247
5248 s->exists = true;
5249 s->has_attrs = true;
5250 s->accounted_size = s->size;
5251
5252 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5253 if (iter != s->attrset.end()) {
5254 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5255 bufferlist& bletag = iter->second;
5256 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5257 bufferlist newbl;
5258 bletag.splice(0, bletag.length() - 1, &newbl);
5259 bletag.claim(newbl);
5260 }
5261 }
5262
5263 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
5264 const bool compressed = (iter != s->attrset.end());
5265 if (compressed) {
5266 // use uncompressed size for accounted_size
5267 try {
5268 RGWCompressionInfo info;
5269 auto p = iter->second.cbegin();
5270 decode(info, p);
5271 s->accounted_size = info.orig_size;
5272 } catch (buffer::error&) {
5273 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5274 return -EIO;
5275 }
5276 }
5277
5278 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5279 if (iter != s->attrset.end()) {
5280 bufferlist bl = iter->second;
5281 bufferlist::iterator it = bl.begin();
5282 it.copy(bl.length(), s->shadow_obj);
5283 s->shadow_obj[bl.length()] = '\0';
5284 }
5285 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
5286 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5287 if (ttiter != s->attrset.end()) {
5288 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5289 }
5290
5291 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5292 if (manifest_bl.length()) {
5293 auto miter = manifest_bl.cbegin();
5294 try {
5295 s->manifest.emplace();
5296 decode(*s->manifest, miter);
5297 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
5298 broken due to old bugs */
5299 s->size = s->manifest->get_obj_size();
5300 if (!compressed)
5301 s->accounted_size = s->size;
5302 } catch (buffer::error& err) {
5303 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
5304 return -EIO;
5305 }
5306 ldout(cct, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
5307 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
5308 s->manifest->has_explicit_objs()) {
5309 RGWObjManifest::obj_iterator mi;
5310 for (mi = s->manifest->obj_begin(); mi != s->manifest->obj_end(); ++mi) {
5311 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
5312 }
5313 }
5314
5315 if (!s->obj_tag.length()) {
5316 /*
5317 * Uh oh, something's wrong, object with manifest should have tag. Let's
5318 * create one out of the manifest, would be unique
5319 */
5320 generate_fake_tag(this, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
5321 s->fake_tag = true;
5322 }
5323 }
5324 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5325 if (aiter != s->attrset.end()) {
5326 bufferlist& pg_ver_bl = aiter->second;
5327 if (pg_ver_bl.length()) {
5328 auto pgbl = pg_ver_bl.cbegin();
5329 try {
5330 decode(s->pg_ver, pgbl);
5331 } catch (buffer::error& err) {
5332 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5333 }
5334 }
5335 }
5336 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5337 if (aiter != s->attrset.end()) {
5338 bufferlist& zone_short_id_bl = aiter->second;
5339 if (zone_short_id_bl.length()) {
5340 auto zbl = zone_short_id_bl.cbegin();
5341 try {
5342 decode(s->zone_short_id, zbl);
5343 } catch (buffer::error& err) {
5344 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5345 }
5346 }
5347 }
5348 if (s->obj_tag.length())
5349 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5350 else
5351 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5352
5353 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5354 * it exist, and not only if is_olh() returns true
5355 */
5356 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5357 if (iter != s->attrset.end()) {
5358 s->olh_tag = iter->second;
5359 }
5360
5361 if (is_olh(s->attrset)) {
5362 s->is_olh = true;
5363
5364 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5365
5366 if (need_follow_olh) {
5367 return get_olh_target_state(*rctx, bucket_info, obj, s, state, y);
5368 } else if (obj.key.have_null_instance() && !s->manifest) {
5369 // read null version, and the head object only have olh info
5370 s->exists = false;
5371 return -ENOENT;
5372 }
5373 }
5374
5375 return 0;
5376 }
5377
5378 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
5379 bool follow_olh, optional_yield y, bool assume_noent)
5380 {
5381 int ret;
5382
5383 do {
5384 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
5385 } while (ret == -EAGAIN);
5386
5387 return ret;
5388 }
5389
5390 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest, optional_yield y)
5391 {
5392 RGWObjState *astate;
5393 int r = get_state(&astate, true, y);
5394 if (r < 0) {
5395 return r;
5396 }
5397
5398 *pmanifest = &(*astate->manifest);
5399
5400 return 0;
5401 }
5402
5403 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest, optional_yield y)
5404 {
5405 RGWObjState *state;
5406 int r = source->get_state(&state, true, y);
5407 if (r < 0)
5408 return r;
5409 if (!state->exists)
5410 return -ENOENT;
5411 if (!state->get_attr(name, dest))
5412 return -ENODATA;
5413
5414 return 0;
5415 }
5416
5417 int RGWRados::Object::Stat::stat_async()
5418 {
5419 RGWObjectCtx& ctx = source->get_ctx();
5420 rgw_obj& obj = source->get_obj();
5421 RGWRados *store = source->get_store();
5422
5423 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
5424 result.obj = obj;
5425 if (s->has_attrs) {
5426 state.ret = 0;
5427 result.size = s->size;
5428 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5429 result.attrs = s->attrset;
5430 result.manifest = s->manifest;
5431 return 0;
5432 }
5433
5434 string oid;
5435 string loc;
5436 get_obj_bucket_and_oid_loc(obj, oid, loc);
5437
5438 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
5439 if (r < 0) {
5440 return r;
5441 }
5442
5443 librados::ObjectReadOperation op;
5444 op.stat2(&result.size, &result.mtime, NULL);
5445 op.getxattrs(&result.attrs, NULL);
5446 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
5447 state.io_ctx.locator_set_key(loc);
5448 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5449 if (r < 0) {
5450 ldout(store->ctx(), 5) << __func__
5451 << ": ERROR: aio_operate() returned ret=" << r
5452 << dendl;
5453 return r;
5454 }
5455
5456 return 0;
5457 }
5458
5459
5460 int RGWRados::Object::Stat::wait()
5461 {
5462 if (!state.completion) {
5463 return state.ret;
5464 }
5465
5466 state.completion->wait_for_complete();
5467 state.ret = state.completion->get_return_value();
5468 state.completion->release();
5469
5470 if (state.ret != 0) {
5471 return state.ret;
5472 }
5473
5474 return finish();
5475 }
5476
5477 int RGWRados::Object::Stat::finish()
5478 {
5479 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5480 if (iter != result.attrs.end()) {
5481 bufferlist& bl = iter->second;
5482 auto biter = bl.cbegin();
5483 try {
5484 result.manifest.emplace();
5485 decode(*result.manifest, biter);
5486 } catch (buffer::error& err) {
5487 RGWRados *store = source->get_store();
5488 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
5489 return -EIO;
5490 }
5491 }
5492
5493 return 0;
5494 }
5495
5496 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
5497 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5498 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
5499 {
5500 if (!rctx)
5501 return 0;
5502
5503 int r = get_obj_state(rctx, bucket_info, obj, pstate, false, y);
5504 if (r < 0)
5505 return r;
5506
5507 return append_atomic_test(*pstate, op);
5508 }
5509
5510 int RGWRados::append_atomic_test(const RGWObjState* state,
5511 librados::ObjectOperation& op)
5512 {
5513 if (!state->is_atomic) {
5514 ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
5515 return 0;
5516 }
5517
5518 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5519 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5520 } else {
5521 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
5522 }
5523 return 0;
5524 }
5525
5526 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
5527 {
5528 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
5529 }
5530
5531 void RGWRados::Object::invalidate_state()
5532 {
5533 ctx.invalidate(obj);
5534 }
5535
5536 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
5537 const char *if_match, const char *if_nomatch, bool removal_op,
5538 bool modify_tail, optional_yield y)
5539 {
5540 int r = get_state(&state, false, y);
5541 if (r < 0)
5542 return r;
5543
5544 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
5545 if_match != NULL || if_nomatch != NULL) &&
5546 (!state->fake_tag);
5547
5548 if (!state->is_atomic) {
5549 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
5550
5551 if (reset_obj) {
5552 op.create(false);
5553 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5554 }
5555
5556 return 0;
5557 }
5558
5559 if (need_guard) {
5560 /* first verify that the object wasn't replaced under */
5561 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5562 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5563 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5564 }
5565
5566 if (if_match) {
5567 if (strcmp(if_match, "*") == 0) {
5568 // test the object is existing
5569 if (!state->exists) {
5570 return -ERR_PRECONDITION_FAILED;
5571 }
5572 } else {
5573 bufferlist bl;
5574 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5575 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5576 return -ERR_PRECONDITION_FAILED;
5577 }
5578 }
5579 }
5580
5581 if (if_nomatch) {
5582 if (strcmp(if_nomatch, "*") == 0) {
5583 // test the object is NOT existing
5584 if (state->exists) {
5585 return -ERR_PRECONDITION_FAILED;
5586 }
5587 } else {
5588 bufferlist bl;
5589 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5590 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5591 return -ERR_PRECONDITION_FAILED;
5592 }
5593 }
5594 }
5595 }
5596
5597 if (reset_obj) {
5598 if (state->exists) {
5599 op.create(false);
5600 store->remove_rgw_head_obj(op);
5601 } else {
5602 op.create(true);
5603 }
5604 }
5605
5606 if (removal_op) {
5607 /* the object is being removed, no need to update its tag */
5608 return 0;
5609 }
5610
5611 if (ptag) {
5612 state->write_tag = *ptag;
5613 } else {
5614 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5615 }
5616 bufferlist bl;
5617 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5618
5619 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
5620
5621 op.setxattr(RGW_ATTR_ID_TAG, bl);
5622 if (modify_tail) {
5623 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5624 }
5625
5626 return 0;
5627 }
5628
5629 /**
5630 * Set an attr on an object.
5631 * bucket: name of the bucket holding the object
5632 * obj: name of the object to set the attr on
5633 * name: the attr to set
5634 * bl: the contents of the attr
5635 * Returns: 0 on success, -ERR# otherwise.
5636 */
5637 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
5638 {
5639 map<string, bufferlist> attrs;
5640 attrs[name] = bl;
5641 return set_attrs(ctx, bucket_info, obj, attrs, NULL, null_yield);
5642 }
5643
5644 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
5645 map<string, bufferlist>& attrs,
5646 map<string, bufferlist>* rmattrs,
5647 optional_yield y)
5648 {
5649 rgw_obj obj = src_obj;
5650 if (obj.key.instance == "null") {
5651 obj.key.instance.clear();
5652 }
5653
5654 rgw_rados_ref ref;
5655 int r = get_obj_head_ref(bucket_info, obj, &ref);
5656 if (r < 0) {
5657 return r;
5658 }
5659 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5660
5661 ObjectWriteOperation op;
5662 RGWObjState *state = NULL;
5663
5664 r = append_atomic_test(rctx, bucket_info, obj, op, &state, y);
5665 if (r < 0)
5666 return r;
5667
5668 // ensure null version object exist
5669 if (src_obj.key.instance == "null" && !state->manifest) {
5670 return -ENOENT;
5671 }
5672
5673 map<string, bufferlist>::iterator iter;
5674 if (rmattrs) {
5675 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5676 const string& name = iter->first;
5677 op.rmxattr(name.c_str());
5678 }
5679 }
5680
5681 const rgw_bucket& bucket = obj.bucket;
5682
5683 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5684 const string& name = iter->first;
5685 bufferlist& bl = iter->second;
5686
5687 if (!bl.length())
5688 continue;
5689
5690 op.setxattr(name.c_str(), bl);
5691
5692 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5693 real_time ts;
5694 try {
5695 decode(ts, bl);
5696
5697 rgw_obj_index_key obj_key;
5698 obj.key.get_index_key(&obj_key);
5699
5700 obj_expirer->hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
5701 } catch (buffer::error& err) {
5702 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
5703 }
5704 }
5705 }
5706
5707 if (!op.size())
5708 return 0;
5709
5710 RGWObjectCtx obj_ctx(this->store);
5711
5712 bufferlist bl;
5713 RGWRados::Bucket bop(this, bucket_info);
5714 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5715
5716 if (state) {
5717 string tag;
5718 append_rand_alpha(cct, tag, tag, 32);
5719 state->write_tag = tag;
5720 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag, y);
5721
5722 if (r < 0)
5723 return r;
5724
5725 bl.append(tag.c_str(), tag.size() + 1);
5726 op.setxattr(RGW_ATTR_ID_TAG, bl);
5727 }
5728
5729
5730 real_time mtime = real_clock::now();
5731 struct timespec mtime_ts = real_clock::to_timespec(mtime);
5732 op.mtime2(&mtime_ts);
5733 auto& ioctx = ref.pool.ioctx();
5734 r = rgw_rados_operate(ioctx, ref.obj.oid, &op, null_yield);
5735 if (state) {
5736 if (r >= 0) {
5737 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
5738 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
5739 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
5740 string etag = rgw_bl_str(etag_bl);
5741 string content_type = rgw_bl_str(content_type_bl);
5742 string storage_class;
5743 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
5744 if (iter != attrs.end()) {
5745 storage_class = rgw_bl_str(iter->second);
5746 }
5747 uint64_t epoch = ioctx.get_last_version();
5748 int64_t poolid = ioctx.get_id();
5749 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
5750 mtime, etag, content_type, storage_class, &acl_bl,
5751 RGWObjCategory::Main, NULL);
5752 } else {
5753 int ret = index_op.cancel();
5754 if (ret < 0) {
5755 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
5756 }
5757 }
5758 }
5759 if (r < 0)
5760 return r;
5761
5762 if (state) {
5763 state->obj_tag.swap(bl);
5764 if (rmattrs) {
5765 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5766 state->attrset.erase(iter->first);
5767 }
5768 }
5769
5770 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5771 state->attrset[iter->first] = iter->second;
5772 }
5773
5774 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
5775 if (iter != state->attrset.end()) {
5776 iter->second = state->obj_tag;
5777 }
5778 }
5779
5780 return 0;
5781 }
5782
5783 int RGWRados::Object::Read::prepare(optional_yield y)
5784 {
5785 RGWRados *store = source->get_store();
5786 CephContext *cct = store->ctx();
5787
5788 bufferlist etag;
5789
5790 map<string, bufferlist>::iterator iter;
5791
5792 RGWObjState *astate;
5793 int r = source->get_state(&astate, true, y);
5794 if (r < 0)
5795 return r;
5796
5797 if (!astate->exists) {
5798 return -ENOENT;
5799 }
5800
5801 const RGWBucketInfo& bucket_info = source->get_bucket_info();
5802
5803 state.obj = astate->obj;
5804 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
5805
5806 state.cur_pool = state.head_obj.pool;
5807 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
5808
5809 r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx);
5810 if (r < 0) {
5811 return r;
5812 }
5813 if (params.target_obj) {
5814 *params.target_obj = state.obj;
5815 }
5816 if (params.attrs) {
5817 *params.attrs = astate->attrset;
5818 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
5819 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
5820 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
5821 }
5822 }
5823 }
5824
5825 /* Convert all times go GMT to make them compatible */
5826 if (conds.mod_ptr || conds.unmod_ptr) {
5827 obj_time_weight src_weight;
5828 src_weight.init(astate);
5829 src_weight.high_precision = conds.high_precision_time;
5830
5831 obj_time_weight dest_weight;
5832 dest_weight.high_precision = conds.high_precision_time;
5833
5834 if (conds.mod_ptr && !conds.if_nomatch) {
5835 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5836 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5837 if (!(dest_weight < src_weight)) {
5838 return -ERR_NOT_MODIFIED;
5839 }
5840 }
5841
5842 if (conds.unmod_ptr && !conds.if_match) {
5843 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5844 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5845 if (dest_weight < src_weight) {
5846 return -ERR_PRECONDITION_FAILED;
5847 }
5848 }
5849 }
5850 if (conds.if_match || conds.if_nomatch) {
5851 r = get_attr(RGW_ATTR_ETAG, etag, y);
5852 if (r < 0)
5853 return r;
5854
5855 if (conds.if_match) {
5856 string if_match_str = rgw_string_unquote(conds.if_match);
5857 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
5858 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
5859 return -ERR_PRECONDITION_FAILED;
5860 }
5861 }
5862
5863 if (conds.if_nomatch) {
5864 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
5865 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
5866 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
5867 return -ERR_NOT_MODIFIED;
5868 }
5869 }
5870 }
5871
5872 if (params.obj_size)
5873 *params.obj_size = astate->size;
5874 if (params.lastmod)
5875 *params.lastmod = astate->mtime;
5876
5877 return 0;
5878 }
5879
5880 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
5881 {
5882 if (ofs < 0) {
5883 ofs += obj_size;
5884 if (ofs < 0)
5885 ofs = 0;
5886 end = obj_size - 1;
5887 } else if (end < 0) {
5888 end = obj_size - 1;
5889 }
5890
5891 if (obj_size > 0) {
5892 if (ofs >= (off_t)obj_size) {
5893 return -ERANGE;
5894 }
5895 if (end >= (off_t)obj_size) {
5896 end = obj_size - 1;
5897 }
5898 }
5899 return 0;
5900 }
5901
5902 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
5903 {
5904 RGWRados *store = target->get_store();
5905 BucketShard *bs;
5906 int r;
5907
5908 #define NUM_RESHARD_RETRIES 10
5909 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
5910 int ret = get_bucket_shard(&bs);
5911 if (ret < 0) {
5912 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
5913 return ret;
5914 }
5915 r = call(bs);
5916 if (r != -ERR_BUSY_RESHARDING) {
5917 break;
5918 }
5919 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
5920 string new_bucket_id;
5921 r = store->block_while_resharding(bs, &new_bucket_id,
5922 target->bucket_info, null_yield);
5923 if (r == -ERR_BUSY_RESHARDING) {
5924 continue;
5925 }
5926 if (r < 0) {
5927 return r;
5928 }
5929 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
5930 i = 0; /* resharding is finished, make sure we can retry */
5931 r = target->update_bucket_id(new_bucket_id);
5932 if (r < 0) {
5933 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
5934 return r;
5935 }
5936 invalidate_bs();
5937 } // for loop
5938
5939 if (r < 0) {
5940 return r;
5941 }
5942
5943 if (pbs) {
5944 *pbs = bs;
5945 }
5946
5947 return 0;
5948 }
5949
5950 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag, optional_yield y)
5951 {
5952 if (blind) {
5953 return 0;
5954 }
5955 RGWRados *store = target->get_store();
5956
5957 if (write_tag && write_tag->length()) {
5958 optag = string(write_tag->c_str(), write_tag->length());
5959 } else {
5960 if (optag.empty()) {
5961 append_rand_alpha(store->ctx(), optag, optag, 32);
5962 }
5963 }
5964
5965 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
5966 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, y, zones_trace);
5967 });
5968
5969 if (r < 0) {
5970 return r;
5971 }
5972 prepared = true;
5973
5974 return 0;
5975 }
5976
5977 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
5978 uint64_t size, uint64_t accounted_size,
5979 ceph::real_time& ut, const string& etag,
5980 const string& content_type, const string& storage_class,
5981 bufferlist *acl_bl,
5982 RGWObjCategory category,
5983 list<rgw_obj_index_key> *remove_objs, const string *user_data,
5984 bool appendable)
5985 {
5986 if (blind) {
5987 return 0;
5988 }
5989 RGWRados *store = target->get_store();
5990 BucketShard *bs;
5991
5992 int ret = get_bucket_shard(&bs);
5993 if (ret < 0) {
5994 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
5995 return ret;
5996 }
5997
5998 rgw_bucket_dir_entry ent;
5999 obj.key.get_index_key(&ent.key);
6000 ent.meta.size = size;
6001 ent.meta.accounted_size = accounted_size;
6002 ent.meta.mtime = ut;
6003 ent.meta.etag = etag;
6004 ent.meta.storage_class = storage_class;
6005 if (user_data)
6006 ent.meta.user_data = *user_data;
6007
6008 ACLOwner owner;
6009 if (acl_bl && acl_bl->length()) {
6010 int ret = store->decode_policy(*acl_bl, &owner);
6011 if (ret < 0) {
6012 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6013 }
6014 }
6015 ent.meta.owner = owner.get_id().to_str();
6016 ent.meta.owner_display_name = owner.get_display_name();
6017 ent.meta.content_type = content_type;
6018 ent.meta.appendable = appendable;
6019
6020 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
6021
6022 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6023 if (r < 0) {
6024 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6025 }
6026
6027 return ret;
6028 }
6029
6030 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
6031 real_time& removed_mtime,
6032 list<rgw_obj_index_key> *remove_objs)
6033 {
6034 if (blind) {
6035 return 0;
6036 }
6037 RGWRados *store = target->get_store();
6038 BucketShard *bs;
6039
6040 int ret = get_bucket_shard(&bs);
6041 if (ret < 0) {
6042 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6043 return ret;
6044 }
6045
6046 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
6047
6048 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6049 if (r < 0) {
6050 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6051 }
6052
6053 return ret;
6054 }
6055
6056
6057 int RGWRados::Bucket::UpdateIndex::cancel()
6058 {
6059 if (blind) {
6060 return 0;
6061 }
6062 RGWRados *store = target->get_store();
6063 BucketShard *bs;
6064
6065 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
6066 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6067 });
6068
6069 /*
6070 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6071 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6072 * have no way to tell that they're all caught up
6073 */
6074 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6075 if (r < 0) {
6076 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6077 }
6078
6079 return ret;
6080 }
6081
6082 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y)
6083 {
6084 RGWRados *store = source->get_store();
6085 CephContext *cct = store->ctx();
6086
6087 rgw_raw_obj read_obj;
6088 uint64_t read_ofs = ofs;
6089 uint64_t len, read_len;
6090 bool reading_from_head = true;
6091 ObjectReadOperation op;
6092
6093 bool merge_bl = false;
6094 bufferlist *pbl = &bl;
6095 bufferlist read_bl;
6096 uint64_t max_chunk_size;
6097
6098 RGWObjState *astate;
6099 int r = source->get_state(&astate, true, y);
6100 if (r < 0)
6101 return r;
6102
6103 if (astate->size == 0) {
6104 end = 0;
6105 } else if (end >= (int64_t)astate->size) {
6106 end = astate->size - 1;
6107 }
6108
6109 if (end < 0)
6110 len = 0;
6111 else
6112 len = end - ofs + 1;
6113
6114 if (astate->manifest && astate->manifest->has_tail()) {
6115 /* now get the relevant object part */
6116 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(ofs);
6117
6118 uint64_t stripe_ofs = iter.get_stripe_ofs();
6119 read_obj = iter.get_location().get_raw_obj(store);
6120 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6121 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6122 reading_from_head = (read_obj == state.head_obj);
6123 } else {
6124 read_obj = state.head_obj;
6125 }
6126
6127 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
6128 if (r < 0) {
6129 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6130 return r;
6131 }
6132
6133 if (len > max_chunk_size)
6134 len = max_chunk_size;
6135
6136
6137 read_len = len;
6138
6139 if (reading_from_head) {
6140 /* only when reading from the head object do we need to do the atomic test */
6141 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
6142 if (r < 0)
6143 return r;
6144
6145 if (astate && astate->prefetch_data) {
6146 if (!ofs && astate->data.length() >= len) {
6147 bl = astate->data;
6148 return bl.length();
6149 }
6150
6151 if (ofs < astate->data.length()) {
6152 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
6153 astate->data.begin(ofs).copy(copy_len, bl);
6154 read_len -= copy_len;
6155 read_ofs += copy_len;
6156 if (!read_len)
6157 return bl.length();
6158
6159 merge_bl = true;
6160 pbl = &read_bl;
6161 }
6162 }
6163 }
6164
6165 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6166 op.read(read_ofs, read_len, pbl, NULL);
6167
6168 if (state.cur_pool != read_obj.pool) {
6169 auto iter = state.io_ctxs.find(read_obj.pool);
6170 if (iter == state.io_ctxs.end()) {
6171 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
6172 r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx, false);
6173 if (r < 0) {
6174 ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6175 return r;
6176 }
6177 } else {
6178 state.cur_ioctx = &iter->second;
6179 }
6180 state.cur_pool = read_obj.pool;
6181 }
6182
6183 state.cur_ioctx->locator_set_key(read_obj.loc);
6184
6185 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6186 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
6187
6188 if (r < 0) {
6189 return r;
6190 }
6191
6192 if (merge_bl) {
6193 bl.append(read_bl);
6194 }
6195
6196 return bl.length();
6197 }
6198
6199 struct get_obj_data {
6200 RGWRados* store;
6201 RGWGetDataCB* client_cb;
6202 rgw::Aio* aio;
6203 uint64_t offset; // next offset to write to client
6204 rgw::AioResultList completed; // completed read results, sorted by offset
6205 optional_yield yield;
6206
6207 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio,
6208 uint64_t offset, optional_yield yield)
6209 : store(store), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
6210
6211 int flush(rgw::AioResultList&& results) {
6212 int r = rgw::check_for_errors(results);
6213 if (r < 0) {
6214 return r;
6215 }
6216
6217 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6218 results.sort(cmp); // merge() requires results to be sorted first
6219 completed.merge(results, cmp); // merge results in sorted order
6220
6221 while (!completed.empty() && completed.front().id == offset) {
6222 auto bl = std::move(completed.front().data);
6223 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
6224
6225 offset += bl.length();
6226 int r = client_cb->handle_data(bl, 0, bl.length());
6227 if (r < 0) {
6228 return r;
6229 }
6230 }
6231 return 0;
6232 }
6233
6234 void cancel() {
6235 // wait for all completions to drain and ignore the results
6236 aio->drain();
6237 }
6238
6239 int drain() {
6240 auto c = aio->wait();
6241 while (!c.empty()) {
6242 int r = flush(std::move(c));
6243 if (r < 0) {
6244 cancel();
6245 return r;
6246 }
6247 c = aio->wait();
6248 }
6249 return flush(std::move(c));
6250 }
6251 };
6252
6253 static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6254 off_t read_ofs, off_t len, bool is_head_obj,
6255 RGWObjState *astate, void *arg)
6256 {
6257 struct get_obj_data *d = (struct get_obj_data *)arg;
6258
6259 return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len,
6260 is_head_obj, astate, arg);
6261 }
6262
6263 int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6264 off_t read_ofs, off_t len, bool is_head_obj,
6265 RGWObjState *astate, void *arg)
6266 {
6267 ObjectReadOperation op;
6268 struct get_obj_data *d = (struct get_obj_data *)arg;
6269 string oid, key;
6270
6271 if (is_head_obj) {
6272 /* only when reading from the head object do we need to do the atomic test */
6273 int r = append_atomic_test(astate, op);
6274 if (r < 0)
6275 return r;
6276
6277 if (astate &&
6278 obj_ofs < astate->data.length()) {
6279 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
6280
6281 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
6282 if (r < 0)
6283 return r;
6284
6285 len -= chunk_len;
6286 d->offset += chunk_len;
6287 read_ofs += chunk_len;
6288 obj_ofs += chunk_len;
6289 if (!len)
6290 return 0;
6291 }
6292 }
6293
6294 auto obj = d->store->svc.rados->obj(read_obj);
6295 int r = obj.open();
6296 if (r < 0) {
6297 ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl;
6298 return r;
6299 }
6300
6301 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6302 op.read(read_ofs, len, nullptr, nullptr);
6303
6304 const uint64_t cost = len;
6305 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
6306
6307 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
6308
6309 return d->flush(std::move(completed));
6310 }
6311
6312 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb,
6313 optional_yield y)
6314 {
6315 RGWRados *store = source->get_store();
6316 CephContext *cct = store->ctx();
6317 RGWObjectCtx& obj_ctx = source->get_ctx();
6318 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6319 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
6320
6321 auto aio = rgw::make_throttle(window_size, y);
6322 get_obj_data data(store, cb, &*aio, ofs, y);
6323
6324 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj,
6325 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
6326 if (r < 0) {
6327 ldout(cct, 0) << "iterate_obj() failed with " << r << dendl;
6328 data.cancel(); // drain completions without writing back to client
6329 return r;
6330 }
6331
6332 return data.drain();
6333 }
6334
6335 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
6336 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6337 off_t ofs, off_t end, uint64_t max_chunk_size,
6338 iterate_obj_cb cb, void *arg, optional_yield y)
6339 {
6340 rgw_raw_obj head_obj;
6341 rgw_raw_obj read_obj;
6342 uint64_t read_ofs = ofs;
6343 uint64_t len;
6344 bool reading_from_head = true;
6345 RGWObjState *astate = NULL;
6346
6347 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6348
6349 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false, y);
6350 if (r < 0) {
6351 return r;
6352 }
6353
6354 if (end < 0)
6355 len = 0;
6356 else
6357 len = end - ofs + 1;
6358
6359 if (astate->manifest) {
6360 /* now get the relevant object stripe */
6361 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(ofs);
6362
6363 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end();
6364
6365 for (; iter != obj_end && ofs <= end; ++iter) {
6366 off_t stripe_ofs = iter.get_stripe_ofs();
6367 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6368
6369 while (ofs < next_stripe_ofs && ofs <= end) {
6370 read_obj = iter.get_location().get_raw_obj(this);
6371 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6372 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6373
6374 if (read_len > max_chunk_size) {
6375 read_len = max_chunk_size;
6376 }
6377
6378 reading_from_head = (read_obj == head_obj);
6379 r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
6380 if (r < 0) {
6381 return r;
6382 }
6383
6384 len -= read_len;
6385 ofs += read_len;
6386 }
6387 }
6388 } else {
6389 while (ofs <= end) {
6390 read_obj = head_obj;
6391 uint64_t read_len = std::min(len, max_chunk_size);
6392
6393 r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
6394 if (r < 0) {
6395 return r;
6396 }
6397
6398 len -= read_len;
6399 ofs += read_len;
6400 }
6401 }
6402
6403 return 0;
6404 }
6405
6406 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6407 {
6408 rgw_rados_ref ref;
6409 int r = get_obj_head_ref(bucket_info, obj, &ref);
6410 if (r < 0) {
6411 return r;
6412 }
6413
6414 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, op, null_yield);
6415 }
6416
6417 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6418 {
6419 rgw_rados_ref ref;
6420 int r = get_obj_head_ref(bucket_info, obj, &ref);
6421 if (r < 0) {
6422 return r;
6423 }
6424
6425 bufferlist outbl;
6426
6427 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
6428 }
6429
6430 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6431 {
6432 ObjectWriteOperation op;
6433
6434 ceph_assert(olh_obj.key.instance.empty());
6435
6436 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6437
6438 if (!state.exists) {
6439 op.create(true);
6440 } else {
6441 op.assert_exists();
6442 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6443 op.mtime2(&mtime_ts);
6444 }
6445
6446 /*
6447 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6448 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6449 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6450 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6451 * log will reflect that.
6452 *
6453 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6454 * is used for object data instance, olh_tag for olh instance.
6455 */
6456 if (has_tag) {
6457 /* guard against racing writes */
6458 bucket_index_guard_olh_op(state, op);
6459 }
6460
6461 if (!has_tag) {
6462 /* obj tag */
6463 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
6464
6465 bufferlist bl;
6466 bl.append(obj_tag.c_str(), obj_tag.size());
6467 op.setxattr(RGW_ATTR_ID_TAG, bl);
6468
6469 state.attrset[RGW_ATTR_ID_TAG] = bl;
6470 state.obj_tag = bl;
6471
6472 /* olh tag */
6473 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
6474
6475 bufferlist olh_bl;
6476 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6477 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6478
6479 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6480 state.olh_tag = olh_bl;
6481 state.is_olh = true;
6482
6483 bufferlist verbl;
6484 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6485 }
6486
6487 bufferlist bl;
6488 RGWOLHPendingInfo pending_info;
6489 pending_info.time = real_clock::now();
6490 encode(pending_info, bl);
6491
6492 #define OLH_PENDING_TAG_LEN 32
6493 /* tag will start with current time epoch, this so that entries are sorted by time */
6494 char buf[32];
6495 utime_t ut(pending_info.time);
6496 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6497 *op_tag = buf;
6498
6499 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
6500
6501 op_tag->append(s);
6502
6503 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6504 attr_name.append(*op_tag);
6505
6506 op.setxattr(attr_name.c_str(), bl);
6507
6508 int ret = obj_operate(bucket_info, olh_obj, &op);
6509 if (ret < 0) {
6510 return ret;
6511 }
6512
6513 state.exists = true;
6514 state.attrset[attr_name] = bl;
6515
6516 return 0;
6517 }
6518
6519 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
6520 {
6521 int ret;
6522
6523 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
6524 if (ret == -EEXIST) {
6525 ret = -ECANCELED;
6526 }
6527
6528 return ret;
6529 }
6530
6531 int RGWRados::guard_reshard(BucketShard *bs,
6532 const rgw_obj& obj_instance,
6533 const RGWBucketInfo& bucket_info,
6534 std::function<int(BucketShard *)> call)
6535 {
6536 rgw_obj obj;
6537 const rgw_obj *pobj = &obj_instance;
6538 int r;
6539
6540 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
6541 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
6542 if (r < 0) {
6543 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
6544 return r;
6545 }
6546 r = call(bs);
6547 if (r != -ERR_BUSY_RESHARDING) {
6548 break;
6549 }
6550 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6551 string new_bucket_id;
6552 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield);
6553 if (r == -ERR_BUSY_RESHARDING) {
6554 continue;
6555 }
6556 if (r < 0) {
6557 return r;
6558 }
6559 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6560 i = 0; /* resharding is finished, make sure we can retry */
6561
6562 obj = *pobj;
6563 obj.bucket.update_bucket_id(new_bucket_id);
6564 pobj = &obj;
6565 } // for loop
6566
6567 if (r < 0) {
6568 return r;
6569 }
6570
6571 return 0;
6572 }
6573
6574 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
6575 string *new_bucket_id,
6576 const RGWBucketInfo& bucket_info,
6577 optional_yield y)
6578 {
6579 int ret = 0;
6580 cls_rgw_bucket_instance_entry entry;
6581
6582 // since we want to run this recovery code from two distinct places,
6583 // let's just put it in a lambda so we can easily re-use; if the
6584 // lambda successfully fetches a new bucket id, it sets
6585 // new_bucket_id and returns 0, otherwise it returns a negative
6586 // error code
6587 auto fetch_new_bucket_id =
6588 [this, &bucket_info](const std::string& log_tag,
6589 std::string* new_bucket_id) -> int {
6590 RGWBucketInfo fresh_bucket_info = bucket_info;
6591 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr);
6592 if (ret < 0) {
6593 ldout(cct, 0) << __func__ <<
6594 " ERROR: failed to refresh bucket info after reshard at " <<
6595 log_tag << ": " << cpp_strerror(-ret) << dendl;
6596 return ret;
6597 }
6598 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
6599 return 0;
6600 };
6601
6602 constexpr int num_retries = 10;
6603 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
6604 auto& ref = bs->bucket_obj.get_ref();
6605 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
6606 if (ret == -ENOENT) {
6607 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
6608 } else if (ret < 0) {
6609 ldout(cct, 0) << __func__ <<
6610 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6611 dendl;
6612 return ret;
6613 }
6614
6615 if (!entry.resharding_in_progress()) {
6616 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6617 new_bucket_id);
6618 }
6619
6620 ldout(cct, 20) << "NOTICE: reshard still in progress; " <<
6621 (i < num_retries ? "retrying" : "too many retries") << dendl;
6622
6623 if (i == num_retries) {
6624 break;
6625 }
6626
6627 // If bucket is erroneously marked as resharding (e.g., crash or
6628 // other error) then fix it. If we can take the bucket reshard
6629 // lock then it means no other resharding should be taking place,
6630 // and we're free to clear the flags.
6631 {
6632 // since we expect to do this rarely, we'll do our work in a
6633 // block and erase our work after each try
6634
6635 RGWObjectCtx obj_ctx(this->store);
6636 const rgw_bucket& b = bs->bucket;
6637 std::string bucket_id = b.get_key();
6638 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
6639 ret = reshard_lock.lock();
6640 if (ret < 0) {
6641 ldout(cct, 20) << __func__ <<
6642 " INFO: failed to take reshard lock for bucket " <<
6643 bucket_id << "; expected if resharding underway" << dendl;
6644 } else {
6645 ldout(cct, 10) << __func__ <<
6646 " INFO: was able to take reshard lock for bucket " <<
6647 bucket_id << dendl;
6648 ret = RGWBucketReshard::clear_resharding(this->store, bucket_info);
6649 if (ret < 0) {
6650 reshard_lock.unlock();
6651 ldout(cct, 0) << __func__ <<
6652 " ERROR: failed to clear resharding flags for bucket " <<
6653 bucket_id << dendl;
6654 } else {
6655 reshard_lock.unlock();
6656 ldout(cct, 5) << __func__ <<
6657 " INFO: apparently successfully cleared resharding flags for "
6658 "bucket " << bucket_id << dendl;
6659 continue; // if we apparently succeed immediately test again
6660 } // if clear resharding succeeded
6661 } // if taking of lock succeeded
6662 } // block to encapsulate recovery from incomplete reshard
6663
6664 ret = reshard_wait->wait(y);
6665 if (ret < 0) {
6666 ldout(cct, 0) << __func__ <<
6667 " ERROR: bucket is still resharding, please retry" << dendl;
6668 return ret;
6669 }
6670 } // for loop
6671
6672 ldout(cct, 0) << __func__ <<
6673 " ERROR: bucket is still resharding, please retry" << dendl;
6674 return -ERR_BUSY_RESHARDING;
6675 }
6676
6677 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
6678 bool delete_marker,
6679 const string& op_tag,
6680 struct rgw_bucket_dir_entry_meta *meta,
6681 uint64_t olh_epoch,
6682 real_time unmod_since, bool high_precision_time,
6683 rgw_zone_set *_zones_trace, bool log_data_change)
6684 {
6685 rgw_rados_ref ref;
6686 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6687 if (r < 0) {
6688 return r;
6689 }
6690
6691 rgw_zone_set zones_trace;
6692 if (_zones_trace) {
6693 zones_trace = *_zones_trace;
6694 }
6695 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
6696
6697 BucketShard bs(this);
6698
6699 r = guard_reshard(&bs, obj_instance, bucket_info,
6700 [&](BucketShard *bs) -> int {
6701 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6702 auto& ref = bs->bucket_obj.get_ref();
6703 librados::ObjectWriteOperation op;
6704 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6705 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
6706 delete_marker, op_tag, meta, olh_epoch,
6707 unmod_since, high_precision_time,
6708 svc.zone->get_zone().log_data, zones_trace);
6709 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6710 });
6711 if (r < 0) {
6712 ldout(cct, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
6713 return r;
6714 }
6715
6716 r = svc.datalog_rados->add_entry(bucket_info, bs.shard_id);
6717 if (r < 0) {
6718 ldout(cct, 0) << "ERROR: failed writing data log" << dendl;
6719 }
6720
6721 return 0;
6722 }
6723
6724 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
6725 {
6726 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
6727 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
6728 }
6729
6730 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
6731 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
6732 {
6733 rgw_rados_ref ref;
6734 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6735 if (r < 0) {
6736 return r;
6737 }
6738
6739 rgw_zone_set zones_trace;
6740 if (_zones_trace) {
6741 zones_trace = *_zones_trace;
6742 }
6743 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
6744
6745 BucketShard bs(this);
6746
6747 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6748 r = guard_reshard(&bs, obj_instance, bucket_info,
6749 [&](BucketShard *bs) -> int {
6750 auto& ref = bs->bucket_obj.get_ref();
6751 librados::ObjectWriteOperation op;
6752 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6753 cls_rgw_bucket_unlink_instance(op, key, op_tag,
6754 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
6755 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6756 });
6757 if (r < 0) {
6758 ldout(cct, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
6759 return r;
6760 }
6761
6762 return 0;
6763 }
6764
6765 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
6766 const rgw_obj& obj_instance, uint64_t ver_marker,
6767 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
6768 bool *is_truncated)
6769 {
6770 rgw_rados_ref ref;
6771 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6772 if (r < 0) {
6773 return r;
6774 }
6775
6776 BucketShard bs(this);
6777 int ret =
6778 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
6779 if (ret < 0) {
6780 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
6781 return ret;
6782 }
6783
6784 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6785
6786 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6787
6788 ret = guard_reshard(&bs, obj_instance, bucket_info,
6789 [&](BucketShard *bs) -> int {
6790 auto& ref = bs->bucket_obj.get_ref();
6791 ObjectReadOperation op;
6792 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6793
6794 rgw_cls_read_olh_log_ret log_ret;
6795 int op_ret = 0;
6796 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
6797 bufferlist outbl;
6798 int r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
6799 if (r < 0) {
6800 return r;
6801 }
6802 if (op_ret < 0) {
6803 return op_ret;
6804 }
6805
6806 *log = std::move(log_ret.log);
6807 *is_truncated = log_ret.is_truncated;
6808 return r;
6809 });
6810 if (ret < 0) {
6811 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
6812 return ret;
6813 }
6814
6815 return 0;
6816 }
6817
6818 // a multisite sync bug resulted in the OLH head attributes being overwritten by
6819 // the attributes from another zone, causing link_olh() to fail endlessly due to
6820 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
6821 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
6822 int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
6823 const rgw_obj& obj)
6824 {
6825 // fetch the current olh entry from the bucket index
6826 rgw_bucket_olh_entry olh;
6827 int r = bi_get_olh(bucket_info, obj, &olh);
6828 if (r < 0) {
6829 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
6830 return r;
6831 }
6832 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
6833 return 0;
6834 }
6835
6836 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
6837 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
6838
6839 // rewrite OLH_ID_TAG and OLH_INFO from current olh
6840 ObjectWriteOperation op;
6841 // assert this is the same olh tag we think we're fixing
6842 bucket_index_guard_olh_op(*state, op);
6843 // preserve existing mtime
6844 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
6845 op.mtime2(&mtime_ts);
6846 {
6847 bufferlist bl;
6848 bl.append(olh.tag.c_str(), olh.tag.size());
6849 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
6850 }
6851 {
6852 RGWOLHInfo info;
6853 info.target = rgw_obj(bucket_info.bucket, olh.key);
6854 info.removed = olh.delete_marker;
6855 bufferlist bl;
6856 encode(info, bl);
6857 op.setxattr(RGW_ATTR_OLH_INFO, bl);
6858 }
6859 rgw_rados_ref ref;
6860 r = get_obj_head_ref(bucket_info, obj, &ref);
6861 if (r < 0) {
6862 return r;
6863 }
6864 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6865 if (r < 0) {
6866 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
6867 << cpp_strerror(r) << dendl;
6868 return r;
6869 }
6870 return 0;
6871 }
6872
6873 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
6874 {
6875 rgw_rados_ref ref;
6876 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6877 if (r < 0) {
6878 return r;
6879 }
6880
6881 BucketShard bs(this);
6882 int ret =
6883 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
6884 if (ret < 0) {
6885 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
6886 return ret;
6887 }
6888
6889 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6890
6891 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6892
6893 ret = guard_reshard(&bs, obj_instance, bucket_info,
6894 [&](BucketShard *pbs) -> int {
6895 ObjectWriteOperation op;
6896 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6897 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
6898 return pbs->bucket_obj.operate(&op, null_yield);
6899 });
6900 if (ret < 0) {
6901 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
6902 return ret;
6903 }
6904
6905 return 0;
6906 }
6907
6908 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
6909 {
6910 rgw_rados_ref ref;
6911 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6912 if (r < 0) {
6913 return r;
6914 }
6915
6916 BucketShard bs(this);
6917
6918 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6919
6920 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6921
6922 int ret = guard_reshard(&bs, obj_instance, bucket_info,
6923 [&](BucketShard *pbs) -> int {
6924 ObjectWriteOperation op;
6925 auto& ref = pbs->bucket_obj.get_ref();
6926 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6927 cls_rgw_clear_olh(op, key, olh_tag);
6928 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6929 });
6930 if (ret < 0) {
6931 ldout(cct, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
6932 return ret;
6933 }
6934
6935 return 0;
6936 }
6937
6938 static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
6939 {
6940 try {
6941 auto biter = bl.cbegin();
6942 decode(*olh, biter);
6943 return 0;
6944 } catch (buffer::error& err) {
6945 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
6946 return -EIO;
6947 }
6948 }
6949
6950 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6951 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
6952 uint64_t *plast_ver, rgw_zone_set* zones_trace)
6953 {
6954 if (log.empty()) {
6955 return 0;
6956 }
6957
6958 librados::ObjectWriteOperation op;
6959
6960 uint64_t last_ver = log.rbegin()->first;
6961 *plast_ver = last_ver;
6962
6963 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
6964
6965 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
6966 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
6967
6968 bufferlist ver_bl;
6969 string last_ver_s = to_string(last_ver);
6970 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
6971 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
6972
6973 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6974 op.mtime2(&mtime_ts);
6975
6976 bool need_to_link = false;
6977 uint64_t link_epoch = 0;
6978 cls_rgw_obj_key key;
6979 bool delete_marker = false;
6980 list<cls_rgw_obj_key> remove_instances;
6981 bool need_to_remove = false;
6982
6983 // decode current epoch and instance
6984 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
6985 if (olh_ver != state.attrset.end()) {
6986 std::string str = olh_ver->second.to_str();
6987 std::string err;
6988 link_epoch = strict_strtoll(str.c_str(), 10, &err);
6989 }
6990 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
6991 if (olh_info != state.attrset.end()) {
6992 RGWOLHInfo info;
6993 int r = decode_olh_info(cct, olh_info->second, &info);
6994 if (r < 0) {
6995 return r;
6996 }
6997 info.target.key.get_index_key(&key);
6998 delete_marker = info.removed;
6999 }
7000
7001 for (iter = log.begin(); iter != log.end(); ++iter) {
7002 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7003 for (; viter != iter->second.end(); ++viter) {
7004 rgw_bucket_olh_log_entry& entry = *viter;
7005
7006 ldout(cct, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7007 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7008 << (entry.delete_marker ? "(delete)" : "") << dendl;
7009 switch (entry.op) {
7010 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7011 remove_instances.push_back(entry.key);
7012 break;
7013 case CLS_RGW_OLH_OP_LINK_OLH:
7014 // only overwrite a link of the same epoch if its key sorts before
7015 if (link_epoch < iter->first || key.instance.empty() ||
7016 key.instance > entry.key.instance) {
7017 ldout(cct, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7018 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7019 need_to_link = true;
7020 need_to_remove = false;
7021 key = entry.key;
7022 delete_marker = entry.delete_marker;
7023 } else {
7024 ldout(cct, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7025 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7026 }
7027 break;
7028 case CLS_RGW_OLH_OP_UNLINK_OLH:
7029 need_to_remove = true;
7030 need_to_link = false;
7031 break;
7032 default:
7033 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7034 return -EIO;
7035 }
7036 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7037 attr_name.append(entry.op_tag);
7038 op.rmxattr(attr_name.c_str());
7039 }
7040 }
7041
7042 rgw_rados_ref ref;
7043 int r = get_obj_head_ref(bucket_info, obj, &ref);
7044 if (r < 0) {
7045 return r;
7046 }
7047
7048 const rgw_bucket& bucket = obj.bucket;
7049
7050 if (need_to_link) {
7051 rgw_obj target(bucket, key);
7052 RGWOLHInfo info;
7053 info.target = target;
7054 info.removed = delete_marker;
7055 bufferlist bl;
7056 encode(info, bl);
7057 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7058 }
7059
7060 /* first remove object instances */
7061 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7062 liter != remove_instances.end(); ++liter) {
7063 cls_rgw_obj_key& key = *liter;
7064 rgw_obj obj_instance(bucket, key);
7065 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7066 if (ret < 0 && ret != -ENOENT) {
7067 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7068 return ret;
7069 }
7070 }
7071
7072 /* update olh object */
7073 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7074 if (r == -ECANCELED) {
7075 r = 0;
7076 }
7077 if (r < 0) {
7078 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7079 return r;
7080 }
7081
7082 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
7083 if (r < 0) {
7084 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7085 return r;
7086 }
7087
7088 if (need_to_remove) {
7089 ObjectWriteOperation rm_op;
7090
7091 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7092 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7093 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7094 rm_op.remove();
7095
7096 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7097 if (r == -ECANCELED) {
7098 return 0; /* someone else won this race */
7099 } else {
7100 /*
7101 * only clear if was successful, otherwise we might clobber pending operations on this object
7102 */
7103 r = bucket_index_clear_olh(bucket_info, state, obj);
7104 if (r < 0) {
7105 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7106 return r;
7107 }
7108 }
7109 }
7110
7111 return 0;
7112 }
7113
7114 /*
7115 * read olh log and apply it
7116 */
7117 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7118 {
7119 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7120 bool is_truncated;
7121 uint64_t ver_marker = 0;
7122
7123 do {
7124 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7125 if (ret < 0) {
7126 return ret;
7127 }
7128 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7129 if (ret < 0) {
7130 return ret;
7131 }
7132 } while (is_truncated);
7133
7134 return 0;
7135 }
7136
7137 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
7138 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
7139 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7140 {
7141 string op_tag;
7142
7143 rgw_obj olh_obj = target_obj;
7144 olh_obj.key.instance.clear();
7145
7146 RGWObjState *state = NULL;
7147
7148 int ret = 0;
7149 int i;
7150
7151 #define MAX_ECANCELED_RETRY 100
7152 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7153 if (ret == -ECANCELED) {
7154 obj_ctx.invalidate(olh_obj);
7155 }
7156
7157 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7158 if (ret < 0) {
7159 return ret;
7160 }
7161
7162 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7163 if (ret < 0) {
7164 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7165 if (ret == -ECANCELED) {
7166 continue;
7167 }
7168 return ret;
7169 }
7170 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
7171 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7172 zones_trace, log_data_change);
7173 if (ret < 0) {
7174 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7175 if (ret == -ECANCELED) {
7176 // the bucket index rejected the link_olh() due to olh tag mismatch;
7177 // attempt to reconstruct olh head attributes based on the bucket index
7178 int r2 = repair_olh(state, bucket_info, olh_obj);
7179 if (r2 < 0 && r2 != -ECANCELED) {
7180 return r2;
7181 }
7182 continue;
7183 }
7184 return ret;
7185 }
7186 break;
7187 }
7188
7189 if (i == MAX_ECANCELED_RETRY) {
7190 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7191 return -EIO;
7192 }
7193
7194 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7195 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7196 ret = 0;
7197 }
7198 if (ret < 0) {
7199 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7200 return ret;
7201 }
7202
7203 return 0;
7204 }
7205
7206 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
7207 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7208 {
7209 string op_tag;
7210
7211 rgw_obj olh_obj = target_obj;
7212 olh_obj.key.instance.clear();
7213
7214 RGWObjState *state = NULL;
7215
7216 int ret = 0;
7217 int i;
7218
7219 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7220 if (ret == -ECANCELED) {
7221 obj_ctx.invalidate(olh_obj);
7222 }
7223
7224 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7225 if (ret < 0)
7226 return ret;
7227
7228 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7229 if (ret < 0) {
7230 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7231 if (ret == -ECANCELED) {
7232 continue;
7233 }
7234 return ret;
7235 }
7236
7237 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7238
7239 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7240 if (ret < 0) {
7241 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7242 if (ret == -ECANCELED) {
7243 continue;
7244 }
7245 return ret;
7246 }
7247 break;
7248 }
7249
7250 if (i == MAX_ECANCELED_RETRY) {
7251 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7252 return -EIO;
7253 }
7254
7255 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7256 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7257 return 0;
7258 }
7259 if (ret < 0) {
7260 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7261 return ret;
7262 }
7263
7264 return 0;
7265 }
7266
7267 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7268 {
7269 #define OBJ_INSTANCE_LEN 32
7270 char buf[OBJ_INSTANCE_LEN + 1];
7271
7272 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7273 no underscore for instance name due to the way we encode the raw keys */
7274
7275 target_key->set_instance(buf);
7276 }
7277
7278 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7279 {
7280 gen_rand_obj_instance_name(&target_obj->key);
7281 }
7282
7283 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7284 {
7285 map<string, bufferlist> attrset;
7286
7287 ObjectReadOperation op;
7288 op.getxattrs(&attrset, NULL);
7289
7290 int r = obj_operate(bucket_info, obj, &op);
7291 if (r < 0) {
7292 return r;
7293 }
7294
7295 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7296 if (iter == attrset.end()) { /* not an olh */
7297 return -EINVAL;
7298 }
7299
7300 return decode_olh_info(cct, iter->second, olh);
7301 }
7302
7303 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7304 map<string, bufferlist> *rm_pending_entries)
7305 {
7306 map<string, bufferlist>::iterator iter = pending_entries.begin();
7307
7308 real_time now = real_clock::now();
7309
7310 while (iter != pending_entries.end()) {
7311 auto biter = iter->second.cbegin();
7312 RGWOLHPendingInfo pending_info;
7313 try {
7314 decode(pending_info, biter);
7315 } catch (buffer::error& err) {
7316 /* skipping bad entry, we could remove it but it might hide a bug */
7317 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7318 ++iter;
7319 continue;
7320 }
7321
7322 map<string, bufferlist>::iterator cur_iter = iter;
7323 ++iter;
7324 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7325 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7326 pending_entries.erase(cur_iter);
7327 } else {
7328 /* entries names are sorted by time (rounded to a second) */
7329 break;
7330 }
7331 }
7332 }
7333
7334 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7335 {
7336 rgw_rados_ref ref;
7337 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
7338 if (r < 0) {
7339 return r;
7340 }
7341
7342 // trim no more than 1000 entries per osd op
7343 constexpr int max_entries = 1000;
7344
7345 auto i = pending_attrs.begin();
7346 while (i != pending_attrs.end()) {
7347 ObjectWriteOperation op;
7348 bucket_index_guard_olh_op(state, op);
7349
7350 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7351 op.rmxattr(i->first.c_str());
7352 }
7353
7354 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7355 if (r == -ENOENT || r == -ECANCELED) {
7356 /* raced with some other change, shouldn't sweat about it */
7357 return 0;
7358 }
7359 if (r < 0) {
7360 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7361 return r;
7362 }
7363 }
7364 return 0;
7365 }
7366
7367 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7368 {
7369 map<string, bufferlist> pending_entries;
7370 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7371
7372 map<string, bufferlist> rm_pending_entries;
7373 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7374
7375 if (!rm_pending_entries.empty()) {
7376 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
7377 if (ret < 0) {
7378 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7379 return ret;
7380 }
7381 }
7382 if (!pending_entries.empty()) {
7383 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7384
7385 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7386 if (ret < 0) {
7387 return ret;
7388 }
7389 }
7390
7391 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7392 if (iter == state->attrset.end()) {
7393 return -EINVAL;
7394 }
7395
7396 RGWOLHInfo olh;
7397 int ret = decode_olh_info(cct, iter->second, &olh);
7398 if (ret < 0) {
7399 return ret;
7400 }
7401
7402 if (olh.removed) {
7403 return -ENOENT;
7404 }
7405
7406 *target = olh.target;
7407
7408 return 0;
7409 }
7410
7411 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7412 map<string, bufferlist> *attrs, bufferlist *first_chunk,
7413 RGWObjVersionTracker *objv_tracker, optional_yield y)
7414 {
7415 rgw_rados_ref ref;
7416 int r = get_raw_obj_ref(obj, &ref);
7417 if (r < 0) {
7418 return r;
7419 }
7420
7421 map<string, bufferlist> unfiltered_attrset;
7422 uint64_t size = 0;
7423 struct timespec mtime_ts;
7424
7425 ObjectReadOperation op;
7426 if (objv_tracker) {
7427 objv_tracker->prepare_op_for_read(&op);
7428 }
7429 if (attrs) {
7430 op.getxattrs(&unfiltered_attrset, NULL);
7431 }
7432 if (psize || pmtime) {
7433 op.stat2(&size, &mtime_ts, NULL);
7434 }
7435 if (first_chunk) {
7436 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7437 }
7438 bufferlist outbl;
7439 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7440
7441 if (epoch) {
7442 *epoch = ref.pool.ioctx().get_last_version();
7443 }
7444
7445 if (r < 0)
7446 return r;
7447
7448 if (psize)
7449 *psize = size;
7450 if (pmtime)
7451 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7452 if (attrs) {
7453 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7454 }
7455
7456 return 0;
7457 }
7458
7459 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
7460 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7461 {
7462 vector<rgw_bucket_dir_header> headers;
7463 map<int, string> bucket_instance_ids;
7464 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7465 if (r < 0) {
7466 return r;
7467 }
7468
7469 ceph_assert(headers.size() == bucket_instance_ids.size());
7470
7471 auto iter = headers.begin();
7472 map<int, string>::iterator viter = bucket_instance_ids.begin();
7473 BucketIndexShardsManager ver_mgr;
7474 BucketIndexShardsManager master_ver_mgr;
7475 BucketIndexShardsManager marker_mgr;
7476 char buf[64];
7477 for(; iter != headers.end(); ++iter, ++viter) {
7478 accumulate_raw_stats(*iter, stats);
7479 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7480 ver_mgr.add(viter->first, string(buf));
7481 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7482 master_ver_mgr.add(viter->first, string(buf));
7483 if (shard_id >= 0) {
7484 *max_marker = iter->max_marker;
7485 } else {
7486 marker_mgr.add(viter->first, iter->max_marker);
7487 }
7488 if (syncstopped != NULL)
7489 *syncstopped = iter->syncstopped;
7490 }
7491 ver_mgr.to_string(bucket_ver);
7492 master_ver_mgr.to_string(master_ver);
7493 if (shard_id < 0) {
7494 marker_mgr.to_string(max_marker);
7495 }
7496 return 0;
7497 }
7498
7499 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7500 RGWGetBucketStats_CB *cb;
7501 uint32_t pendings;
7502 map<RGWObjCategory, RGWStorageStats> stats;
7503 int ret_code;
7504 bool should_cb;
7505 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7506
7507 public:
7508 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
7509 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7510 {}
7511
7512 void handle_response(int r, rgw_bucket_dir_header& header) override {
7513 std::lock_guard l{lock};
7514 if (should_cb) {
7515 if ( r >= 0) {
7516 accumulate_raw_stats(header, stats);
7517 } else {
7518 ret_code = r;
7519 }
7520
7521 // Are we all done?
7522 if (--pendings == 0) {
7523 if (!ret_code) {
7524 cb->set_response(&stats);
7525 }
7526 cb->handle_response(ret_code);
7527 cb->put();
7528 }
7529 }
7530 }
7531
7532 void unset_cb() {
7533 std::lock_guard l{lock};
7534 should_cb = false;
7535 }
7536 };
7537
7538 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7539 {
7540 int num_aio = 0;
7541 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
7542 ceph_assert(get_ctx);
7543 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7544 if (r < 0) {
7545 ctx->put();
7546 if (num_aio) {
7547 get_ctx->unset_cb();
7548 }
7549 }
7550 get_ctx->put();
7551 return r;
7552 }
7553
7554 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
7555 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y)
7556 {
7557 rgw_bucket bucket;
7558 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7559
7560 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y);
7561 }
7562
7563 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
7564 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y)
7565 {
7566 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7567 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7568 y,
7569 RGWBucketCtl::BucketInstance::GetParams()
7570 .set_mtime(pmtime)
7571 .set_attrs(pattrs)
7572 .set_bectx_params(bectx_params));
7573 }
7574
7575 int RGWRados::get_bucket_info(RGWServices *svc,
7576 const string& tenant, const string& bucket_name,
7577 RGWBucketInfo& info,
7578 real_time *pmtime,
7579 optional_yield y, map<string, bufferlist> *pattrs)
7580 {
7581 auto obj_ctx = svc->sysobj->init_obj_ctx();
7582 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7583 rgw_bucket bucket;
7584 bucket.tenant = tenant;
7585 bucket.name = bucket_name;
7586 return ctl.bucket->read_bucket_info(bucket, &info, y,
7587 RGWBucketCtl::BucketInstance::GetParams()
7588 .set_mtime(pmtime)
7589 .set_attrs(pattrs)
7590 .set_bectx_params(bectx_params));
7591 }
7592
7593 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7594 ceph::real_time *pmtime,
7595 map<string, bufferlist> *pattrs)
7596 {
7597 rgw_bucket bucket = info.bucket;
7598 bucket.bucket_id.clear();
7599
7600 auto rv = info.objv_tracker.read_version;
7601
7602 return ctl.bucket->read_bucket_info(bucket, &info, null_yield,
7603 RGWBucketCtl::BucketInstance::GetParams()
7604 .set_mtime(pmtime)
7605 .set_attrs(pattrs)
7606 .set_refresh_version(rv));
7607 }
7608
7609 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
7610 real_time mtime, map<string, bufferlist> *pattrs)
7611 {
7612 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield,
7613 RGWBucketCtl::BucketInstance::PutParams()
7614 .set_exclusive(exclusive)
7615 .set_mtime(mtime)
7616 .set_attrs(pattrs));
7617 }
7618
7619 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
7620 map<string, bufferlist> *pattrs, bool create_entry_point)
7621 {
7622 bool create_head = !info.has_instance_obj || create_entry_point;
7623
7624 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
7625 if (ret < 0) {
7626 return ret;
7627 }
7628
7629 if (!create_head)
7630 return 0; /* done! */
7631
7632 RGWBucketEntryPoint entry_point;
7633 entry_point.bucket = info.bucket;
7634 entry_point.owner = info.owner;
7635 entry_point.creation_time = info.creation_time;
7636 entry_point.linked = true;
7637 RGWObjVersionTracker ot;
7638 if (pep_objv && !pep_objv->tag.empty()) {
7639 ot.write_version = *pep_objv;
7640 } else {
7641 ot.generate_new_write_ver(cct);
7642 if (pep_objv) {
7643 *pep_objv = ot.write_version;
7644 }
7645 }
7646 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, RGWBucketCtl::Bucket::PutParams()
7647 .set_exclusive(exclusive)
7648 .set_objv_tracker(&ot)
7649 .set_mtime(mtime));
7650 if (ret < 0)
7651 return ret;
7652
7653 return 0;
7654 }
7655
7656 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
7657 {
7658 auto obj_ctx = svc.sysobj->init_obj_ctx();
7659
7660 map<string, RGWBucketEnt>::iterator iter;
7661 for (iter = m.begin(); iter != m.end(); ++iter) {
7662 RGWBucketEnt& ent = iter->second;
7663 rgw_bucket& bucket = ent.bucket;
7664 ent.count = 0;
7665 ent.size = 0;
7666 ent.size_rounded = 0;
7667
7668 vector<rgw_bucket_dir_header> headers;
7669
7670 RGWBucketInfo bucket_info;
7671 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield);
7672 if (ret < 0) {
7673 return ret;
7674 }
7675
7676 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
7677 if (r < 0)
7678 return r;
7679
7680 auto hiter = headers.begin();
7681 for (; hiter != headers.end(); ++hiter) {
7682 RGWObjCategory category = main_category;
7683 auto iter = (hiter->stats).find(category);
7684 if (iter != hiter->stats.end()) {
7685 struct rgw_bucket_category_stats& stats = iter->second;
7686 ent.count += stats.num_entries;
7687 ent.size += stats.total_size;
7688 ent.size_rounded += stats.total_size_rounded;
7689 }
7690 }
7691
7692 // fill in placement_rule from the bucket instance for use in swift's
7693 // per-storage policy statistics
7694 ent.placement_rule = std::move(bucket_info.placement_rule);
7695 }
7696
7697 return m.size();
7698 }
7699
7700 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
7701 {
7702 rgw_rados_ref ref;
7703 int r = get_raw_obj_ref(obj, &ref);
7704 if (r < 0) {
7705 return r;
7706 }
7707 librados::Rados *rad = get_rados_handle();
7708 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7709
7710 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7711 completion->release();
7712 return r;
7713 }
7714
7715 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
7716 {
7717 librados::IoCtx& io_ctx = ctx.io_ctx;
7718 librados::NObjectIterator& iter = ctx.iter;
7719
7720 int r = open_pool_ctx(pool, io_ctx, false);
7721 if (r < 0)
7722 return r;
7723
7724 iter = io_ctx.nobjects_begin();
7725
7726 return 0;
7727 }
7728
7729 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
7730 {
7731 librados::IoCtx& io_ctx = ctx.io_ctx;
7732 librados::NObjectIterator& iter = ctx.iter;
7733
7734 int r = open_pool_ctx(pool, io_ctx, false);
7735 if (r < 0)
7736 return r;
7737
7738 librados::ObjectCursor oc;
7739 if (!oc.from_str(cursor)) {
7740 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
7741 return -EINVAL;
7742 }
7743
7744 try {
7745 iter = io_ctx.nobjects_begin(oc);
7746 return 0;
7747 } catch (const std::system_error& e) {
7748 r = -e.code().value();
7749 ldout(cct, 10) << "nobjects_begin threw " << e.what()
7750 << ", returning " << r << dendl;
7751 return r;
7752 } catch (const std::exception& e) {
7753 ldout(cct, 10) << "nobjects_begin threw " << e.what()
7754 << ", returning -5" << dendl;
7755 return -EIO;
7756 }
7757 }
7758
7759 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
7760 {
7761 return ctx.iter.get_cursor().to_str();
7762 }
7763
7764 static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
7765 vector<rgw_bucket_dir_entry>& objs,
7766 bool *is_truncated, RGWAccessListFilter *filter)
7767 {
7768 librados::IoCtx& io_ctx = ctx.io_ctx;
7769 librados::NObjectIterator& iter = ctx.iter;
7770
7771 if (iter == io_ctx.nobjects_end())
7772 return -ENOENT;
7773
7774 uint32_t i;
7775
7776 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
7777 rgw_bucket_dir_entry e;
7778
7779 string oid = iter->get_oid();
7780 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7781
7782 // fill it in with initial values; we may correct later
7783 if (filter && !filter->filter(oid, oid))
7784 continue;
7785
7786 e.key = oid;
7787 objs.push_back(e);
7788 }
7789
7790 if (is_truncated)
7791 *is_truncated = (iter != io_ctx.nobjects_end());
7792
7793 return objs.size();
7794 }
7795
7796 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
7797 bool *is_truncated, RGWAccessListFilter *filter)
7798 {
7799 // catch exceptions from NObjectIterator::operator++()
7800 try {
7801 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
7802 } catch (const std::system_error& e) {
7803 int r = -e.code().value();
7804 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7805 << ", returning " << r << dendl;
7806 return r;
7807 } catch (const std::exception& e) {
7808 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7809 << ", returning -5" << dendl;
7810 return -EIO;
7811 }
7812 }
7813
7814 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7815 {
7816 if (!ctx->initialized) {
7817 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7818 if (r < 0) {
7819 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7820 return r;
7821 }
7822 ctx->initialized = true;
7823 }
7824 return 0;
7825 }
7826
7827 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
7828 RGWListRawObjsCtx& ctx, list<string>& oids,
7829 bool *is_truncated)
7830 {
7831 if (!ctx.initialized) {
7832 return -EINVAL;
7833 }
7834 RGWAccessListFilterPrefix filter(prefix_filter);
7835 vector<rgw_bucket_dir_entry> objs;
7836 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
7837 if (r < 0) {
7838 if(r != -ENOENT)
7839 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7840 return r;
7841 }
7842
7843 vector<rgw_bucket_dir_entry>::iterator iter;
7844 for (iter = objs.begin(); iter != objs.end(); ++iter) {
7845 oids.push_back(iter->key.name);
7846 }
7847
7848 return oids.size();
7849 }
7850
7851 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
7852 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
7853 bool *is_truncated)
7854 {
7855 if (!ctx.initialized) {
7856 int r = list_raw_objects_init(pool, string(), &ctx);
7857 if (r < 0) {
7858 return r;
7859 }
7860 }
7861
7862 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
7863 }
7864
7865 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
7866 {
7867 return pool_iterate_get_cursor(ctx.iter_ctx);
7868 }
7869
7870 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7871 rgw_bucket_dir_entry *dirent)
7872 {
7873 rgw_cls_bi_entry bi_entry;
7874 int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry);
7875 if (r < 0 && r != -ENOENT) {
7876 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7877 }
7878 if (r < 0) {
7879 return r;
7880 }
7881 auto iter = bi_entry.data.cbegin();
7882 try {
7883 decode(*dirent, iter);
7884 } catch (buffer::error& err) {
7885 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7886 return -EIO;
7887 }
7888
7889 return 0;
7890 }
7891
7892 int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7893 rgw_bucket_olh_entry *olh)
7894 {
7895 rgw_cls_bi_entry bi_entry;
7896 int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry);
7897 if (r < 0 && r != -ENOENT) {
7898 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7899 }
7900 if (r < 0) {
7901 return r;
7902 }
7903 auto iter = bi_entry.data.cbegin();
7904 try {
7905 decode(*olh, iter);
7906 } catch (buffer::error& err) {
7907 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7908 return -EIO;
7909 }
7910
7911 return 0;
7912 }
7913
7914 int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7915 BIIndexType index_type, rgw_cls_bi_entry *entry)
7916 {
7917 BucketShard bs(this);
7918 int ret = bs.init(bucket_info, obj);
7919 if (ret < 0) {
7920 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7921 return ret;
7922 }
7923
7924 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
7925
7926 auto& ref = bs.bucket_obj.get_ref();
7927
7928 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
7929 }
7930
7931 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
7932 {
7933 auto& ref = bs.bucket_obj.get_ref();
7934 cls_rgw_bi_put(op, ref.obj.oid, entry);
7935 }
7936
7937 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
7938 {
7939 auto& ref = bs.bucket_obj.get_ref();
7940 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
7941 if (ret < 0)
7942 return ret;
7943
7944 return 0;
7945 }
7946
7947 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
7948 {
7949 BucketShard bs(this);
7950 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7951 if (ret < 0) {
7952 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7953 return ret;
7954 }
7955
7956 return bi_put(bs, entry);
7957 }
7958
7959 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7960 {
7961 rgw_obj obj(bucket, obj_name);
7962 BucketShard bs(this);
7963 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7964 if (ret < 0) {
7965 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7966 return ret;
7967 }
7968
7969 auto& ref = bs.bucket_obj.get_ref();
7970 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name, marker, max, entries, is_truncated);
7971 if (ret == -ENOENT) {
7972 *is_truncated = false;
7973 }
7974 if (ret < 0)
7975 return ret;
7976
7977 return 0;
7978 }
7979
7980 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7981 {
7982 auto& ref = bs.bucket_obj.get_ref();
7983 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, filter_obj, marker, max, entries, is_truncated);
7984 if (ret < 0)
7985 return ret;
7986
7987 return 0;
7988 }
7989
7990 int RGWRados::bi_remove(BucketShard& bs)
7991 {
7992 auto& ref = bs.bucket_obj.get_ref();
7993 int ret = ref.pool.ioctx().remove(ref.obj.oid);
7994 if (ret == -ENOENT) {
7995 ret = 0;
7996 }
7997 if (ret < 0) {
7998 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
7999 return ret;
8000 }
8001
8002 return 0;
8003 }
8004
8005 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8006 {
8007 BucketShard bs(this);
8008 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
8009 if (ret < 0) {
8010 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8011 return ret;
8012 }
8013
8014 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8015 }
8016
8017 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
8018 {
8019 return rgw_rados_operate(gc_pool_ctx, oid, op, null_yield);
8020 }
8021
8022 int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8023 librados::ObjectWriteOperation *op)
8024 {
8025 return gc_pool_ctx.aio_operate(oid, c, op);
8026 }
8027
8028 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8029 {
8030 return rgw_rados_operate(gc_pool_ctx, oid, op, pbl, null_yield);
8031 }
8032
8033 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
8034 {
8035 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
8036 }
8037
8038 int RGWRados::process_gc(bool expired_only)
8039 {
8040 return gc->process(expired_only);
8041 }
8042
8043 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
8044 {
8045 return lc->list_lc_progress(marker, max_entries, progress_map);
8046 }
8047
8048 int RGWRados::process_lc()
8049 {
8050 return lc->process();
8051 }
8052
8053 bool RGWRados::process_expire_objects()
8054 {
8055 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
8056 }
8057
8058 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
8059 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
8060 {
8061 rgw_zone_set zones_trace;
8062 if (_zones_trace) {
8063 zones_trace = *_zones_trace;
8064 }
8065 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
8066
8067 ObjectWriteOperation o;
8068 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
8069 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8070 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
8071 return bs.bucket_obj.operate(&o, y);
8072 }
8073
8074 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
8075 int64_t pool, uint64_t epoch,
8076 rgw_bucket_dir_entry& ent, RGWObjCategory category,
8077 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
8078 {
8079 ObjectWriteOperation o;
8080 rgw_bucket_dir_entry_meta dir_meta;
8081 dir_meta = ent.meta;
8082 dir_meta.category = category;
8083
8084 rgw_zone_set zones_trace;
8085 if (_zones_trace) {
8086 zones_trace = *_zones_trace;
8087 }
8088 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
8089
8090 rgw_bucket_entry_ver ver;
8091 ver.pool = pool;
8092 ver.epoch = epoch;
8093 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
8094 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8095 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
8096 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
8097 complete_op_data *arg;
8098 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
8099 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
8100 librados::AioCompletion *completion = arg->rados_completion;
8101 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
8102 completion->release(); /* can't reference arg here, as it might have already been released */
8103 return ret;
8104 }
8105
8106 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
8107 int64_t pool, uint64_t epoch,
8108 rgw_bucket_dir_entry& ent, RGWObjCategory category,
8109 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
8110 {
8111 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
8112 }
8113
8114 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8115 int64_t pool, uint64_t epoch,
8116 rgw_obj& obj,
8117 real_time& removed_mtime,
8118 list<rgw_obj_index_key> *remove_objs,
8119 uint16_t bilog_flags,
8120 rgw_zone_set *zones_trace)
8121 {
8122 rgw_bucket_dir_entry ent;
8123 ent.meta.mtime = removed_mtime;
8124 obj.key.get_index_key(&ent.key);
8125 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8126 ent, RGWObjCategory::None, remove_objs,
8127 bilog_flags, zones_trace);
8128 }
8129
8130 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
8131 {
8132 rgw_bucket_dir_entry ent;
8133 obj.key.get_index_key(&ent.key);
8134 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8135 -1 /* pool id */, 0, ent,
8136 RGWObjCategory::None, NULL, bilog_flags,
8137 zones_trace);
8138 }
8139
8140 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
8141 {
8142 RGWSI_RADOS::Pool index_pool;
8143 map<int, string> bucket_objs;
8144 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
8145 if (r < 0)
8146 return r;
8147
8148 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8149 }
8150
8151
8152 uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8153 uint32_t num_shards)
8154 {
8155 // We want to minimize the chances that when num_shards >>
8156 // num_entries that we return much fewer than num_entries to the
8157 // client. Given all the overhead of making a cls call to the osd,
8158 // returning a few entries is not much more work than returning one
8159 // entry. This minimum might be better tuned based on future
8160 // experiments where num_shards >> num_entries. (Note: ">>" should
8161 // be interpreted as "much greater than".)
8162 constexpr uint32_t min_read = 8;
8163
8164 // The following is based on _"Balls into Bins" -- A Simple and
8165 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8166 // cases when num_shards >> num_entries (it almost serves as a
8167 // ceiling calculation). We also assume alpha is 1.0 and extract it
8168 // from the calculation. Future work could involve memoizing some of
8169 // the transcendental functions to minimize repeatedly re-calling
8170 // them with the same parameters, which we expect to be the case the
8171 // majority of the time.
8172 uint32_t calc_read =
8173 1 +
8174 static_cast<uint32_t>((num_entries / num_shards) +
8175 sqrt((2 * num_entries) *
8176 log(num_shards) / num_shards));
8177
8178 return std::max(min_read, calc_read);
8179 }
8180
8181
8182 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
8183 const int shard_id,
8184 const rgw_obj_index_key& start_after,
8185 const string& prefix,
8186 const string& delimiter,
8187 const uint32_t num_entries,
8188 const bool list_versions,
8189 const uint16_t expansion_factor,
8190 ent_map_t& m,
8191 bool* is_truncated,
8192 bool* cls_filtered,
8193 rgw_obj_index_key *last_entry,
8194 optional_yield y,
8195 check_filter_t force_check_filter)
8196 {
8197 /* expansion_factor allows the number of entries to read to grow
8198 * exponentially; this is used when earlier reads are producing too
8199 * few results, perhaps due to filtering or to a series of
8200 * namespaced entries */
8201
8202 ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
8203 " start_after=\"" << start_after.name <<
8204 "[" << start_after.instance <<
8205 "]\", prefix=\"" << prefix <<
8206 "\" num_entries=" << num_entries <<
8207 ", list_versions=" << list_versions <<
8208 ", expansion_factor=" << expansion_factor << dendl;
8209
8210 m.clear();
8211
8212 RGWSI_RADOS::Pool index_pool;
8213 // key - oid (for different shards if there is any)
8214 // value - list result for the corresponding oid (shard), it is filled by
8215 // the AIO callback
8216 map<int, string> shard_oids;
8217 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id,
8218 &index_pool, &shard_oids,
8219 nullptr);
8220 if (r < 0) {
8221 return r;
8222 }
8223
8224 const uint32_t shard_count = shard_oids.size();
8225 uint32_t num_entries_per_shard;
8226 if (expansion_factor == 0) {
8227 num_entries_per_shard =
8228 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8229 } else if (expansion_factor <= 11) {
8230 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8231 num_entries_per_shard =
8232 std::min(num_entries,
8233 (uint32_t(1 << (expansion_factor - 1)) *
8234 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8235 } else {
8236 num_entries_per_shard = num_entries;
8237 }
8238
8239 ldout(cct, 10) << "RGWRados::" << __func__ <<
8240 " request from each of " << shard_count <<
8241 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8242 num_entries << " total entries" << dendl;
8243
8244 auto& ioctx = index_pool.ioctx();
8245 map<int, rgw_cls_list_ret> shard_list_results;
8246 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8247 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8248 num_entries_per_shard,
8249 list_versions, shard_oids, shard_list_results,
8250 cct->_conf->rgw_bucket_index_max_aio)();
8251 if (r < 0) {
8252 return r;
8253 }
8254
8255 // to manage the iterators through each shard's list results
8256 struct ShardTracker {
8257 const size_t shard_idx;
8258 rgw_cls_list_ret& result;
8259 const std::string& oid_name;
8260 RGWRados::ent_map_t::iterator cursor;
8261 RGWRados::ent_map_t::iterator end;
8262
8263 // manages an iterator through a shard and provides other
8264 // accessors
8265 ShardTracker(size_t _shard_idx,
8266 rgw_cls_list_ret& _result,
8267 const std::string& _oid_name):
8268 shard_idx(_shard_idx),
8269 result(_result),
8270 oid_name(_oid_name),
8271 cursor(_result.dir.m.begin()),
8272 end(_result.dir.m.end())
8273 {}
8274
8275 inline const std::string& entry_name() const {
8276 return cursor->first;
8277 }
8278 rgw_bucket_dir_entry& dir_entry() const {
8279 return cursor->second;
8280 }
8281 inline bool is_truncated() const {
8282 return result.is_truncated;
8283 }
8284 inline ShardTracker& advance() {
8285 ++cursor;
8286 // return a self-reference to allow for chaining of calls, such
8287 // as x.advance().at_end()
8288 return *this;
8289 }
8290 inline bool at_end() const {
8291 return cursor == end;
8292 }
8293 }; // ShardTracker
8294
8295 // add the next unique candidate, or return false if we reach the end
8296 auto next_candidate = [] (ShardTracker& t,
8297 std::map<std::string, size_t>& candidates,
8298 size_t tracker_idx) {
8299 while (!t.at_end()) {
8300 if (candidates.emplace(t.entry_name(), tracker_idx).second) {
8301 return;
8302 }
8303 t.advance(); // skip duplicate common prefixes
8304 }
8305 };
8306
8307 // one tracker per shard requested (may not be all shards)
8308 std::vector<ShardTracker> results_trackers;
8309 results_trackers.reserve(shard_list_results.size());
8310 for (auto& r : shard_list_results) {
8311 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8312
8313 // if any *one* shard's result is trucated, the entire result is
8314 // truncated
8315 *is_truncated = *is_truncated || r.second.is_truncated;
8316
8317 // unless *all* are shards are cls_filtered, the entire result is
8318 // not filtered
8319 *cls_filtered = *cls_filtered && r.second.cls_filtered;
8320 }
8321
8322 // create a map to track the next candidate entry from ShardTracker
8323 // (key=candidate, value=index into results_trackers); as we consume
8324 // entries from shards, we replace them with the next entries in the
8325 // shards until we run out
8326 map<string, size_t> candidates;
8327 size_t tracker_idx = 0;
8328 for (auto& t : results_trackers) {
8329 // it's important that the values in the map refer to the index
8330 // into the results_trackers vector, which may not be the same
8331 // as the shard number (i.e., when not all shards are requested)
8332 next_candidate(t, candidates, tracker_idx);
8333 ++tracker_idx;
8334 }
8335
8336 rgw_bucket_dir_entry*
8337 last_entry_visited = nullptr; // to set last_entry (marker)
8338 map<string, bufferlist> updates;
8339 uint32_t count = 0;
8340 while (count < num_entries && !candidates.empty()) {
8341 r = 0;
8342 // select the next entry in lexical order (first key in map);
8343 // again tracker_idx is not necessarily shard number, but is index
8344 // into results_trackers vector
8345 tracker_idx = candidates.begin()->second;
8346 auto& tracker = results_trackers.at(tracker_idx);
8347 last_entry_visited = &tracker.dir_entry();
8348 const string& name = tracker.entry_name();
8349 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8350
8351 ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " <<
8352 dirent.key << " from shard " << tracker.shard_idx << dendl;
8353
8354 const bool force_check =
8355 force_check_filter && force_check_filter(dirent.key.name);
8356
8357 if ((!dirent.exists &&
8358 !dirent.is_delete_marker() &&
8359 !dirent.is_common_prefix()) ||
8360 !dirent.pending_map.empty() ||
8361 force_check) {
8362 /* there are uncommitted ops. We need to check the current
8363 * state, and if the tags are old we need to do clean-up as
8364 * well. */
8365 librados::IoCtx sub_ctx;
8366 sub_ctx.dup(ioctx);
8367 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
8368 updates[tracker.oid_name], y);
8369 if (r < 0 && r != -ENOENT) {
8370 return r;
8371 }
8372 } else {
8373 r = 0;
8374 }
8375
8376 if (r >= 0) {
8377 ldout(cct, 10) << "RGWRados::" << __func__ << ": got " <<
8378 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8379 m[name] = std::move(dirent);
8380 ++count;
8381 } else {
8382 ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " <<
8383 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8384 }
8385
8386 // refresh the candidates map
8387 candidates.erase(candidates.begin());
8388 tracker.advance();
8389
8390 next_candidate(tracker, candidates, tracker_idx);
8391
8392 if (tracker.at_end() && tracker.is_truncated()) {
8393 // once we exhaust one shard that is truncated, we need to stop,
8394 // as we cannot be certain that one of the next entries needs to
8395 // come from that shard; S3 and swift protocols allow returning
8396 // fewer than what was requested
8397 break;
8398 }
8399 } // while we haven't provided requested # of result entries
8400
8401 // suggest updates if there are any
8402 for (auto& miter : updates) {
8403 if (miter.second.length()) {
8404 ObjectWriteOperation o;
8405 cls_rgw_suggest_changes(o, miter.second);
8406 // we don't care if we lose suggested updates, send them off blindly
8407 AioCompletion *c =
8408 librados::Rados::aio_create_completion(nullptr, nullptr);
8409 ioctx.aio_operate(miter.first, c, &o);
8410 c->release();
8411 }
8412 } // updates loop
8413
8414 // determine truncation by checking if all the returned entries are
8415 // consumed or not
8416 *is_truncated = false;
8417 for (const auto& t : results_trackers) {
8418 if (!t.at_end() || t.is_truncated()) {
8419 *is_truncated = true;
8420 break;
8421 }
8422 }
8423
8424 ldout(cct, 20) << "RGWRados::" << __func__ <<
8425 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8426 dendl;
8427
8428 if (*is_truncated && count < num_entries) {
8429 ldout(cct, 10) << "RGWRados::" << __func__ <<
8430 ": INFO requested " << num_entries << " entries but returning " <<
8431 count << ", which is truncated" << dendl;
8432 }
8433
8434 if (last_entry_visited != nullptr && last_entry) {
8435 // since we'll not need this any more, might as well move it...
8436 *last_entry = std::move(last_entry_visited->key);
8437 ldout(cct, 20) << "RGWRados::" << __func__ <<
8438 ": returning, last_entry=" << *last_entry << dendl;
8439 } else {
8440 ldout(cct, 20) << "RGWRados::" << __func__ <<
8441 ": returning, last_entry NOT SET" << dendl;
8442 }
8443
8444 return 0;
8445 }
8446
8447
8448 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
8449 int shard_id,
8450 const rgw_obj_index_key& start_after,
8451 const string& prefix,
8452 uint32_t num_entries,
8453 bool list_versions,
8454 std::vector<rgw_bucket_dir_entry>& ent_list,
8455 bool *is_truncated,
8456 rgw_obj_index_key *last_entry,
8457 optional_yield y,
8458 check_filter_t force_check_filter) {
8459 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
8460 " start_after " << start_after.name << "[" << start_after.instance <<
8461 "] num_entries " << num_entries << dendl;
8462
8463 ent_list.clear();
8464 static MultipartMetaFilter multipart_meta_filter;
8465
8466 *is_truncated = false;
8467 RGWSI_RADOS::Pool index_pool;
8468
8469 map<int, string> oids;
8470 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &oids, nullptr);
8471 if (r < 0)
8472 return r;
8473
8474 auto& ioctx = index_pool.ioctx();
8475
8476 const uint32_t num_shards = oids.size();
8477
8478 rgw_obj_index_key marker = start_after;
8479 uint32_t current_shard;
8480 if (shard_id >= 0) {
8481 current_shard = shard_id;
8482 } else if (start_after.empty()) {
8483 current_shard = 0u;
8484 } else {
8485 // at this point we have a marker (start_after) that has something
8486 // in it, so we need to get to the bucket shard index, so we can
8487 // start reading from there
8488
8489 std::string key;
8490 // test whether object name is a multipart meta name
8491 if(! multipart_meta_filter.filter(start_after.name, key)) {
8492 // if multipart_meta_filter fails, must be "regular" (i.e.,
8493 // unadorned) and the name is the key
8494 key = start_after.name;
8495 }
8496
8497 // now convert the key (oid) to an rgw_obj_key since that will
8498 // separate out the namespace, name, and instance
8499 rgw_obj_key obj_key;
8500 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
8501 if (!parsed) {
8502 ldout(cct, 0) <<
8503 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
8504 "start marker: '" << start_after << "'" << dendl;
8505 return -EINVAL;
8506 } else if (obj_key.name.empty()) {
8507 // if the name is empty that means the object name came in with
8508 // a namespace only, and therefore we need to start our scan at
8509 // the first bucket index shard
8510 current_shard = 0u;
8511 } else {
8512 // so now we have the key used to compute the bucket index shard
8513 // and can extract the specific shard from it
8514 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
8515 }
8516 }
8517
8518 uint32_t count = 0u;
8519 map<string, bufferlist> updates;
8520 rgw_obj_index_key last_added_entry;
8521 while (count <= num_entries &&
8522 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8523 current_shard < num_shards)) {
8524 const std::string& oid = oids[current_shard];
8525 rgw_cls_list_ret result;
8526
8527 librados::ObjectReadOperation op;
8528 string empty_delimiter;
8529 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8530 num_entries,
8531 list_versions, &result);
8532 r = rgw_rados_operate(ioctx, oid, &op, nullptr, null_yield);
8533 if (r < 0)
8534 return r;
8535
8536 for (auto& entry : result.dir.m) {
8537 rgw_bucket_dir_entry& dirent = entry.second;
8538
8539 bool force_check = force_check_filter &&
8540 force_check_filter(dirent.key.name);
8541 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8542 !dirent.pending_map.empty() ||
8543 force_check) {
8544 /* there are uncommitted ops. We need to check the current state,
8545 * and if the tags are old we need to do cleanup as well. */
8546 librados::IoCtx sub_ctx;
8547 sub_ctx.dup(ioctx);
8548 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
8549 if (r < 0 && r != -ENOENT) {
8550 return r;
8551 }
8552 } else {
8553 r = 0;
8554 }
8555
8556 // at this point either r >=0 or r == -ENOENT
8557 if (r >= 0) { // i.e., if r != -ENOENT
8558 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
8559 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8560
8561 if (count < num_entries) {
8562 marker = last_added_entry = dirent.key; // double assign
8563 ent_list.emplace_back(std::move(dirent));
8564 ++count;
8565 } else {
8566 *is_truncated = true;
8567 goto check_updates;
8568 }
8569 } else { // r == -ENOENT
8570 // in the case of -ENOENT, make sure we're advancing marker
8571 // for possible next call to CLSRGWIssueBucketList
8572 marker = dirent.key;
8573 }
8574 } // entry for loop
8575
8576 if (!result.is_truncated) {
8577 // if we reached the end of the shard read next shard
8578 ++current_shard;
8579 marker = rgw_obj_index_key();
8580 }
8581 } // shard loop
8582
8583 check_updates:
8584
8585 // suggest updates if there is any
8586 map<string, bufferlist>::iterator miter = updates.begin();
8587 for (; miter != updates.end(); ++miter) {
8588 if (miter->second.length()) {
8589 ObjectWriteOperation o;
8590 cls_rgw_suggest_changes(o, miter->second);
8591 // we don't care if we lose suggested updates, send them off blindly
8592 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
8593 ioctx.aio_operate(miter->first, c, &o);
8594 c->release();
8595 }
8596 }
8597
8598 if (last_entry && !ent_list.empty()) {
8599 *last_entry = last_added_entry;
8600 }
8601
8602 return 0;
8603 } // RGWRados::cls_bucket_list_unordered
8604
8605
8606 int RGWRados::cls_obj_usage_log_add(const string& oid,
8607 rgw_usage_log_info& info)
8608 {
8609 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8610
8611 rgw_rados_ref ref;
8612 int r = get_raw_obj_ref(obj, &ref);
8613 if (r < 0) {
8614 return r;
8615 }
8616
8617 ObjectWriteOperation op;
8618 cls_rgw_usage_log_add(op, info);
8619
8620 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8621 return r;
8622 }
8623
8624 int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket,
8625 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
8626 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
8627 bool *is_truncated)
8628 {
8629 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8630
8631 rgw_rados_ref ref;
8632 int r = get_raw_obj_ref(obj, &ref);
8633 if (r < 0) {
8634 return r;
8635 }
8636
8637 *is_truncated = false;
8638
8639 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
8640 max_entries, read_iter, usage, is_truncated);
8641
8642 return r;
8643 }
8644
8645 static int cls_rgw_usage_log_trim_repeat(rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
8646 {
8647 bool done = false;
8648 do {
8649 librados::ObjectWriteOperation op;
8650 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
8651 int r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8652 if (r == -ENODATA)
8653 done = true;
8654 else if (r < 0)
8655 return r;
8656 } while (!done);
8657
8658 return 0;
8659 }
8660
8661 int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket,
8662 uint64_t start_epoch, uint64_t end_epoch)
8663 {
8664 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8665
8666 rgw_rados_ref ref;
8667 int r = get_raw_obj_ref(obj, &ref);
8668 if (r < 0) {
8669 return r;
8670 }
8671
8672 r = cls_rgw_usage_log_trim_repeat(ref, user, bucket, start_epoch, end_epoch);
8673 return r;
8674 }
8675
8676 int RGWRados::cls_obj_usage_log_clear(string& oid)
8677 {
8678 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8679
8680 rgw_rados_ref ref;
8681 int r = get_raw_obj_ref(obj, &ref);
8682 if (r < 0) {
8683 return r;
8684 }
8685 librados::ObjectWriteOperation op;
8686 cls_rgw_usage_log_clear(op);
8687 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8688 return r;
8689 }
8690
8691
8692 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
8693 {
8694 RGWSI_RADOS::Pool index_pool;
8695 string dir_oid;
8696
8697 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
8698
8699 int r = svc.bi_rados->open_bucket_index(bucket_info, &index_pool, &dir_oid);
8700 if (r < 0)
8701 return r;
8702
8703 bufferlist updates;
8704
8705 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
8706 rgw_bucket_dir_entry entry;
8707 entry.key = *iter;
8708 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
8709 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
8710 updates.append(CEPH_RGW_REMOVE | suggest_flag);
8711 encode(entry, updates);
8712 }
8713
8714 bufferlist out;
8715
8716 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
8717
8718 return r;
8719 }
8720
8721 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
8722 const RGWBucketInfo& bucket_info,
8723 rgw_bucket_dir_entry& list_state,
8724 rgw_bucket_dir_entry& object,
8725 bufferlist& suggested_updates,
8726 optional_yield y)
8727 {
8728 const rgw_bucket& bucket = bucket_info.bucket;
8729 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
8730
8731 std::string loc;
8732
8733 rgw_obj obj(bucket, list_state.key);
8734
8735 string oid;
8736 get_obj_bucket_and_oid_loc(obj, oid, loc);
8737
8738 if (loc != list_state.locator) {
8739 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
8740 }
8741
8742 io_ctx.locator_set_key(list_state.locator);
8743
8744 RGWObjState *astate = NULL;
8745 RGWObjectCtx rctx(this->store);
8746 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false, y);
8747 if (r < 0)
8748 return r;
8749
8750 list_state.pending_map.clear(); // we don't need this and it inflates size
8751 if (!list_state.is_delete_marker() && !astate->exists) {
8752 /* object doesn't exist right now -- hopefully because it's
8753 * marked as !exists and got deleted */
8754 if (list_state.exists) {
8755 /* FIXME: what should happen now? Work out if there are any
8756 * non-bad ways this could happen (there probably are, but annoying
8757 * to handle!) */
8758 }
8759 // encode a suggested removal of that key
8760 list_state.ver.epoch = io_ctx.get_last_version();
8761 list_state.ver.pool = io_ctx.get_id();
8762 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
8763 return -ENOENT;
8764 }
8765
8766 string etag;
8767 string content_type;
8768 ACLOwner owner;
8769
8770 object.meta.size = astate->size;
8771 object.meta.accounted_size = astate->accounted_size;
8772 object.meta.mtime = astate->mtime;
8773
8774 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
8775 if (iter != astate->attrset.end()) {
8776 etag = rgw_bl_str(iter->second);
8777 }
8778 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
8779 if (iter != astate->attrset.end()) {
8780 content_type = rgw_bl_str(iter->second);
8781 }
8782 iter = astate->attrset.find(RGW_ATTR_ACL);
8783 if (iter != astate->attrset.end()) {
8784 r = decode_policy(iter->second, &owner);
8785 if (r < 0) {
8786 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
8787 }
8788 }
8789
8790 if (astate->manifest) {
8791 RGWObjManifest::obj_iterator miter;
8792 RGWObjManifest& manifest = *astate->manifest;
8793 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
8794 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
8795 rgw_obj loc;
8796 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
8797
8798 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
8799 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
8800 r = delete_obj_index(loc, astate->mtime);
8801 if (r < 0) {
8802 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
8803 }
8804 }
8805 }
8806 }
8807
8808 object.meta.etag = etag;
8809 object.meta.content_type = content_type;
8810 object.meta.owner = owner.get_id().to_str();
8811 object.meta.owner_display_name = owner.get_display_name();
8812
8813 // encode suggested updates
8814 list_state.ver.pool = io_ctx.get_id();
8815 list_state.ver.epoch = astate->epoch;
8816 list_state.meta.size = object.meta.size;
8817 list_state.meta.accounted_size = object.meta.accounted_size;
8818 list_state.meta.mtime = object.meta.mtime;
8819 list_state.meta.category = main_category;
8820 list_state.meta.etag = etag;
8821 list_state.meta.content_type = content_type;
8822 if (astate->obj_tag.length() > 0)
8823 list_state.tag = astate->obj_tag.c_str();
8824 list_state.meta.owner = owner.get_id().to_str();
8825 list_state.meta.owner_display_name = owner.get_display_name();
8826
8827 list_state.exists = true;
8828 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
8829 return 0;
8830 }
8831
8832 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
8833 {
8834 RGWSI_RADOS::Pool index_pool;
8835 map<int, string> oids;
8836 map<int, struct rgw_cls_list_ret> list_results;
8837 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
8838 if (r < 0) {
8839 ldout(cct, 20) << "cls_bucket_head: open_bucket_index() returned "
8840 << r << dendl;
8841 return r;
8842 }
8843
8844 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
8845 if (r < 0) {
8846 ldout(cct, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
8847 << r << dendl;
8848 return r;
8849 }
8850
8851 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
8852 for(; iter != list_results.end(); ++iter) {
8853 headers.push_back(std::move(iter->second.dir.header));
8854 }
8855 return 0;
8856 }
8857
8858 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
8859 {
8860 RGWSI_RADOS::Pool index_pool;
8861 map<int, string> bucket_objs;
8862 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
8863 if (r < 0)
8864 return r;
8865
8866 map<int, string>::iterator iter = bucket_objs.begin();
8867 for (; iter != bucket_objs.end(); ++iter) {
8868 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
8869 if (r < 0) {
8870 ctx->put();
8871 break;
8872 } else {
8873 (*num_aio)++;
8874 }
8875 }
8876 return r;
8877 }
8878
8879 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
8880 const rgw_bucket& bucket,
8881 uint64_t num_objs)
8882 {
8883 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
8884 return 0;
8885 }
8886
8887 bool need_resharding = false;
8888 uint32_t num_source_shards =
8889 (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
8890 const uint32_t max_dynamic_shards =
8891 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
8892
8893 if (num_source_shards >= max_dynamic_shards) {
8894 return 0;
8895 }
8896
8897 uint32_t suggested_num_shards = 0;
8898 const uint64_t max_objs_per_shard =
8899 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
8900
8901 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
8902 num_objs, need_resharding, &suggested_num_shards);
8903 if (! need_resharding) {
8904 return 0;
8905 }
8906
8907 const uint32_t final_num_shards =
8908 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
8909 max_dynamic_shards);
8910 // final verification, so we don't reduce number of shards
8911 if (final_num_shards <= num_source_shards) {
8912 return 0;
8913 }
8914
8915 ldout(cct, 20) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
8916 " needs resharding; current num shards " << bucket_info.num_shards <<
8917 "; new num shards " << final_num_shards << " (suggested " <<
8918 suggested_num_shards << ")" << dendl;
8919
8920 return add_bucket_to_reshard(bucket_info, final_num_shards);
8921 }
8922
8923 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
8924 {
8925 RGWReshard reshard(this->store);
8926
8927 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
8928
8929 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
8930 if (new_num_shards <= num_source_shards) {
8931 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
8932 return 0;
8933 }
8934
8935 cls_rgw_reshard_entry entry;
8936 entry.time = real_clock::now();
8937 entry.tenant = bucket_info.owner.tenant;
8938 entry.bucket_name = bucket_info.bucket.name;
8939 entry.bucket_id = bucket_info.bucket.bucket_id;
8940 entry.old_num_shards = num_source_shards;
8941 entry.new_num_shards = new_num_shards;
8942
8943 return reshard.add(entry);
8944 }
8945
8946 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
8947 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only)
8948 {
8949 // if we only check size, then num_objs will set to 0
8950 if(check_size_only)
8951 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size);
8952
8953 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
8954 }
8955
8956 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
8957 int *shard_id)
8958 {
8959 int r = 0;
8960 switch (bucket_info.bucket_index_shard_hash_type) {
8961 case RGWBucketInfo::MOD:
8962 if (!bucket_info.num_shards) {
8963 if (shard_id) {
8964 *shard_id = -1;
8965 }
8966 } else {
8967 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, bucket_info.num_shards);
8968 if (shard_id) {
8969 *shard_id = (int)sid;
8970 }
8971 }
8972 break;
8973 default:
8974 r = -ENOTSUP;
8975 }
8976 return r;
8977 }
8978
8979 uint64_t RGWRados::instance_id()
8980 {
8981 return get_rados_handle()->get_instance_id();
8982 }
8983
8984 uint64_t RGWRados::next_bucket_id()
8985 {
8986 std::lock_guard l{bucket_id_lock};
8987 return ++max_bucket_id;
8988 }
8989
8990 librados::Rados* RGWRados::get_rados_handle()
8991 {
8992 return &rados;
8993 }
8994
8995 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
8996 {
8997 rgw_rados_ref ref;
8998 int ret = get_raw_obj_ref(obj, &ref);
8999 if (ret < 0) {
9000 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9001 return ret;
9002 }
9003
9004 ObjectWriteOperation op;
9005 list<string> prefixes;
9006 cls_rgw_remove_obj(op, prefixes);
9007
9008 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9009 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
9010 if (ret < 0) {
9011 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9012 c->release();
9013 return ret;
9014 }
9015
9016 handles.push_back(c);
9017
9018 return 0;
9019 }
9020
9021 int RGWRados::delete_obj_aio(const rgw_obj& obj,
9022 RGWBucketInfo& bucket_info, RGWObjState *astate,
9023 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9024 optional_yield y)
9025 {
9026 rgw_rados_ref ref;
9027 int ret = get_obj_head_ref(bucket_info, obj, &ref);
9028 if (ret < 0) {
9029 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9030 return ret;
9031 }
9032
9033 if (keep_index_consistent) {
9034 RGWRados::Bucket bop(this, bucket_info);
9035 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9036
9037 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag, y);
9038 if (ret < 0) {
9039 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
9040 return ret;
9041 }
9042 }
9043
9044 ObjectWriteOperation op;
9045 list<string> prefixes;
9046 cls_rgw_remove_obj(op, prefixes);
9047
9048 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9049 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
9050 if (ret < 0) {
9051 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9052 c->release();
9053 return ret;
9054 }
9055
9056 handles.push_back(c);
9057
9058 if (keep_index_consistent) {
9059 ret = delete_obj_index(obj, astate->mtime);
9060 if (ret < 0) {
9061 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
9062 return ret;
9063 }
9064 }
9065 return ret;
9066 }