]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
bump version to 16.2.6-pve2
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab ft=cpp
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <sstream>
9
10 #include <boost/algorithm/string.hpp>
11 #include <string_view>
12
13 #include <boost/container/flat_set.hpp>
14 #include <boost/format.hpp>
15 #include <boost/optional.hpp>
16 #include <boost/utility/in_place_factory.hpp>
17
18 #include "common/ceph_json.h"
19
20 #include "common/errno.h"
21 #include "common/Formatter.h"
22 #include "common/Throttle.h"
23
24 #include "rgw_sal.h"
25 #include "rgw_zone.h"
26 #include "rgw_cache.h"
27 #include "rgw_acl.h"
28 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
29 #include "rgw_aio_throttle.h"
30 #include "rgw_bucket.h"
31 #include "rgw_rest_conn.h"
32 #include "rgw_cr_rados.h"
33 #include "rgw_cr_rest.h"
34 #include "rgw_datalog.h"
35 #include "rgw_putobj_processor.h"
36
37 #include "cls/rgw/cls_rgw_ops.h"
38 #include "cls/rgw/cls_rgw_client.h"
39 #include "cls/rgw/cls_rgw_const.h"
40 #include "cls/refcount/cls_refcount_client.h"
41 #include "cls/version/cls_version_client.h"
42 #include "osd/osd_types.h"
43
44 #include "rgw_tools.h"
45 #include "rgw_coroutine.h"
46 #include "rgw_compression.h"
47 #include "rgw_etag_verifier.h"
48 #include "rgw_worker.h"
49 #include "rgw_notify.h"
50
51 #undef fork // fails to compile RGWPeriod::fork() below
52
53 #include "common/Clock.h"
54
55 using namespace librados;
56
57 #include <string>
58 #include <iostream>
59 #include <vector>
60 #include <atomic>
61 #include <list>
62 #include <map>
63 #include "include/random.h"
64
65 #include "rgw_gc.h"
66 #include "rgw_lc.h"
67
68 #include "rgw_object_expirer_core.h"
69 #include "rgw_sync.h"
70 #include "rgw_sync_counters.h"
71 #include "rgw_sync_trace.h"
72 #include "rgw_trim_datalog.h"
73 #include "rgw_trim_mdlog.h"
74 #include "rgw_data_sync.h"
75 #include "rgw_realm_watcher.h"
76 #include "rgw_reshard.h"
77
78 #include "services/svc_zone.h"
79 #include "services/svc_zone_utils.h"
80 #include "services/svc_quota.h"
81 #include "services/svc_sync_modules.h"
82 #include "services/svc_sys_obj.h"
83 #include "services/svc_sys_obj_cache.h"
84 #include "services/svc_bucket.h"
85 #include "services/svc_mdlog.h"
86
87 #include "compressor/Compressor.h"
88
89 #ifdef WITH_LTTNG
90 #define TRACEPOINT_DEFINE
91 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
92 #include "tracing/rgw_rados.h"
93 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
94 #undef TRACEPOINT_DEFINE
95 #else
96 #define tracepoint(...)
97 #endif
98
99 #define dout_context g_ceph_context
100 #define dout_subsys ceph_subsys_rgw
101
102
103 static string shadow_ns = "shadow";
104 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
105 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
106
107 static RGWObjCategory main_category = RGWObjCategory::Main;
108 #define RGW_USAGE_OBJ_PREFIX "usage."
109
110 #define dout_subsys ceph_subsys_rgw
111
112
113 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
114 const rgw_placement_rule& head_placement_rule,
115 const rgw_obj& obj, rgw_pool *pool)
116 {
117 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
118 RGWZonePlacementInfo placement;
119 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
120 return false;
121 }
122
123 if (!obj.in_extra_data) {
124 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
125 } else {
126 *pool = placement.get_data_extra_pool();
127 }
128 }
129
130 return true;
131 }
132
133 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
134 const rgw_placement_rule& head_placement_rule,
135 const rgw_obj& obj, rgw_raw_obj *raw_obj)
136 {
137 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
138
139 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
140 }
141
142 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
143 {
144 if (!is_raw) {
145 rgw_raw_obj r;
146 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
147 return r;
148 }
149 return raw_obj;
150 }
151
152 rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RGWStore* store) const
153 {
154 if (!is_raw) {
155 rgw_raw_obj r;
156 store->get_raw_obj(placement_rule, obj, &r);
157 return r;
158 }
159 return raw_obj;
160 }
161
162 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
163 {
164 obj_version *check_objv = version_for_check();
165
166 if (check_objv) {
167 cls_version_check(*op, *check_objv, VER_COND_EQ);
168 }
169
170 cls_version_read(*op, &read_version);
171 }
172
173 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
174 {
175 obj_version *check_objv = version_for_check();
176 obj_version *modify_version = version_for_write();
177
178 if (check_objv) {
179 cls_version_check(*op, *check_objv, VER_COND_EQ);
180 }
181
182 if (modify_version) {
183 cls_version_set(*op, *modify_version);
184 } else {
185 cls_version_inc(*op);
186 }
187 }
188
189 void RGWObjVersionTracker::apply_write()
190 {
191 const bool checked = (read_version.ver != 0);
192 const bool incremented = (write_version.ver == 0);
193
194 if (checked && incremented) {
195 // apply cls_version_inc() so our next operation can recheck it
196 ++read_version.ver;
197 } else {
198 read_version = write_version;
199 }
200 write_version = obj_version();
201 }
202
203 RGWObjState::RGWObjState() {
204 }
205
206 RGWObjState::~RGWObjState() {
207 }
208
209 RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
210 is_atomic = rhs.is_atomic;
211 has_attrs = rhs.has_attrs;
212 exists = rhs.exists;
213 size = rhs.size;
214 accounted_size = rhs.accounted_size;
215 mtime = rhs.mtime;
216 epoch = rhs.epoch;
217 if (rhs.obj_tag.length()) {
218 obj_tag = rhs.obj_tag;
219 }
220 if (rhs.tail_tag.length()) {
221 tail_tag = rhs.tail_tag;
222 }
223 write_tag = rhs.write_tag;
224 fake_tag = rhs.fake_tag;
225 manifest = rhs.manifest;
226 shadow_obj = rhs.shadow_obj;
227 has_data = rhs.has_data;
228 if (rhs.data.length()) {
229 data = rhs.data;
230 }
231 prefetch_data = rhs.prefetch_data;
232 keep_tail = rhs.keep_tail;
233 is_olh = rhs.is_olh;
234 objv_tracker = rhs.objv_tracker;
235 pg_ver = rhs.pg_ver;
236 }
237
238 RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
239 RGWObjState *result;
240 typename std::map<rgw_obj, RGWObjState>::iterator iter;
241 lock.lock_shared();
242 assert (!obj.empty());
243 iter = objs_state.find(obj);
244 if (iter != objs_state.end()) {
245 result = &iter->second;
246 lock.unlock_shared();
247 } else {
248 lock.unlock_shared();
249 lock.lock();
250 result = &objs_state[obj];
251 lock.unlock();
252 }
253 return result;
254 }
255
256 void RGWObjectCtx::set_atomic(rgw_obj& obj) {
257 std::unique_lock wl{lock};
258 assert (!obj.empty());
259 objs_state[obj].is_atomic = true;
260 }
261 void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
262 std::unique_lock wl{lock};
263 assert (!obj.empty());
264 objs_state[obj].prefetch_data = true;
265 }
266
267 void RGWObjectCtx::invalidate(const rgw_obj& obj) {
268 std::unique_lock wl{lock};
269 auto iter = objs_state.find(obj);
270 if (iter == objs_state.end()) {
271 return;
272 }
273 bool is_atomic = iter->second.is_atomic;
274 bool prefetch_data = iter->second.prefetch_data;
275
276 objs_state.erase(iter);
277
278 if (is_atomic || prefetch_data) {
279 auto& state = objs_state[obj];
280 state.is_atomic = is_atomic;
281 state.prefetch_data = prefetch_data;
282 }
283 }
284
285 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
286 {
287 write_version.ver = 1;
288 #define TAG_LEN 24
289
290 write_version.tag.clear();
291 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
292 }
293
294 class RGWMetaNotifierManager : public RGWCoroutinesManager {
295 RGWRados *store;
296 RGWHTTPManager http_manager;
297
298 public:
299 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
300 http_manager(store->ctx(), completion_mgr) {
301 http_manager.start();
302 }
303
304 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
305 rgw_http_param_pair pairs[] = { { "type", "metadata" },
306 { "notify", NULL },
307 { NULL, NULL } };
308
309 list<RGWCoroutinesStack *> stacks;
310 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
311 RGWRESTConn *conn = iter->second;
312 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
313 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
314
315 stacks.push_back(stack);
316 }
317 return run(dpp, stacks);
318 }
319 };
320
321 class RGWDataNotifierManager : public RGWCoroutinesManager {
322 RGWRados *store;
323 RGWHTTPManager http_manager;
324
325 public:
326 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
327 http_manager(store->ctx(), completion_mgr) {
328 http_manager.start();
329 }
330
331 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
332 bc::flat_map<int, bc::flat_set<string> >& shards) {
333 rgw_http_param_pair pairs[] = { { "type", "data" },
334 { "notify", NULL },
335 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
336 { NULL, NULL } };
337
338 list<RGWCoroutinesStack *> stacks;
339 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
340 RGWRESTConn *conn = iter->second;
341 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
342 stack->call(new RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
343
344 stacks.push_back(stack);
345 }
346 return run(dpp, stacks);
347 }
348 };
349
350 /* class RGWRadosThread */
351
352 void RGWRadosThread::start()
353 {
354 worker = new Worker(cct, this);
355 worker->create(thread_name.c_str());
356 }
357
358 void RGWRadosThread::stop()
359 {
360 down_flag = true;
361 stop_process();
362 if (worker) {
363 worker->signal();
364 worker->join();
365 }
366 delete worker;
367 worker = NULL;
368 }
369
370 void *RGWRadosThread::Worker::entry() {
371 uint64_t msec = processor->interval_msec();
372 auto interval = std::chrono::milliseconds(msec);
373
374 do {
375 auto start = ceph::real_clock::now();
376 int r = processor->process(this);
377 if (r < 0) {
378 ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
379 }
380
381 if (processor->going_down())
382 break;
383
384 auto end = ceph::real_clock::now() - start;
385
386 uint64_t cur_msec = processor->interval_msec();
387 if (cur_msec != msec) { /* was it reconfigured? */
388 msec = cur_msec;
389 interval = std::chrono::milliseconds(msec);
390 }
391
392 if (cur_msec > 0) {
393 if (interval <= end)
394 continue; // next round
395
396 auto wait_time = interval - end;
397 wait_interval(wait_time);
398 } else {
399 wait();
400 }
401 } while (!processor->going_down());
402
403 return NULL;
404 }
405
406 class RGWMetaNotifier : public RGWRadosThread {
407 RGWMetaNotifierManager notify_mgr;
408 RGWMetadataLog *const log;
409
410 uint64_t interval_msec() override {
411 return cct->_conf->rgw_md_notify_interval_msec;
412 }
413 void stop_process() override {
414 notify_mgr.stop();
415 }
416 public:
417 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
418 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
419
420 int process(const DoutPrefixProvider *dpp) override;
421 };
422
423 int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
424 {
425 set<int> shards;
426
427 log->read_clear_modified(shards);
428
429 if (shards.empty()) {
430 return 0;
431 }
432
433 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
434 ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
435 }
436
437 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
438
439 return 0;
440 }
441
442 class RGWDataNotifier : public RGWRadosThread {
443 RGWDataNotifierManager notify_mgr;
444
445 uint64_t interval_msec() override {
446 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
447 }
448 void stop_process() override {
449 notify_mgr.stop();
450 }
451 public:
452 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
453
454 int process(const DoutPrefixProvider *dpp) override;
455 };
456
457 int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
458 {
459 auto data_log = store->svc.datalog_rados;
460 if (!data_log) {
461 return 0;
462 }
463
464 auto shards = data_log->read_clear_modified();
465
466 if (shards.empty()) {
467 return 0;
468 }
469
470 for (const auto& [shard_id, keys] : shards) {
471 ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
472 << shard_id << ": " << keys << dendl;
473 }
474
475 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
476
477 return 0;
478 }
479
480 class RGWSyncProcessorThread : public RGWRadosThread {
481 public:
482 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
483 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
484 ~RGWSyncProcessorThread() override {}
485 int init(const DoutPrefixProvider *dpp) override = 0 ;
486 int process(const DoutPrefixProvider *dpp) override = 0;
487 };
488
489 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
490 {
491 RGWMetaSyncStatusManager sync;
492
493 uint64_t interval_msec() override {
494 return 0; /* no interval associated, it'll run once until stopped */
495 }
496 void stop_process() override {
497 sync.stop();
498 }
499 public:
500 RGWMetaSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados)
501 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
502
503 void wakeup_sync_shards(set<int>& shard_ids) {
504 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
505 sync.wakeup(*iter);
506 }
507 }
508 RGWMetaSyncStatusManager* get_manager() { return &sync; }
509
510 int init(const DoutPrefixProvider *dpp) override {
511 int ret = sync.init(dpp);
512 if (ret < 0) {
513 ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
514 return ret;
515 }
516 return 0;
517 }
518
519 int process(const DoutPrefixProvider *dpp) override {
520 sync.run(dpp, null_yield);
521 return 0;
522 }
523 };
524
525 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
526 {
527 PerfCountersRef counters;
528 RGWDataSyncStatusManager sync;
529 bool initialized;
530
531 uint64_t interval_msec() override {
532 if (initialized) {
533 return 0; /* no interval associated, it'll run once until stopped */
534 } else {
535 #define DATA_SYNC_INIT_WAIT_SEC 20
536 return DATA_SYNC_INIT_WAIT_SEC * 1000;
537 }
538 }
539 void stop_process() override {
540 sync.stop();
541 }
542 public:
543 RGWDataSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados,
544 const RGWZone* source_zone)
545 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
546 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
547 sync(_store, async_rados, source_zone->id, counters.get()),
548 initialized(false) {}
549
550 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
551 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
552 sync.wakeup(iter->first, iter->second);
553 }
554 }
555 RGWDataSyncStatusManager* get_manager() { return &sync; }
556
557 int init(const DoutPrefixProvider *dpp) override {
558 return 0;
559 }
560
561 int process(const DoutPrefixProvider *dpp) override {
562 while (!initialized) {
563 if (going_down()) {
564 return 0;
565 }
566 int ret = sync.init(dpp);
567 if (ret >= 0) {
568 initialized = true;
569 break;
570 }
571 /* we'll be back! */
572 return 0;
573 }
574 sync.run(dpp);
575 return 0;
576 }
577 };
578
579 class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
580 {
581 RGWCoroutinesManager crs;
582 rgw::sal::RGWRadosStore *store;
583 rgw::BucketTrimManager *bucket_trim;
584 RGWHTTPManager http;
585 const utime_t trim_interval;
586
587 uint64_t interval_msec() override { return 0; }
588 void stop_process() override { crs.stop(); }
589 public:
590 RGWSyncLogTrimThread(rgw::sal::RGWRadosStore *store, rgw::BucketTrimManager *bucket_trim,
591 int interval)
592 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
593 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
594 bucket_trim(bucket_trim),
595 http(store->ctx(), crs.get_completion_mgr()),
596 trim_interval(interval, 0)
597 {}
598
599 int init(const DoutPrefixProvider *dpp) override {
600 return http.start();
601 }
602 int process(const DoutPrefixProvider *dpp) override {
603 list<RGWCoroutinesStack*> stacks;
604 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
605 meta->call(create_meta_log_trim_cr(this, store, &http,
606 cct->_conf->rgw_md_log_max_shards,
607 trim_interval));
608 stacks.push_back(meta);
609
610 if (store->svc()->zone->sync_module_exports_data()) {
611 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
612 data->call(create_data_log_trim_cr(this, store, &http,
613 cct->_conf->rgw_data_log_num_shards,
614 trim_interval));
615 stacks.push_back(data);
616
617 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
618 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
619 stacks.push_back(bucket);
620 }
621
622 crs.run(dpp, stacks);
623 return 0;
624 }
625
626 // implements DoutPrefixProvider
627 CephContext *get_cct() const override { return store->ctx(); }
628 unsigned get_subsys() const override
629 {
630 return dout_subsys;
631 }
632
633 std::ostream& gen_prefix(std::ostream& out) const override
634 {
635 return out << "sync log trim: ";
636 }
637
638 };
639
640 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
641 {
642 std::lock_guard l{meta_sync_thread_lock};
643 if (meta_sync_processor_thread) {
644 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
645 }
646 }
647
648 void RGWRados::wakeup_data_sync_shards(const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
649 {
650 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
651 std::lock_guard l{data_sync_thread_lock};
652 auto iter = data_sync_processor_threads.find(source_zone);
653 if (iter == data_sync_processor_threads.end()) {
654 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
655 return;
656 }
657
658 RGWDataSyncProcessorThread *thread = iter->second;
659 ceph_assert(thread);
660 thread->wakeup_sync_shards(shard_ids);
661 }
662
663 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
664 {
665 std::lock_guard l{meta_sync_thread_lock};
666 if (meta_sync_processor_thread) {
667 return meta_sync_processor_thread->get_manager();
668 }
669 return nullptr;
670 }
671
672 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
673 {
674 std::lock_guard l{data_sync_thread_lock};
675 auto thread = data_sync_processor_threads.find(source_zone);
676 if (thread == data_sync_processor_threads.end()) {
677 return nullptr;
678 }
679 return thread->second->get_manager();
680 }
681
682 int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
683 {
684 IoCtx ioctx;
685 int r = open_pool_ctx(dpp, pool, ioctx, false);
686 if (r < 0) {
687 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
688 return r;
689 }
690
691 bool requires;
692 r = ioctx.pool_requires_alignment2(&requires);
693 if (r < 0) {
694 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
695 << r << dendl;
696 return r;
697 }
698
699 if (!requires) {
700 *alignment = 0;
701 return 0;
702 }
703
704 uint64_t align;
705 r = ioctx.pool_required_alignment2(&align);
706 if (r < 0) {
707 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
708 << r << dendl;
709 return r;
710 }
711 if (align != 0) {
712 ldout(cct, 20) << "required alignment=" << align << dendl;
713 }
714 *alignment = align;
715 return 0;
716 }
717
718 void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
719 {
720 if (alignment == 0) {
721 *max_size = size;
722 return;
723 }
724
725 if (size <= alignment) {
726 *max_size = alignment;
727 return;
728 }
729
730 *max_size = size - (size % alignment);
731 }
732
733 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
734 {
735 uint64_t alignment;
736 int r = get_required_alignment(dpp, pool, &alignment);
737 if (r < 0) {
738 return r;
739 }
740
741 if (palignment) {
742 *palignment = alignment;
743 }
744
745 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
746
747 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
748
749 ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
750
751 return 0;
752 }
753
754 int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
755 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
756 {
757 rgw_pool pool;
758 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
759 ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
760 return -EIO;
761 }
762 return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
763 }
764
765 class RGWIndexCompletionManager;
766
767 struct complete_op_data {
768 ceph::mutex lock = ceph::make_mutex("complete_op_data");
769 AioCompletion *rados_completion{nullptr};
770 int manager_shard_id{-1};
771 RGWIndexCompletionManager *manager{nullptr};
772 rgw_obj obj;
773 RGWModifyOp op;
774 string tag;
775 rgw_bucket_entry_ver ver;
776 cls_rgw_obj_key key;
777 rgw_bucket_dir_entry_meta dir_meta;
778 list<cls_rgw_obj_key> remove_objs;
779 bool log_op;
780 uint16_t bilog_op;
781 rgw_zone_set zones_trace;
782
783 bool stopped{false};
784
785 void stop() {
786 std::lock_guard l{lock};
787 stopped = true;
788 }
789 };
790
791 class RGWIndexCompletionThread : public RGWRadosThread, public DoutPrefixProvider {
792 RGWRados *store;
793
794 uint64_t interval_msec() override {
795 return 0;
796 }
797
798 list<complete_op_data *> completions;
799
800 ceph::mutex completions_lock =
801 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
802 public:
803 RGWIndexCompletionThread(RGWRados *_store)
804 : RGWRadosThread(_store, "index-complete"), store(_store) {}
805
806 int process(const DoutPrefixProvider *dpp) override;
807
808 void add_completion(complete_op_data *completion) {
809 {
810 std::lock_guard l{completions_lock};
811 completions.push_back(completion);
812 }
813
814 signal();
815 }
816
817 CephContext *get_cct() const override { return store->ctx(); }
818 unsigned get_subsys() const { return dout_subsys; }
819 std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw index completion thread: "; }
820 };
821
822 int RGWIndexCompletionThread::process(const DoutPrefixProvider *dpp)
823 {
824 list<complete_op_data *> comps;
825
826 {
827 std::lock_guard l{completions_lock};
828 completions.swap(comps);
829 }
830
831 for (auto c : comps) {
832 std::unique_ptr<complete_op_data> up{c};
833
834 if (going_down()) {
835 continue;
836 }
837 ldpp_dout(this, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
838
839 RGWRados::BucketShard bs(store);
840 RGWBucketInfo bucket_info;
841
842 int r = bs.init(c->obj.bucket, c->obj, &bucket_info, this);
843 if (r < 0) {
844 ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
845 /* not much to do */
846 continue;
847 }
848
849 r = store->guard_reshard(this, &bs, c->obj, bucket_info,
850 [&](RGWRados::BucketShard *bs) -> int {
851 librados::ObjectWriteOperation o;
852 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
853 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
854 c->log_op, c->bilog_op, &c->zones_trace);
855 return bs->bucket_obj.operate(this, &o, null_yield);
856 });
857 if (r < 0) {
858 ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
859 /* ignoring error, can't do anything about it */
860 continue;
861 }
862 r = store->svc.datalog_rados->add_entry(this, bucket_info, bs.shard_id);
863 if (r < 0) {
864 ldpp_dout(this, -1) << "ERROR: failed writing data log" << dendl;
865 }
866 }
867
868 return 0;
869 }
870
871 class RGWIndexCompletionManager {
872 RGWRados *store{nullptr};
873 ceph::containers::tiny_vector<ceph::mutex> locks;
874 vector<set<complete_op_data *> > completions;
875
876 RGWIndexCompletionThread *completion_thread{nullptr};
877
878 int num_shards;
879
880 std::atomic<int> cur_shard {0};
881
882
883 public:
884 RGWIndexCompletionManager(RGWRados *_store) :
885 store(_store),
886 locks{ceph::make_lock_container<ceph::mutex>(
887 store->ctx()->_conf->rgw_thread_pool_size,
888 [](const size_t i) {
889 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
890 std::to_string(i));
891 })}
892 {
893 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
894 completions.resize(num_shards);
895 }
896 ~RGWIndexCompletionManager() {
897 stop();
898 }
899
900 int next_shard() {
901 int result = cur_shard % num_shards;
902 cur_shard++;
903 return result;
904 }
905
906 void create_completion(const rgw_obj& obj,
907 RGWModifyOp op, string& tag,
908 rgw_bucket_entry_ver& ver,
909 const cls_rgw_obj_key& key,
910 rgw_bucket_dir_entry_meta& dir_meta,
911 list<cls_rgw_obj_key> *remove_objs, bool log_op,
912 uint16_t bilog_op,
913 rgw_zone_set *zones_trace,
914 complete_op_data **result);
915 bool handle_completion(completion_t cb, complete_op_data *arg);
916
917 int start(const DoutPrefixProvider *dpp) {
918 completion_thread = new RGWIndexCompletionThread(store);
919 int ret = completion_thread->init(dpp);
920 if (ret < 0) {
921 return ret;
922 }
923 completion_thread->start();
924 return 0;
925 }
926 void stop() {
927 if (completion_thread) {
928 completion_thread->stop();
929 delete completion_thread;
930 }
931
932 for (int i = 0; i < num_shards; ++i) {
933 std::lock_guard l{locks[i]};
934 for (auto c : completions[i]) {
935 c->stop();
936 }
937 }
938 completions.clear();
939 }
940 };
941
942 static void obj_complete_cb(completion_t cb, void *arg)
943 {
944 complete_op_data *completion = (complete_op_data *)arg;
945 completion->lock.lock();
946 if (completion->stopped) {
947 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
948 delete completion;
949 return;
950 }
951 bool need_delete = completion->manager->handle_completion(cb, completion);
952 completion->lock.unlock();
953 if (need_delete) {
954 delete completion;
955 }
956 }
957
958
959 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
960 RGWModifyOp op, string& tag,
961 rgw_bucket_entry_ver& ver,
962 const cls_rgw_obj_key& key,
963 rgw_bucket_dir_entry_meta& dir_meta,
964 list<cls_rgw_obj_key> *remove_objs, bool log_op,
965 uint16_t bilog_op,
966 rgw_zone_set *zones_trace,
967 complete_op_data **result)
968 {
969 complete_op_data *entry = new complete_op_data;
970
971 int shard_id = next_shard();
972
973 entry->manager_shard_id = shard_id;
974 entry->manager = this;
975 entry->obj = obj;
976 entry->op = op;
977 entry->tag = tag;
978 entry->ver = ver;
979 entry->key = key;
980 entry->dir_meta = dir_meta;
981 entry->log_op = log_op;
982 entry->bilog_op = bilog_op;
983
984 if (remove_objs) {
985 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
986 entry->remove_objs.push_back(*iter);
987 }
988 }
989
990 if (zones_trace) {
991 entry->zones_trace = *zones_trace;
992 } else {
993 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
994 }
995
996 *result = entry;
997
998 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
999
1000 std::lock_guard l{locks[shard_id]};
1001 completions[shard_id].insert(entry);
1002 }
1003
1004 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1005 {
1006 int shard_id = arg->manager_shard_id;
1007 {
1008 std::lock_guard l{locks[shard_id]};
1009
1010 auto& comps = completions[shard_id];
1011
1012 auto iter = comps.find(arg);
1013 if (iter == comps.end()) {
1014 return true;
1015 }
1016
1017 comps.erase(iter);
1018 }
1019
1020 int r = rados_aio_get_return_value(cb);
1021 if (r != -ERR_BUSY_RESHARDING) {
1022 return true;
1023 }
1024 completion_thread->add_completion(arg);
1025 return false;
1026 }
1027
1028 void RGWRados::finalize()
1029 {
1030 if (run_sync_thread) {
1031 std::lock_guard l{meta_sync_thread_lock};
1032 meta_sync_processor_thread->stop();
1033
1034 std::lock_guard dl{data_sync_thread_lock};
1035 for (auto iter : data_sync_processor_threads) {
1036 RGWDataSyncProcessorThread *thread = iter.second;
1037 thread->stop();
1038 }
1039 if (sync_log_trimmer) {
1040 sync_log_trimmer->stop();
1041 }
1042 }
1043 if (run_sync_thread) {
1044 delete meta_sync_processor_thread;
1045 meta_sync_processor_thread = NULL;
1046 std::lock_guard dl{data_sync_thread_lock};
1047 for (auto iter : data_sync_processor_threads) {
1048 RGWDataSyncProcessorThread *thread = iter.second;
1049 delete thread;
1050 }
1051 data_sync_processor_threads.clear();
1052 delete sync_log_trimmer;
1053 sync_log_trimmer = nullptr;
1054 bucket_trim = boost::none;
1055 }
1056 if (meta_notifier) {
1057 meta_notifier->stop();
1058 delete meta_notifier;
1059 }
1060 if (data_notifier) {
1061 data_notifier->stop();
1062 delete data_notifier;
1063 }
1064 delete sync_tracer;
1065
1066 delete lc;
1067 lc = NULL;
1068
1069 delete gc;
1070 gc = NULL;
1071
1072 delete obj_expirer;
1073 obj_expirer = NULL;
1074
1075 RGWQuotaHandler::free_handler(quota_handler);
1076 if (cr_registry) {
1077 cr_registry->put();
1078 }
1079
1080 svc.shutdown();
1081
1082 delete binfo_cache;
1083 delete obj_tombstone_cache;
1084
1085 if (reshard_wait.get()) {
1086 reshard_wait->stop();
1087 reshard_wait.reset();
1088 }
1089
1090 if (run_reshard_thread) {
1091 reshard->stop_processor();
1092 }
1093 delete reshard;
1094 delete index_completion_manager;
1095
1096 rgw::notify::shutdown();
1097 }
1098
1099 /**
1100 * Initialize the RADOS instance and prepare to do other ops
1101 * Returns 0 on success, -ERR# on failure.
1102 */
1103 int RGWRados::init_rados()
1104 {
1105 int ret = 0;
1106
1107 ret = rados.init_with_context(cct);
1108 if (ret < 0) {
1109 return ret;
1110 }
1111 ret = rados.connect();
1112 if (ret < 0) {
1113 return ret;
1114 }
1115
1116 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1117 new RGWCoroutinesManagerRegistry(cct)};
1118 ret = crs->hook_to_admin_command("cr dump");
1119 if (ret < 0) {
1120 return ret;
1121 }
1122
1123 cr_registry = crs.release();
1124 return ret;
1125 }
1126
1127 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
1128 {
1129 string name = cct->_conf->name.get_id();
1130 if (name.compare(0, 4, "rgw.") == 0) {
1131 name = name.substr(4);
1132 }
1133 map<string,string> metadata = meta;
1134 metadata["num_handles"] = "1"s;
1135 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1136 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1137 metadata["zone_name"] = svc.zone->zone_name();
1138 metadata["zone_id"] = svc.zone->zone_id().id;
1139 metadata["realm_name"] = svc.zone->get_realm().get_name();
1140 metadata["realm_id"] = svc.zone->get_realm().get_id();
1141 metadata["id"] = name;
1142 int ret = rados.service_daemon_register(
1143 daemon_type,
1144 stringify(rados.get_instance_id()),
1145 metadata);
1146 if (ret < 0) {
1147 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1148 return ret;
1149 }
1150
1151 return 0;
1152 }
1153
1154 int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
1155 {
1156 int ret = rados.service_daemon_update_status(move(status));
1157 if (ret < 0) {
1158 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1159 return ret;
1160 }
1161
1162 return 0;
1163 }
1164
1165 /**
1166 * Initialize the RADOS instance and prepare to do other ops
1167 * Returns 0 on success, -ERR# on failure.
1168 */
1169 int RGWRados::init_complete(const DoutPrefixProvider *dpp)
1170 {
1171 int ret;
1172
1173 /*
1174 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1175 */
1176 sync_module = svc.sync_modules->get_sync_module();
1177
1178 ret = open_root_pool_ctx(dpp);
1179 if (ret < 0)
1180 return ret;
1181
1182 ret = open_gc_pool_ctx(dpp);
1183 if (ret < 0)
1184 return ret;
1185
1186 ret = open_lc_pool_ctx(dpp);
1187 if (ret < 0)
1188 return ret;
1189
1190 ret = open_objexp_pool_ctx(dpp);
1191 if (ret < 0)
1192 return ret;
1193
1194 ret = open_reshard_pool_ctx(dpp);
1195 if (ret < 0)
1196 return ret;
1197
1198 ret = open_notif_pool_ctx(dpp);
1199 if (ret < 0)
1200 return ret;
1201
1202 pools_initialized = true;
1203
1204 if (use_gc) {
1205 gc = new RGWGC();
1206 gc->initialize(cct, this);
1207 } else {
1208 ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
1209 }
1210
1211 obj_expirer = new RGWObjectExpirer(this->store);
1212
1213 if (use_gc_thread && use_gc) {
1214 gc->start_processor();
1215 obj_expirer->start_processor();
1216 }
1217
1218 auto& current_period = svc.zone->get_current_period();
1219 auto& zonegroup = svc.zone->get_zonegroup();
1220 auto& zone_params = svc.zone->get_zone_params();
1221 auto& zone = svc.zone->get_zone();
1222
1223 /* no point of running sync thread if we don't have a master zone configured
1224 or there is no rest_master_conn */
1225 if (!svc.zone->need_to_sync()) {
1226 run_sync_thread = false;
1227 }
1228
1229 if (svc.zone->is_meta_master()) {
1230 auto md_log = svc.mdlog->get_log(current_period.get_id());
1231 meta_notifier = new RGWMetaNotifier(this, md_log);
1232 meta_notifier->start();
1233 }
1234
1235 /* init it anyway, might run sync through radosgw-admin explicitly */
1236 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1237 sync_tracer->init(this);
1238 ret = sync_tracer->hook_to_admin_command();
1239 if (ret < 0) {
1240 return ret;
1241 }
1242
1243 if (run_sync_thread) {
1244 for (const auto &pt: zonegroup.placement_targets) {
1245 if (zone_params.placement_pools.find(pt.second.name)
1246 == zone_params.placement_pools.end()){
1247 ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
1248 << pt.second.name << " present in zonegroup" << dendl;
1249 }
1250 }
1251 auto async_processor = svc.rados->get_async_processor();
1252 std::lock_guard l{meta_sync_thread_lock};
1253 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
1254 ret = meta_sync_processor_thread->init(dpp);
1255 if (ret < 0) {
1256 ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1257 return ret;
1258 }
1259 meta_sync_processor_thread->start();
1260
1261 // configure the bucket trim manager
1262 rgw::BucketTrimConfig config;
1263 rgw::configure_bucket_trim(cct, config);
1264
1265 bucket_trim.emplace(this->store, config);
1266 ret = bucket_trim->init();
1267 if (ret < 0) {
1268 ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1269 return ret;
1270 }
1271 svc.datalog_rados->set_observer(&*bucket_trim);
1272
1273 std::lock_guard dl{data_sync_thread_lock};
1274 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1275 ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
1276 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
1277 ret = thread->init(dpp);
1278 if (ret < 0) {
1279 ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1280 return ret;
1281 }
1282 thread->start();
1283 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
1284 }
1285 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1286 if (interval > 0) {
1287 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
1288 ret = sync_log_trimmer->init(dpp);
1289 if (ret < 0) {
1290 ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1291 return ret;
1292 }
1293 sync_log_trimmer->start();
1294 }
1295 }
1296 data_notifier = new RGWDataNotifier(this);
1297 data_notifier->start();
1298
1299 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1300 binfo_cache->init(svc.cache);
1301
1302 lc = new RGWLC();
1303 lc->initialize(cct, this->store);
1304
1305 if (use_lc_thread)
1306 lc->start_processor();
1307
1308 quota_handler = RGWQuotaHandler::generate_handler(dpp, this->store, quota_threads);
1309
1310 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
1311 zone.bucket_index_max_shards);
1312 if (bucket_index_max_shards > get_max_bucket_shards()) {
1313 bucket_index_max_shards = get_max_bucket_shards();
1314 ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
1315 << get_max_bucket_shards() << dendl;
1316 }
1317 ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1318
1319 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1320
1321 if (need_tombstone_cache) {
1322 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1323 }
1324
1325 reshard_wait = std::make_shared<RGWReshardWait>();
1326
1327 reshard = new RGWReshard(this->store);
1328
1329 /* only the master zone in the zonegroup reshards buckets */
1330 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
1331 if (run_reshard_thread) {
1332 reshard->start_processor();
1333 }
1334
1335 index_completion_manager = new RGWIndexCompletionManager(this);
1336 ret = index_completion_manager->start(dpp);
1337 if (ret < 0) {
1338 return ret;
1339 }
1340 ret = rgw::notify::init(cct, store, dpp);
1341 if (ret < 0 ) {
1342 ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
1343 }
1344
1345 return ret;
1346 }
1347
1348 int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
1349 {
1350 if (raw) {
1351 return svc.init_raw(cct, use_cache, null_yield, dpp);
1352 }
1353
1354 return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
1355 }
1356
1357 int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
1358 {
1359 return ctl.init(&svc, dpp);
1360 }
1361
1362 /**
1363 * Initialize the RADOS instance and prepare to do other ops
1364 * Returns 0 on success, -ERR# on failure.
1365 */
1366 int RGWRados::initialize(const DoutPrefixProvider *dpp)
1367 {
1368 int ret;
1369
1370 inject_notify_timeout_probability =
1371 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1372 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
1373
1374 ret = init_svc(false, dpp);
1375 if (ret < 0) {
1376 ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
1377 return ret;
1378 }
1379
1380 ret = init_ctl(dpp);
1381 if (ret < 0) {
1382 ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
1383 return ret;
1384 }
1385
1386 host_id = svc.zone_utils->gen_host_id();
1387
1388 ret = init_rados();
1389 if (ret < 0)
1390 return ret;
1391
1392 return init_complete(dpp);
1393 }
1394
1395 /**
1396 * Open the pool used as root for this gateway
1397 * Returns: 0 on success, -ERR# otherwise.
1398 */
1399 int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
1400 {
1401 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
1402 }
1403
1404 int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
1405 {
1406 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
1407 }
1408
1409 int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
1410 {
1411 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
1412 }
1413
1414 int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
1415 {
1416 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
1417 }
1418
1419 int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
1420 {
1421 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
1422 }
1423
1424 int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
1425 {
1426 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
1427 }
1428
1429 int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
1430 bool mostly_omap)
1431 {
1432 constexpr bool create = true; // create the pool if it doesn't exist
1433 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
1434 }
1435
1436 /**** logs ****/
1437
1438 struct log_list_state {
1439 string prefix;
1440 librados::IoCtx io_ctx;
1441 librados::NObjectIterator obit;
1442 };
1443
1444 int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
1445 {
1446 log_list_state *state = new log_list_state;
1447 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1448 if (r < 0) {
1449 delete state;
1450 return r;
1451 }
1452 state->prefix = prefix;
1453 state->obit = state->io_ctx.nobjects_begin();
1454 *handle = (RGWAccessHandle)state;
1455 return 0;
1456 }
1457
1458 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1459 {
1460 log_list_state *state = static_cast<log_list_state *>(handle);
1461 while (true) {
1462 if (state->obit == state->io_ctx.nobjects_end()) {
1463 delete state;
1464 return -ENOENT;
1465 }
1466 if (state->prefix.length() &&
1467 state->obit->get_oid().find(state->prefix) != 0) {
1468 state->obit++;
1469 continue;
1470 }
1471 *name = state->obit->get_oid();
1472 state->obit++;
1473 break;
1474 }
1475 return 0;
1476 }
1477
1478 int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
1479 {
1480 librados::IoCtx io_ctx;
1481 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
1482 if (r < 0)
1483 return r;
1484 return io_ctx.remove(name);
1485 }
1486
1487 struct log_show_state {
1488 librados::IoCtx io_ctx;
1489 bufferlist bl;
1490 bufferlist::const_iterator p;
1491 string name;
1492 uint64_t pos;
1493 bool eof;
1494 log_show_state() : pos(0), eof(false) {}
1495 };
1496
1497 int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
1498 {
1499 log_show_state *state = new log_show_state;
1500 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1501 if (r < 0) {
1502 delete state;
1503 return r;
1504 }
1505 state->name = name;
1506 *handle = (RGWAccessHandle)state;
1507 return 0;
1508 }
1509
1510 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1511 {
1512 log_show_state *state = static_cast<log_show_state *>(handle);
1513 off_t off = state->p.get_off();
1514
1515 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1516 << " off " << off
1517 << " eof " << (int)state->eof
1518 << dendl;
1519 // read some?
1520 unsigned chunk = 1024*1024;
1521 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1522 bufferlist more;
1523 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1524 if (r < 0)
1525 return r;
1526 state->pos += r;
1527 bufferlist old;
1528 try {
1529 old.substr_of(state->bl, off, state->bl.length() - off);
1530 } catch (buffer::error& err) {
1531 return -EINVAL;
1532 }
1533 state->bl = std::move(old);
1534 state->bl.claim_append(more);
1535 state->p = state->bl.cbegin();
1536 if ((unsigned)r < chunk)
1537 state->eof = true;
1538 ldout(cct, 10) << " read " << r << dendl;
1539 }
1540
1541 if (state->p.end())
1542 return 0; // end of file
1543 try {
1544 decode(*entry, state->p);
1545 }
1546 catch (const buffer::error &e) {
1547 return -EINVAL;
1548 }
1549 return 1;
1550 }
1551
1552 /**
1553 * usage_log_hash: get usage log key hash, based on name and index
1554 *
1555 * Get the usage object name. Since a user may have more than 1
1556 * object holding that info (multiple shards), we use index to
1557 * specify that shard number. Once index exceeds max shards it
1558 * wraps.
1559 * If name is not being set, results for all users will be returned
1560 * and index will wrap only after total shards number.
1561 *
1562 * @param cct [in] ceph context
1563 * @param name [in] user name
1564 * @param hash [out] hash value
1565 * @param index [in] shard index number
1566 */
1567 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1568 {
1569 uint32_t val = index;
1570
1571 if (!name.empty()) {
1572 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
1573 val %= max_user_shards;
1574 val += ceph_str_hash_linux(name.c_str(), name.size());
1575 }
1576 char buf[17];
1577 int max_shards = cct->_conf->rgw_usage_max_shards;
1578 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1579 hash = buf;
1580 }
1581
1582 int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1583 {
1584 uint32_t index = 0;
1585
1586 map<string, rgw_usage_log_info> log_objs;
1587
1588 string hash;
1589 string last_user;
1590
1591 /* restructure usage map, zone by object hash */
1592 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1593 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1594 const rgw_user_bucket& ub = iter->first;
1595 RGWUsageBatch& info = iter->second;
1596
1597 if (ub.user.empty()) {
1598 ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1599 continue;
1600 }
1601
1602 if (ub.user != last_user) {
1603 /* index *should* be random, but why waste extra cycles
1604 in most cases max user shards is not going to exceed 1,
1605 so just incrementing it */
1606 usage_log_hash(cct, ub.user, hash, index++);
1607 }
1608 last_user = ub.user;
1609 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1610
1611 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1612 v.push_back(miter->second);
1613 }
1614 }
1615
1616 map<string, rgw_usage_log_info>::iterator liter;
1617
1618 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1619 int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
1620 if (r < 0)
1621 return r;
1622 }
1623 return 0;
1624 }
1625
1626 int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1627 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1628 rgw_usage_log_entry>& usage)
1629 {
1630 uint32_t num = max_entries;
1631 string hash, first_hash;
1632 string user_str = user.to_str();
1633 usage_log_hash(cct, user_str, first_hash, 0);
1634
1635 if (usage_iter.index) {
1636 usage_log_hash(cct, user_str, hash, usage_iter.index);
1637 } else {
1638 hash = first_hash;
1639 }
1640
1641 usage.clear();
1642
1643 do {
1644 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1645 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1646
1647 int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
1648 usage_iter.read_iter, ret_usage, is_truncated);
1649 if (ret == -ENOENT)
1650 goto next;
1651
1652 if (ret < 0)
1653 return ret;
1654
1655 num -= ret_usage.size();
1656
1657 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1658 usage[iter->first].aggregate(iter->second);
1659 }
1660
1661 next:
1662 if (!*is_truncated) {
1663 usage_iter.read_iter.clear();
1664 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1665 }
1666 } while (num && !*is_truncated && hash != first_hash);
1667 return 0;
1668 }
1669
1670 int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
1671 {
1672 uint32_t index = 0;
1673 string hash, first_hash;
1674 string user_str = user.to_str();
1675 usage_log_hash(cct, user_str, first_hash, index);
1676
1677 hash = first_hash;
1678 do {
1679 int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
1680
1681 if (ret < 0 && ret != -ENOENT)
1682 return ret;
1683
1684 usage_log_hash(cct, user_str, hash, ++index);
1685 } while (hash != first_hash);
1686
1687 return 0;
1688 }
1689
1690
1691 int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
1692 {
1693 auto max_shards = cct->_conf->rgw_usage_max_shards;
1694 int ret=0;
1695 for (unsigned i=0; i < max_shards; i++){
1696 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
1697 ret = cls_obj_usage_log_clear(dpp, oid);
1698 if (ret < 0){
1699 ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
1700 return ret;
1701 }
1702 }
1703 return ret;
1704 }
1705
1706 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
1707 {
1708 auto i = bl.cbegin();
1709 RGWAccessControlPolicy policy(cct);
1710 try {
1711 policy.decode_owner(i);
1712 } catch (buffer::error& err) {
1713 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1714 return -EIO;
1715 }
1716 *owner = policy.get_owner();
1717 return 0;
1718 }
1719
1720 int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
1721 {
1722 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1723 if (aiter == attrset.end())
1724 return -EIO;
1725
1726 bufferlist& bl = aiter->second;
1727 auto iter = bl.cbegin();
1728 try {
1729 policy->decode(iter);
1730 } catch (buffer::error& err) {
1731 ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1732 return -EIO;
1733 }
1734 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1735 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
1736 ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy";
1737 s3policy->to_xml(*_dout);
1738 *_dout << dendl;
1739 }
1740 return 0;
1741 }
1742
1743
1744 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
1745 {
1746 rgw_bucket bucket = bucket_info.bucket;
1747 bucket.update_bucket_id(new_bucket_id);
1748
1749 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
1750
1751 bucket_info.objv_tracker.clear();
1752 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
1753 if (ret < 0) {
1754 return ret;
1755 }
1756
1757 return 0;
1758 }
1759
1760
1761 /**
1762 * Get ordered listing of the objects in a bucket.
1763 *
1764 * max_p: maximum number of results to return
1765 * bucket: bucket to list contents of
1766 * prefix: only return results that match this prefix
1767 * delim: do not include results that match this string.
1768 * Any skipped results will have the matching portion of their name
1769 * inserted in common_prefixes with a "true" mark.
1770 * marker: if filled in, begin the listing with this object.
1771 * end_marker: if filled in, end the listing with this object.
1772 * result: the objects are put in here.
1773 * common_prefixes: if delim is filled in, any matching prefixes are
1774 * placed here.
1775 * is_truncated: if number of objects in the bucket is bigger than
1776 * max, then truncated.
1777 */
1778 int RGWRados::Bucket::List::list_objects_ordered(
1779 const DoutPrefixProvider *dpp,
1780 int64_t max_p,
1781 vector<rgw_bucket_dir_entry> *result,
1782 map<string, bool> *common_prefixes,
1783 bool *is_truncated,
1784 optional_yield y)
1785 {
1786 RGWRados *store = target->get_store();
1787 CephContext *cct = store->ctx();
1788 int shard_id = target->get_shard_id();
1789
1790 int count = 0;
1791 bool truncated = true;
1792 bool cls_filtered = false;
1793 const int64_t max = // protect against memory issues and negative vals
1794 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1795 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
1796
1797 result->clear();
1798
1799 // use a local marker; either the marker will have a previous entry
1800 // or it will be empty; either way it's OK to copy
1801 rgw_obj_key marker_obj(params.marker.name,
1802 params.marker.instance,
1803 params.ns.empty() ? params.marker.ns : params.ns);
1804 rgw_obj_index_key cur_marker;
1805 marker_obj.get_index_key(&cur_marker);
1806
1807 rgw_obj_key end_marker_obj(params.end_marker.name,
1808 params.end_marker.instance,
1809 params.ns.empty() ? params.end_marker.ns : params.ns);
1810 rgw_obj_index_key cur_end_marker;
1811 end_marker_obj.get_index_key(&cur_end_marker);
1812 const bool cur_end_marker_valid = !params.end_marker.empty();
1813
1814 rgw_obj_key prefix_obj(params.prefix);
1815 prefix_obj.set_ns(params.ns);
1816 string cur_prefix = prefix_obj.get_index_key_name();
1817 string after_delim_s; /* needed in !params.delim.empty() AND later */
1818
1819 if (!params.delim.empty()) {
1820 after_delim_s = cls_rgw_after_delim(params.delim);
1821 /* if marker points at a common prefix, fast forward it into its
1822 * upper bound string */
1823 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
1824 if (delim_pos >= 0) {
1825 string s = cur_marker.name.substr(0, delim_pos);
1826 s.append(after_delim_s);
1827 cur_marker = s;
1828 }
1829 }
1830
1831 rgw_obj_index_key prev_marker;
1832 for (uint16_t attempt = 1; /* empty */; ++attempt) {
1833 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
1834 " starting attempt " << attempt << dendl;
1835
1836 if (attempt > 1 && !(prev_marker < cur_marker)) {
1837 // we've failed to make forward progress
1838 ldpp_dout(dpp, 0) << "RGWRados::Bucket::List::" << __func__ <<
1839 ": ERROR marker failed to make forward progress; attempt=" << attempt <<
1840 ", prev_marker=" << prev_marker <<
1841 ", cur_marker=" << cur_marker << dendl;
1842 break;
1843 }
1844 prev_marker = cur_marker;
1845
1846 ent_map_t ent_map;
1847 ent_map.reserve(read_ahead);
1848 int r = store->cls_bucket_list_ordered(dpp,
1849 target->get_bucket_info(),
1850 shard_id,
1851 cur_marker,
1852 cur_prefix,
1853 params.delim,
1854 read_ahead + 1 - count,
1855 params.list_versions,
1856 attempt,
1857 ent_map,
1858 &truncated,
1859 &cls_filtered,
1860 &cur_marker,
1861 y);
1862 if (r < 0) {
1863 return r;
1864 }
1865
1866 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
1867 rgw_bucket_dir_entry& entry = eiter->second;
1868 rgw_obj_index_key index_key = entry.key;
1869 rgw_obj_key obj(index_key);
1870
1871 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
1872 " considering entry " << entry.key << dendl;
1873
1874 /* note that parse_raw_oid() here will not set the correct
1875 * object's instance, as rgw_obj_index_key encodes that
1876 * separately. We don't need to set the instance because it's
1877 * not needed for the checks here and we end up using the raw
1878 * entry for the return vector
1879 */
1880 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1881 if (!valid) {
1882 ldpp_dout(dpp, 0) << "ERROR: could not parse object name: " <<
1883 obj.name << dendl;
1884 continue;
1885 }
1886
1887 bool matched_ns = (obj.ns == params.ns);
1888 if (!params.list_versions && !entry.is_visible()) {
1889 continue;
1890 }
1891
1892 if (params.enforce_ns && !matched_ns) {
1893 if (!params.ns.empty()) {
1894 /* we've iterated past the namespace we're searching -- done now */
1895 truncated = false;
1896 goto done;
1897 }
1898
1899 /* we're not looking at the namespace this object is in, next! */
1900 continue;
1901 }
1902
1903 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1904 truncated = false;
1905 goto done;
1906 }
1907
1908 if (count < max) {
1909 params.marker = index_key;
1910 next_marker = index_key;
1911 }
1912
1913 if (params.filter &&
1914 ! params.filter->filter(obj.name, index_key.name)) {
1915 continue;
1916 }
1917
1918 if (params.prefix.size() &&
1919 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
1920 continue;
1921 }
1922
1923 if (!params.delim.empty()) {
1924 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1925 if (delim_pos >= 0) {
1926 // run either the code where delimiter filtering is done a)
1927 // in the OSD/CLS or b) here.
1928 if (cls_filtered) {
1929 // NOTE: this condition is for the newer versions of the
1930 // OSD that does filtering on the CLS side
1931
1932 // should only find one delimiter at the end if it finds any
1933 // after the prefix
1934 if (delim_pos !=
1935 int(obj.name.length() - params.delim.length())) {
1936 ldpp_dout(dpp, 0) <<
1937 "WARNING: found delimiter in place other than the end of "
1938 "the prefix; obj.name=" << obj.name <<
1939 ", prefix=" << params.prefix << dendl;
1940 }
1941 if (common_prefixes) {
1942 if (count >= max) {
1943 truncated = true;
1944 goto done;
1945 }
1946
1947 (*common_prefixes)[obj.name] = true;
1948 count++;
1949 }
1950
1951 continue;
1952 } else {
1953 // NOTE: this condition is for older versions of the OSD
1954 // that do not filter on the CLS side, so the following code
1955 // must do the filtering; once we reach version 16 of ceph,
1956 // this code can be removed along with the conditional that
1957 // can lead this way
1958
1959 /* extract key -with trailing delimiter- for CommonPrefix */
1960 string prefix_key =
1961 obj.name.substr(0, delim_pos + params.delim.length());
1962
1963 if (common_prefixes &&
1964 common_prefixes->find(prefix_key) == common_prefixes->end()) {
1965 if (count >= max) {
1966 truncated = true;
1967 goto done;
1968 }
1969 next_marker = prefix_key;
1970 (*common_prefixes)[prefix_key] = true;
1971
1972 count++;
1973 }
1974
1975 continue;
1976 } // if we're running an older OSD version
1977 } // if a delimiter was found after prefix
1978 } // if a delimiter was passed in
1979
1980 if (count >= max) {
1981 truncated = true;
1982 goto done;
1983 }
1984
1985 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
1986 " adding entry " << entry.key << " to result" << dendl;
1987
1988 result->emplace_back(std::move(entry));
1989 count++;
1990 } // eiter for loop
1991
1992 // NOTE: the following conditional is needed by older versions of
1993 // the OSD that don't do delimiter filtering on the CLS side; once
1994 // we reach version 16 of ceph, the following conditional and the
1995 // code within can be removed
1996 if (!cls_filtered && !params.delim.empty()) {
1997 int marker_delim_pos =
1998 cur_marker.name.find(params.delim, cur_prefix.size());
1999 if (marker_delim_pos >= 0) {
2000 std::string skip_after_delim =
2001 cur_marker.name.substr(0, marker_delim_pos);
2002 skip_after_delim.append(after_delim_s);
2003
2004 ldpp_dout(dpp, 20) << "skip_after_delim=" << skip_after_delim << dendl;
2005
2006 if (skip_after_delim > cur_marker.name) {
2007 cur_marker = skip_after_delim;
2008 ldpp_dout(dpp, 20) << "setting cur_marker="
2009 << cur_marker.name
2010 << "[" << cur_marker.instance << "]"
2011 << dendl;
2012 }
2013 }
2014 } // if older osd didn't do delimiter filtering
2015
2016 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
2017 " INFO end of outer loop, truncated=" << truncated <<
2018 ", count=" << count << ", attempt=" << attempt << dendl;
2019
2020 if (!truncated || count >= (max + 1) / 2) {
2021 // if we finished listing, or if we're returning at least half the
2022 // requested entries, that's enough; S3 and swift protocols allow
2023 // returning fewer than max entries
2024 break;
2025 } else if (attempt > 8 && count >= 1) {
2026 // if we've made at least 8 attempts and we have some, but very
2027 // few, results, return with what we have
2028 break;
2029 }
2030 } // for (uint16_t attempt...
2031
2032 done:
2033
2034 if (is_truncated) {
2035 *is_truncated = truncated;
2036 }
2037
2038 return 0;
2039 } // list_objects_ordered
2040
2041
2042 /**
2043 * Get listing of the objects in a bucket and allow the results to be out
2044 * of order.
2045 *
2046 * Even though there are key differences with the ordered counterpart,
2047 * the parameters are the same to maintain some compatability.
2048 *
2049 * max: maximum number of results to return
2050 * bucket: bucket to list contents of
2051 * prefix: only return results that match this prefix
2052 * delim: should not be set; if it is we should have indicated an error
2053 * marker: if filled in, begin the listing with this object.
2054 * end_marker: if filled in, end the listing with this object.
2055 * result: the objects are put in here.
2056 * common_prefixes: this is never filled with an unordered list; the param
2057 * is maintained for compatibility
2058 * is_truncated: if number of objects in the bucket is bigger than max, then
2059 * truncated.
2060 */
2061 int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
2062 int64_t max_p,
2063 vector<rgw_bucket_dir_entry> *result,
2064 map<string, bool> *common_prefixes,
2065 bool *is_truncated,
2066 optional_yield y)
2067 {
2068 RGWRados *store = target->get_store();
2069 int shard_id = target->get_shard_id();
2070
2071 int count = 0;
2072 bool truncated = true;
2073
2074 const int64_t max = // protect against memory issues and negative vals
2075 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2076
2077 // read a few extra in each call to cls_bucket_list_unordered in
2078 // case some are filtered out due to namespace matching, versioning,
2079 // filtering, etc.
2080 const int64_t max_read_ahead = 100;
2081 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2082
2083 result->clear();
2084
2085 // use a local marker; either the marker will have a previous entry
2086 // or it will be empty; either way it's OK to copy
2087 rgw_obj_key marker_obj(params.marker.name,
2088 params.marker.instance,
2089 params.ns.empty() ? params.marker.ns : params.ns);
2090 rgw_obj_index_key cur_marker;
2091 marker_obj.get_index_key(&cur_marker);
2092
2093 rgw_obj_key end_marker_obj(params.end_marker.name,
2094 params.end_marker.instance,
2095 params.ns.empty() ? params.end_marker.ns : params.ns);
2096 rgw_obj_index_key cur_end_marker;
2097 end_marker_obj.get_index_key(&cur_end_marker);
2098 const bool cur_end_marker_valid = !params.end_marker.empty();
2099
2100 rgw_obj_key prefix_obj(params.prefix);
2101 prefix_obj.set_ns(params.ns);
2102 string cur_prefix = prefix_obj.get_index_key_name();
2103
2104 while (truncated && count <= max) {
2105 std::vector<rgw_bucket_dir_entry> ent_list;
2106 ent_list.reserve(read_ahead);
2107
2108 int r = store->cls_bucket_list_unordered(dpp,
2109 target->get_bucket_info(),
2110 shard_id,
2111 cur_marker,
2112 cur_prefix,
2113 read_ahead,
2114 params.list_versions,
2115 ent_list,
2116 &truncated,
2117 &cur_marker,
2118 y);
2119 if (r < 0)
2120 return r;
2121
2122 // NB: while regions of ent_list will be sorted, we have no
2123 // guarantee that all items will be sorted since they can cross
2124 // shard boundaries
2125
2126 for (auto& entry : ent_list) {
2127 rgw_obj_index_key index_key = entry.key;
2128 rgw_obj_key obj(index_key);
2129
2130 if (count < max) {
2131 params.marker.set(index_key);
2132 next_marker.set(index_key);
2133 }
2134
2135 /* note that parse_raw_oid() here will not set the correct
2136 * object's instance, as rgw_obj_index_key encodes that
2137 * separately. We don't need to set the instance because it's
2138 * not needed for the checks here and we end up using the raw
2139 * entry for the return vector
2140 */
2141 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2142 if (!valid) {
2143 ldpp_dout(dpp, 0) << "ERROR: could not parse object name: " <<
2144 obj.name << dendl;
2145 continue;
2146 }
2147
2148 if (!params.list_versions && !entry.is_visible()) {
2149 continue;
2150 }
2151
2152 if (params.enforce_ns && obj.ns != params.ns) {
2153 continue;
2154 }
2155
2156 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2157 // we're not guaranteed items will come in order, so we have
2158 // to loop through all
2159 continue;
2160 }
2161
2162 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2163 continue;
2164
2165 if (params.prefix.size() &&
2166 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2167 continue;
2168
2169 if (count >= max) {
2170 truncated = true;
2171 goto done;
2172 }
2173
2174 result->emplace_back(std::move(entry));
2175 count++;
2176 } // for (auto& entry : ent_list)
2177 } // while (truncated && count <= max)
2178
2179 done:
2180 if (is_truncated)
2181 *is_truncated = truncated;
2182
2183 return 0;
2184 } // list_objects_unordered
2185
2186
2187 /**
2188 * create a rados pool, associated meta info
2189 * returns 0 on success, -ERR# otherwise.
2190 */
2191 int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
2192 {
2193 librados::IoCtx io_ctx;
2194 constexpr bool create = true;
2195 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
2196 }
2197
2198 void RGWRados::create_bucket_id(string *bucket_id)
2199 {
2200 uint64_t iid = instance_id();
2201 uint64_t bid = next_bucket_id();
2202 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2203 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2204 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2205 *bucket_id = buf;
2206 }
2207
2208 int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
2209 const string& zonegroup_id,
2210 const rgw_placement_rule& placement_rule,
2211 const string& swift_ver_location,
2212 const RGWQuotaInfo * pquota_info,
2213 map<std::string, bufferlist>& attrs,
2214 RGWBucketInfo& info,
2215 obj_version *pobjv,
2216 obj_version *pep_objv,
2217 real_time creation_time,
2218 rgw_bucket *pmaster_bucket,
2219 uint32_t *pmaster_num_shards,
2220 optional_yield y,
2221 const DoutPrefixProvider *dpp,
2222 bool exclusive)
2223 {
2224 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2225 rgw_placement_rule selected_placement_rule;
2226 RGWZonePlacementInfo rule_info;
2227
2228 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2229 int ret = 0;
2230 ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
2231 &selected_placement_rule, &rule_info, y);
2232 if (ret < 0)
2233 return ret;
2234
2235 if (!pmaster_bucket) {
2236 create_bucket_id(&bucket.marker);
2237 bucket.bucket_id = bucket.marker;
2238 } else {
2239 bucket.marker = pmaster_bucket->marker;
2240 bucket.bucket_id = pmaster_bucket->bucket_id;
2241 }
2242
2243 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2244
2245 objv_tracker.read_version.clear();
2246
2247 if (pobjv) {
2248 objv_tracker.write_version = *pobjv;
2249 } else {
2250 objv_tracker.generate_new_write_ver(cct);
2251 }
2252
2253 info.bucket = bucket;
2254 info.owner = owner.user_id;
2255 info.zonegroup = zonegroup_id;
2256 info.placement_rule = selected_placement_rule;
2257 info.swift_ver_location = swift_ver_location;
2258 info.swift_versioning = (!swift_ver_location.empty());
2259
2260 init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
2261 pmaster_num_shards ?
2262 std::optional{*pmaster_num_shards} :
2263 std::nullopt,
2264 rule_info.index_type);
2265
2266 info.requester_pays = false;
2267 if (real_clock::is_zero(creation_time)) {
2268 info.creation_time = ceph::real_clock::now();
2269 } else {
2270 info.creation_time = creation_time;
2271 }
2272 if (pquota_info) {
2273 info.quota = *pquota_info;
2274 }
2275
2276 int r = svc.bi->init_index(dpp, info);
2277 if (r < 0) {
2278 return r;
2279 }
2280
2281 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
2282 if (ret == -ECANCELED) {
2283 ret = -EEXIST;
2284 }
2285 if (ret == -EEXIST) {
2286 /* we need to reread the info and return it, caller will have a use for it */
2287 RGWBucketInfo orig_info;
2288 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
2289 if (r < 0) {
2290 if (r == -ENOENT) {
2291 continue;
2292 }
2293 ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
2294 return r;
2295 }
2296
2297 /* only remove it if it's a different bucket instance */
2298 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
2299 int r = svc.bi->clean_index(dpp, info);
2300 if (r < 0) {
2301 ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
2302 }
2303 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
2304 if (r < 0) {
2305 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
2306 /* continue anyway */
2307 }
2308 }
2309
2310 info = std::move(orig_info);
2311 /* ret == -EEXIST here */
2312 }
2313 return ret;
2314 }
2315
2316 /* this is highly unlikely */
2317 ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2318 return -ENOENT;
2319 }
2320
2321 bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
2322 {
2323 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2324 }
2325
2326 bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2327 {
2328 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
2329
2330 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
2331 }
2332
2333 int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2334 {
2335 string oid, key;
2336 get_obj_bucket_and_oid_loc(obj, oid, key);
2337
2338 rgw_pool pool;
2339 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2340 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2341 return -EIO;
2342 }
2343
2344 int r = open_pool_ctx(dpp, pool, *ioctx, false);
2345 if (r < 0) {
2346 return r;
2347 }
2348
2349 ioctx->locator_set_key(key);
2350
2351 return 0;
2352 }
2353
2354 int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2355 {
2356 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
2357
2358 rgw_pool pool;
2359 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2360 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2361 return -EIO;
2362 }
2363
2364 ref->pool = svc.rados->pool(pool);
2365
2366 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
2367 .set_mostly_omap(false));
2368 if (r < 0) {
2369 ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
2370 return r;
2371 }
2372
2373 ref->pool.ioctx().locator_set_key(ref->obj.loc);
2374
2375 return 0;
2376 }
2377
2378 int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
2379 {
2380 ref->obj = obj;
2381
2382 if (ref->obj.oid.empty()) {
2383 ref->obj.oid = obj.pool.to_str();
2384 ref->obj.pool = svc.zone->get_zone_params().domain_root;
2385 }
2386 ref->pool = svc.rados->pool(obj.pool);
2387 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
2388 .set_mostly_omap(false));
2389 if (r < 0) {
2390 ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
2391 return r;
2392 }
2393
2394 ref->pool.ioctx().locator_set_key(ref->obj.loc);
2395
2396 return 0;
2397 }
2398
2399 int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
2400 {
2401 return get_raw_obj_ref(dpp, obj, ref);
2402 }
2403
2404 /*
2405 * fixes an issue where head objects were supposed to have a locator created, but ended
2406 * up without one
2407 */
2408 int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
2409 {
2410 const rgw_bucket& bucket = bucket_info.bucket;
2411 string oid;
2412 string locator;
2413
2414 rgw_obj obj(bucket, key);
2415
2416 get_obj_bucket_and_oid_loc(obj, oid, locator);
2417
2418 if (locator.empty()) {
2419 ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
2420 return 0;
2421 }
2422
2423 librados::IoCtx ioctx;
2424
2425 int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
2426 if (ret < 0) {
2427 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2428 return ret;
2429 }
2430 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2431
2432 uint64_t size;
2433 bufferlist data;
2434
2435 struct timespec mtime_ts;
2436 map<string, bufferlist> attrs;
2437 librados::ObjectReadOperation op;
2438 op.getxattrs(&attrs, NULL);
2439 op.stat2(&size, &mtime_ts, NULL);
2440 #define HEAD_SIZE 512 * 1024
2441 op.read(0, HEAD_SIZE, &data, NULL);
2442
2443 ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
2444 if (ret < 0) {
2445 ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
2446 return ret;
2447 }
2448
2449 if (size > HEAD_SIZE) {
2450 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
2451 return -EIO;
2452 }
2453
2454 if (size != data.length()) {
2455 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
2456 return -EIO;
2457 }
2458
2459 if (copy_obj) {
2460 librados::ObjectWriteOperation wop;
2461
2462 wop.mtime2(&mtime_ts);
2463
2464 map<string, bufferlist>::iterator iter;
2465 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2466 wop.setxattr(iter->first.c_str(), iter->second);
2467 }
2468
2469 wop.write(0, data);
2470
2471 ioctx.locator_set_key(locator);
2472 rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
2473 }
2474
2475 if (remove_bad) {
2476 ioctx.locator_set_key(string());
2477
2478 ret = ioctx.remove(oid);
2479 if (ret < 0) {
2480 ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
2481 return ret;
2482 }
2483 }
2484
2485 return 0;
2486 }
2487
2488 int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
2489 librados::IoCtx& src_ioctx,
2490 const string& src_oid, const string& src_locator,
2491 librados::IoCtx& dst_ioctx,
2492 const string& dst_oid, const string& dst_locator)
2493 {
2494
2495 #define COPY_BUF_SIZE (4 * 1024 * 1024)
2496 bool done = false;
2497 uint64_t chunk_size = COPY_BUF_SIZE;
2498 uint64_t ofs = 0;
2499 int ret = 0;
2500 real_time mtime;
2501 struct timespec mtime_ts;
2502 uint64_t size;
2503
2504 if (src_oid == dst_oid && src_locator == dst_locator) {
2505 return 0;
2506 }
2507
2508 src_ioctx.locator_set_key(src_locator);
2509 dst_ioctx.locator_set_key(dst_locator);
2510
2511 do {
2512 bufferlist data;
2513 ObjectReadOperation rop;
2514 ObjectWriteOperation wop;
2515
2516 if (ofs == 0) {
2517 rop.stat2(&size, &mtime_ts, NULL);
2518 mtime = real_clock::from_timespec(mtime_ts);
2519 }
2520 rop.read(ofs, chunk_size, &data, NULL);
2521 ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
2522 if (ret < 0) {
2523 goto done_err;
2524 }
2525
2526 if (data.length() == 0) {
2527 break;
2528 }
2529
2530 if (ofs == 0) {
2531 wop.create(true); /* make it exclusive */
2532 wop.mtime2(&mtime_ts);
2533 mtime = real_clock::from_timespec(mtime_ts);
2534 }
2535 wop.write(ofs, data);
2536 ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
2537 if (ret < 0) {
2538 goto done_err;
2539 }
2540 ofs += data.length();
2541 done = data.length() != chunk_size;
2542 } while (!done);
2543
2544 if (ofs != size) {
2545 ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
2546 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2547 ret = -EIO;
2548 goto done_err;
2549 }
2550
2551 src_ioctx.remove(src_oid);
2552
2553 return 0;
2554
2555 done_err:
2556 // TODO: clean up dst_oid if we created it
2557 ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
2558 return ret;
2559 }
2560
2561 /*
2562 * fixes an issue where head objects were supposed to have a locator created, but ended
2563 * up without one
2564 */
2565 int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
2566 {
2567 const rgw_bucket& bucket = bucket_info.bucket;
2568 rgw_obj obj(bucket, key);
2569
2570 if (need_fix) {
2571 *need_fix = false;
2572 }
2573
2574 rgw_rados_ref ref;
2575 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
2576 if (r < 0) {
2577 return r;
2578 }
2579
2580 RGWObjState *astate = NULL;
2581 RGWObjectCtx rctx(this->store);
2582 r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
2583 if (r < 0)
2584 return r;
2585
2586 if (astate->manifest) {
2587 RGWObjManifest::obj_iterator miter;
2588 RGWObjManifest& manifest = *astate->manifest;
2589 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
2590 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store);
2591 rgw_obj loc;
2592 string oid;
2593 string locator;
2594
2595 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
2596
2597 if (loc.key.ns.empty()) {
2598 /* continue, we're only interested in tail objects */
2599 continue;
2600 }
2601
2602 auto& ioctx = ref.pool.ioctx();
2603
2604 get_obj_bucket_and_oid_loc(loc, oid, locator);
2605 ref.pool.ioctx().locator_set_key(locator);
2606
2607 ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
2608
2609 r = ioctx.stat(oid, NULL, NULL);
2610 if (r != -ENOENT) {
2611 continue;
2612 }
2613
2614 string bad_loc;
2615 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2616
2617 /* create a new ioctx with the bad locator */
2618 librados::IoCtx src_ioctx;
2619 src_ioctx.dup(ioctx);
2620 src_ioctx.locator_set_key(bad_loc);
2621
2622 r = src_ioctx.stat(oid, NULL, NULL);
2623 if (r != 0) {
2624 /* cannot find a broken part */
2625 continue;
2626 }
2627 ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
2628 if (need_fix) {
2629 *need_fix = true;
2630 }
2631 if (fix) {
2632 r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
2633 if (r < 0) {
2634 ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
2635 }
2636 }
2637 }
2638 }
2639
2640 return 0;
2641 }
2642
2643 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2644 const rgw_obj& obj,
2645 RGWBucketInfo* bucket_info_out,
2646 const DoutPrefixProvider *dpp)
2647 {
2648 bucket = _bucket;
2649
2650 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
2651
2652 RGWBucketInfo bucket_info;
2653 RGWBucketInfo* bucket_info_p =
2654 bucket_info_out ? bucket_info_out : &bucket_info;
2655
2656 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
2657 if (ret < 0) {
2658 return ret;
2659 }
2660
2661 string oid;
2662
2663 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
2664 if (ret < 0) {
2665 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2666 return ret;
2667 }
2668 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
2669
2670 return 0;
2671 }
2672
2673 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2674 int sid, const rgw::bucket_index_layout_generation& idx_layout,
2675 RGWBucketInfo* bucket_info_out,
2676 const DoutPrefixProvider *dpp)
2677 {
2678 bucket = _bucket;
2679 shard_id = sid;
2680
2681 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
2682
2683
2684 RGWBucketInfo bucket_info;
2685 RGWBucketInfo* bucket_info_p =
2686 bucket_info_out ? bucket_info_out : &bucket_info;
2687 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
2688 if (ret < 0) {
2689 return ret;
2690 }
2691
2692 string oid;
2693
2694 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, shard_id, idx_layout, &bucket_obj);
2695 if (ret < 0) {
2696 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2697 return ret;
2698 }
2699 ldpp_dout(dpp, 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
2700
2701 return 0;
2702 }
2703
2704 int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
2705 const rgw_obj& obj)
2706 {
2707 bucket = bucket_info.bucket;
2708
2709 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
2710 obj.get_hash_object(),
2711 &bucket_obj,
2712 &shard_id);
2713 if (ret < 0) {
2714 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2715 return ret;
2716 }
2717 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
2718
2719 return 0;
2720 }
2721
2722 int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int sid)
2723 {
2724 bucket = bucket_info.bucket;
2725 shard_id = sid;
2726
2727 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id, idx_layout, &bucket_obj);
2728 if (ret < 0) {
2729 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2730 return ret;
2731 }
2732 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
2733
2734 return 0;
2735 }
2736
2737
2738 /* Execute @handler on last item in bucket listing for bucket specified
2739 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2740 * to objects matching these criterias. */
2741 int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
2742 RGWBucketInfo& bucket_info,
2743 const std::string& obj_prefix,
2744 const std::string& obj_delim,
2745 std::function<int(const rgw_bucket_dir_entry&)> handler)
2746 {
2747 RGWRados::Bucket target(this, bucket_info);
2748 RGWRados::Bucket::List list_op(&target);
2749
2750 list_op.params.prefix = obj_prefix;
2751 list_op.params.delim = obj_delim;
2752
2753 ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
2754 << ", obj_prefix=" << obj_prefix
2755 << ", obj_delim=" << obj_delim
2756 << dendl;
2757
2758 bool is_truncated = false;
2759
2760 boost::optional<rgw_bucket_dir_entry> last_entry;
2761 /* We need to rewind to the last object in a listing. */
2762 do {
2763 /* List bucket entries in chunks. */
2764 static constexpr int MAX_LIST_OBJS = 100;
2765 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2766
2767 int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
2768 &is_truncated, null_yield);
2769 if (ret < 0) {
2770 return ret;
2771 } else if (!entries.empty()) {
2772 last_entry = entries.back();
2773 }
2774 } while (is_truncated);
2775
2776 if (last_entry) {
2777 return handler(*last_entry);
2778 }
2779
2780 /* Empty listing - no items we can run handler on. */
2781 return 0;
2782 }
2783
2784 bool RGWRados::swift_versioning_enabled(rgw::sal::RGWBucket* bucket) const
2785 {
2786 return bucket->get_info().has_swift_versioning() &&
2787 bucket->get_info().swift_ver_location.size();
2788 }
2789
2790 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2791 const rgw_user& user,
2792 rgw::sal::RGWBucket* bucket,
2793 rgw::sal::RGWObject* obj,
2794 const DoutPrefixProvider *dpp,
2795 optional_yield y)
2796 {
2797 if (! swift_versioning_enabled(bucket)) {
2798 return 0;
2799 }
2800
2801 obj->set_atomic(&obj_ctx);
2802
2803 RGWObjState * state = nullptr;
2804 int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj->get_obj(), &state, false, y);
2805 if (r < 0) {
2806 return r;
2807 }
2808
2809 if (!state->exists) {
2810 return 0;
2811 }
2812
2813 const string& src_name = obj->get_oid();
2814 char buf[src_name.size() + 32];
2815 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2816 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2817 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2818
2819 RGWBucketInfo dest_bucket_info;
2820
2821 r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
2822 if (r < 0) {
2823 ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
2824 if (r == -ENOENT) {
2825 return -ERR_PRECONDITION_FAILED;
2826 }
2827 return r;
2828 }
2829
2830 if (dest_bucket_info.owner != bucket->get_info().owner) {
2831 return -ERR_PRECONDITION_FAILED;
2832 }
2833
2834 rgw::sal::RGWRadosBucket dest_bucket(store, dest_bucket_info);
2835 rgw::sal::RGWRadosObject dest_obj(store, rgw_obj_key(buf), &dest_bucket);
2836
2837 if (dest_bucket_info.versioning_enabled()){
2838 dest_obj.gen_rand_obj_instance_name();
2839 }
2840
2841 dest_obj.set_atomic(&obj_ctx);
2842
2843 rgw_zone_id no_zone;
2844
2845 r = copy_obj(obj_ctx,
2846 user,
2847 NULL, /* req_info *info */
2848 no_zone,
2849 &dest_obj,
2850 obj,
2851 &dest_bucket,
2852 bucket,
2853 bucket->get_placement_rule(),
2854 NULL, /* time_t *src_mtime */
2855 NULL, /* time_t *mtime */
2856 NULL, /* const time_t *mod_ptr */
2857 NULL, /* const time_t *unmod_ptr */
2858 false, /* bool high_precision_time */
2859 NULL, /* const char *if_match */
2860 NULL, /* const char *if_nomatch */
2861 RGWRados::ATTRSMOD_NONE,
2862 true, /* bool copy_if_newer */
2863 state->attrset,
2864 RGWObjCategory::Main,
2865 0, /* uint64_t olh_epoch */
2866 real_time(), /* time_t delete_at */
2867 NULL, /* string *version_id */
2868 NULL, /* string *ptag */
2869 NULL, /* string *petag */
2870 NULL, /* void (*progress_cb)(off_t, void *) */
2871 NULL, /* void *progress_data */
2872 dpp,
2873 null_yield);
2874 if (r == -ECANCELED || r == -ENOENT) {
2875 /* Has already been overwritten, meaning another rgw process already
2876 * copied it out */
2877 return 0;
2878 }
2879
2880 return r;
2881 }
2882
2883 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
2884 const rgw_user& user,
2885 rgw::sal::RGWBucket* bucket,
2886 rgw::sal::RGWObject* obj,
2887 bool& restored, /* out */
2888 const DoutPrefixProvider *dpp)
2889 {
2890 if (! swift_versioning_enabled(bucket)) {
2891 return 0;
2892 }
2893
2894 /* Bucket info of the bucket that stores previous versions of our object. */
2895 RGWBucketInfo archive_binfo;
2896
2897 int ret = get_bucket_info(&svc, bucket->get_tenant(),
2898 bucket->get_info().swift_ver_location,
2899 archive_binfo, nullptr, null_yield, nullptr);
2900 if (ret < 0) {
2901 return ret;
2902 }
2903
2904 /* Abort the operation if the bucket storing our archive belongs to someone
2905 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
2906 * into consideration. For we can live with that.
2907 *
2908 * TODO: delegate this check to un upper layer and compare with ACLs. */
2909 if (bucket->get_info().owner != archive_binfo.owner) {
2910 return -EPERM;
2911 }
2912
2913 /* This code will be executed on latest version of the object. */
2914 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
2915 rgw_zone_id no_zone;
2916
2917 /* We don't support object versioning of Swift API on those buckets that
2918 * are already versioned using the S3 mechanism. This affects also bucket
2919 * storing archived objects. Otherwise the delete operation would create
2920 * a deletion marker. */
2921 if (archive_binfo.versioned()) {
2922 restored = false;
2923 return -ERR_PRECONDITION_FAILED;
2924 }
2925
2926 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
2927 * irrelevant and may be safely skipped. */
2928 std::map<std::string, ceph::bufferlist> no_attrs;
2929
2930 rgw::sal::RGWRadosBucket archive_bucket(store, archive_binfo);
2931 rgw::sal::RGWRadosObject archive_obj(store, entry.key, &archive_bucket);
2932
2933 if (bucket->versioning_enabled()){
2934 obj->gen_rand_obj_instance_name();
2935 }
2936
2937 archive_obj.set_atomic(&obj_ctx);
2938 obj->set_atomic(&obj_ctx);
2939
2940 int ret = copy_obj(obj_ctx,
2941 user,
2942 nullptr, /* req_info *info */
2943 no_zone,
2944 obj, /* dest obj */
2945 &archive_obj, /* src obj */
2946 bucket, /* dest bucket info */
2947 &archive_bucket, /* src bucket info */
2948 bucket->get_placement_rule(), /* placement_rule */
2949 nullptr, /* time_t *src_mtime */
2950 nullptr, /* time_t *mtime */
2951 nullptr, /* const time_t *mod_ptr */
2952 nullptr, /* const time_t *unmod_ptr */
2953 false, /* bool high_precision_time */
2954 nullptr, /* const char *if_match */
2955 nullptr, /* const char *if_nomatch */
2956 RGWRados::ATTRSMOD_NONE,
2957 true, /* bool copy_if_newer */
2958 no_attrs,
2959 RGWObjCategory::Main,
2960 0, /* uint64_t olh_epoch */
2961 real_time(), /* time_t delete_at */
2962 nullptr, /* string *version_id */
2963 nullptr, /* string *ptag */
2964 nullptr, /* string *petag */
2965 nullptr, /* void (*progress_cb)(off_t, void *) */
2966 nullptr, /* void *progress_data */
2967 dpp,
2968 null_yield);
2969 if (ret == -ECANCELED || ret == -ENOENT) {
2970 /* Has already been overwritten, meaning another rgw process already
2971 * copied it out */
2972 return 0;
2973 } else if (ret < 0) {
2974 return ret;
2975 } else {
2976 restored = true;
2977 }
2978
2979 /* Need to remove the archived copy. */
2980 ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj.get_obj(),
2981 archive_binfo.versioning_status());
2982
2983 return ret;
2984 };
2985
2986 const std::string& obj_name = obj->get_oid();
2987 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
2988 % obj_name);
2989
2990 return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
2991 handler);
2992 }
2993
2994 int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
2995 uint64_t size, uint64_t accounted_size,
2996 map<string, bufferlist>& attrs,
2997 bool assume_noent, bool modify_tail,
2998 void *_index_op, optional_yield y)
2999 {
3000 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
3001 RGWRados *store = target->get_store();
3002
3003 ObjectWriteOperation op;
3004 #ifdef WITH_LTTNG
3005 const struct req_state* s = get_req_state();
3006 string req_id;
3007 if (!s) {
3008 // fake req_id
3009 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3010 } else {
3011 req_id = s->req_id;
3012 }
3013 #endif
3014
3015 RGWObjState *state;
3016 int r = target->get_state(dpp, &state, false, y, assume_noent);
3017 if (r < 0)
3018 return r;
3019
3020 rgw_obj& obj = target->get_obj();
3021
3022 if (obj.get_oid().empty()) {
3023 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
3024 return -EIO;
3025 }
3026
3027 rgw_rados_ref ref;
3028 r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
3029 if (r < 0)
3030 return r;
3031
3032 bool is_olh = state->is_olh;
3033
3034 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3035
3036 const string *ptag = meta.ptag;
3037 if (!ptag && !index_op->get_optag()->empty()) {
3038 ptag = index_op->get_optag();
3039 }
3040 r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
3041 if (r < 0)
3042 return r;
3043
3044 if (real_clock::is_zero(meta.set_mtime)) {
3045 meta.set_mtime = real_clock::now();
3046 }
3047
3048 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
3049 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
3050 if (iter == attrs.end()) {
3051 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
3052 string mode = target->bucket_info.obj_lock.get_mode();
3053 RGWObjectRetention obj_retention(mode, lock_until_date);
3054 bufferlist bl;
3055 obj_retention.encode(bl);
3056 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3057 }
3058 }
3059
3060 if (state->is_olh) {
3061 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3062 }
3063
3064 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3065 op.mtime2(&mtime_ts);
3066
3067 if (meta.data) {
3068 /* if we want to overwrite the data, we also want to overwrite the
3069 xattrs, so just remove the object */
3070 op.write_full(*meta.data);
3071 }
3072
3073 string etag;
3074 string content_type;
3075 bufferlist acl_bl;
3076 string storage_class;
3077
3078 map<string, bufferlist>::iterator iter;
3079 if (meta.rmattrs) {
3080 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3081 const string& name = iter->first;
3082 op.rmxattr(name.c_str());
3083 }
3084 }
3085
3086 if (meta.manifest) {
3087 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3088
3089 /* remove existing manifest attr */
3090 iter = attrs.find(RGW_ATTR_MANIFEST);
3091 if (iter != attrs.end())
3092 attrs.erase(iter);
3093
3094 bufferlist bl;
3095 encode(*meta.manifest, bl);
3096 op.setxattr(RGW_ATTR_MANIFEST, bl);
3097 }
3098
3099 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3100 const string& name = iter->first;
3101 bufferlist& bl = iter->second;
3102
3103 if (!bl.length())
3104 continue;
3105
3106 op.setxattr(name.c_str(), bl);
3107
3108 if (name.compare(RGW_ATTR_ETAG) == 0) {
3109 etag = rgw_bl_str(bl);
3110 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
3111 content_type = rgw_bl_str(bl);
3112 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3113 acl_bl = bl;
3114 }
3115 }
3116 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3117 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3118 }
3119
3120 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3121 bufferlist bl;
3122 encode(store->svc.zone->get_zone_short_id(), bl);
3123 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3124 }
3125
3126 if (!storage_class.empty()) {
3127 bufferlist bl;
3128 bl.append(storage_class);
3129 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3130 }
3131
3132 if (!op.size())
3133 return 0;
3134
3135 uint64_t epoch;
3136 int64_t poolid;
3137 bool orig_exists;
3138 uint64_t orig_size;
3139
3140 if (!reset_obj) { //Multipart upload, it has immutable head.
3141 orig_exists = false;
3142 orig_size = 0;
3143 } else {
3144 orig_exists = state->exists;
3145 orig_size = state->accounted_size;
3146 }
3147
3148 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3149 !obj.key.instance.empty();
3150
3151 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3152
3153 if (versioned_op) {
3154 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3155 }
3156
3157 if (!index_op->is_prepared()) {
3158 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
3159 r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
3160 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
3161 if (r < 0)
3162 return r;
3163 }
3164
3165 auto& ioctx = ref.pool.ioctx();
3166
3167 tracepoint(rgw_rados, operate_enter, req_id.c_str());
3168 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
3169 tracepoint(rgw_rados, operate_exit, req_id.c_str());
3170 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3171 or -ENOENT if was removed, or -EEXIST if it did not exist
3172 before and now it does */
3173 if (r == -EEXIST && assume_noent) {
3174 target->invalidate_state();
3175 return r;
3176 }
3177 goto done_cancel;
3178 }
3179
3180 epoch = ioctx.get_last_version();
3181 poolid = ioctx.get_id();
3182
3183 r = target->complete_atomic_modification(dpp);
3184 if (r < 0) {
3185 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3186 }
3187
3188 tracepoint(rgw_rados, complete_enter, req_id.c_str());
3189 r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
3190 meta.set_mtime, etag, content_type,
3191 storage_class, &acl_bl,
3192 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3193 tracepoint(rgw_rados, complete_exit, req_id.c_str());
3194 if (r < 0)
3195 goto done_cancel;
3196
3197 if (meta.mtime) {
3198 *meta.mtime = meta.set_mtime;
3199 }
3200
3201 /* note that index_op was using state so we couldn't invalidate it earlier */
3202 target->invalidate_state();
3203 state = NULL;
3204
3205 if (versioned_op && meta.olh_epoch) {
3206 r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
3207 if (r < 0) {
3208 return r;
3209 }
3210 }
3211
3212 if (!real_clock::is_zero(meta.delete_at)) {
3213 rgw_obj_index_key obj_key;
3214 obj.key.get_index_key(&obj_key);
3215
3216 r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
3217 obj.bucket.bucket_id, obj_key);
3218 if (r < 0) {
3219 ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3220 /* ignoring error, nothing we can do at this point */
3221 }
3222 }
3223 meta.canceled = false;
3224
3225 /* update quota cache */
3226 if (meta.completeMultipart){
3227 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3228 0, orig_size);
3229 }
3230 else {
3231 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3232 accounted_size, orig_size);
3233 }
3234 return 0;
3235
3236 done_cancel:
3237 int ret = index_op->cancel(dpp);
3238 if (ret < 0) {
3239 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3240 }
3241
3242 meta.canceled = true;
3243
3244 /* we lost in a race. There are a few options:
3245 * - existing object was rewritten (ECANCELED)
3246 * - non existing object was created (EEXIST)
3247 * - object was removed (ENOENT)
3248 * should treat it as a success
3249 */
3250 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3251 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3252 r = 0;
3253 }
3254 } else {
3255 if (meta.if_match != NULL) {
3256 // only overwrite existing object
3257 if (strcmp(meta.if_match, "*") == 0) {
3258 if (r == -ENOENT) {
3259 r = -ERR_PRECONDITION_FAILED;
3260 } else if (r == -ECANCELED) {
3261 r = 0;
3262 }
3263 }
3264 }
3265
3266 if (meta.if_nomatch != NULL) {
3267 // only create a new object
3268 if (strcmp(meta.if_nomatch, "*") == 0) {
3269 if (r == -EEXIST) {
3270 r = -ERR_PRECONDITION_FAILED;
3271 } else if (r == -ENOENT) {
3272 r = 0;
3273 }
3274 }
3275 }
3276 }
3277
3278 return r;
3279 }
3280
3281 int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
3282 map<string, bufferlist>& attrs, optional_yield y)
3283 {
3284 RGWBucketInfo& bucket_info = target->get_bucket_info();
3285
3286 RGWRados::Bucket bop(target->get_store(), bucket_info);
3287 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
3288 index_op.set_zones_trace(meta.zones_trace);
3289
3290 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3291 int r;
3292 if (assume_noent) {
3293 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
3294 if (r == -EEXIST) {
3295 assume_noent = false;
3296 }
3297 }
3298 if (!assume_noent) {
3299 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
3300 }
3301 return r;
3302 }
3303
3304 class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
3305 {
3306 const DoutPrefixProvider *dpp;
3307 CephContext* cct;
3308 rgw_obj obj;
3309 rgw::putobj::DataProcessor *filter;
3310 boost::optional<RGWPutObj_Compress>& compressor;
3311 bool try_etag_verify;
3312 rgw::putobj::etag_verifier_ptr etag_verifier;
3313 boost::optional<rgw::putobj::ChunkProcessor> buffering;
3314 CompressorRef& plugin;
3315 rgw::putobj::ObjectProcessor *processor;
3316 void (*progress_cb)(off_t, void *);
3317 void *progress_data;
3318 bufferlist extra_data_bl, manifest_bl;
3319 std::optional<RGWCompressionInfo> compression_info;
3320 uint64_t extra_data_left{0};
3321 bool need_to_process_attrs{true};
3322 uint64_t data_len{0};
3323 map<string, bufferlist> src_attrs;
3324 uint64_t ofs{0};
3325 uint64_t lofs{0}; /* logical ofs */
3326 std::function<int(map<string, bufferlist>&)> attrs_handler;
3327 public:
3328 RGWRadosPutObj(const DoutPrefixProvider *dpp,
3329 CephContext* cct,
3330 CompressorRef& plugin,
3331 boost::optional<RGWPutObj_Compress>& compressor,
3332 rgw::putobj::ObjectProcessor *p,
3333 void (*_progress_cb)(off_t, void *),
3334 void *_progress_data,
3335 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
3336 dpp(dpp),
3337 cct(cct),
3338 filter(p),
3339 compressor(compressor),
3340 try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
3341 plugin(plugin),
3342 processor(p),
3343 progress_cb(_progress_cb),
3344 progress_data(_progress_data),
3345 attrs_handler(_attrs_handler) {}
3346
3347 int process_attrs(void) {
3348 if (extra_data_bl.length()) {
3349 JSONParser jp;
3350 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3351 ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3352 return -EIO;
3353 }
3354
3355 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3356
3357 auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
3358 if (iter != src_attrs.end()) {
3359 const bufferlist bl = std::move(iter->second);
3360 src_attrs.erase(iter); // don't preserve source compression info
3361
3362 if (try_etag_verify) {
3363 // if we're trying to verify etags, we need to convert compressed
3364 // ranges in the manifest back into logical multipart part offsets
3365 RGWCompressionInfo info;
3366 bool compressed = false;
3367 int r = rgw_compression_info_from_attr(bl, compressed, info);
3368 if (r < 0) {
3369 ldpp_dout(dpp, 4) << "failed to decode compression info, "
3370 "disabling etag verification" << dendl;
3371 try_etag_verify = false;
3372 } else if (compressed) {
3373 compression_info = std::move(info);
3374 }
3375 }
3376 }
3377 /* We need the manifest to recompute the ETag for verification */
3378 iter = src_attrs.find(RGW_ATTR_MANIFEST);
3379 if (iter != src_attrs.end()) {
3380 manifest_bl = std::move(iter->second);
3381 src_attrs.erase(iter);
3382 }
3383
3384 // filter out olh attributes
3385 iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3386 while (iter != src_attrs.end()) {
3387 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3388 break;
3389 }
3390 iter = src_attrs.erase(iter);
3391 }
3392 }
3393
3394 int ret = attrs_handler(src_attrs);
3395 if (ret < 0) {
3396 return ret;
3397 }
3398
3399 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3400 //do not compress if object is encrypted
3401 compressor = boost::in_place(cct, plugin, filter);
3402 // add a filter that buffers data so we don't try to compress tiny blocks.
3403 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3404 // compression ratio
3405 constexpr unsigned buffer_size = 512 * 1024;
3406 buffering = boost::in_place(&*compressor, buffer_size);
3407 filter = &*buffering;
3408 }
3409
3410 /*
3411 * Presently we don't support ETag based verification if encryption is
3412 * requested. We can enable simultaneous support once we have a mechanism
3413 * to know the sequence in which the filters must be applied.
3414 */
3415 if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3416 ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
3417 compression_info,
3418 etag_verifier);
3419 if (ret < 0) {
3420 ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
3421 "disabling etag verification" << dendl;
3422 } else {
3423 filter = etag_verifier.get();
3424 }
3425 }
3426
3427 need_to_process_attrs = false;
3428
3429 return 0;
3430 }
3431
3432 int handle_data(bufferlist& bl, bool *pause) override {
3433 if (progress_cb) {
3434 progress_cb(data_len, progress_data);
3435 }
3436 if (extra_data_left) {
3437 uint64_t extra_len = bl.length();
3438 if (extra_len > extra_data_left)
3439 extra_len = extra_data_left;
3440
3441 bufferlist extra;
3442 bl.splice(0, extra_len, &extra);
3443 extra_data_bl.append(extra);
3444
3445 extra_data_left -= extra_len;
3446 if (extra_data_left == 0) {
3447 int res = process_attrs();
3448 if (res < 0)
3449 return res;
3450 }
3451 ofs += extra_len;
3452 if (bl.length() == 0) {
3453 return 0;
3454 }
3455 }
3456 if (need_to_process_attrs) {
3457 /* need to call process_attrs() even if we don't get any attrs,
3458 * need it to call attrs_handler().
3459 */
3460 int res = process_attrs();
3461 if (res < 0) {
3462 return res;
3463 }
3464 }
3465
3466 ceph_assert(uint64_t(ofs) >= extra_data_len);
3467
3468 uint64_t size = bl.length();
3469 ofs += size;
3470
3471 const uint64_t lofs = data_len;
3472 data_len += size;
3473
3474 return filter->process(std::move(bl), lofs);
3475 }
3476
3477 int flush() {
3478 return filter->process({}, data_len);
3479 }
3480
3481 bufferlist& get_extra_data() { return extra_data_bl; }
3482
3483 map<string, bufferlist>& get_attrs() { return src_attrs; }
3484
3485 void set_extra_data_len(uint64_t len) override {
3486 extra_data_left = len;
3487 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
3488 }
3489
3490 uint64_t get_data_len() {
3491 return data_len;
3492 }
3493
3494 std::string get_verifier_etag() {
3495 if (etag_verifier) {
3496 etag_verifier->calculate_etag();
3497 return etag_verifier->get_calculated_etag();
3498 } else {
3499 return "";
3500 }
3501 }
3502 };
3503
3504 /*
3505 * prepare attrset depending on attrs_mod.
3506 */
3507 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3508 map<string, bufferlist>& attrs,
3509 RGWRados::AttrsMod attrs_mod)
3510 {
3511 switch (attrs_mod) {
3512 case RGWRados::ATTRSMOD_NONE:
3513 attrs = src_attrs;
3514 break;
3515 case RGWRados::ATTRSMOD_REPLACE:
3516 if (!attrs[RGW_ATTR_ETAG].length()) {
3517 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3518 }
3519 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3520 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3521 if (ttiter != src_attrs.end()) {
3522 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3523 }
3524 }
3525 break;
3526 case RGWRados::ATTRSMOD_MERGE:
3527 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3528 if (attrs.find(it->first) == attrs.end()) {
3529 attrs[it->first] = it->second;
3530 }
3531 }
3532 break;
3533 }
3534 }
3535
3536 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw::sal::RGWObject* obj, const DoutPrefixProvider *dpp, optional_yield y)
3537 {
3538 RGWObjectCtx rctx(this->store);
3539 rgw::sal::RGWRadosBucket bucket(store, dest_bucket_info);
3540
3541 return obj->copy_obj_data(rctx, &bucket, obj, 0, NULL, dpp, y);
3542 }
3543
3544 struct obj_time_weight {
3545 real_time mtime;
3546 uint32_t zone_short_id;
3547 uint64_t pg_ver;
3548 bool high_precision;
3549
3550 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3551
3552 bool compare_low_precision(const obj_time_weight& rhs) {
3553 struct timespec l = ceph::real_clock::to_timespec(mtime);
3554 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3555 l.tv_nsec = 0;
3556 r.tv_nsec = 0;
3557 if (l > r) {
3558 return false;
3559 }
3560 if (l < r) {
3561 return true;
3562 }
3563 if (!zone_short_id || !rhs.zone_short_id) {
3564 /* don't compare zone ids, if one wasn't provided */
3565 return false;
3566 }
3567 if (zone_short_id != rhs.zone_short_id) {
3568 return (zone_short_id < rhs.zone_short_id);
3569 }
3570 return (pg_ver < rhs.pg_ver);
3571
3572 }
3573
3574 bool operator<(const obj_time_weight& rhs) {
3575 if (!high_precision || !rhs.high_precision) {
3576 return compare_low_precision(rhs);
3577 }
3578 if (mtime > rhs.mtime) {
3579 return false;
3580 }
3581 if (mtime < rhs.mtime) {
3582 return true;
3583 }
3584 if (!zone_short_id || !rhs.zone_short_id) {
3585 /* don't compare zone ids, if one wasn't provided */
3586 return false;
3587 }
3588 if (zone_short_id != rhs.zone_short_id) {
3589 return (zone_short_id < rhs.zone_short_id);
3590 }
3591 return (pg_ver < rhs.pg_ver);
3592 }
3593
3594 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3595 mtime = _mtime;
3596 zone_short_id = _short_id;
3597 pg_ver = _pg_ver;
3598 }
3599
3600 void init(RGWObjState *state) {
3601 mtime = state->mtime;
3602 zone_short_id = state->zone_short_id;
3603 pg_ver = state->pg_ver;
3604 }
3605 };
3606
3607 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3608 out << o.mtime;
3609
3610 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3611 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3612 }
3613
3614 return out;
3615 }
3616
3617 class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
3618 bufferlist extra_data;
3619 public:
3620 RGWGetExtraDataCB() {}
3621 int handle_data(bufferlist& bl, bool *pause) override {
3622 int bl_len = (int)bl.length();
3623 if (extra_data.length() < extra_data_len) {
3624 off_t max = extra_data_len - extra_data.length();
3625 if (max > bl_len) {
3626 max = bl_len;
3627 }
3628 bl.splice(0, max, &extra_data);
3629 }
3630 return bl_len;
3631 }
3632
3633 bufferlist& get_extra_data() {
3634 return extra_data;
3635 }
3636 };
3637
3638 int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
3639 RGWObjectCtx& obj_ctx,
3640 const rgw_user& user_id,
3641 req_info *info,
3642 const rgw_zone_id& source_zone,
3643 rgw::sal::RGWObject* src_obj,
3644 const RGWBucketInfo *src_bucket_info,
3645 real_time *src_mtime,
3646 uint64_t *psize,
3647 const real_time *mod_ptr,
3648 const real_time *unmod_ptr,
3649 bool high_precision_time,
3650 const char *if_match,
3651 const char *if_nomatch,
3652 map<string, bufferlist> *pattrs,
3653 map<string, string> *pheaders,
3654 string *version_id,
3655 string *ptag,
3656 string *petag)
3657 {
3658 /* source is in a different zonegroup, copy from there */
3659
3660 RGWRESTStreamRWRequest *in_stream_req;
3661 string tag;
3662 map<string, bufferlist> src_attrs;
3663 append_rand_alpha(cct, tag, tag, 32);
3664 obj_time_weight set_mtime_weight;
3665 set_mtime_weight.high_precision = high_precision_time;
3666
3667 RGWRESTConn *conn;
3668 if (source_zone.empty()) {
3669 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
3670 /* source is in the master zonegroup */
3671 conn = svc.zone->get_master_conn();
3672 } else {
3673 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
3674 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
3675 if (iter == zonegroup_conn_map.end()) {
3676 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3677 return -ENOENT;
3678 }
3679 conn = iter->second;
3680 }
3681 } else {
3682 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3683 auto iter = zone_conn_map.find(source_zone);
3684 if (iter == zone_conn_map.end()) {
3685 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3686 return -ENOENT;
3687 }
3688 conn = iter->second;
3689 }
3690
3691 RGWGetExtraDataCB cb;
3692 map<string, string> req_headers;
3693 real_time set_mtime;
3694
3695 const real_time *pmod = mod_ptr;
3696
3697 obj_time_weight dest_mtime_weight;
3698
3699 constexpr bool prepend_meta = true;
3700 constexpr bool get_op = true;
3701 constexpr bool rgwx_stat = true;
3702 constexpr bool sync_manifest = true;
3703 constexpr bool skip_decrypt = true;
3704 int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
3705 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
3706 prepend_meta, get_op, rgwx_stat,
3707 sync_manifest, skip_decrypt,
3708 true, &cb, &in_stream_req);
3709 if (ret < 0) {
3710 return ret;
3711 }
3712
3713 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
3714 nullptr, pheaders, null_yield);
3715 if (ret < 0) {
3716 return ret;
3717 }
3718
3719 bufferlist& extra_data_bl = cb.get_extra_data();
3720 if (extra_data_bl.length()) {
3721 JSONParser jp;
3722 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3723 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3724 return -EIO;
3725 }
3726
3727 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3728
3729 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3730 }
3731
3732 if (src_mtime) {
3733 *src_mtime = set_mtime;
3734 }
3735
3736 if (petag) {
3737 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3738 if (iter != src_attrs.end()) {
3739 bufferlist& etagbl = iter->second;
3740 *petag = etagbl.to_str();
3741 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3742 *petag = petag->substr(0, petag->size() - 1);
3743 }
3744 }
3745 }
3746
3747 if (pattrs) {
3748 *pattrs = std::move(src_attrs);
3749 }
3750
3751 return 0;
3752 }
3753
3754 int RGWFetchObjFilter_Default::filter(CephContext *cct,
3755 const rgw_obj_key& source_key,
3756 const RGWBucketInfo& dest_bucket_info,
3757 std::optional<rgw_placement_rule> dest_placement_rule,
3758 const map<string, bufferlist>& obj_attrs,
3759 std::optional<rgw_user> *poverride_owner,
3760 const rgw_placement_rule **prule)
3761 {
3762 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3763 if (!ptail_rule) {
3764 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3765 if (iter != obj_attrs.end()) {
3766 dest_rule.storage_class = iter->second.to_str();
3767 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3768 ptail_rule = &dest_rule;
3769 } else {
3770 ptail_rule = &dest_bucket_info.placement_rule;
3771 }
3772 }
3773 *prule = ptail_rule;
3774 return 0;
3775 }
3776
3777 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3778 const rgw_user& user_id,
3779 req_info *info,
3780 const rgw_zone_id& source_zone,
3781 rgw::sal::RGWObject* dest_obj,
3782 rgw::sal::RGWObject* src_obj,
3783 rgw::sal::RGWBucket* dest_bucket,
3784 rgw::sal::RGWBucket* src_bucket,
3785 std::optional<rgw_placement_rule> dest_placement_rule,
3786 real_time *src_mtime,
3787 real_time *mtime,
3788 const real_time *mod_ptr,
3789 const real_time *unmod_ptr,
3790 bool high_precision_time,
3791 const char *if_match,
3792 const char *if_nomatch,
3793 AttrsMod attrs_mod,
3794 bool copy_if_newer,
3795 rgw::sal::RGWAttrs& attrs,
3796 RGWObjCategory category,
3797 std::optional<uint64_t> olh_epoch,
3798 real_time delete_at,
3799 string *ptag,
3800 string *petag,
3801 void (*progress_cb)(off_t, void *),
3802 void *progress_data,
3803 const DoutPrefixProvider *dpp,
3804 RGWFetchObjFilter *filter,
3805 rgw_zone_set *zones_trace,
3806 std::optional<uint64_t>* bytes_transferred)
3807 {
3808 /* source is in a different zonegroup, copy from there */
3809
3810 RGWRESTStreamRWRequest *in_stream_req;
3811 string tag;
3812 int i;
3813 append_rand_alpha(cct, tag, tag, 32);
3814 obj_time_weight set_mtime_weight;
3815 set_mtime_weight.high_precision = high_precision_time;
3816 int ret;
3817
3818 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
3819 using namespace rgw::putobj;
3820 AtomicObjectProcessor processor(&aio, this->store, dest_bucket, nullptr, user_id,
3821 obj_ctx, dest_obj->clone(), olh_epoch,
3822 tag, dpp, null_yield);
3823 RGWRESTConn *conn;
3824 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3825 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
3826 if (source_zone.empty()) {
3827 if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
3828 /* source is in the master zonegroup */
3829 conn = svc.zone->get_master_conn();
3830 } else {
3831 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
3832 if (iter == zonegroup_conn_map.end()) {
3833 ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3834 return -ENOENT;
3835 }
3836 conn = iter->second;
3837 }
3838 } else {
3839 auto iter = zone_conn_map.find(source_zone);
3840 if (iter == zone_conn_map.end()) {
3841 ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3842 return -ENOENT;
3843 }
3844 conn = iter->second;
3845 }
3846
3847 boost::optional<RGWPutObj_Compress> compressor;
3848 CompressorRef plugin;
3849
3850 RGWFetchObjFilter_Default source_filter;
3851 if (!filter) {
3852 filter = &source_filter;
3853 }
3854
3855 std::optional<rgw_user> override_owner;
3856
3857 RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
3858 [&](map<string, bufferlist>& obj_attrs) {
3859 const rgw_placement_rule *ptail_rule;
3860
3861 int ret = filter->filter(cct,
3862 src_obj->get_key(),
3863 dest_bucket->get_info(),
3864 dest_placement_rule,
3865 obj_attrs,
3866 &override_owner,
3867 &ptail_rule);
3868 if (ret < 0) {
3869 ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
3870 return ret;
3871 }
3872
3873 processor.set_tail_placement(*ptail_rule);
3874
3875 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
3876 if (compression_type != "none") {
3877 plugin = Compressor::create(cct, compression_type);
3878 if (!plugin) {
3879 ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
3880 << compression_type << dendl;
3881 }
3882 }
3883
3884 ret = processor.prepare(null_yield);
3885 if (ret < 0) {
3886 return ret;
3887 }
3888 return 0;
3889 });
3890
3891 string etag;
3892 real_time set_mtime;
3893 uint64_t expected_size = 0;
3894
3895 RGWObjState *dest_state = NULL;
3896
3897 const real_time *pmod = mod_ptr;
3898
3899 obj_time_weight dest_mtime_weight;
3900
3901 if (copy_if_newer) {
3902 /* need to get mtime for destination */
3903 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
3904 if (ret < 0)
3905 goto set_err_state;
3906
3907 if (!real_clock::is_zero(dest_state->mtime)) {
3908 dest_mtime_weight.init(dest_state);
3909 pmod = &dest_mtime_weight.mtime;
3910 }
3911 }
3912
3913 static constexpr bool prepend_meta = true;
3914 static constexpr bool get_op = true;
3915 static constexpr bool rgwx_stat = false;
3916 static constexpr bool sync_manifest = true;
3917 static constexpr bool skip_decrypt = true;
3918 ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
3919 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
3920 prepend_meta, get_op, rgwx_stat,
3921 sync_manifest, skip_decrypt,
3922 true,
3923 &cb, &in_stream_req);
3924 if (ret < 0) {
3925 goto set_err_state;
3926 }
3927
3928 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
3929 &expected_size, nullptr, nullptr, null_yield);
3930 if (ret < 0) {
3931 goto set_err_state;
3932 }
3933 ret = cb.flush();
3934 if (ret < 0) {
3935 goto set_err_state;
3936 }
3937 if (cb.get_data_len() != expected_size) {
3938 ret = -EIO;
3939 ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
3940 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
3941 goto set_err_state;
3942 }
3943 if (compressor && compressor->is_compressed()) {
3944 bufferlist tmp;
3945 RGWCompressionInfo cs_info;
3946 cs_info.compression_type = plugin->get_type_name();
3947 cs_info.orig_size = cb.get_data_len();
3948 cs_info.compressor_message = compressor->get_compressor_message();
3949 cs_info.blocks = move(compressor->get_compression_blocks());
3950 encode(cs_info, tmp);
3951 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
3952 }
3953
3954 if (override_owner) {
3955 processor.set_owner(*override_owner);
3956
3957 auto& obj_attrs = cb.get_attrs();
3958
3959 RGWUserInfo owner_info;
3960 if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
3961 ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
3962 return -EINVAL;
3963 }
3964
3965 RGWAccessControlPolicy acl;
3966
3967 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
3968 if (aiter == obj_attrs.end()) {
3969 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
3970 acl.create_default(owner_info.user_id, owner_info.display_name);
3971 } else {
3972 auto iter = aiter->second.cbegin();
3973 try {
3974 acl.decode(iter);
3975 } catch (buffer::error& err) {
3976 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
3977 return -EIO;
3978 }
3979 }
3980
3981 ACLOwner new_owner;
3982 new_owner.set_id(*override_owner);
3983 new_owner.set_name(owner_info.display_name);
3984
3985 acl.set_owner(new_owner);
3986
3987 bufferlist bl;
3988 acl.encode(bl);
3989 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
3990 }
3991
3992 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
3993 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
3994 } else {
3995 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
3996 if (iter != cb.get_attrs().end()) {
3997 try {
3998 decode(delete_at, iter->second);
3999 } catch (buffer::error& err) {
4000 ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
4001 }
4002 }
4003 }
4004
4005 if (src_mtime) {
4006 *src_mtime = set_mtime;
4007 }
4008
4009 if (petag) {
4010 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4011 if (iter != cb.get_attrs().end()) {
4012 *petag = iter->second.to_str();
4013 }
4014 }
4015
4016 //erase the append attr
4017 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4018
4019 if (source_zone.empty()) {
4020 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4021 } else {
4022 attrs = cb.get_attrs();
4023 }
4024
4025 if (copy_if_newer) {
4026 uint64_t pg_ver = 0;
4027 auto i = attrs.find(RGW_ATTR_PG_VER);
4028 if (i != attrs.end() && i->second.length() > 0) {
4029 auto iter = i->second.cbegin();
4030 try {
4031 decode(pg_ver, iter);
4032 } catch (buffer::error& err) {
4033 ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
4034 /* non critical error */
4035 }
4036 }
4037 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
4038 }
4039
4040 /* Perform ETag verification is we have computed the object's MD5 sum at our end */
4041 if (const auto& verifier_etag = cb.get_verifier_etag();
4042 !verifier_etag.empty()) {
4043 string trimmed_etag = etag;
4044
4045 /* Remove the leading and trailing double quotes from etag */
4046 trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
4047 trimmed_etag.end());
4048
4049 if (verifier_etag != trimmed_etag) {
4050 ret = -EIO;
4051 ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
4052 << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
4053 goto set_err_state;
4054 }
4055 }
4056
4057 #define MAX_COMPLETE_RETRY 100
4058 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
4059 bool canceled = false;
4060 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4061 attrs, delete_at, nullptr, nullptr, nullptr,
4062 zones_trace, &canceled, null_yield);
4063 if (ret < 0) {
4064 goto set_err_state;
4065 }
4066
4067 if (copy_if_newer && canceled) {
4068 ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
4069 obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
4070 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
4071 if (ret < 0) {
4072 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
4073 goto set_err_state;
4074 }
4075 dest_mtime_weight.init(dest_state);
4076 dest_mtime_weight.high_precision = high_precision_time;
4077 if (!dest_state->exists ||
4078 dest_mtime_weight < set_mtime_weight) {
4079 ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4080 continue;
4081 } else {
4082 ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4083 }
4084 }
4085 break;
4086 }
4087
4088 if (i == MAX_COMPLETE_RETRY) {
4089 ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
4090 ret = -EIO;
4091 goto set_err_state;
4092 }
4093
4094 if (bytes_transferred) {
4095 *bytes_transferred = cb.get_data_len();
4096 }
4097 return 0;
4098 set_err_state:
4099 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
4100 // we may have already fetched during sync of OP_ADD, but were waiting
4101 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4102 if (olh_epoch && *olh_epoch > 0) {
4103 constexpr bool log_data_change = true;
4104 ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), false, nullptr,
4105 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
4106 } else {
4107 // we already have the latest copy
4108 ret = 0;
4109 }
4110 }
4111 return ret;
4112 }
4113
4114
4115 int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
4116 RGWObjState *astate,
4117 map<string, bufferlist>& src_attrs,
4118 RGWRados::Object::Read& read_op,
4119 const rgw_user& user_id,
4120 rgw::sal::RGWObject* dest_obj,
4121 real_time *mtime)
4122 {
4123 string etag;
4124
4125 RGWRESTStreamS3PutObj *out_stream_req;
4126
4127 auto rest_master_conn = svc.zone->get_master_conn();
4128
4129 int ret = rest_master_conn->put_obj_async(dpp, user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
4130 if (ret < 0) {
4131 return ret;
4132 }
4133
4134 ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
4135 if (ret < 0) {
4136 delete out_stream_req;
4137 return ret;
4138 }
4139
4140 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
4141 if (ret < 0)
4142 return ret;
4143
4144 return 0;
4145 }
4146
4147 /**
4148 * Copy an object.
4149 * dest_obj: the object to copy into
4150 * src_obj: the object to copy from
4151 * attrs: usage depends on attrs_mod parameter
4152 * attrs_mod: the modification mode of the attrs, may have the following values:
4153 * ATTRSMOD_NONE - the attributes of the source object will be
4154 * copied without modifications, attrs parameter is ignored;
4155 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4156 * parameter, source object attributes are not copied;
4157 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4158 * are overwritten by values contained in attrs parameter.
4159 * err: stores any errors resulting from the get of the original object
4160 * Returns: 0 on success, -ERR# otherwise.
4161 */
4162 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4163 const rgw_user& user_id,
4164 req_info *info,
4165 const rgw_zone_id& source_zone,
4166 rgw::sal::RGWObject* dest_obj,
4167 rgw::sal::RGWObject* src_obj,
4168 rgw::sal::RGWBucket* dest_bucket,
4169 rgw::sal::RGWBucket* src_bucket,
4170 const rgw_placement_rule& dest_placement,
4171 real_time *src_mtime,
4172 real_time *mtime,
4173 const real_time *mod_ptr,
4174 const real_time *unmod_ptr,
4175 bool high_precision_time,
4176 const char *if_match,
4177 const char *if_nomatch,
4178 AttrsMod attrs_mod,
4179 bool copy_if_newer,
4180 rgw::sal::RGWAttrs& attrs,
4181 RGWObjCategory category,
4182 uint64_t olh_epoch,
4183 real_time delete_at,
4184 string *version_id,
4185 string *ptag,
4186 string *petag,
4187 void (*progress_cb)(off_t, void *),
4188 void *progress_data,
4189 const DoutPrefixProvider *dpp,
4190 optional_yield y)
4191 {
4192 int ret;
4193 uint64_t obj_size;
4194 rgw_obj shadow_obj = dest_obj->get_obj();
4195 string shadow_oid;
4196
4197 bool remote_src;
4198 bool remote_dest;
4199
4200 append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
4201 shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
4202
4203 auto& zonegroup = svc.zone->get_zonegroup();
4204
4205 remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
4206 remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
4207
4208 if (remote_src && remote_dest) {
4209 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
4210 return -EINVAL;
4211 }
4212
4213 ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
4214
4215 if (remote_src || !source_zone.empty()) {
4216 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
4217 dest_obj, src_obj, dest_bucket, src_bucket,
4218 dest_placement, src_mtime, mtime, mod_ptr,
4219 unmod_ptr, high_precision_time,
4220 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
4221 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4222 nullptr /* filter */);
4223 }
4224
4225 map<string, bufferlist> src_attrs;
4226 RGWRados::Object src_op_target(this, src_bucket->get_info(), obj_ctx, src_obj->get_obj());
4227 RGWRados::Object::Read read_op(&src_op_target);
4228
4229 read_op.conds.mod_ptr = mod_ptr;
4230 read_op.conds.unmod_ptr = unmod_ptr;
4231 read_op.conds.high_precision_time = high_precision_time;
4232 read_op.conds.if_match = if_match;
4233 read_op.conds.if_nomatch = if_nomatch;
4234 read_op.params.attrs = &src_attrs;
4235 read_op.params.lastmod = src_mtime;
4236 read_op.params.obj_size = &obj_size;
4237
4238 ret = read_op.prepare(y, dpp);
4239 if (ret < 0) {
4240 return ret;
4241 }
4242 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4243 // Current implementation does not follow S3 spec and even
4244 // may result in data corruption silently when copying
4245 // multipart objects acorss pools. So reject COPY operations
4246 //on encrypted objects before it is fully functional.
4247 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
4248 << " has not been implemented." << dendl;
4249 return -ERR_NOT_IMPLEMENTED;
4250 }
4251
4252 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4253 src_attrs.erase(RGW_ATTR_DELETE_AT);
4254
4255 set_copy_attrs(src_attrs, attrs, attrs_mod);
4256 attrs.erase(RGW_ATTR_ID_TAG);
4257 attrs.erase(RGW_ATTR_PG_VER);
4258 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4259 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4260 if (cmp != src_attrs.end())
4261 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4262
4263 RGWObjManifest manifest;
4264 RGWObjState *astate = NULL;
4265
4266 ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj->get_obj(), &astate, y);
4267 if (ret < 0) {
4268 return ret;
4269 }
4270
4271 vector<rgw_raw_obj> ref_objs;
4272
4273 if (remote_dest) {
4274 /* dest is in a different zonegroup, copy it there */
4275 return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
4276 }
4277 uint64_t max_chunk_size;
4278
4279 ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
4280 if (ret < 0) {
4281 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
4282 return ret;
4283 }
4284
4285 rgw_pool src_pool;
4286 rgw_pool dest_pool;
4287
4288 const rgw_placement_rule *src_rule{nullptr};
4289
4290 if (astate->manifest) {
4291 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4292 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
4293 }
4294
4295 if (!src_rule || src_rule->empty()) {
4296 src_rule = &src_bucket->get_placement_rule();
4297 }
4298
4299 if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
4300 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
4301 return -EIO;
4302 }
4303
4304 if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
4305 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
4306 return -EIO;
4307 }
4308
4309 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
4310 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4311
4312 bool copy_data = (!astate->manifest) ||
4313 (*src_rule != dest_placement) ||
4314 (src_pool != dest_pool);
4315
4316 bool copy_first = false;
4317 if (astate->manifest) {
4318 if (!astate->manifest->has_tail()) {
4319 copy_data = true;
4320 } else {
4321 uint64_t head_size = astate->manifest->get_head_size();
4322
4323 if (head_size > 0) {
4324 if (head_size > max_chunk_size) {
4325 copy_data = true;
4326 } else {
4327 copy_first = true;
4328 }
4329 }
4330 }
4331 }
4332
4333 if (petag) {
4334 const auto iter = attrs.find(RGW_ATTR_ETAG);
4335 if (iter != attrs.end()) {
4336 *petag = iter->second.to_str();
4337 }
4338 }
4339
4340 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
4341 attrs.erase(RGW_ATTR_TAIL_TAG);
4342 return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
4343 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
4344 }
4345
4346 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin(dpp);
4347
4348 if (copy_first) { // we need to copy first chunk, not increase refcount
4349 ++miter;
4350 }
4351
4352 rgw_rados_ref ref;
4353 ret = get_raw_obj_ref(dpp, miter.get_location().get_raw_obj(store), &ref);
4354 if (ret < 0) {
4355 return ret;
4356 }
4357
4358 bufferlist first_chunk;
4359
4360 bool copy_itself = (dest_obj == src_obj);
4361 RGWObjManifest *pmanifest;
4362 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
4363
4364 RGWRados::Object dest_op_target(this, dest_bucket->get_info(), obj_ctx, dest_obj->get_obj());
4365 RGWRados::Object::Write write_op(&dest_op_target);
4366
4367 string tag;
4368
4369 if (ptag) {
4370 tag = *ptag;
4371 }
4372
4373 if (tag.empty()) {
4374 append_rand_alpha(cct, tag, tag, 32);
4375 }
4376
4377 if (!copy_itself) {
4378 attrs.erase(RGW_ATTR_TAIL_TAG);
4379 manifest = *astate->manifest;
4380 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4381 if (tail_placement.bucket.name.empty()) {
4382 manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
4383 }
4384 string ref_tag;
4385 for (; miter != astate->manifest->obj_end(dpp); ++miter) {
4386 ObjectWriteOperation op;
4387 ref_tag = tag + '\0';
4388 cls_refcount_get(op, ref_tag, true);
4389 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store);
4390
4391 auto& ioctx = ref.pool.ioctx();
4392 ioctx.locator_set_key(loc.loc);
4393
4394 ret = rgw_rados_operate(dpp, ioctx, loc.oid, &op, null_yield);
4395 if (ret < 0) {
4396 goto done_ret;
4397 }
4398
4399 ref_objs.push_back(loc);
4400 }
4401
4402 pmanifest = &manifest;
4403 } else {
4404 pmanifest = &(*astate->manifest);
4405 /* don't send the object's tail for garbage collection */
4406 astate->keep_tail = true;
4407 }
4408
4409 if (copy_first) {
4410 ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
4411 if (ret < 0) {
4412 goto done_ret;
4413 }
4414
4415 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
4416 } else {
4417 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
4418 }
4419
4420 write_op.meta.data = &first_chunk;
4421 write_op.meta.manifest = pmanifest;
4422 write_op.meta.ptag = &tag;
4423 write_op.meta.owner = dest_bucket->get_info().owner;
4424 write_op.meta.mtime = mtime;
4425 write_op.meta.flags = PUT_OBJ_CREATE;
4426 write_op.meta.category = category;
4427 write_op.meta.olh_epoch = olh_epoch;
4428 write_op.meta.delete_at = delete_at;
4429 write_op.meta.modify_tail = !copy_itself;
4430
4431 ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
4432 if (ret < 0) {
4433 goto done_ret;
4434 }
4435
4436 return 0;
4437
4438 done_ret:
4439 if (!copy_itself) {
4440 vector<rgw_raw_obj>::iterator riter;
4441
4442 /* rollback reference */
4443 string ref_tag = tag + '\0';
4444 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4445 ObjectWriteOperation op;
4446 cls_refcount_put(op, ref_tag, true);
4447
4448 ref.pool.ioctx().locator_set_key(riter->loc);
4449
4450 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), riter->oid, &op, null_yield);
4451 if (r < 0) {
4452 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
4453 }
4454 }
4455 }
4456 return ret;
4457 }
4458
4459
4460 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4461 rgw::sal::RGWBucket* bucket,
4462 const rgw_placement_rule& dest_placement,
4463 RGWRados::Object::Read& read_op, off_t end,
4464 rgw::sal::RGWObject* dest_obj,
4465 real_time *mtime,
4466 real_time set_mtime,
4467 rgw::sal::RGWAttrs& attrs,
4468 uint64_t olh_epoch,
4469 real_time delete_at,
4470 string *petag,
4471 const DoutPrefixProvider *dpp,
4472 optional_yield y)
4473 {
4474 string tag;
4475 append_rand_alpha(cct, tag, tag, 32);
4476
4477 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4478 using namespace rgw::putobj;
4479 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4480 // it causes crashes in the ragweed tests
4481 AtomicObjectProcessor processor(&aio, this->store, bucket, &dest_placement,
4482 bucket->get_info().owner, obj_ctx,
4483 dest_obj->clone(), olh_epoch, tag,
4484 dpp, null_yield);
4485 int ret = processor.prepare(y);
4486 if (ret < 0)
4487 return ret;
4488
4489 off_t ofs = 0;
4490
4491 do {
4492 bufferlist bl;
4493 ret = read_op.read(ofs, end, bl, y, dpp);
4494 if (ret < 0) {
4495 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
4496 return ret;
4497 }
4498
4499 uint64_t read_len = ret;
4500 ret = processor.process(std::move(bl), ofs);
4501 if (ret < 0) {
4502 return ret;
4503 }
4504
4505 ofs += read_len;
4506 } while (ofs <= end);
4507
4508 // flush
4509 ret = processor.process({}, ofs);
4510 if (ret < 0) {
4511 return ret;
4512 }
4513
4514 string etag;
4515 auto iter = attrs.find(RGW_ATTR_ETAG);
4516 if (iter != attrs.end()) {
4517 bufferlist& bl = iter->second;
4518 etag = bl.to_str();
4519 if (petag) {
4520 *petag = etag;
4521 }
4522 }
4523
4524 uint64_t accounted_size;
4525 {
4526 bool compressed{false};
4527 RGWCompressionInfo cs_info;
4528 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4529 if (ret < 0) {
4530 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
4531 return ret;
4532 }
4533 // pass original size if compressed
4534 accounted_size = compressed ? cs_info.orig_size : ofs;
4535 }
4536
4537 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
4538 nullptr, nullptr, nullptr, nullptr, nullptr, y);
4539 }
4540
4541 int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4542 rgw::sal::RGWBucket* bucket,
4543 rgw::sal::RGWObject& obj,
4544 const rgw_placement_rule& placement_rule,
4545 const real_time& mtime,
4546 uint64_t olh_epoch,
4547 const DoutPrefixProvider *dpp,
4548 optional_yield y)
4549 {
4550 rgw::sal::RGWAttrs attrs;
4551 real_time read_mtime;
4552 uint64_t obj_size;
4553
4554 obj.set_atomic(&obj_ctx);
4555 RGWRados::Object op_target(this, bucket->get_info(), obj_ctx, obj.get_obj());
4556 RGWRados::Object::Read read_op(&op_target);
4557
4558 read_op.params.attrs = &attrs;
4559 read_op.params.lastmod = &read_mtime;
4560 read_op.params.obj_size = &obj_size;
4561
4562 int ret = read_op.prepare(y, dpp);
4563 if (ret < 0) {
4564 return ret;
4565 }
4566
4567 if (read_mtime != mtime) {
4568 /* raced */
4569 return -ECANCELED;
4570 }
4571
4572 attrs.erase(RGW_ATTR_ID_TAG);
4573 attrs.erase(RGW_ATTR_TAIL_TAG);
4574
4575 ret = copy_obj_data(obj_ctx,
4576 bucket,
4577 placement_rule,
4578 read_op,
4579 obj_size - 1,
4580 &obj,
4581 nullptr /* pmtime */,
4582 mtime,
4583 attrs,
4584 olh_epoch,
4585 real_time(),
4586 nullptr /* petag */,
4587 dpp,
4588 y);
4589 if (ret < 0) {
4590 return ret;
4591 }
4592
4593 return 0;
4594 }
4595
4596 int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
4597 {
4598 constexpr uint NUM_ENTRIES = 1000u;
4599
4600 rgw_obj_index_key marker;
4601 string prefix;
4602 bool is_truncated;
4603
4604 do {
4605 std::vector<rgw_bucket_dir_entry> ent_list;
4606 ent_list.reserve(NUM_ENTRIES);
4607
4608 int r = cls_bucket_list_unordered(dpp,
4609 bucket_info,
4610 RGW_NO_SHARD,
4611 marker,
4612 prefix,
4613 NUM_ENTRIES,
4614 true,
4615 ent_list,
4616 &is_truncated,
4617 &marker,
4618 y);
4619 if (r < 0) {
4620 return r;
4621 }
4622
4623 string ns;
4624 for (auto const& dirent : ent_list) {
4625 rgw_obj_key obj;
4626
4627 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
4628 return -ENOTEMPTY;
4629 }
4630 }
4631 } while (is_truncated);
4632
4633 return 0;
4634 }
4635
4636 /**
4637 * Delete a bucket.
4638 * bucket: the name of the bucket to delete
4639 * Returns 0 on success, -ERR# otherwise.
4640 */
4641 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
4642 {
4643 const rgw_bucket& bucket = bucket_info.bucket;
4644 RGWSI_RADOS::Pool index_pool;
4645 map<int, string> bucket_objs;
4646 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
4647 if (r < 0)
4648 return r;
4649
4650 if (check_empty) {
4651 r = check_bucket_empty(dpp, bucket_info, y);
4652 if (r < 0) {
4653 return r;
4654 }
4655 }
4656
4657 bool remove_ep = true;
4658
4659 if (objv_tracker.read_version.empty()) {
4660 RGWBucketEntryPoint ep;
4661 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4662 &ep,
4663 null_yield,
4664 dpp,
4665 RGWBucketCtl::Bucket::GetParams()
4666 .set_objv_tracker(&objv_tracker));
4667 if (r < 0 ||
4668 (!bucket_info.bucket.bucket_id.empty() &&
4669 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4670 if (r != -ENOENT) {
4671 ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
4672 /* we have no idea what caused the error, will not try to remove it */
4673 }
4674 /*
4675 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4676 * requested
4677 */
4678 remove_ep = false;
4679 }
4680 }
4681
4682 if (remove_ep) {
4683 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
4684 RGWBucketCtl::Bucket::RemoveParams()
4685 .set_objv_tracker(&objv_tracker));
4686 if (r < 0)
4687 return r;
4688 }
4689
4690 /* if the bucket is not synced we can remove the meta file */
4691 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
4692 RGWObjVersionTracker objv_tracker;
4693 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
4694 if (r < 0) {
4695 return r;
4696 }
4697
4698 /* remove bucket index objects asynchronously by best effort */
4699 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
4700 bucket_objs,
4701 cct->_conf->rgw_bucket_index_max_aio)();
4702 }
4703
4704 return 0;
4705 }
4706
4707 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
4708 {
4709 RGWBucketInfo info;
4710 map<string, bufferlist> attrs;
4711 int r;
4712 auto obj_ctx = svc.sysobj->init_obj_ctx();
4713
4714 if (bucket.bucket_id.empty()) {
4715 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
4716 } else {
4717 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield, dpp);
4718 }
4719 if (r < 0) {
4720 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4721 return r;
4722 }
4723
4724 info.owner = owner.get_id();
4725
4726 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
4727 if (r < 0) {
4728 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4729 return r;
4730 }
4731
4732 return 0;
4733 }
4734
4735
4736 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
4737 {
4738 int ret = 0;
4739
4740 vector<rgw_bucket>::iterator iter;
4741
4742 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4743 rgw_bucket& bucket = *iter;
4744 if (enabled) {
4745 ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
4746 } else {
4747 ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
4748 }
4749
4750 RGWBucketInfo info;
4751 map<string, bufferlist> attrs;
4752 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
4753 if (r < 0) {
4754 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4755 ret = r;
4756 continue;
4757 }
4758 if (enabled) {
4759 info.flags &= ~BUCKET_SUSPENDED;
4760 } else {
4761 info.flags |= BUCKET_SUSPENDED;
4762 }
4763
4764 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
4765 if (r < 0) {
4766 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4767 ret = r;
4768 continue;
4769 }
4770 }
4771 return ret;
4772 }
4773
4774 int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
4775 {
4776 RGWBucketInfo bucket_info;
4777 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
4778 if (ret < 0) {
4779 return ret;
4780 }
4781
4782 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4783 return 0;
4784 }
4785
4786 int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
4787 {
4788 if ((!state->manifest)|| state->keep_tail)
4789 return 0;
4790
4791 cls_rgw_obj_chain chain;
4792 store->update_gc_chain(dpp, obj, *state->manifest, &chain);
4793
4794 if (chain.empty()) {
4795 return 0;
4796 }
4797
4798 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
4799 auto ret = store->gc->send_chain(chain, tag); // do it synchronously
4800 if (ret < 0) {
4801 //Delete objects inline if send chain to gc fails
4802 store->delete_objs_inline(dpp, chain, tag);
4803 }
4804 return 0;
4805 }
4806
4807 void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
4808 {
4809 RGWObjManifest::obj_iterator iter;
4810 rgw_raw_obj raw_head;
4811 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
4812 for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
4813 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(store);
4814 if (mobj == raw_head)
4815 continue;
4816 cls_rgw_obj_key key(mobj.oid);
4817 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4818 }
4819 }
4820
4821 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
4822 {
4823 if (chain.empty()) {
4824 return 0;
4825 }
4826
4827 return gc->send_chain(chain, tag);
4828 }
4829
4830 void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
4831 {
4832 string last_pool;
4833 std::unique_ptr<IoCtx> ctx(new IoCtx);
4834 int ret = 0;
4835 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4836 cls_rgw_obj& obj = *liter;
4837 if (obj.pool != last_pool) {
4838 ctx.reset(new IoCtx);
4839 ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
4840 if (ret < 0) {
4841 last_pool = "";
4842 ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
4843 obj.pool << dendl;
4844 continue;
4845 }
4846 last_pool = obj.pool;
4847 }
4848 ctx->locator_set_key(obj.loc);
4849 const string& oid = obj.key.name; /* just stored raw oid there */
4850 ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
4851 ":" << obj.key.name << dendl;
4852 ObjectWriteOperation op;
4853 cls_refcount_put(op, tag, true);
4854 ret = ctx->operate(oid, &op);
4855 if (ret < 0) {
4856 ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
4857 }
4858 }
4859 }
4860
4861 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
4862 map<RGWObjCategory, RGWStorageStats>& stats)
4863 {
4864 for (const auto& pair : header.stats) {
4865 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
4866 const rgw_bucket_category_stats& header_stats = pair.second;
4867
4868 RGWStorageStats& s = stats[category];
4869
4870 s.category = category;
4871 s.size += header_stats.total_size;
4872 s.size_rounded += header_stats.total_size_rounded;
4873 s.size_utilized += header_stats.actual_size;
4874 s.num_objects += header_stats.num_entries;
4875 }
4876 }
4877
4878 int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
4879 map<RGWObjCategory, RGWStorageStats> *existing_stats,
4880 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
4881 {
4882 RGWSI_RADOS::Pool index_pool;
4883 // key - bucket index object id
4884 // value - bucket index check OP returned result with the given bucket index object (shard)
4885 map<int, string> oids;
4886 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
4887
4888 int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &oids, nullptr);
4889 if (ret < 0) {
4890 return ret;
4891 }
4892
4893 for (auto& iter : oids) {
4894 bucket_objs_ret[iter.first] = rgw_cls_check_index_ret();
4895 }
4896
4897 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
4898 if (ret < 0) {
4899 return ret;
4900 }
4901
4902 // Aggregate results (from different shards if there is any)
4903 map<int, struct rgw_cls_check_index_ret>::iterator iter;
4904 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
4905 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
4906 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
4907 }
4908
4909 return 0;
4910 }
4911
4912 int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
4913 {
4914 RGWSI_RADOS::Pool index_pool;
4915 map<int, string> bucket_objs;
4916
4917 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
4918 if (r < 0) {
4919 return r;
4920 }
4921
4922 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
4923 }
4924
4925 int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
4926 {
4927 RGWSI_RADOS::Pool index_pool;
4928 map<int, string> bucket_objs;
4929
4930 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
4931 if (r < 0) {
4932 return r;
4933 }
4934
4935 return CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
4936 }
4937
4938 int RGWRados::defer_gc(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
4939 {
4940 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
4941 std::string oid, key;
4942 get_obj_bucket_and_oid_loc(obj, oid, key);
4943 if (!rctx)
4944 return 0;
4945
4946 RGWObjState *state = NULL;
4947
4948 int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, false, y);
4949 if (r < 0)
4950 return r;
4951
4952 if (!state->is_atomic) {
4953 ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
4954 return -EINVAL;
4955 }
4956
4957 string tag;
4958
4959 if (state->tail_tag.length() > 0) {
4960 tag = state->tail_tag.c_str();
4961 } else if (state->obj_tag.length() > 0) {
4962 tag = state->obj_tag.c_str();
4963 } else {
4964 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
4965 return -EINVAL;
4966 }
4967
4968 ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
4969
4970 cls_rgw_obj_chain chain;
4971 update_gc_chain(dpp, state->obj, *state->manifest, &chain);
4972 return gc->async_defer_chain(tag, chain);
4973 }
4974
4975 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
4976 {
4977 list<string> prefixes;
4978 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
4979 cls_rgw_remove_obj(op, prefixes);
4980 }
4981
4982 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
4983 {
4984 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
4985 }
4986
4987 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
4988 {
4989 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
4990 }
4991
4992 struct tombstone_entry {
4993 ceph::real_time mtime;
4994 uint32_t zone_short_id;
4995 uint64_t pg_ver;
4996
4997 tombstone_entry() = default;
4998 explicit tombstone_entry(const RGWObjState& state)
4999 : mtime(state.mtime), zone_short_id(state.zone_short_id),
5000 pg_ver(state.pg_ver) {}
5001 };
5002
5003 /**
5004 * Delete an object.
5005 * bucket: name of the bucket storing the object
5006 * obj: name of the object to delete
5007 * Returns: 0 on success, -ERR# otherwise.
5008 */
5009 int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
5010 {
5011 RGWRados *store = target->get_store();
5012 rgw_obj& src_obj = target->get_obj();
5013 const string& instance = src_obj.key.instance;
5014 rgw_obj obj = src_obj;
5015
5016 if (instance == "null") {
5017 obj.key.instance.clear();
5018 }
5019
5020 bool explicit_marker_version = (!params.marker_version_id.empty());
5021
5022 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5023 if (instance.empty() || explicit_marker_version) {
5024 rgw_obj marker = obj;
5025
5026 if (!params.marker_version_id.empty()) {
5027 if (params.marker_version_id != "null") {
5028 marker.key.set_instance(params.marker_version_id);
5029 }
5030 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5031 store->gen_rand_obj_instance_name(&marker);
5032 }
5033
5034 result.version_id = marker.key.instance;
5035 if (result.version_id.empty())
5036 result.version_id = "null";
5037 result.delete_marker = true;
5038
5039 struct rgw_bucket_dir_entry_meta meta;
5040
5041 meta.owner = params.obj_owner.get_id().to_str();
5042 meta.owner_display_name = params.obj_owner.get_display_name();
5043
5044 if (real_clock::is_zero(params.mtime)) {
5045 meta.mtime = real_clock::now();
5046 } else {
5047 meta.mtime = params.mtime;
5048 }
5049
5050 int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
5051 if (r < 0) {
5052 return r;
5053 }
5054 } else {
5055 rgw_bucket_dir_entry dirent;
5056
5057 int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
5058 if (r < 0) {
5059 return r;
5060 }
5061 result.delete_marker = dirent.is_delete_marker();
5062 r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
5063 if (r < 0) {
5064 return r;
5065 }
5066 result.version_id = instance;
5067 }
5068
5069 BucketShard *bs;
5070 int r = target->get_bucket_shard(&bs, dpp);
5071 if (r < 0) {
5072 ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
5073 return r;
5074 }
5075
5076 r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
5077 if (r < 0) {
5078 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
5079 return r;
5080 }
5081
5082 return 0;
5083 }
5084
5085 rgw_rados_ref ref;
5086 int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
5087 if (r < 0) {
5088 return r;
5089 }
5090
5091 RGWObjState *state;
5092 r = target->get_state(dpp, &state, false, y);
5093 if (r < 0)
5094 return r;
5095
5096 ObjectWriteOperation op;
5097
5098 if (!real_clock::is_zero(params.unmod_since)) {
5099 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5100 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5101 if (!params.high_precision_time) {
5102 ctime.tv_nsec = 0;
5103 unmod.tv_nsec = 0;
5104 }
5105
5106 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
5107 if (ctime > unmod) {
5108 return -ERR_PRECONDITION_FAILED;
5109 }
5110
5111 /* only delete object if mtime is less than or equal to params.unmod_since */
5112 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5113 }
5114 uint64_t obj_accounted_size = state->accounted_size;
5115
5116 if(params.abortmp) {
5117 obj_accounted_size = params.parts_accounted_size;
5118 }
5119
5120 if (!real_clock::is_zero(params.expiration_time)) {
5121 bufferlist bl;
5122 real_time delete_at;
5123
5124 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5125 try {
5126 auto iter = bl.cbegin();
5127 decode(delete_at, iter);
5128 } catch (buffer::error& err) {
5129 ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
5130 return -EIO;
5131 }
5132
5133 if (params.expiration_time != delete_at) {
5134 return -ERR_PRECONDITION_FAILED;
5135 }
5136 } else {
5137 return -ERR_PRECONDITION_FAILED;
5138 }
5139 }
5140
5141 if (!state->exists) {
5142 target->invalidate_state();
5143 return -ENOENT;
5144 }
5145
5146 r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
5147 if (r < 0)
5148 return r;
5149
5150 RGWBucketInfo& bucket_info = target->get_bucket_info();
5151
5152 RGWRados::Bucket bop(store, bucket_info);
5153 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5154
5155 index_op.set_zones_trace(params.zones_trace);
5156 index_op.set_bilog_flags(params.bilog_flags);
5157
5158 r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
5159 if (r < 0)
5160 return r;
5161
5162 store->remove_rgw_head_obj(op);
5163
5164 auto& ioctx = ref.pool.ioctx();
5165 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
5166
5167 /* raced with another operation, object state is indeterminate */
5168 const bool need_invalidate = (r == -ECANCELED);
5169
5170 int64_t poolid = ioctx.get_id();
5171 if (r >= 0) {
5172 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5173 if (obj_tombstone_cache) {
5174 tombstone_entry entry{*state};
5175 obj_tombstone_cache->add(obj, entry);
5176 }
5177 r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
5178
5179 int ret = target->complete_atomic_modification(dpp);
5180 if (ret < 0) {
5181 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5182 }
5183 /* other than that, no need to propagate error */
5184 } else {
5185 int ret = index_op.cancel(dpp);
5186 if (ret < 0) {
5187 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5188 }
5189 }
5190
5191 if (need_invalidate) {
5192 target->invalidate_state();
5193 }
5194
5195 if (r < 0)
5196 return r;
5197
5198 /* update quota cache */
5199 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
5200
5201 return 0;
5202 }
5203
5204 int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
5205 RGWObjectCtx& obj_ctx,
5206 const RGWBucketInfo& bucket_info,
5207 const rgw_obj& obj,
5208 int versioning_status, // versioning flags in enum RGWBucketFlags
5209 uint16_t bilog_flags,
5210 const real_time& expiration_time,
5211 rgw_zone_set *zones_trace)
5212 {
5213 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5214 RGWRados::Object::Delete del_op(&del_target);
5215
5216 del_op.params.bucket_owner = bucket_info.owner;
5217 del_op.params.versioning_status = versioning_status;
5218 del_op.params.bilog_flags = bilog_flags;
5219 del_op.params.expiration_time = expiration_time;
5220 del_op.params.zones_trace = zones_trace;
5221
5222 return del_op.delete_obj(null_yield, dpp);
5223 }
5224
5225 int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
5226 {
5227 rgw_rados_ref ref;
5228 int r = get_raw_obj_ref(dpp, obj, &ref);
5229 if (r < 0) {
5230 return r;
5231 }
5232
5233 ObjectWriteOperation op;
5234
5235 op.remove();
5236 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
5237 if (r < 0)
5238 return r;
5239
5240 return 0;
5241 }
5242
5243 int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
5244 {
5245 std::string oid, key;
5246 get_obj_bucket_and_oid_loc(obj, oid, key);
5247
5248 auto obj_ctx = svc.sysobj->init_obj_ctx();
5249
5250 RGWBucketInfo bucket_info;
5251 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
5252 if (ret < 0) {
5253 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5254 return ret;
5255 }
5256
5257 RGWRados::Bucket bop(this, bucket_info);
5258 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5259
5260 return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
5261 }
5262
5263 static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::RGWStore* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5264 {
5265 string tag;
5266
5267 RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
5268 if (mi != manifest.obj_end(dpp)) {
5269 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5270 ++mi;
5271 tag = mi.get_location().get_raw_obj(store).oid;
5272 tag.append("_");
5273 }
5274
5275 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5276 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5277 MD5 hash;
5278 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
5279
5280 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5281 if (iter != attrset.end()) {
5282 bufferlist& bl = iter->second;
5283 hash.Update((const unsigned char *)bl.c_str(), bl.length());
5284 }
5285
5286 hash.Final(md5);
5287 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5288 tag.append(md5_str);
5289
5290 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5291
5292 tag_bl.append(tag.c_str(), tag.size() + 1);
5293 }
5294
5295 static bool is_olh(map<string, bufferlist>& attrs)
5296 {
5297 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5298 return (iter != attrs.end());
5299 }
5300
5301 static bool has_olh_tag(map<string, bufferlist>& attrs)
5302 {
5303 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5304 return (iter != attrs.end());
5305 }
5306
5307 int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5308 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
5309 {
5310 ceph_assert(olh_state->is_olh);
5311
5312 rgw_obj target;
5313 int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5314 if (r < 0) {
5315 return r;
5316 }
5317 r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state, false, y);
5318 if (r < 0) {
5319 return r;
5320 }
5321
5322 return 0;
5323 }
5324
5325 int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5326 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
5327 {
5328 if (obj.empty()) {
5329 return -EINVAL;
5330 }
5331
5332 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5333
5334 RGWObjState *s = rctx->get_state(obj);
5335 ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5336 *state = s;
5337 if (s->has_attrs) {
5338 if (s->is_olh && need_follow_olh) {
5339 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
5340 }
5341 return 0;
5342 }
5343
5344 s->obj = obj;
5345
5346 rgw_raw_obj raw_obj;
5347 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5348
5349 int r = -ENOENT;
5350
5351 if (!assume_noent) {
5352 r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
5353 }
5354
5355 if (r == -ENOENT) {
5356 s->exists = false;
5357 s->has_attrs = true;
5358 tombstone_entry entry;
5359 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5360 s->mtime = entry.mtime;
5361 s->zone_short_id = entry.zone_short_id;
5362 s->pg_ver = entry.pg_ver;
5363 ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5364 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5365 } else {
5366 s->mtime = real_time();
5367 }
5368 return 0;
5369 }
5370 if (r < 0)
5371 return r;
5372
5373 s->exists = true;
5374 s->has_attrs = true;
5375 s->accounted_size = s->size;
5376
5377 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5378 if (iter != s->attrset.end()) {
5379 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5380 bufferlist& bletag = iter->second;
5381 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5382 bufferlist newbl;
5383 bletag.splice(0, bletag.length() - 1, &newbl);
5384 bletag = std::move(newbl);
5385 }
5386 }
5387
5388 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
5389 const bool compressed = (iter != s->attrset.end());
5390 if (compressed) {
5391 // use uncompressed size for accounted_size
5392 try {
5393 RGWCompressionInfo info;
5394 auto p = iter->second.cbegin();
5395 decode(info, p);
5396 s->accounted_size = info.orig_size;
5397 } catch (buffer::error&) {
5398 ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5399 return -EIO;
5400 }
5401 }
5402
5403 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5404 if (iter != s->attrset.end()) {
5405 bufferlist bl = iter->second;
5406 bufferlist::iterator it = bl.begin();
5407 it.copy(bl.length(), s->shadow_obj);
5408 s->shadow_obj[bl.length()] = '\0';
5409 }
5410 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
5411 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5412 if (ttiter != s->attrset.end()) {
5413 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5414 }
5415
5416 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5417 if (manifest_bl.length()) {
5418 auto miter = manifest_bl.cbegin();
5419 try {
5420 s->manifest.emplace();
5421 decode(*s->manifest, miter);
5422 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
5423 broken due to old bugs */
5424 s->size = s->manifest->get_obj_size();
5425 if (!compressed)
5426 s->accounted_size = s->size;
5427 } catch (buffer::error& err) {
5428 ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
5429 return -EIO;
5430 }
5431 ldpp_dout(dpp, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
5432 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
5433 s->manifest->has_explicit_objs()) {
5434 RGWObjManifest::obj_iterator mi;
5435 for (mi = s->manifest->obj_begin(dpp); mi != s->manifest->obj_end(dpp); ++mi) {
5436 ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(store) << dendl;
5437 }
5438 }
5439
5440 if (!s->obj_tag.length()) {
5441 /*
5442 * Uh oh, something's wrong, object with manifest should have tag. Let's
5443 * create one out of the manifest, would be unique
5444 */
5445 generate_fake_tag(dpp, store, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
5446 s->fake_tag = true;
5447 }
5448 }
5449 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5450 if (aiter != s->attrset.end()) {
5451 bufferlist& pg_ver_bl = aiter->second;
5452 if (pg_ver_bl.length()) {
5453 auto pgbl = pg_ver_bl.cbegin();
5454 try {
5455 decode(s->pg_ver, pgbl);
5456 } catch (buffer::error& err) {
5457 ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5458 }
5459 }
5460 }
5461 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5462 if (aiter != s->attrset.end()) {
5463 bufferlist& zone_short_id_bl = aiter->second;
5464 if (zone_short_id_bl.length()) {
5465 auto zbl = zone_short_id_bl.cbegin();
5466 try {
5467 decode(s->zone_short_id, zbl);
5468 } catch (buffer::error& err) {
5469 ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5470 }
5471 }
5472 }
5473 if (s->obj_tag.length()) {
5474 ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5475 } else {
5476 ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5477 }
5478
5479 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5480 * it exist, and not only if is_olh() returns true
5481 */
5482 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5483 if (iter != s->attrset.end()) {
5484 s->olh_tag = iter->second;
5485 }
5486
5487 if (is_olh(s->attrset)) {
5488 s->is_olh = true;
5489
5490 ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5491
5492 if (need_follow_olh) {
5493 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
5494 } else if (obj.key.have_null_instance() && !s->manifest) {
5495 // read null version, and the head object only have olh info
5496 s->exists = false;
5497 return -ENOENT;
5498 }
5499 }
5500
5501 return 0;
5502 }
5503
5504 int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
5505 bool follow_olh, optional_yield y, bool assume_noent)
5506 {
5507 int ret;
5508
5509 do {
5510 ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
5511 } while (ret == -EAGAIN);
5512
5513 return ret;
5514 }
5515
5516 int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
5517 {
5518 RGWObjState *astate;
5519 int r = get_state(dpp, &astate, true, y);
5520 if (r < 0) {
5521 return r;
5522 }
5523
5524 *pmanifest = &(*astate->manifest);
5525
5526 return 0;
5527 }
5528
5529 int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
5530 {
5531 RGWObjState *state;
5532 int r = source->get_state(dpp, &state, true, y);
5533 if (r < 0)
5534 return r;
5535 if (!state->exists)
5536 return -ENOENT;
5537 if (!state->get_attr(name, dest))
5538 return -ENODATA;
5539
5540 return 0;
5541 }
5542
5543 int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
5544 {
5545 RGWObjectCtx& ctx = source->get_ctx();
5546 rgw_obj& obj = source->get_obj();
5547 RGWRados *store = source->get_store();
5548
5549 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
5550 result.obj = obj;
5551 if (s->has_attrs) {
5552 state.ret = 0;
5553 result.size = s->size;
5554 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5555 result.attrs = s->attrset;
5556 result.manifest = s->manifest;
5557 return 0;
5558 }
5559
5560 string oid;
5561 string loc;
5562 get_obj_bucket_and_oid_loc(obj, oid, loc);
5563
5564 int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
5565 if (r < 0) {
5566 return r;
5567 }
5568
5569 librados::ObjectReadOperation op;
5570 op.stat2(&result.size, &result.mtime, NULL);
5571 op.getxattrs(&result.attrs, NULL);
5572 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
5573 state.io_ctx.locator_set_key(loc);
5574 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5575 if (r < 0) {
5576 ldpp_dout(dpp, 5) << __func__
5577 << ": ERROR: aio_operate() returned ret=" << r
5578 << dendl;
5579 return r;
5580 }
5581
5582 return 0;
5583 }
5584
5585
5586 int RGWRados::Object::Stat::wait()
5587 {
5588 if (!state.completion) {
5589 return state.ret;
5590 }
5591
5592 state.completion->wait_for_complete();
5593 state.ret = state.completion->get_return_value();
5594 state.completion->release();
5595
5596 if (state.ret != 0) {
5597 return state.ret;
5598 }
5599
5600 return finish();
5601 }
5602
5603 int RGWRados::Object::Stat::finish()
5604 {
5605 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5606 if (iter != result.attrs.end()) {
5607 bufferlist& bl = iter->second;
5608 auto biter = bl.cbegin();
5609 try {
5610 result.manifest.emplace();
5611 decode(*result.manifest, biter);
5612 } catch (buffer::error& err) {
5613 RGWRados *store = source->get_store();
5614 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
5615 return -EIO;
5616 }
5617 }
5618
5619 return 0;
5620 }
5621
5622 int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
5623 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5624 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
5625 {
5626 if (!rctx)
5627 return 0;
5628
5629 int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, false, y);
5630 if (r < 0)
5631 return r;
5632
5633 return append_atomic_test(dpp, *pstate, op);
5634 }
5635
5636 int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
5637 const RGWObjState* state,
5638 librados::ObjectOperation& op)
5639 {
5640 if (!state->is_atomic) {
5641 ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
5642 return 0;
5643 }
5644
5645 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5646 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5647 } else {
5648 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
5649 }
5650 return 0;
5651 }
5652
5653 int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
5654 {
5655 return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
5656 }
5657
5658 void RGWRados::Object::invalidate_state()
5659 {
5660 ctx.invalidate(obj);
5661 }
5662
5663 int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
5664 ObjectWriteOperation& op, bool reset_obj, const string *ptag,
5665 const char *if_match, const char *if_nomatch, bool removal_op,
5666 bool modify_tail, optional_yield y)
5667 {
5668 int r = get_state(dpp, &state, false, y);
5669 if (r < 0)
5670 return r;
5671
5672 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
5673 if_match != NULL || if_nomatch != NULL) &&
5674 (!state->fake_tag);
5675
5676 if (!state->is_atomic) {
5677 ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
5678
5679 if (reset_obj) {
5680 op.create(false);
5681 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5682 }
5683
5684 return 0;
5685 }
5686
5687 if (need_guard) {
5688 /* first verify that the object wasn't replaced under */
5689 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5690 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5691 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5692 }
5693
5694 if (if_match) {
5695 if (strcmp(if_match, "*") == 0) {
5696 // test the object is existing
5697 if (!state->exists) {
5698 return -ERR_PRECONDITION_FAILED;
5699 }
5700 } else {
5701 bufferlist bl;
5702 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5703 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5704 return -ERR_PRECONDITION_FAILED;
5705 }
5706 }
5707 }
5708
5709 if (if_nomatch) {
5710 if (strcmp(if_nomatch, "*") == 0) {
5711 // test the object is NOT existing
5712 if (state->exists) {
5713 return -ERR_PRECONDITION_FAILED;
5714 }
5715 } else {
5716 bufferlist bl;
5717 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5718 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5719 return -ERR_PRECONDITION_FAILED;
5720 }
5721 }
5722 }
5723 }
5724
5725 if (reset_obj) {
5726 if (state->exists) {
5727 op.create(false);
5728 store->remove_rgw_head_obj(op);
5729 } else {
5730 op.create(true);
5731 }
5732 }
5733
5734 if (removal_op) {
5735 /* the object is being removed, no need to update its tag */
5736 return 0;
5737 }
5738
5739 if (ptag) {
5740 state->write_tag = *ptag;
5741 } else {
5742 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5743 }
5744 bufferlist bl;
5745 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5746
5747 ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
5748
5749 op.setxattr(RGW_ATTR_ID_TAG, bl);
5750 if (modify_tail) {
5751 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5752 }
5753
5754 return 0;
5755 }
5756
5757 /**
5758 * Set an attr on an object.
5759 * bucket: name of the bucket holding the object
5760 * obj: name of the object to set the attr on
5761 * name: the attr to set
5762 * bl: the contents of the attr
5763 * Returns: 0 on success, -ERR# otherwise.
5764 */
5765 int RGWRados::set_attr(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
5766 {
5767 map<string, bufferlist> attrs;
5768 attrs[name] = bl;
5769 return set_attrs(dpp, ctx, bucket_info, obj, attrs, NULL, null_yield);
5770 }
5771
5772 int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
5773 map<string, bufferlist>& attrs,
5774 map<string, bufferlist>* rmattrs,
5775 optional_yield y)
5776 {
5777 rgw_obj obj = src_obj;
5778 if (obj.key.instance == "null") {
5779 obj.key.instance.clear();
5780 }
5781
5782 rgw_rados_ref ref;
5783 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
5784 if (r < 0) {
5785 return r;
5786 }
5787 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5788
5789 ObjectWriteOperation op;
5790 RGWObjState *state = NULL;
5791
5792 r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, y);
5793 if (r < 0)
5794 return r;
5795
5796 // ensure null version object exist
5797 if (src_obj.key.instance == "null" && !state->manifest) {
5798 return -ENOENT;
5799 }
5800
5801 map<string, bufferlist>::iterator iter;
5802 if (rmattrs) {
5803 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5804 const string& name = iter->first;
5805 op.rmxattr(name.c_str());
5806 }
5807 }
5808
5809 const rgw_bucket& bucket = obj.bucket;
5810
5811 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5812 const string& name = iter->first;
5813 bufferlist& bl = iter->second;
5814
5815 if (!bl.length())
5816 continue;
5817
5818 op.setxattr(name.c_str(), bl);
5819
5820 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5821 real_time ts;
5822 try {
5823 decode(ts, bl);
5824
5825 rgw_obj_index_key obj_key;
5826 obj.key.get_index_key(&obj_key);
5827
5828 obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
5829 } catch (buffer::error& err) {
5830 ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
5831 }
5832 }
5833 }
5834
5835 if (!op.size())
5836 return 0;
5837
5838 RGWObjectCtx obj_ctx(this->store);
5839
5840 bufferlist bl;
5841 RGWRados::Bucket bop(this, bucket_info);
5842 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5843
5844 if (state) {
5845 string tag;
5846 append_rand_alpha(cct, tag, tag, 32);
5847 state->write_tag = tag;
5848 r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
5849
5850 if (r < 0)
5851 return r;
5852
5853 bl.append(tag.c_str(), tag.size() + 1);
5854 op.setxattr(RGW_ATTR_ID_TAG, bl);
5855 }
5856
5857
5858 real_time mtime = real_clock::now();
5859 struct timespec mtime_ts = real_clock::to_timespec(mtime);
5860 op.mtime2(&mtime_ts);
5861 auto& ioctx = ref.pool.ioctx();
5862 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
5863 if (state) {
5864 if (r >= 0) {
5865 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
5866 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
5867 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
5868 string etag = rgw_bl_str(etag_bl);
5869 string content_type = rgw_bl_str(content_type_bl);
5870 string storage_class;
5871 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
5872 if (iter != attrs.end()) {
5873 storage_class = rgw_bl_str(iter->second);
5874 }
5875 uint64_t epoch = ioctx.get_last_version();
5876 int64_t poolid = ioctx.get_id();
5877 r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
5878 mtime, etag, content_type, storage_class, &acl_bl,
5879 RGWObjCategory::Main, NULL);
5880 } else {
5881 int ret = index_op.cancel(dpp);
5882 if (ret < 0) {
5883 ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
5884 }
5885 }
5886 }
5887 if (r < 0)
5888 return r;
5889
5890 if (state) {
5891 state->obj_tag.swap(bl);
5892 if (rmattrs) {
5893 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5894 state->attrset.erase(iter->first);
5895 }
5896 }
5897
5898 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5899 state->attrset[iter->first] = iter->second;
5900 }
5901
5902 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
5903 if (iter != state->attrset.end()) {
5904 iter->second = state->obj_tag;
5905 }
5906 }
5907
5908 return 0;
5909 }
5910
5911 int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
5912 {
5913 RGWRados *store = source->get_store();
5914 CephContext *cct = store->ctx();
5915
5916 bufferlist etag;
5917
5918 map<string, bufferlist>::iterator iter;
5919
5920 RGWObjState *astate;
5921 int r = source->get_state(dpp, &astate, true, y);
5922 if (r < 0)
5923 return r;
5924
5925 if (!astate->exists) {
5926 return -ENOENT;
5927 }
5928
5929 const RGWBucketInfo& bucket_info = source->get_bucket_info();
5930
5931 state.obj = astate->obj;
5932 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
5933
5934 state.cur_pool = state.head_obj.pool;
5935 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
5936
5937 r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
5938 if (r < 0) {
5939 return r;
5940 }
5941 if (params.target_obj) {
5942 *params.target_obj = state.obj;
5943 }
5944 if (params.attrs) {
5945 *params.attrs = astate->attrset;
5946 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
5947 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
5948 ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
5949 }
5950 }
5951 }
5952
5953 /* Convert all times go GMT to make them compatible */
5954 if (conds.mod_ptr || conds.unmod_ptr) {
5955 obj_time_weight src_weight;
5956 src_weight.init(astate);
5957 src_weight.high_precision = conds.high_precision_time;
5958
5959 obj_time_weight dest_weight;
5960 dest_weight.high_precision = conds.high_precision_time;
5961
5962 if (conds.mod_ptr && !conds.if_nomatch) {
5963 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5964 ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5965 if (!(dest_weight < src_weight)) {
5966 return -ERR_NOT_MODIFIED;
5967 }
5968 }
5969
5970 if (conds.unmod_ptr && !conds.if_match) {
5971 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5972 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5973 if (dest_weight < src_weight) {
5974 return -ERR_PRECONDITION_FAILED;
5975 }
5976 }
5977 }
5978 if (conds.if_match || conds.if_nomatch) {
5979 r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
5980 if (r < 0)
5981 return r;
5982
5983 if (conds.if_match) {
5984 string if_match_str = rgw_string_unquote(conds.if_match);
5985 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
5986 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
5987 return -ERR_PRECONDITION_FAILED;
5988 }
5989 }
5990
5991 if (conds.if_nomatch) {
5992 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
5993 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
5994 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
5995 return -ERR_NOT_MODIFIED;
5996 }
5997 }
5998 }
5999
6000 if (params.obj_size)
6001 *params.obj_size = astate->size;
6002 if (params.lastmod)
6003 *params.lastmod = astate->mtime;
6004
6005 return 0;
6006 }
6007
6008 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6009 {
6010 if (ofs < 0) {
6011 ofs += obj_size;
6012 if (ofs < 0)
6013 ofs = 0;
6014 end = obj_size - 1;
6015 } else if (end < 0) {
6016 end = obj_size - 1;
6017 }
6018
6019 if (obj_size > 0) {
6020 if (ofs >= (off_t)obj_size) {
6021 return -ERANGE;
6022 }
6023 if (end >= (off_t)obj_size) {
6024 end = obj_size - 1;
6025 }
6026 }
6027 return 0;
6028 }
6029
6030 int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, BucketShard **pbs, std::function<int(BucketShard *)> call)
6031 {
6032 RGWRados *store = target->get_store();
6033 BucketShard *bs;
6034 int r;
6035
6036 #define NUM_RESHARD_RETRIES 10
6037 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
6038 int ret = get_bucket_shard(&bs, dpp);
6039 if (ret < 0) {
6040 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6041 return ret;
6042 }
6043 r = call(bs);
6044 if (r != -ERR_BUSY_RESHARDING) {
6045 break;
6046 }
6047 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6048 string new_bucket_id;
6049 r = store->block_while_resharding(bs, &new_bucket_id,
6050 target->bucket_info, null_yield, dpp);
6051 if (r == -ERR_BUSY_RESHARDING) {
6052 continue;
6053 }
6054 if (r < 0) {
6055 return r;
6056 }
6057 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6058 i = 0; /* resharding is finished, make sure we can retry */
6059 r = target->update_bucket_id(new_bucket_id, dpp);
6060 if (r < 0) {
6061 ldpp_dout(dpp, 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
6062 return r;
6063 }
6064 invalidate_bs();
6065 } // for loop
6066
6067 if (r < 0) {
6068 return r;
6069 }
6070
6071 if (pbs) {
6072 *pbs = bs;
6073 }
6074
6075 return 0;
6076 }
6077
6078 int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
6079 {
6080 if (blind) {
6081 return 0;
6082 }
6083 RGWRados *store = target->get_store();
6084
6085 if (write_tag && write_tag->length()) {
6086 optag = string(write_tag->c_str(), write_tag->length());
6087 } else {
6088 if (optag.empty()) {
6089 append_rand_alpha(store->ctx(), optag, optag, 32);
6090 }
6091 }
6092
6093 int r = guard_reshard(dpp, nullptr, [&](BucketShard *bs) -> int {
6094 return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
6095 });
6096
6097 if (r < 0) {
6098 return r;
6099 }
6100 prepared = true;
6101
6102 return 0;
6103 }
6104
6105 int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
6106 uint64_t size, uint64_t accounted_size,
6107 ceph::real_time& ut, const string& etag,
6108 const string& content_type, const string& storage_class,
6109 bufferlist *acl_bl,
6110 RGWObjCategory category,
6111 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6112 bool appendable)
6113 {
6114 if (blind) {
6115 return 0;
6116 }
6117 RGWRados *store = target->get_store();
6118 BucketShard *bs;
6119
6120 int ret = get_bucket_shard(&bs, dpp);
6121 if (ret < 0) {
6122 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6123 return ret;
6124 }
6125
6126 rgw_bucket_dir_entry ent;
6127 obj.key.get_index_key(&ent.key);
6128 ent.meta.size = size;
6129 ent.meta.accounted_size = accounted_size;
6130 ent.meta.mtime = ut;
6131 ent.meta.etag = etag;
6132 ent.meta.storage_class = storage_class;
6133 if (user_data)
6134 ent.meta.user_data = *user_data;
6135
6136 ACLOwner owner;
6137 if (acl_bl && acl_bl->length()) {
6138 int ret = store->decode_policy(*acl_bl, &owner);
6139 if (ret < 0) {
6140 ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6141 }
6142 }
6143 ent.meta.owner = owner.get_id().to_str();
6144 ent.meta.owner_display_name = owner.get_display_name();
6145 ent.meta.content_type = content_type;
6146 ent.meta.appendable = appendable;
6147
6148 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
6149
6150 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
6151 if (r < 0) {
6152 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
6153 }
6154
6155 return ret;
6156 }
6157
6158 int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
6159 int64_t poolid, uint64_t epoch,
6160 real_time& removed_mtime,
6161 list<rgw_obj_index_key> *remove_objs)
6162 {
6163 if (blind) {
6164 return 0;
6165 }
6166 RGWRados *store = target->get_store();
6167 BucketShard *bs;
6168
6169 int ret = get_bucket_shard(&bs, dpp);
6170 if (ret < 0) {
6171 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6172 return ret;
6173 }
6174
6175 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
6176
6177 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
6178 if (r < 0) {
6179 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
6180 }
6181
6182 return ret;
6183 }
6184
6185
6186 int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp)
6187 {
6188 if (blind) {
6189 return 0;
6190 }
6191 RGWRados *store = target->get_store();
6192 BucketShard *bs;
6193
6194 int ret = guard_reshard(dpp, &bs, [&](BucketShard *bs) -> int {
6195 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6196 });
6197
6198 /*
6199 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6200 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6201 * have no way to tell that they're all caught up
6202 */
6203 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
6204 if (r < 0) {
6205 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
6206 }
6207
6208 return ret;
6209 }
6210
6211 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp)
6212 {
6213 RGWRados *store = source->get_store();
6214
6215 rgw_raw_obj read_obj;
6216 uint64_t read_ofs = ofs;
6217 uint64_t len, read_len;
6218 bool reading_from_head = true;
6219 ObjectReadOperation op;
6220
6221 bool merge_bl = false;
6222 bufferlist *pbl = &bl;
6223 bufferlist read_bl;
6224 uint64_t max_chunk_size;
6225
6226 RGWObjState *astate;
6227 int r = source->get_state(dpp, &astate, true, y);
6228 if (r < 0)
6229 return r;
6230
6231 if (astate->size == 0) {
6232 end = 0;
6233 } else if (end >= (int64_t)astate->size) {
6234 end = astate->size - 1;
6235 }
6236
6237 if (end < 0)
6238 len = 0;
6239 else
6240 len = end - ofs + 1;
6241
6242 if (astate->manifest && astate->manifest->has_tail()) {
6243 /* now get the relevant object part */
6244 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
6245
6246 uint64_t stripe_ofs = iter.get_stripe_ofs();
6247 read_obj = iter.get_location().get_raw_obj(store->store);
6248 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6249 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6250 reading_from_head = (read_obj == state.head_obj);
6251 } else {
6252 read_obj = state.head_obj;
6253 }
6254
6255 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
6256 if (r < 0) {
6257 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6258 return r;
6259 }
6260
6261 if (len > max_chunk_size)
6262 len = max_chunk_size;
6263
6264
6265 read_len = len;
6266
6267 if (reading_from_head) {
6268 /* only when reading from the head object do we need to do the atomic test */
6269 r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
6270 if (r < 0)
6271 return r;
6272
6273 if (astate && astate->prefetch_data) {
6274 if (!ofs && astate->data.length() >= len) {
6275 bl = astate->data;
6276 return bl.length();
6277 }
6278
6279 if (ofs < astate->data.length()) {
6280 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
6281 astate->data.begin(ofs).copy(copy_len, bl);
6282 read_len -= copy_len;
6283 read_ofs += copy_len;
6284 if (!read_len)
6285 return bl.length();
6286
6287 merge_bl = true;
6288 pbl = &read_bl;
6289 }
6290 }
6291 }
6292
6293 ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6294 op.read(read_ofs, read_len, pbl, NULL);
6295
6296 if (state.cur_pool != read_obj.pool) {
6297 auto iter = state.io_ctxs.find(read_obj.pool);
6298 if (iter == state.io_ctxs.end()) {
6299 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
6300 r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
6301 if (r < 0) {
6302 ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6303 return r;
6304 }
6305 } else {
6306 state.cur_ioctx = &iter->second;
6307 }
6308 state.cur_pool = read_obj.pool;
6309 }
6310
6311 state.cur_ioctx->locator_set_key(read_obj.loc);
6312
6313 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6314 ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
6315
6316 if (r < 0) {
6317 return r;
6318 }
6319
6320 if (merge_bl) {
6321 bl.append(read_bl);
6322 }
6323
6324 return bl.length();
6325 }
6326
6327 struct get_obj_data {
6328 RGWRados* store;
6329 RGWGetDataCB* client_cb;
6330 rgw::Aio* aio;
6331 uint64_t offset; // next offset to write to client
6332 rgw::AioResultList completed; // completed read results, sorted by offset
6333 optional_yield yield;
6334
6335 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio,
6336 uint64_t offset, optional_yield yield)
6337 : store(store), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
6338
6339 int flush(rgw::AioResultList&& results) {
6340 int r = rgw::check_for_errors(results);
6341 if (r < 0) {
6342 return r;
6343 }
6344
6345 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6346 results.sort(cmp); // merge() requires results to be sorted first
6347 completed.merge(results, cmp); // merge results in sorted order
6348
6349 while (!completed.empty() && completed.front().id == offset) {
6350 auto bl = std::move(completed.front().data);
6351 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
6352
6353 offset += bl.length();
6354 int r = client_cb->handle_data(bl, 0, bl.length());
6355 if (r < 0) {
6356 return r;
6357 }
6358 }
6359 return 0;
6360 }
6361
6362 void cancel() {
6363 // wait for all completions to drain and ignore the results
6364 aio->drain();
6365 }
6366
6367 int drain() {
6368 auto c = aio->wait();
6369 while (!c.empty()) {
6370 int r = flush(std::move(c));
6371 if (r < 0) {
6372 cancel();
6373 return r;
6374 }
6375 c = aio->wait();
6376 }
6377 return flush(std::move(c));
6378 }
6379 };
6380
6381 static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
6382 const rgw_raw_obj& read_obj, off_t obj_ofs,
6383 off_t read_ofs, off_t len, bool is_head_obj,
6384 RGWObjState *astate, void *arg)
6385 {
6386 struct get_obj_data *d = (struct get_obj_data *)arg;
6387
6388 return d->store->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
6389 is_head_obj, astate, arg);
6390 }
6391
6392 int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
6393 const rgw_raw_obj& read_obj, off_t obj_ofs,
6394 off_t read_ofs, off_t len, bool is_head_obj,
6395 RGWObjState *astate, void *arg)
6396 {
6397 ObjectReadOperation op;
6398 struct get_obj_data *d = (struct get_obj_data *)arg;
6399 string oid, key;
6400
6401 if (is_head_obj) {
6402 /* only when reading from the head object do we need to do the atomic test */
6403 int r = append_atomic_test(dpp, astate, op);
6404 if (r < 0)
6405 return r;
6406
6407 if (astate &&
6408 obj_ofs < astate->data.length()) {
6409 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
6410
6411 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
6412 if (r < 0)
6413 return r;
6414
6415 len -= chunk_len;
6416 d->offset += chunk_len;
6417 read_ofs += chunk_len;
6418 obj_ofs += chunk_len;
6419 if (!len)
6420 return 0;
6421 }
6422 }
6423
6424 auto obj = d->store->svc.rados->obj(read_obj);
6425 int r = obj.open(dpp);
6426 if (r < 0) {
6427 ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
6428 return r;
6429 }
6430
6431 ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6432 op.read(read_ofs, len, nullptr, nullptr);
6433
6434 const uint64_t cost = len;
6435 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
6436
6437 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
6438
6439 return d->flush(std::move(completed));
6440 }
6441
6442 int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
6443 optional_yield y)
6444 {
6445 RGWRados *store = source->get_store();
6446 CephContext *cct = store->ctx();
6447 RGWObjectCtx& obj_ctx = source->get_ctx();
6448 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6449 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
6450
6451 auto aio = rgw::make_throttle(window_size, y);
6452 get_obj_data data(store, cb, &*aio, ofs, y);
6453
6454 int r = store->iterate_obj(dpp, obj_ctx, source->get_bucket_info(), state.obj,
6455 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
6456 if (r < 0) {
6457 ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
6458 data.cancel(); // drain completions without writing back to client
6459 return r;
6460 }
6461
6462 return data.drain();
6463 }
6464
6465 int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
6466 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6467 off_t ofs, off_t end, uint64_t max_chunk_size,
6468 iterate_obj_cb cb, void *arg, optional_yield y)
6469 {
6470 rgw_raw_obj head_obj;
6471 rgw_raw_obj read_obj;
6472 uint64_t read_ofs = ofs;
6473 uint64_t len;
6474 bool reading_from_head = true;
6475 RGWObjState *astate = NULL;
6476
6477 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6478
6479 int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, false, y);
6480 if (r < 0) {
6481 return r;
6482 }
6483
6484 if (end < 0)
6485 len = 0;
6486 else
6487 len = end - ofs + 1;
6488
6489 if (astate->manifest) {
6490 /* now get the relevant object stripe */
6491 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
6492
6493 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end(dpp);
6494
6495 for (; iter != obj_end && ofs <= end; ++iter) {
6496 off_t stripe_ofs = iter.get_stripe_ofs();
6497 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6498
6499 while (ofs < next_stripe_ofs && ofs <= end) {
6500 read_obj = iter.get_location().get_raw_obj(store);
6501 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6502 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6503
6504 if (read_len > max_chunk_size) {
6505 read_len = max_chunk_size;
6506 }
6507
6508 reading_from_head = (read_obj == head_obj);
6509 r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
6510 if (r < 0) {
6511 return r;
6512 }
6513
6514 len -= read_len;
6515 ofs += read_len;
6516 }
6517 }
6518 } else {
6519 while (ofs <= end) {
6520 read_obj = head_obj;
6521 uint64_t read_len = std::min(len, max_chunk_size);
6522
6523 r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
6524 if (r < 0) {
6525 return r;
6526 }
6527
6528 len -= read_len;
6529 ofs += read_len;
6530 }
6531 }
6532
6533 return 0;
6534 }
6535
6536 int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6537 {
6538 rgw_rados_ref ref;
6539 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
6540 if (r < 0) {
6541 return r;
6542 }
6543
6544 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
6545 }
6546
6547 int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6548 {
6549 rgw_rados_ref ref;
6550 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
6551 if (r < 0) {
6552 return r;
6553 }
6554
6555 bufferlist outbl;
6556
6557 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
6558 }
6559
6560 int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6561 {
6562 ObjectWriteOperation op;
6563
6564 ceph_assert(olh_obj.key.instance.empty());
6565
6566 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6567
6568 if (!state.exists) {
6569 op.create(true);
6570 } else {
6571 op.assert_exists();
6572 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6573 op.mtime2(&mtime_ts);
6574 }
6575
6576 /*
6577 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6578 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6579 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6580 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6581 * log will reflect that.
6582 *
6583 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6584 * is used for object data instance, olh_tag for olh instance.
6585 */
6586 if (has_tag) {
6587 /* guard against racing writes */
6588 bucket_index_guard_olh_op(dpp, state, op);
6589 }
6590
6591 if (!has_tag) {
6592 /* obj tag */
6593 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
6594
6595 bufferlist bl;
6596 bl.append(obj_tag.c_str(), obj_tag.size());
6597 op.setxattr(RGW_ATTR_ID_TAG, bl);
6598
6599 state.attrset[RGW_ATTR_ID_TAG] = bl;
6600 state.obj_tag = bl;
6601
6602 /* olh tag */
6603 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
6604
6605 bufferlist olh_bl;
6606 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6607 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6608
6609 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6610 state.olh_tag = olh_bl;
6611 state.is_olh = true;
6612
6613 bufferlist verbl;
6614 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6615 }
6616
6617 bufferlist bl;
6618 RGWOLHPendingInfo pending_info;
6619 pending_info.time = real_clock::now();
6620 encode(pending_info, bl);
6621
6622 #define OLH_PENDING_TAG_LEN 32
6623 /* tag will start with current time epoch, this so that entries are sorted by time */
6624 char buf[32];
6625 utime_t ut(pending_info.time);
6626 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6627 *op_tag = buf;
6628
6629 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
6630
6631 op_tag->append(s);
6632
6633 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6634 attr_name.append(*op_tag);
6635
6636 op.setxattr(attr_name.c_str(), bl);
6637
6638 int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
6639 if (ret < 0) {
6640 return ret;
6641 }
6642
6643 state.exists = true;
6644 state.attrset[attr_name] = bl;
6645
6646 return 0;
6647 }
6648
6649 int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
6650 {
6651 int ret;
6652
6653 ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
6654 if (ret == -EEXIST) {
6655 ret = -ECANCELED;
6656 }
6657
6658 return ret;
6659 }
6660
6661 int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
6662 BucketShard *bs,
6663 const rgw_obj& obj_instance,
6664 const RGWBucketInfo& bucket_info,
6665 std::function<int(BucketShard *)> call)
6666 {
6667 rgw_obj obj;
6668 const rgw_obj *pobj = &obj_instance;
6669 int r;
6670
6671 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
6672 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
6673 if (r < 0) {
6674 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
6675 return r;
6676 }
6677 r = call(bs);
6678 if (r != -ERR_BUSY_RESHARDING) {
6679 break;
6680 }
6681 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6682 string new_bucket_id;
6683 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield, dpp);
6684 if (r == -ERR_BUSY_RESHARDING) {
6685 continue;
6686 }
6687 if (r < 0) {
6688 return r;
6689 }
6690 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6691 i = 0; /* resharding is finished, make sure we can retry */
6692
6693 obj = *pobj;
6694 obj.bucket.update_bucket_id(new_bucket_id);
6695 pobj = &obj;
6696 } // for loop
6697
6698 if (r < 0) {
6699 return r;
6700 }
6701
6702 return 0;
6703 }
6704
6705 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
6706 string *new_bucket_id,
6707 const RGWBucketInfo& bucket_info,
6708 optional_yield y,
6709 const DoutPrefixProvider *dpp)
6710 {
6711 int ret = 0;
6712 cls_rgw_bucket_instance_entry entry;
6713
6714 // since we want to run this recovery code from two distinct places,
6715 // let's just put it in a lambda so we can easily re-use; if the
6716 // lambda successfully fetches a new bucket id, it sets
6717 // new_bucket_id and returns 0, otherwise it returns a negative
6718 // error code
6719 auto fetch_new_bucket_id =
6720 [this, &bucket_info, dpp](const std::string& log_tag,
6721 std::string* new_bucket_id) -> int {
6722 RGWBucketInfo fresh_bucket_info = bucket_info;
6723 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr, dpp);
6724 if (ret < 0) {
6725 ldpp_dout(dpp, 0) << __func__ <<
6726 " ERROR: failed to refresh bucket info after reshard at " <<
6727 log_tag << ": " << cpp_strerror(-ret) << dendl;
6728 return ret;
6729 }
6730 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
6731 return 0;
6732 };
6733
6734 constexpr int num_retries = 10;
6735 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
6736 auto& ref = bs->bucket_obj.get_ref();
6737 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
6738 if (ret == -ENOENT) {
6739 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
6740 } else if (ret < 0) {
6741 ldpp_dout(dpp, 0) << __func__ <<
6742 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6743 dendl;
6744 return ret;
6745 }
6746
6747 if (!entry.resharding_in_progress()) {
6748 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6749 new_bucket_id);
6750 }
6751
6752 ldpp_dout(dpp, 20) << "NOTICE: reshard still in progress; " <<
6753 (i < num_retries ? "retrying" : "too many retries") << dendl;
6754
6755 if (i == num_retries) {
6756 break;
6757 }
6758
6759 // If bucket is erroneously marked as resharding (e.g., crash or
6760 // other error) then fix it. If we can take the bucket reshard
6761 // lock then it means no other resharding should be taking place,
6762 // and we're free to clear the flags.
6763 {
6764 // since we expect to do this rarely, we'll do our work in a
6765 // block and erase our work after each try
6766
6767 RGWObjectCtx obj_ctx(this->store);
6768 const rgw_bucket& b = bs->bucket;
6769 std::string bucket_id = b.get_key();
6770 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
6771 ret = reshard_lock.lock();
6772 if (ret < 0) {
6773 ldpp_dout(dpp, 20) << __func__ <<
6774 " INFO: failed to take reshard lock for bucket " <<
6775 bucket_id << "; expected if resharding underway" << dendl;
6776 } else {
6777 ldpp_dout(dpp, 10) << __func__ <<
6778 " INFO: was able to take reshard lock for bucket " <<
6779 bucket_id << dendl;
6780 ret = RGWBucketReshard::clear_resharding(dpp, this->store, bucket_info);
6781 if (ret < 0) {
6782 reshard_lock.unlock();
6783 ldpp_dout(dpp, 0) << __func__ <<
6784 " ERROR: failed to clear resharding flags for bucket " <<
6785 bucket_id << dendl;
6786 } else {
6787 reshard_lock.unlock();
6788 ldpp_dout(dpp, 5) << __func__ <<
6789 " INFO: apparently successfully cleared resharding flags for "
6790 "bucket " << bucket_id << dendl;
6791 continue; // if we apparently succeed immediately test again
6792 } // if clear resharding succeeded
6793 } // if taking of lock succeeded
6794 } // block to encapsulate recovery from incomplete reshard
6795
6796 ret = reshard_wait->wait(y);
6797 if (ret < 0) {
6798 ldpp_dout(dpp, 0) << __func__ <<
6799 " ERROR: bucket is still resharding, please retry" << dendl;
6800 return ret;
6801 }
6802 } // for loop
6803
6804 ldpp_dout(dpp, 0) << __func__ <<
6805 " ERROR: bucket is still resharding, please retry" << dendl;
6806 return -ERR_BUSY_RESHARDING;
6807 }
6808
6809 int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
6810 bool delete_marker,
6811 const string& op_tag,
6812 struct rgw_bucket_dir_entry_meta *meta,
6813 uint64_t olh_epoch,
6814 real_time unmod_since, bool high_precision_time,
6815 rgw_zone_set *_zones_trace, bool log_data_change)
6816 {
6817 rgw_rados_ref ref;
6818 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
6819 if (r < 0) {
6820 return r;
6821 }
6822
6823 rgw_zone_set zones_trace;
6824 if (_zones_trace) {
6825 zones_trace = *_zones_trace;
6826 }
6827 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
6828
6829 BucketShard bs(this);
6830
6831 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
6832 [&](BucketShard *bs) -> int {
6833 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6834 auto& ref = bs->bucket_obj.get_ref();
6835 librados::ObjectWriteOperation op;
6836 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6837 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
6838 delete_marker, op_tag, meta, olh_epoch,
6839 unmod_since, high_precision_time,
6840 svc.zone->get_zone().log_data, zones_trace);
6841 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6842 });
6843 if (r < 0) {
6844 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
6845 return r;
6846 }
6847
6848 r = svc.datalog_rados->add_entry(dpp, bucket_info, bs.shard_id);
6849 if (r < 0) {
6850 ldpp_dout(dpp, 0) << "ERROR: failed writing data log" << dendl;
6851 }
6852
6853 return 0;
6854 }
6855
6856 void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
6857 {
6858 ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
6859 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
6860 }
6861
6862 int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
6863 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
6864 {
6865 rgw_rados_ref ref;
6866 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
6867 if (r < 0) {
6868 return r;
6869 }
6870
6871 rgw_zone_set zones_trace;
6872 if (_zones_trace) {
6873 zones_trace = *_zones_trace;
6874 }
6875 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
6876
6877 BucketShard bs(this);
6878
6879 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6880 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
6881 [&](BucketShard *bs) -> int {
6882 auto& ref = bs->bucket_obj.get_ref();
6883 librados::ObjectWriteOperation op;
6884 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6885 cls_rgw_bucket_unlink_instance(op, key, op_tag,
6886 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
6887 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6888 });
6889 if (r < 0) {
6890 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
6891 return r;
6892 }
6893
6894 return 0;
6895 }
6896
6897 int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
6898 const RGWBucketInfo& bucket_info, RGWObjState& state,
6899 const rgw_obj& obj_instance, uint64_t ver_marker,
6900 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
6901 bool *is_truncated)
6902 {
6903 rgw_rados_ref ref;
6904 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
6905 if (r < 0) {
6906 return r;
6907 }
6908
6909 BucketShard bs(this);
6910 int ret =
6911 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
6912 if (ret < 0) {
6913 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
6914 return ret;
6915 }
6916
6917 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6918
6919 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6920
6921 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
6922 [&](BucketShard *bs) -> int {
6923 auto& ref = bs->bucket_obj.get_ref();
6924 ObjectReadOperation op;
6925 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6926
6927 rgw_cls_read_olh_log_ret log_ret;
6928 int op_ret = 0;
6929 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
6930 bufferlist outbl;
6931 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
6932 if (r < 0) {
6933 return r;
6934 }
6935 if (op_ret < 0) {
6936 return op_ret;
6937 }
6938
6939 *log = std::move(log_ret.log);
6940 *is_truncated = log_ret.is_truncated;
6941 return r;
6942 });
6943 if (ret < 0) {
6944 ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
6945 return ret;
6946 }
6947
6948 return 0;
6949 }
6950
6951 // a multisite sync bug resulted in the OLH head attributes being overwritten by
6952 // the attributes from another zone, causing link_olh() to fail endlessly due to
6953 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
6954 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
6955 int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
6956 const rgw_obj& obj)
6957 {
6958 // fetch the current olh entry from the bucket index
6959 rgw_bucket_olh_entry olh;
6960 int r = bi_get_olh(dpp, bucket_info, obj, &olh);
6961 if (r < 0) {
6962 ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
6963 return r;
6964 }
6965 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
6966 return 0;
6967 }
6968
6969 ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
6970 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
6971
6972 // rewrite OLH_ID_TAG and OLH_INFO from current olh
6973 ObjectWriteOperation op;
6974 // assert this is the same olh tag we think we're fixing
6975 bucket_index_guard_olh_op(dpp, *state, op);
6976 // preserve existing mtime
6977 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
6978 op.mtime2(&mtime_ts);
6979 {
6980 bufferlist bl;
6981 bl.append(olh.tag.c_str(), olh.tag.size());
6982 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
6983 }
6984 {
6985 RGWOLHInfo info;
6986 info.target = rgw_obj(bucket_info.bucket, olh.key);
6987 info.removed = olh.delete_marker;
6988 bufferlist bl;
6989 encode(info, bl);
6990 op.setxattr(RGW_ATTR_OLH_INFO, bl);
6991 }
6992 rgw_rados_ref ref;
6993 r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
6994 if (r < 0) {
6995 return r;
6996 }
6997 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
6998 if (r < 0) {
6999 ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
7000 << cpp_strerror(r) << dendl;
7001 return r;
7002 }
7003 return 0;
7004 }
7005
7006 int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7007 {
7008 rgw_rados_ref ref;
7009 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7010 if (r < 0) {
7011 return r;
7012 }
7013
7014 BucketShard bs(this);
7015 int ret =
7016 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7017 if (ret < 0) {
7018 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7019 return ret;
7020 }
7021
7022 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7023
7024 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7025
7026 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
7027 [&](BucketShard *pbs) -> int {
7028 ObjectWriteOperation op;
7029 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7030 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
7031 return pbs->bucket_obj.operate(dpp, &op, null_yield);
7032 });
7033 if (ret < 0) {
7034 ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7035 return ret;
7036 }
7037
7038 return 0;
7039 }
7040
7041 int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7042 {
7043 rgw_rados_ref ref;
7044 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7045 if (r < 0) {
7046 return r;
7047 }
7048
7049 BucketShard bs(this);
7050
7051 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7052
7053 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7054
7055 int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
7056 [&](BucketShard *pbs) -> int {
7057 ObjectWriteOperation op;
7058 auto& ref = pbs->bucket_obj.get_ref();
7059 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7060 cls_rgw_clear_olh(op, key, olh_tag);
7061 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7062 });
7063 if (ret < 0) {
7064 ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
7065 return ret;
7066 }
7067
7068 return 0;
7069 }
7070
7071 static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
7072 {
7073 try {
7074 auto biter = bl.cbegin();
7075 decode(*olh, biter);
7076 return 0;
7077 } catch (buffer::error& err) {
7078 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7079 return -EIO;
7080 }
7081 }
7082
7083 int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
7084 RGWObjectCtx& obj_ctx,
7085 RGWObjState& state,
7086 const RGWBucketInfo& bucket_info,
7087 const rgw_obj& obj,
7088 bufferlist& olh_tag,
7089 std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
7090 uint64_t *plast_ver,
7091 rgw_zone_set* zones_trace)
7092 {
7093 if (log.empty()) {
7094 return 0;
7095 }
7096
7097 librados::ObjectWriteOperation op;
7098
7099 uint64_t last_ver = log.rbegin()->first;
7100 *plast_ver = last_ver;
7101
7102 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7103
7104 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7105 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7106
7107 bufferlist ver_bl;
7108 string last_ver_s = to_string(last_ver);
7109 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7110 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7111
7112 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7113 op.mtime2(&mtime_ts);
7114
7115 bool need_to_link = false;
7116 uint64_t link_epoch = 0;
7117 cls_rgw_obj_key key;
7118 bool delete_marker = false;
7119 list<cls_rgw_obj_key> remove_instances;
7120 bool need_to_remove = false;
7121
7122 // decode current epoch and instance
7123 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
7124 if (olh_ver != state.attrset.end()) {
7125 std::string str = olh_ver->second.to_str();
7126 std::string err;
7127 link_epoch = strict_strtoll(str.c_str(), 10, &err);
7128 }
7129 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
7130 if (olh_info != state.attrset.end()) {
7131 RGWOLHInfo info;
7132 int r = decode_olh_info(cct, olh_info->second, &info);
7133 if (r < 0) {
7134 return r;
7135 }
7136 info.target.key.get_index_key(&key);
7137 delete_marker = info.removed;
7138 }
7139
7140 for (iter = log.begin(); iter != log.end(); ++iter) {
7141 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7142 for (; viter != iter->second.end(); ++viter) {
7143 rgw_bucket_olh_log_entry& entry = *viter;
7144
7145 ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7146 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7147 << (entry.delete_marker ? "(delete)" : "") << dendl;
7148 switch (entry.op) {
7149 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7150 remove_instances.push_back(entry.key);
7151 break;
7152 case CLS_RGW_OLH_OP_LINK_OLH:
7153 // only overwrite a link of the same epoch if its key sorts before
7154 if (link_epoch < iter->first || key.instance.empty() ||
7155 key.instance > entry.key.instance) {
7156 ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7157 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7158 need_to_link = true;
7159 need_to_remove = false;
7160 key = entry.key;
7161 delete_marker = entry.delete_marker;
7162 } else {
7163 ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7164 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7165 }
7166 break;
7167 case CLS_RGW_OLH_OP_UNLINK_OLH:
7168 need_to_remove = true;
7169 need_to_link = false;
7170 break;
7171 default:
7172 ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7173 return -EIO;
7174 }
7175 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7176 attr_name.append(entry.op_tag);
7177 op.rmxattr(attr_name.c_str());
7178 }
7179 }
7180
7181 rgw_rados_ref ref;
7182 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7183 if (r < 0) {
7184 return r;
7185 }
7186
7187 const rgw_bucket& bucket = obj.bucket;
7188
7189 if (need_to_link) {
7190 rgw_obj target(bucket, key);
7191 RGWOLHInfo info;
7192 info.target = target;
7193 info.removed = delete_marker;
7194 bufferlist bl;
7195 encode(info, bl);
7196 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7197 }
7198
7199 /* first remove object instances */
7200 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7201 liter != remove_instances.end(); ++liter) {
7202 cls_rgw_obj_key& key = *liter;
7203 rgw_obj obj_instance(bucket, key);
7204 int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7205 if (ret < 0 && ret != -ENOENT) {
7206 ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7207 return ret;
7208 }
7209 }
7210
7211 /* update olh object */
7212 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7213 if (r == -ECANCELED) {
7214 r = 0;
7215 }
7216 if (r < 0) {
7217 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7218 return r;
7219 }
7220
7221 r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver);
7222 if (r < 0) {
7223 ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7224 return r;
7225 }
7226
7227 if (need_to_remove) {
7228 ObjectWriteOperation rm_op;
7229
7230 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7231 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7232 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7233 rm_op.remove();
7234
7235 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7236 if (r == -ECANCELED) {
7237 return 0; /* someone else won this race */
7238 } else {
7239 /*
7240 * only clear if was successful, otherwise we might clobber pending operations on this object
7241 */
7242 r = bucket_index_clear_olh(dpp, bucket_info, state, obj);
7243 if (r < 0) {
7244 ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7245 return r;
7246 }
7247 }
7248 }
7249
7250 return 0;
7251 }
7252
7253 /*
7254 * read olh log and apply it
7255 */
7256 int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7257 {
7258 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7259 bool is_truncated;
7260 uint64_t ver_marker = 0;
7261
7262 do {
7263 int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7264 if (ret < 0) {
7265 return ret;
7266 }
7267 ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7268 if (ret < 0) {
7269 return ret;
7270 }
7271 } while (is_truncated);
7272
7273 return 0;
7274 }
7275
7276 int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
7277 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
7278 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7279 {
7280 string op_tag;
7281
7282 rgw_obj olh_obj = target_obj;
7283 olh_obj.key.instance.clear();
7284
7285 RGWObjState *state = NULL;
7286
7287 int ret = 0;
7288 int i;
7289
7290 #define MAX_ECANCELED_RETRY 100
7291 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7292 if (ret == -ECANCELED) {
7293 obj_ctx.invalidate(olh_obj);
7294 }
7295
7296 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7297 if (ret < 0) {
7298 return ret;
7299 }
7300
7301 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7302 if (ret < 0) {
7303 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7304 if (ret == -ECANCELED) {
7305 continue;
7306 }
7307 return ret;
7308 }
7309 ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj, delete_marker,
7310 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7311 zones_trace, log_data_change);
7312 if (ret < 0) {
7313 ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7314 if (ret == -ECANCELED) {
7315 // the bucket index rejected the link_olh() due to olh tag mismatch;
7316 // attempt to reconstruct olh head attributes based on the bucket index
7317 int r2 = repair_olh(dpp, state, bucket_info, olh_obj);
7318 if (r2 < 0 && r2 != -ECANCELED) {
7319 return r2;
7320 }
7321 continue;
7322 }
7323 return ret;
7324 }
7325 break;
7326 }
7327
7328 if (i == MAX_ECANCELED_RETRY) {
7329 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7330 return -EIO;
7331 }
7332
7333 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7334 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7335 ret = 0;
7336 }
7337 if (ret < 0) {
7338 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7339 return ret;
7340 }
7341
7342 return 0;
7343 }
7344
7345 int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
7346 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7347 {
7348 string op_tag;
7349
7350 rgw_obj olh_obj = target_obj;
7351 olh_obj.key.instance.clear();
7352
7353 RGWObjState *state = NULL;
7354
7355 int ret = 0;
7356 int i;
7357
7358 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7359 if (ret == -ECANCELED) {
7360 obj_ctx.invalidate(olh_obj);
7361 }
7362
7363 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7364 if (ret < 0)
7365 return ret;
7366
7367 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7368 if (ret < 0) {
7369 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7370 if (ret == -ECANCELED) {
7371 continue;
7372 }
7373 return ret;
7374 }
7375
7376 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7377
7378 ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7379 if (ret < 0) {
7380 ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7381 if (ret == -ECANCELED) {
7382 continue;
7383 }
7384 return ret;
7385 }
7386 break;
7387 }
7388
7389 if (i == MAX_ECANCELED_RETRY) {
7390 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7391 return -EIO;
7392 }
7393
7394 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
7395 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7396 return 0;
7397 }
7398 if (ret < 0) {
7399 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7400 return ret;
7401 }
7402
7403 return 0;
7404 }
7405
7406 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7407 {
7408 #define OBJ_INSTANCE_LEN 32
7409 char buf[OBJ_INSTANCE_LEN + 1];
7410
7411 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7412 no underscore for instance name due to the way we encode the raw keys */
7413
7414 target_key->set_instance(buf);
7415 }
7416
7417 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7418 {
7419 gen_rand_obj_instance_name(&target_obj->key);
7420 }
7421
7422 int RGWRados::get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7423 {
7424 map<string, bufferlist> attrset;
7425
7426 ObjectReadOperation op;
7427 op.getxattrs(&attrset, NULL);
7428
7429 int r = obj_operate(dpp, bucket_info, obj, &op);
7430 if (r < 0) {
7431 return r;
7432 }
7433
7434 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7435 if (iter == attrset.end()) { /* not an olh */
7436 return -EINVAL;
7437 }
7438
7439 return decode_olh_info(cct, iter->second, olh);
7440 }
7441
7442 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7443 map<string, bufferlist> *rm_pending_entries)
7444 {
7445 map<string, bufferlist>::iterator iter = pending_entries.begin();
7446
7447 real_time now = real_clock::now();
7448
7449 while (iter != pending_entries.end()) {
7450 auto biter = iter->second.cbegin();
7451 RGWOLHPendingInfo pending_info;
7452 try {
7453 decode(pending_info, biter);
7454 } catch (buffer::error& err) {
7455 /* skipping bad entry, we could remove it but it might hide a bug */
7456 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7457 ++iter;
7458 continue;
7459 }
7460
7461 map<string, bufferlist>::iterator cur_iter = iter;
7462 ++iter;
7463 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7464 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7465 pending_entries.erase(cur_iter);
7466 } else {
7467 /* entries names are sorted by time (rounded to a second) */
7468 break;
7469 }
7470 }
7471 }
7472
7473 int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7474 {
7475 rgw_rados_ref ref;
7476 int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
7477 if (r < 0) {
7478 return r;
7479 }
7480
7481 // trim no more than 1000 entries per osd op
7482 constexpr int max_entries = 1000;
7483
7484 auto i = pending_attrs.begin();
7485 while (i != pending_attrs.end()) {
7486 ObjectWriteOperation op;
7487 bucket_index_guard_olh_op(dpp, state, op);
7488
7489 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7490 op.rmxattr(i->first.c_str());
7491 }
7492
7493 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7494 if (r == -ENOENT || r == -ECANCELED) {
7495 /* raced with some other change, shouldn't sweat about it */
7496 return 0;
7497 }
7498 if (r < 0) {
7499 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7500 return r;
7501 }
7502 }
7503 return 0;
7504 }
7505
7506 int RGWRados::follow_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7507 {
7508 map<string, bufferlist> pending_entries;
7509 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7510
7511 map<string, bufferlist> rm_pending_entries;
7512 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7513
7514 if (!rm_pending_entries.empty()) {
7515 int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries);
7516 if (ret < 0) {
7517 ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7518 return ret;
7519 }
7520 }
7521 if (!pending_entries.empty()) {
7522 ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7523
7524 int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7525 if (ret < 0) {
7526 return ret;
7527 }
7528 }
7529
7530 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7531 if (iter == state->attrset.end()) {
7532 return -EINVAL;
7533 }
7534
7535 RGWOLHInfo olh;
7536 int ret = decode_olh_info(cct, iter->second, &olh);
7537 if (ret < 0) {
7538 return ret;
7539 }
7540
7541 if (olh.removed) {
7542 return -ENOENT;
7543 }
7544
7545 *target = olh.target;
7546
7547 return 0;
7548 }
7549
7550 int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
7551 rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7552 map<string, bufferlist> *attrs, bufferlist *first_chunk,
7553 RGWObjVersionTracker *objv_tracker, optional_yield y)
7554 {
7555 rgw_rados_ref ref;
7556 int r = get_raw_obj_ref(dpp, obj, &ref);
7557 if (r < 0) {
7558 return r;
7559 }
7560
7561 map<string, bufferlist> unfiltered_attrset;
7562 uint64_t size = 0;
7563 struct timespec mtime_ts;
7564
7565 ObjectReadOperation op;
7566 if (objv_tracker) {
7567 objv_tracker->prepare_op_for_read(&op);
7568 }
7569 if (attrs) {
7570 op.getxattrs(&unfiltered_attrset, NULL);
7571 }
7572 if (psize || pmtime) {
7573 op.stat2(&size, &mtime_ts, NULL);
7574 }
7575 if (first_chunk) {
7576 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7577 }
7578 bufferlist outbl;
7579 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7580
7581 if (epoch) {
7582 *epoch = ref.pool.ioctx().get_last_version();
7583 }
7584
7585 if (r < 0)
7586 return r;
7587
7588 if (psize)
7589 *psize = size;
7590 if (pmtime)
7591 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7592 if (attrs) {
7593 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7594 }
7595
7596 return 0;
7597 }
7598
7599 int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
7600 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7601 {
7602 vector<rgw_bucket_dir_header> headers;
7603 map<int, string> bucket_instance_ids;
7604 int r = cls_bucket_head(dpp, bucket_info, shard_id, headers, &bucket_instance_ids);
7605 if (r < 0) {
7606 return r;
7607 }
7608
7609 ceph_assert(headers.size() == bucket_instance_ids.size());
7610
7611 auto iter = headers.begin();
7612 map<int, string>::iterator viter = bucket_instance_ids.begin();
7613 BucketIndexShardsManager ver_mgr;
7614 BucketIndexShardsManager master_ver_mgr;
7615 BucketIndexShardsManager marker_mgr;
7616 char buf[64];
7617 for(; iter != headers.end(); ++iter, ++viter) {
7618 accumulate_raw_stats(*iter, stats);
7619 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7620 ver_mgr.add(viter->first, string(buf));
7621 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7622 master_ver_mgr.add(viter->first, string(buf));
7623 if (shard_id >= 0) {
7624 *max_marker = iter->max_marker;
7625 } else {
7626 marker_mgr.add(viter->first, iter->max_marker);
7627 }
7628 if (syncstopped != NULL)
7629 *syncstopped = iter->syncstopped;
7630 }
7631 ver_mgr.to_string(bucket_ver);
7632 master_ver_mgr.to_string(master_ver);
7633 if (shard_id < 0) {
7634 marker_mgr.to_string(max_marker);
7635 }
7636 return 0;
7637 }
7638
7639 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7640 RGWGetBucketStats_CB *cb;
7641 uint32_t pendings;
7642 map<RGWObjCategory, RGWStorageStats> stats;
7643 int ret_code;
7644 bool should_cb;
7645 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7646
7647 public:
7648 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
7649 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7650 {}
7651
7652 void handle_response(int r, rgw_bucket_dir_header& header) override {
7653 std::lock_guard l{lock};
7654 if (should_cb) {
7655 if ( r >= 0) {
7656 accumulate_raw_stats(header, stats);
7657 } else {
7658 ret_code = r;
7659 }
7660
7661 // Are we all done?
7662 if (--pendings == 0) {
7663 if (!ret_code) {
7664 cb->set_response(&stats);
7665 }
7666 cb->handle_response(ret_code);
7667 cb->put();
7668 }
7669 }
7670 }
7671
7672 void unset_cb() {
7673 std::lock_guard l{lock};
7674 should_cb = false;
7675 }
7676 };
7677
7678 int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7679 {
7680 int num_aio = 0;
7681 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
7682 ceph_assert(get_ctx);
7683 int r = cls_bucket_head_async(dpp, bucket_info, shard_id, get_ctx, &num_aio);
7684 if (r < 0) {
7685 ctx->put();
7686 if (num_aio) {
7687 get_ctx->unset_cb();
7688 }
7689 }
7690 get_ctx->put();
7691 return r;
7692 }
7693
7694 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx,
7695 const string& meta_key,
7696 RGWBucketInfo& info,
7697 real_time *pmtime,
7698 map<string, bufferlist> *pattrs,
7699 optional_yield y,
7700 const DoutPrefixProvider *dpp)
7701 {
7702 rgw_bucket bucket;
7703 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7704
7705 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y, dpp);
7706 }
7707
7708 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
7709 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
7710 const DoutPrefixProvider *dpp)
7711 {
7712 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7713 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7714 y,
7715 dpp,
7716 RGWBucketCtl::BucketInstance::GetParams()
7717 .set_mtime(pmtime)
7718 .set_attrs(pattrs)
7719 .set_bectx_params(bectx_params));
7720 }
7721
7722 int RGWRados::get_bucket_info(RGWServices *svc,
7723 const string& tenant, const string& bucket_name,
7724 RGWBucketInfo& info,
7725 real_time *pmtime,
7726 optional_yield y,
7727 const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
7728 {
7729 auto obj_ctx = svc->sysobj->init_obj_ctx();
7730 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7731 rgw_bucket bucket;
7732 bucket.tenant = tenant;
7733 bucket.name = bucket_name;
7734 return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
7735 RGWBucketCtl::BucketInstance::GetParams()
7736 .set_mtime(pmtime)
7737 .set_attrs(pattrs)
7738 .set_bectx_params(bectx_params));
7739 }
7740
7741 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7742 ceph::real_time *pmtime,
7743 const DoutPrefixProvider *dpp,
7744 map<string, bufferlist> *pattrs)
7745 {
7746 rgw_bucket bucket = info.bucket;
7747 bucket.bucket_id.clear();
7748
7749 auto rv = info.objv_tracker.read_version;
7750
7751 return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
7752 RGWBucketCtl::BucketInstance::GetParams()
7753 .set_mtime(pmtime)
7754 .set_attrs(pattrs)
7755 .set_refresh_version(rv));
7756 }
7757
7758 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
7759 real_time mtime, map<string, bufferlist> *pattrs,
7760 const DoutPrefixProvider *dpp)
7761 {
7762 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
7763 RGWBucketCtl::BucketInstance::PutParams()
7764 .set_exclusive(exclusive)
7765 .set_mtime(mtime)
7766 .set_attrs(pattrs));
7767 }
7768
7769 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
7770 map<string, bufferlist> *pattrs, bool create_entry_point,
7771 const DoutPrefixProvider *dpp)
7772 {
7773 bool create_head = !info.has_instance_obj || create_entry_point;
7774
7775 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
7776 if (ret < 0) {
7777 return ret;
7778 }
7779
7780 if (!create_head)
7781 return 0; /* done! */
7782
7783 RGWBucketEntryPoint entry_point;
7784 entry_point.bucket = info.bucket;
7785 entry_point.owner = info.owner;
7786 entry_point.creation_time = info.creation_time;
7787 entry_point.linked = true;
7788 RGWObjVersionTracker ot;
7789 if (pep_objv && !pep_objv->tag.empty()) {
7790 ot.write_version = *pep_objv;
7791 } else {
7792 ot.generate_new_write_ver(cct);
7793 if (pep_objv) {
7794 *pep_objv = ot.write_version;
7795 }
7796 }
7797 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
7798 .set_exclusive(exclusive)
7799 .set_objv_tracker(&ot)
7800 .set_mtime(mtime));
7801 if (ret < 0)
7802 return ret;
7803
7804 return 0;
7805 }
7806
7807 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
7808 {
7809 auto obj_ctx = svc.sysobj->init_obj_ctx();
7810
7811 map<string, RGWBucketEnt>::iterator iter;
7812 for (iter = m.begin(); iter != m.end(); ++iter) {
7813 RGWBucketEnt& ent = iter->second;
7814 rgw_bucket& bucket = ent.bucket;
7815 ent.count = 0;
7816 ent.size = 0;
7817 ent.size_rounded = 0;
7818
7819 vector<rgw_bucket_dir_header> headers;
7820
7821 RGWBucketInfo bucket_info;
7822 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield, dpp);
7823 if (ret < 0) {
7824 return ret;
7825 }
7826
7827 int r = cls_bucket_head(dpp, bucket_info, RGW_NO_SHARD, headers);
7828 if (r < 0)
7829 return r;
7830
7831 auto hiter = headers.begin();
7832 for (; hiter != headers.end(); ++hiter) {
7833 RGWObjCategory category = main_category;
7834 auto iter = (hiter->stats).find(category);
7835 if (iter != hiter->stats.end()) {
7836 struct rgw_bucket_category_stats& stats = iter->second;
7837 ent.count += stats.num_entries;
7838 ent.size += stats.total_size;
7839 ent.size_rounded += stats.total_size_rounded;
7840 }
7841 }
7842
7843 // fill in placement_rule from the bucket instance for use in swift's
7844 // per-storage policy statistics
7845 ent.placement_rule = std::move(bucket_info.placement_rule);
7846 }
7847
7848 return m.size();
7849 }
7850
7851 int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
7852 {
7853 rgw_rados_ref ref;
7854 int r = get_raw_obj_ref(dpp, obj, &ref);
7855 if (r < 0) {
7856 return r;
7857 }
7858 librados::Rados *rad = get_rados_handle();
7859 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7860
7861 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7862 completion->release();
7863 return r;
7864 }
7865
7866 int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
7867 {
7868 librados::IoCtx& io_ctx = ctx.io_ctx;
7869 librados::NObjectIterator& iter = ctx.iter;
7870
7871 int r = open_pool_ctx(dpp, pool, io_ctx, false);
7872 if (r < 0)
7873 return r;
7874
7875 iter = io_ctx.nobjects_begin();
7876
7877 return 0;
7878 }
7879
7880 int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
7881 {
7882 librados::IoCtx& io_ctx = ctx.io_ctx;
7883 librados::NObjectIterator& iter = ctx.iter;
7884
7885 int r = open_pool_ctx(dpp, pool, io_ctx, false);
7886 if (r < 0)
7887 return r;
7888
7889 librados::ObjectCursor oc;
7890 if (!oc.from_str(cursor)) {
7891 ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
7892 return -EINVAL;
7893 }
7894
7895 try {
7896 iter = io_ctx.nobjects_begin(oc);
7897 return 0;
7898 } catch (const std::system_error& e) {
7899 r = -e.code().value();
7900 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
7901 << ", returning " << r << dendl;
7902 return r;
7903 } catch (const std::exception& e) {
7904 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
7905 << ", returning -5" << dendl;
7906 return -EIO;
7907 }
7908 }
7909
7910 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
7911 {
7912 return ctx.iter.get_cursor().to_str();
7913 }
7914
7915 static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
7916 vector<rgw_bucket_dir_entry>& objs,
7917 bool *is_truncated, RGWAccessListFilter *filter)
7918 {
7919 librados::IoCtx& io_ctx = ctx.io_ctx;
7920 librados::NObjectIterator& iter = ctx.iter;
7921
7922 if (iter == io_ctx.nobjects_end())
7923 return -ENOENT;
7924
7925 uint32_t i;
7926
7927 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
7928 rgw_bucket_dir_entry e;
7929
7930 string oid = iter->get_oid();
7931 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7932
7933 // fill it in with initial values; we may correct later
7934 if (filter && !filter->filter(oid, oid))
7935 continue;
7936
7937 e.key = oid;
7938 objs.push_back(e);
7939 }
7940
7941 if (is_truncated)
7942 *is_truncated = (iter != io_ctx.nobjects_end());
7943
7944 return objs.size();
7945 }
7946
7947 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
7948 bool *is_truncated, RGWAccessListFilter *filter)
7949 {
7950 // catch exceptions from NObjectIterator::operator++()
7951 try {
7952 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
7953 } catch (const std::system_error& e) {
7954 int r = -e.code().value();
7955 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7956 << ", returning " << r << dendl;
7957 return r;
7958 } catch (const std::exception& e) {
7959 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7960 << ", returning -5" << dendl;
7961 return -EIO;
7962 }
7963 }
7964
7965 int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7966 {
7967 if (!ctx->initialized) {
7968 int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
7969 if (r < 0) {
7970 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7971 return r;
7972 }
7973 ctx->initialized = true;
7974 }
7975 return 0;
7976 }
7977
7978 int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
7979 RGWListRawObjsCtx& ctx, list<string>& oids,
7980 bool *is_truncated)
7981 {
7982 if (!ctx.initialized) {
7983 return -EINVAL;
7984 }
7985 RGWAccessListFilterPrefix filter(prefix_filter);
7986 vector<rgw_bucket_dir_entry> objs;
7987 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
7988 if (r < 0) {
7989 if(r != -ENOENT)
7990 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7991 return r;
7992 }
7993
7994 vector<rgw_bucket_dir_entry>::iterator iter;
7995 for (iter = objs.begin(); iter != objs.end(); ++iter) {
7996 oids.push_back(iter->key.name);
7997 }
7998
7999 return oids.size();
8000 }
8001
8002 int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
8003 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
8004 bool *is_truncated)
8005 {
8006 if (!ctx.initialized) {
8007 int r = list_raw_objects_init(dpp, pool, string(), &ctx);
8008 if (r < 0) {
8009 return r;
8010 }
8011 }
8012
8013 return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
8014 }
8015
8016 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8017 {
8018 return pool_iterate_get_cursor(ctx.iter_ctx);
8019 }
8020
8021 int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8022 rgw_bucket_dir_entry *dirent)
8023 {
8024 rgw_cls_bi_entry bi_entry;
8025 int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
8026 if (r < 0 && r != -ENOENT) {
8027 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8028 }
8029 if (r < 0) {
8030 return r;
8031 }
8032 auto iter = bi_entry.data.cbegin();
8033 try {
8034 decode(*dirent, iter);
8035 } catch (buffer::error& err) {
8036 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8037 return -EIO;
8038 }
8039
8040 return 0;
8041 }
8042
8043 int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8044 rgw_bucket_olh_entry *olh)
8045 {
8046 rgw_cls_bi_entry bi_entry;
8047 int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
8048 if (r < 0 && r != -ENOENT) {
8049 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8050 }
8051 if (r < 0) {
8052 return r;
8053 }
8054 auto iter = bi_entry.data.cbegin();
8055 try {
8056 decode(*olh, iter);
8057 } catch (buffer::error& err) {
8058 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8059 return -EIO;
8060 }
8061
8062 return 0;
8063 }
8064
8065 int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8066 BIIndexType index_type, rgw_cls_bi_entry *entry)
8067 {
8068 BucketShard bs(this);
8069 int ret = bs.init(dpp, bucket_info, obj);
8070 if (ret < 0) {
8071 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
8072 return ret;
8073 }
8074
8075 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
8076
8077 auto& ref = bs.bucket_obj.get_ref();
8078
8079 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
8080 }
8081
8082 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8083 {
8084 auto& ref = bs.bucket_obj.get_ref();
8085 cls_rgw_bi_put(op, ref.obj.oid, entry);
8086 }
8087
8088 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8089 {
8090 auto& ref = bs.bucket_obj.get_ref();
8091 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
8092 if (ret < 0)
8093 return ret;
8094
8095 return 0;
8096 }
8097
8098 int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
8099 {
8100 BucketShard bs(this);
8101 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
8102 if (ret < 0) {
8103 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
8104 return ret;
8105 }
8106
8107 return bi_put(bs, entry);
8108 }
8109
8110 int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8111 {
8112 rgw_obj obj(bucket, obj_name);
8113 BucketShard bs(this);
8114 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
8115 if (ret < 0) {
8116 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
8117 return ret;
8118 }
8119
8120 auto& ref = bs.bucket_obj.get_ref();
8121 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name, marker, max, entries, is_truncated);
8122 if (ret == -ENOENT) {
8123 *is_truncated = false;
8124 }
8125 if (ret < 0)
8126 return ret;
8127
8128 return 0;
8129 }
8130
8131 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8132 {
8133 auto& ref = bs.bucket_obj.get_ref();
8134 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, filter_obj, marker, max, entries, is_truncated);
8135 if (ret < 0)
8136 return ret;
8137
8138 return 0;
8139 }
8140
8141 int RGWRados::bi_remove(BucketShard& bs)
8142 {
8143 auto& ref = bs.bucket_obj.get_ref();
8144 int ret = ref.pool.ioctx().remove(ref.obj.oid);
8145 if (ret == -ENOENT) {
8146 ret = 0;
8147 }
8148 if (ret < 0) {
8149 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
8150 return ret;
8151 }
8152
8153 return 0;
8154 }
8155
8156 int RGWRados::bi_list(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8157 {
8158 BucketShard bs(this);
8159 int ret = bs.init(bucket_info.bucket, shard_id, bucket_info.layout.current_index, nullptr /* no RGWBucketInfo */, dpp);
8160 if (ret < 0) {
8161 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
8162 return ret;
8163 }
8164
8165 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8166 }
8167
8168 int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
8169 {
8170 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
8171 }
8172
8173 int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8174 librados::ObjectWriteOperation *op)
8175 {
8176 return gc_pool_ctx.aio_operate(oid, c, op);
8177 }
8178
8179 int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8180 {
8181 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
8182 }
8183
8184 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
8185 {
8186 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
8187 }
8188
8189 int RGWRados::process_gc(bool expired_only)
8190 {
8191 return gc->process(expired_only);
8192 }
8193
8194 int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
8195 vector<rgw::sal::Lifecycle::LCEntry>& progress_map,
8196 int& index)
8197 {
8198 return lc->list_lc_progress(marker, max_entries, progress_map, index);
8199 }
8200
8201 int RGWRados::process_lc()
8202 {
8203 RGWLC lc;
8204 lc.initialize(cct, this->store);
8205 RGWLC::LCWorker worker(&lc, cct, &lc, 0);
8206 auto ret = lc.process(&worker, true /* once */);
8207 lc.stop_processor(); // sets down_flag, but returns immediately
8208 return ret;
8209 }
8210
8211 bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
8212 {
8213 return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
8214 }
8215
8216 int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
8217 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
8218 {
8219 rgw_zone_set zones_trace;
8220 if (_zones_trace) {
8221 zones_trace = *_zones_trace;
8222 }
8223 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
8224
8225 ObjectWriteOperation o;
8226 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
8227 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8228 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
8229 return bs.bucket_obj.operate(dpp, &o, y);
8230 }
8231
8232 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
8233 int64_t pool, uint64_t epoch,
8234 rgw_bucket_dir_entry& ent, RGWObjCategory category,
8235 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
8236 {
8237 ObjectWriteOperation o;
8238 rgw_bucket_dir_entry_meta dir_meta;
8239 dir_meta = ent.meta;
8240 dir_meta.category = category;
8241
8242 rgw_zone_set zones_trace;
8243 if (_zones_trace) {
8244 zones_trace = *_zones_trace;
8245 }
8246 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
8247
8248 rgw_bucket_entry_ver ver;
8249 ver.pool = pool;
8250 ver.epoch = epoch;
8251 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
8252 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8253 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
8254 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
8255 complete_op_data *arg;
8256 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
8257 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
8258 librados::AioCompletion *completion = arg->rados_completion;
8259 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
8260 completion->release(); /* can't reference arg here, as it might have already been released */
8261 return ret;
8262 }
8263
8264 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
8265 int64_t pool, uint64_t epoch,
8266 rgw_bucket_dir_entry& ent, RGWObjCategory category,
8267 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
8268 {
8269 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
8270 }
8271
8272 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8273 int64_t pool, uint64_t epoch,
8274 rgw_obj& obj,
8275 real_time& removed_mtime,
8276 list<rgw_obj_index_key> *remove_objs,
8277 uint16_t bilog_flags,
8278 rgw_zone_set *zones_trace)
8279 {
8280 rgw_bucket_dir_entry ent;
8281 ent.meta.mtime = removed_mtime;
8282 obj.key.get_index_key(&ent.key);
8283 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8284 ent, RGWObjCategory::None, remove_objs,
8285 bilog_flags, zones_trace);
8286 }
8287
8288 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
8289 {
8290 rgw_bucket_dir_entry ent;
8291 obj.key.get_index_key(&ent.key);
8292 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8293 -1 /* pool id */, 0, ent,
8294 RGWObjCategory::None, NULL, bilog_flags,
8295 zones_trace);
8296 }
8297
8298 int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
8299 {
8300 RGWSI_RADOS::Pool index_pool;
8301 map<int, string> bucket_objs;
8302 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
8303 if (r < 0)
8304 return r;
8305
8306 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8307 }
8308
8309
8310 uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8311 uint32_t num_shards)
8312 {
8313 // We want to minimize the chances that when num_shards >>
8314 // num_entries that we return much fewer than num_entries to the
8315 // client. Given all the overhead of making a cls call to the osd,
8316 // returning a few entries is not much more work than returning one
8317 // entry. This minimum might be better tuned based on future
8318 // experiments where num_shards >> num_entries. (Note: ">>" should
8319 // be interpreted as "much greater than".)
8320 constexpr uint32_t min_read = 8;
8321
8322 // The following is based on _"Balls into Bins" -- A Simple and
8323 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8324 // cases when num_shards >> num_entries (it almost serves as a
8325 // ceiling calculation). We also assume alpha is 1.0 and extract it
8326 // from the calculation. Future work could involve memoizing some of
8327 // the transcendental functions to minimize repeatedly re-calling
8328 // them with the same parameters, which we expect to be the case the
8329 // majority of the time.
8330 uint32_t calc_read =
8331 1 +
8332 static_cast<uint32_t>((num_entries / num_shards) +
8333 sqrt((2 * num_entries) *
8334 log(num_shards) / num_shards));
8335
8336 return std::max(min_read, calc_read);
8337 }
8338
8339
8340 int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
8341 RGWBucketInfo& bucket_info,
8342 const int shard_id,
8343 const rgw_obj_index_key& start_after,
8344 const string& prefix,
8345 const string& delimiter,
8346 const uint32_t num_entries,
8347 const bool list_versions,
8348 const uint16_t expansion_factor,
8349 ent_map_t& m,
8350 bool* is_truncated,
8351 bool* cls_filtered,
8352 rgw_obj_index_key *last_entry,
8353 optional_yield y,
8354 check_filter_t force_check_filter)
8355 {
8356 /* expansion_factor allows the number of entries to read to grow
8357 * exponentially; this is used when earlier reads are producing too
8358 * few results, perhaps due to filtering or to a series of
8359 * namespaced entries */
8360
8361 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
8362 " start_after=\"" << start_after.name <<
8363 "[" << start_after.instance <<
8364 "]\", prefix=\"" << prefix <<
8365 "\" num_entries=" << num_entries <<
8366 ", list_versions=" << list_versions <<
8367 ", expansion_factor=" << expansion_factor << dendl;
8368
8369 m.clear();
8370
8371 RGWSI_RADOS::Pool index_pool;
8372 // key - oid (for different shards if there is any)
8373 // value - list result for the corresponding oid (shard), it is filled by
8374 // the AIO callback
8375 map<int, string> shard_oids;
8376 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id,
8377 &index_pool, &shard_oids,
8378 nullptr);
8379 if (r < 0) {
8380 return r;
8381 }
8382
8383 const uint32_t shard_count = shard_oids.size();
8384 uint32_t num_entries_per_shard;
8385 if (expansion_factor == 0) {
8386 num_entries_per_shard =
8387 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8388 } else if (expansion_factor <= 11) {
8389 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8390 num_entries_per_shard =
8391 std::min(num_entries,
8392 (uint32_t(1 << (expansion_factor - 1)) *
8393 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8394 } else {
8395 num_entries_per_shard = num_entries;
8396 }
8397
8398 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ <<
8399 " request from each of " << shard_count <<
8400 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8401 num_entries << " total entries" << dendl;
8402
8403 auto& ioctx = index_pool.ioctx();
8404 map<int, rgw_cls_list_ret> shard_list_results;
8405 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8406 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8407 num_entries_per_shard,
8408 list_versions, shard_oids, shard_list_results,
8409 cct->_conf->rgw_bucket_index_max_aio)();
8410 if (r < 0) {
8411 return r;
8412 }
8413
8414 // to manage the iterators through each shard's list results
8415 struct ShardTracker {
8416 const size_t shard_idx;
8417 rgw_cls_list_ret& result;
8418 const std::string& oid_name;
8419 RGWRados::ent_map_t::iterator cursor;
8420 RGWRados::ent_map_t::iterator end;
8421
8422 // manages an iterator through a shard and provides other
8423 // accessors
8424 ShardTracker(size_t _shard_idx,
8425 rgw_cls_list_ret& _result,
8426 const std::string& _oid_name):
8427 shard_idx(_shard_idx),
8428 result(_result),
8429 oid_name(_oid_name),
8430 cursor(_result.dir.m.begin()),
8431 end(_result.dir.m.end())
8432 {}
8433
8434 inline const std::string& entry_name() const {
8435 return cursor->first;
8436 }
8437 rgw_bucket_dir_entry& dir_entry() const {
8438 return cursor->second;
8439 }
8440 inline bool is_truncated() const {
8441 return result.is_truncated;
8442 }
8443 inline ShardTracker& advance() {
8444 ++cursor;
8445 // return a self-reference to allow for chaining of calls, such
8446 // as x.advance().at_end()
8447 return *this;
8448 }
8449 inline bool at_end() const {
8450 return cursor == end;
8451 }
8452 }; // ShardTracker
8453
8454 // add the next unique candidate, or return false if we reach the end
8455 auto next_candidate = [] (CephContext *cct, ShardTracker& t,
8456 std::map<std::string, size_t>& candidates,
8457 size_t tracker_idx) {
8458 while (!t.at_end()) {
8459 if (candidates.emplace(t.entry_name(), tracker_idx).second) {
8460 return;
8461 }
8462 t.advance(); // skip duplicate common prefixes
8463 }
8464 };
8465
8466 // one tracker per shard requested (may not be all shards)
8467 std::vector<ShardTracker> results_trackers;
8468 results_trackers.reserve(shard_list_results.size());
8469 for (auto& r : shard_list_results) {
8470 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8471
8472 // if any *one* shard's result is trucated, the entire result is
8473 // truncated
8474 *is_truncated = *is_truncated || r.second.is_truncated;
8475
8476 // unless *all* are shards are cls_filtered, the entire result is
8477 // not filtered
8478 *cls_filtered = *cls_filtered && r.second.cls_filtered;
8479 }
8480
8481 // create a map to track the next candidate entry from ShardTracker
8482 // (key=candidate, value=index into results_trackers); as we consume
8483 // entries from shards, we replace them with the next entries in the
8484 // shards until we run out
8485 map<string, size_t> candidates;
8486 size_t tracker_idx = 0;
8487 for (auto& t : results_trackers) {
8488 // it's important that the values in the map refer to the index
8489 // into the results_trackers vector, which may not be the same
8490 // as the shard number (i.e., when not all shards are requested)
8491 next_candidate(cct, t, candidates, tracker_idx);
8492 ++tracker_idx;
8493 }
8494
8495 rgw_bucket_dir_entry*
8496 last_entry_visited = nullptr; // to set last_entry (marker)
8497 map<string, bufferlist> updates;
8498 uint32_t count = 0;
8499 while (count < num_entries && !candidates.empty()) {
8500 r = 0;
8501 // select the next entry in lexical order (first key in map);
8502 // again tracker_idx is not necessarily shard number, but is index
8503 // into results_trackers vector
8504 tracker_idx = candidates.begin()->second;
8505 auto& tracker = results_trackers.at(tracker_idx);
8506
8507 const string& name = tracker.entry_name();
8508 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8509
8510 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ << " currently processing " <<
8511 dirent.key << " from shard " << tracker.shard_idx << dendl;
8512
8513 const bool force_check =
8514 force_check_filter && force_check_filter(dirent.key.name);
8515
8516 if ((!dirent.exists &&
8517 !dirent.is_delete_marker() &&
8518 !dirent.is_common_prefix()) ||
8519 !dirent.pending_map.empty() ||
8520 force_check) {
8521 /* there are uncommitted ops. We need to check the current
8522 * state, and if the tags are old we need to do clean-up as
8523 * well. */
8524 librados::IoCtx sub_ctx;
8525 sub_ctx.dup(ioctx);
8526 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
8527 updates[tracker.oid_name], y);
8528 if (r < 0 && r != -ENOENT) {
8529 return r;
8530 }
8531 } else {
8532 r = 0;
8533 }
8534
8535 if (r >= 0) {
8536 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ << ": got " <<
8537 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8538 m[name] = std::move(dirent);
8539 last_entry_visited = &(m[name]);
8540 ++count;
8541 } else {
8542 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ << ": skipping " <<
8543 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8544 last_entry_visited = &tracker.dir_entry();
8545 }
8546
8547 // refresh the candidates map
8548 candidates.erase(candidates.begin());
8549 tracker.advance();
8550
8551 next_candidate(cct, tracker, candidates, tracker_idx);
8552
8553 if (tracker.at_end() && tracker.is_truncated()) {
8554 // once we exhaust one shard that is truncated, we need to stop,
8555 // as we cannot be certain that one of the next entries needs to
8556 // come from that shard; S3 and swift protocols allow returning
8557 // fewer than what was requested
8558 break;
8559 }
8560 } // while we haven't provided requested # of result entries
8561
8562 // suggest updates if there are any
8563 for (auto& miter : updates) {
8564 if (miter.second.length()) {
8565 ObjectWriteOperation o;
8566 cls_rgw_suggest_changes(o, miter.second);
8567 // we don't care if we lose suggested updates, send them off blindly
8568 AioCompletion *c =
8569 librados::Rados::aio_create_completion(nullptr, nullptr);
8570 ioctx.aio_operate(miter.first, c, &o);
8571 c->release();
8572 }
8573 } // updates loop
8574
8575 // determine truncation by checking if all the returned entries are
8576 // consumed or not
8577 *is_truncated = false;
8578 for (const auto& t : results_trackers) {
8579 if (!t.at_end() || t.is_truncated()) {
8580 *is_truncated = true;
8581 break;
8582 }
8583 }
8584
8585 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ <<
8586 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8587 dendl;
8588
8589 if (*is_truncated && count < num_entries) {
8590 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ <<
8591 ": INFO requested " << num_entries << " entries but returning " <<
8592 count << ", which is truncated" << dendl;
8593 }
8594
8595 if (last_entry_visited != nullptr && last_entry) {
8596 *last_entry = last_entry_visited->key;
8597 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ <<
8598 ": returning, last_entry=" << *last_entry << dendl;
8599 } else {
8600 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ <<
8601 ": returning, last_entry NOT SET" << dendl;
8602 }
8603
8604 return 0;
8605 }
8606
8607
8608 // A helper function to retrieve the hash source from an incomplete multipart entry
8609 // by removing everything from the second last dot to the end.
8610 static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
8611 std::size_t found = oid_wo_ns.rfind('.');
8612 if (found == std::string::npos || found < 1) {
8613 return -EINVAL;
8614 }
8615 found = oid_wo_ns.rfind('.', found - 1);
8616 if (found == std::string::npos || found < 1) {
8617 return -EINVAL;
8618 }
8619 *index_hash_source = oid_wo_ns.substr(0, found);
8620 return 0;
8621 }
8622
8623
8624 int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
8625 RGWBucketInfo& bucket_info,
8626 int shard_id,
8627 const rgw_obj_index_key& start_after,
8628 const string& prefix,
8629 uint32_t num_entries,
8630 bool list_versions,
8631 std::vector<rgw_bucket_dir_entry>& ent_list,
8632 bool *is_truncated,
8633 rgw_obj_index_key *last_entry,
8634 optional_yield y,
8635 check_filter_t force_check_filter) {
8636 ldpp_dout(dpp, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
8637 " start_after " << start_after.name << "[" << start_after.instance <<
8638 "] num_entries " << num_entries << dendl;
8639
8640 ent_list.clear();
8641 static MultipartMetaFilter multipart_meta_filter;
8642
8643 *is_truncated = false;
8644 RGWSI_RADOS::Pool index_pool;
8645
8646 map<int, string> oids;
8647 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, nullptr);
8648 if (r < 0)
8649 return r;
8650
8651 auto& ioctx = index_pool.ioctx();
8652
8653 const uint32_t num_shards = oids.size();
8654
8655 rgw_obj_index_key marker = start_after;
8656 uint32_t current_shard;
8657 if (shard_id >= 0) {
8658 current_shard = shard_id;
8659 } else if (start_after.empty()) {
8660 current_shard = 0u;
8661 } else {
8662 // at this point we have a marker (start_after) that has something
8663 // in it, so we need to get to the bucket shard index, so we can
8664 // start reading from there
8665
8666
8667 // now convert the key (oid) to an rgw_obj_key since that will
8668 // separate out the namespace, name, and instance
8669 rgw_obj_key obj_key;
8670 bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
8671 if (!parsed) {
8672 ldpp_dout(dpp, 0) <<
8673 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
8674 "start marker: '" << start_after << "'" << dendl;
8675 return -EINVAL;
8676 } else if (obj_key.name.empty()) {
8677 // if the name is empty that means the object name came in with
8678 // a namespace only, and therefore we need to start our scan at
8679 // the first bucket index shard
8680 current_shard = 0u;
8681 } else {
8682 // so now we have the key used to compute the bucket index shard
8683 // and can extract the specific shard from it
8684 if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
8685 // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
8686 // the implementation relying on MultipartMetaFilter
8687 // because MultipartMetaFilter only checks .meta suffix, which may
8688 // exclude data multiparts but include some regular objects with .meta suffix
8689 // by mistake.
8690 string index_hash_source;
8691 r = parse_index_hash_source(obj_key.name, &index_hash_source);
8692 if (r < 0) {
8693 return r;
8694 }
8695 current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
8696 } else {
8697 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
8698 }
8699 }
8700 }
8701
8702 uint32_t count = 0u;
8703 map<string, bufferlist> updates;
8704 rgw_obj_index_key last_added_entry;
8705 while (count <= num_entries &&
8706 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8707 current_shard < num_shards)) {
8708 const std::string& oid = oids[current_shard];
8709 rgw_cls_list_ret result;
8710
8711 librados::ObjectReadOperation op;
8712 string empty_delimiter;
8713 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8714 num_entries,
8715 list_versions, &result);
8716 r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
8717 if (r < 0)
8718 return r;
8719
8720 for (auto& entry : result.dir.m) {
8721 rgw_bucket_dir_entry& dirent = entry.second;
8722
8723 bool force_check = force_check_filter &&
8724 force_check_filter(dirent.key.name);
8725 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8726 !dirent.pending_map.empty() ||
8727 force_check) {
8728 /* there are uncommitted ops. We need to check the current state,
8729 * and if the tags are old we need to do cleanup as well. */
8730 librados::IoCtx sub_ctx;
8731 sub_ctx.dup(ioctx);
8732 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
8733 if (r < 0 && r != -ENOENT) {
8734 return r;
8735 }
8736 } else {
8737 r = 0;
8738 }
8739
8740 // at this point either r >=0 or r == -ENOENT
8741 if (r >= 0) { // i.e., if r != -ENOENT
8742 ldpp_dout(dpp, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
8743 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8744
8745 if (count < num_entries) {
8746 marker = last_added_entry = dirent.key; // double assign
8747 ent_list.emplace_back(std::move(dirent));
8748 ++count;
8749 } else {
8750 *is_truncated = true;
8751 goto check_updates;
8752 }
8753 } else { // r == -ENOENT
8754 // in the case of -ENOENT, make sure we're advancing marker
8755 // for possible next call to CLSRGWIssueBucketList
8756 marker = dirent.key;
8757 }
8758 } // entry for loop
8759
8760 if (!result.is_truncated) {
8761 // if we reached the end of the shard read next shard
8762 ++current_shard;
8763 marker = rgw_obj_index_key();
8764 }
8765 } // shard loop
8766
8767 check_updates:
8768
8769 // suggest updates if there is any
8770 map<string, bufferlist>::iterator miter = updates.begin();
8771 for (; miter != updates.end(); ++miter) {
8772 if (miter->second.length()) {
8773 ObjectWriteOperation o;
8774 cls_rgw_suggest_changes(o, miter->second);
8775 // we don't care if we lose suggested updates, send them off blindly
8776 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
8777 ioctx.aio_operate(miter->first, c, &o);
8778 c->release();
8779 }
8780 }
8781
8782 if (last_entry && !ent_list.empty()) {
8783 *last_entry = last_added_entry;
8784 }
8785
8786 return 0;
8787 } // RGWRados::cls_bucket_list_unordered
8788
8789
8790 int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
8791 rgw_usage_log_info& info)
8792 {
8793 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8794
8795 rgw_rados_ref ref;
8796 int r = get_raw_obj_ref(dpp, obj, &ref);
8797 if (r < 0) {
8798 return r;
8799 }
8800
8801 ObjectWriteOperation op;
8802 cls_rgw_usage_log_add(op, info);
8803
8804 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8805 return r;
8806 }
8807
8808 int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
8809 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
8810 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
8811 bool *is_truncated)
8812 {
8813 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8814
8815 rgw_rados_ref ref;
8816 int r = get_raw_obj_ref(dpp, obj, &ref);
8817 if (r < 0) {
8818 return r;
8819 }
8820
8821 *is_truncated = false;
8822
8823 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
8824 max_entries, read_iter, usage, is_truncated);
8825
8826 return r;
8827 }
8828
8829 static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
8830 {
8831 bool done = false;
8832 do {
8833 librados::ObjectWriteOperation op;
8834 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
8835 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8836 if (r == -ENODATA)
8837 done = true;
8838 else if (r < 0)
8839 return r;
8840 } while (!done);
8841
8842 return 0;
8843 }
8844
8845 int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
8846 uint64_t start_epoch, uint64_t end_epoch)
8847 {
8848 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8849
8850 rgw_rados_ref ref;
8851 int r = get_raw_obj_ref(dpp, obj, &ref);
8852 if (r < 0) {
8853 return r;
8854 }
8855
8856 r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
8857 return r;
8858 }
8859
8860 int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
8861 {
8862 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8863
8864 rgw_rados_ref ref;
8865 int r = get_raw_obj_ref(dpp, obj, &ref);
8866 if (r < 0) {
8867 return r;
8868 }
8869 librados::ObjectWriteOperation op;
8870 cls_rgw_usage_log_clear(op);
8871 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8872 return r;
8873 }
8874
8875
8876 int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
8877 {
8878 RGWSI_RADOS::Pool index_pool;
8879 string dir_oid;
8880
8881 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
8882
8883 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, &index_pool, &dir_oid);
8884 if (r < 0)
8885 return r;
8886
8887 bufferlist updates;
8888
8889 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
8890 rgw_bucket_dir_entry entry;
8891 entry.key = *iter;
8892 ldpp_dout(dpp, 2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
8893 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
8894 updates.append(CEPH_RGW_REMOVE | suggest_flag);
8895 encode(entry, updates);
8896 }
8897
8898 bufferlist out;
8899
8900 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
8901
8902 return r;
8903 }
8904
8905 int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
8906 librados::IoCtx io_ctx,
8907 const RGWBucketInfo& bucket_info,
8908 rgw_bucket_dir_entry& list_state,
8909 rgw_bucket_dir_entry& object,
8910 bufferlist& suggested_updates,
8911 optional_yield y)
8912 {
8913 const rgw_bucket& bucket = bucket_info.bucket;
8914 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
8915
8916 std::string loc;
8917
8918 rgw_obj obj(bucket, list_state.key);
8919
8920 string oid;
8921 get_obj_bucket_and_oid_loc(obj, oid, loc);
8922
8923 if (loc != list_state.locator) {
8924 ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
8925 }
8926
8927 io_ctx.locator_set_key(list_state.locator);
8928
8929 RGWObjState *astate = NULL;
8930 RGWObjectCtx rctx(this->store);
8931 int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
8932 if (r < 0)
8933 return r;
8934
8935 list_state.pending_map.clear(); // we don't need this and it inflates size
8936 if (!list_state.is_delete_marker() && !astate->exists) {
8937 /* object doesn't exist right now -- hopefully because it's
8938 * marked as !exists and got deleted */
8939 if (list_state.exists) {
8940 /* FIXME: what should happen now? Work out if there are any
8941 * non-bad ways this could happen (there probably are, but annoying
8942 * to handle!) */
8943 }
8944 // encode a suggested removal of that key
8945 list_state.ver.epoch = io_ctx.get_last_version();
8946 list_state.ver.pool = io_ctx.get_id();
8947 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
8948 return -ENOENT;
8949 }
8950
8951 string etag;
8952 string content_type;
8953 ACLOwner owner;
8954
8955 object.meta.size = astate->size;
8956 object.meta.accounted_size = astate->accounted_size;
8957 object.meta.mtime = astate->mtime;
8958
8959 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
8960 if (iter != astate->attrset.end()) {
8961 etag = rgw_bl_str(iter->second);
8962 }
8963 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
8964 if (iter != astate->attrset.end()) {
8965 content_type = rgw_bl_str(iter->second);
8966 }
8967 iter = astate->attrset.find(RGW_ATTR_ACL);
8968 if (iter != astate->attrset.end()) {
8969 r = decode_policy(iter->second, &owner);
8970 if (r < 0) {
8971 ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
8972 }
8973 }
8974
8975 if (astate->manifest) {
8976 RGWObjManifest::obj_iterator miter;
8977 RGWObjManifest& manifest = *astate->manifest;
8978 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
8979 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(store);
8980 rgw_obj loc;
8981 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
8982
8983 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
8984 ldpp_dout(dpp, 0) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
8985 r = delete_obj_index(loc, astate->mtime, dpp);
8986 if (r < 0) {
8987 ldpp_dout(dpp, 0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
8988 }
8989 }
8990 }
8991 }
8992
8993 object.meta.etag = etag;
8994 object.meta.content_type = content_type;
8995 object.meta.owner = owner.get_id().to_str();
8996 object.meta.owner_display_name = owner.get_display_name();
8997
8998 // encode suggested updates
8999 list_state.ver.pool = io_ctx.get_id();
9000 list_state.ver.epoch = astate->epoch;
9001 list_state.meta.size = object.meta.size;
9002 list_state.meta.accounted_size = object.meta.accounted_size;
9003 list_state.meta.mtime = object.meta.mtime;
9004 list_state.meta.category = main_category;
9005 list_state.meta.etag = etag;
9006 list_state.meta.content_type = content_type;
9007 if (astate->obj_tag.length() > 0)
9008 list_state.tag = astate->obj_tag.c_str();
9009 list_state.meta.owner = owner.get_id().to_str();
9010 list_state.meta.owner_display_name = owner.get_display_name();
9011
9012 list_state.exists = true;
9013 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
9014 return 0;
9015 }
9016
9017 int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
9018 {
9019 RGWSI_RADOS::Pool index_pool;
9020 map<int, string> oids;
9021 map<int, struct rgw_cls_list_ret> list_results;
9022 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
9023 if (r < 0) {
9024 ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
9025 << r << dendl;
9026 return r;
9027 }
9028
9029 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
9030 if (r < 0) {
9031 ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
9032 << r << dendl;
9033 return r;
9034 }
9035
9036 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9037 for(; iter != list_results.end(); ++iter) {
9038 headers.push_back(std::move(iter->second.dir.header));
9039 }
9040 return 0;
9041 }
9042
9043 int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
9044 {
9045 RGWSI_RADOS::Pool index_pool;
9046 map<int, string> bucket_objs;
9047 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
9048 if (r < 0)
9049 return r;
9050
9051 map<int, string>::iterator iter = bucket_objs.begin();
9052 for (; iter != bucket_objs.end(); ++iter) {
9053 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
9054 if (r < 0) {
9055 ctx->put();
9056 break;
9057 } else {
9058 (*num_aio)++;
9059 }
9060 }
9061 return r;
9062 }
9063
9064 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
9065 const rgw_bucket& bucket,
9066 uint64_t num_objs,
9067 const DoutPrefixProvider *dpp)
9068 {
9069 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
9070 return 0;
9071 }
9072
9073 bool need_resharding = false;
9074 uint32_t num_source_shards =
9075 (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
9076 const uint32_t max_dynamic_shards =
9077 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
9078
9079 if (num_source_shards >= max_dynamic_shards) {
9080 return 0;
9081 }
9082
9083 uint32_t suggested_num_shards = 0;
9084 const uint64_t max_objs_per_shard =
9085 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9086
9087 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
9088 num_objs, need_resharding, &suggested_num_shards);
9089 if (! need_resharding) {
9090 return 0;
9091 }
9092
9093 const uint32_t final_num_shards =
9094 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
9095 max_dynamic_shards);
9096 // final verification, so we don't reduce number of shards
9097 if (final_num_shards <= num_source_shards) {
9098 return 0;
9099 }
9100
9101 ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
9102 " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
9103 "; new num shards " << final_num_shards << " (suggested " <<
9104 suggested_num_shards << ")" << dendl;
9105
9106 return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
9107 }
9108
9109 int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
9110 {
9111 RGWReshard reshard(this->store, dpp);
9112
9113 uint32_t num_source_shards = (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
9114
9115 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
9116 if (new_num_shards <= num_source_shards) {
9117 ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
9118 return 0;
9119 }
9120
9121 cls_rgw_reshard_entry entry;
9122 entry.time = real_clock::now();
9123 entry.tenant = bucket_info.owner.tenant;
9124 entry.bucket_name = bucket_info.bucket.name;
9125 entry.bucket_id = bucket_info.bucket.bucket_id;
9126 entry.old_num_shards = num_source_shards;
9127 entry.new_num_shards = new_num_shards;
9128
9129 return reshard.add(dpp, entry);
9130 }
9131
9132 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
9133 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
9134 uint64_t obj_size, optional_yield y,
9135 bool check_size_only)
9136 {
9137 // if we only check size, then num_objs will set to 0
9138 if(check_size_only)
9139 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size, y);
9140
9141 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size, y);
9142 }
9143
9144 int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
9145 int *shard_id)
9146 {
9147 int r = 0;
9148 switch (layout.hash_type) {
9149 case rgw::BucketHashType::Mod:
9150 if (!layout.num_shards) {
9151 if (shard_id) {
9152 *shard_id = -1;
9153 }
9154 } else {
9155 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
9156 if (shard_id) {
9157 *shard_id = (int)sid;
9158 }
9159 }
9160 break;
9161 default:
9162 r = -ENOTSUP;
9163 }
9164 return r;
9165 }
9166
9167 uint64_t RGWRados::instance_id()
9168 {
9169 return get_rados_handle()->get_instance_id();
9170 }
9171
9172 uint64_t RGWRados::next_bucket_id()
9173 {
9174 std::lock_guard l{bucket_id_lock};
9175 return ++max_bucket_id;
9176 }
9177
9178 librados::Rados* RGWRados::get_rados_handle()
9179 {
9180 return &rados;
9181 }
9182
9183 int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
9184 {
9185 rgw_rados_ref ref;
9186 int ret = get_raw_obj_ref(dpp, obj, &ref);
9187 if (ret < 0) {
9188 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9189 return ret;
9190 }
9191
9192 ObjectWriteOperation op;
9193 list<string> prefixes;
9194 cls_rgw_remove_obj(op, prefixes);
9195
9196 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9197 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
9198 if (ret < 0) {
9199 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9200 c->release();
9201 return ret;
9202 }
9203
9204 handles.push_back(c);
9205
9206 return 0;
9207 }
9208
9209 int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
9210 RGWBucketInfo& bucket_info, RGWObjState *astate,
9211 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9212 optional_yield y)
9213 {
9214 rgw_rados_ref ref;
9215 int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
9216 if (ret < 0) {
9217 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9218 return ret;
9219 }
9220
9221 if (keep_index_consistent) {
9222 RGWRados::Bucket bop(this, bucket_info);
9223 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9224
9225 ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
9226 if (ret < 0) {
9227 ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
9228 return ret;
9229 }
9230 }
9231
9232 ObjectWriteOperation op;
9233 list<string> prefixes;
9234 cls_rgw_remove_obj(op, prefixes);
9235
9236 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9237 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
9238 if (ret < 0) {
9239 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9240 c->release();
9241 return ret;
9242 }
9243
9244 handles.push_back(c);
9245
9246 if (keep_index_consistent) {
9247 ret = delete_obj_index(obj, astate->mtime, dpp);
9248 if (ret < 0) {
9249 ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
9250 return ret;
9251 }
9252 }
9253 return ret;
9254 }