1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab ft=cpp
4 #include "include/compat.h"
10 #include <boost/algorithm/string.hpp>
11 #include <string_view>
13 #include <boost/container/flat_set.hpp>
14 #include <boost/format.hpp>
15 #include <boost/optional.hpp>
16 #include <boost/utility/in_place_factory.hpp>
18 #include "common/ceph_json.h"
20 #include "common/errno.h"
21 #include "common/Formatter.h"
22 #include "common/Throttle.h"
26 #include "rgw_cache.h"
28 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
29 #include "rgw_aio_throttle.h"
30 #include "rgw_bucket.h"
31 #include "rgw_rest_conn.h"
32 #include "rgw_cr_rados.h"
33 #include "rgw_cr_rest.h"
34 #include "rgw_putobj_processor.h"
36 #include "cls/rgw/cls_rgw_ops.h"
37 #include "cls/rgw/cls_rgw_client.h"
38 #include "cls/rgw/cls_rgw_const.h"
39 #include "cls/refcount/cls_refcount_client.h"
40 #include "cls/version/cls_version_client.h"
41 #include "osd/osd_types.h"
43 #include "rgw_tools.h"
44 #include "rgw_coroutine.h"
45 #include "rgw_compression.h"
46 #include "rgw_worker.h"
48 #undef fork // fails to compile RGWPeriod::fork() below
50 #include "common/Clock.h"
52 using namespace librados
;
60 #include "include/random.h"
65 #include "rgw_object_expirer_core.h"
67 #include "rgw_sync_counters.h"
68 #include "rgw_sync_trace.h"
69 #include "rgw_trim_datalog.h"
70 #include "rgw_trim_mdlog.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
75 #include "services/svc_zone.h"
76 #include "services/svc_zone_utils.h"
77 #include "services/svc_quota.h"
78 #include "services/svc_sync_modules.h"
79 #include "services/svc_sys_obj.h"
80 #include "services/svc_sys_obj_cache.h"
81 #include "services/svc_bucket.h"
82 #include "services/svc_mdlog.h"
83 #include "services/svc_datalog_rados.h"
85 #include "compressor/Compressor.h"
88 #define TRACEPOINT_DEFINE
89 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
90 #include "tracing/rgw_rados.h"
91 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
92 #undef TRACEPOINT_DEFINE
94 #define tracepoint(...)
97 #define dout_context g_ceph_context
98 #define dout_subsys ceph_subsys_rgw
101 static string shadow_ns
= "shadow";
102 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
103 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
105 static RGWObjCategory main_category
= RGWObjCategory::Main
;
106 #define RGW_USAGE_OBJ_PREFIX "usage."
108 #define dout_subsys ceph_subsys_rgw
111 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
112 const rgw_placement_rule
& head_placement_rule
,
113 const rgw_obj
& obj
, rgw_pool
*pool
)
115 if (!zone_params
.get_head_data_pool(head_placement_rule
, obj
, pool
)) {
116 RGWZonePlacementInfo placement
;
117 if (!zone_params
.get_placement(zonegroup
.default_placement
.name
, &placement
)) {
121 if (!obj
.in_extra_data
) {
122 *pool
= placement
.get_data_pool(zonegroup
.default_placement
.storage_class
);
124 *pool
= placement
.get_data_extra_pool();
131 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
132 const rgw_placement_rule
& head_placement_rule
,
133 const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
135 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
137 return rgw_get_obj_data_pool(zonegroup
, zone_params
, head_placement_rule
, obj
, &raw_obj
->pool
);
140 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
144 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
150 rgw_raw_obj
rgw_obj_select::get_raw_obj(RGWRados
*store
) const
154 store
->obj_to_raw(placement_rule
, obj
, &r
);
160 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
162 obj_version
*check_objv
= version_for_check();
165 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
168 cls_version_read(*op
, &read_version
);
171 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
173 obj_version
*check_objv
= version_for_check();
174 obj_version
*modify_version
= version_for_write();
177 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
180 if (modify_version
) {
181 cls_version_set(*op
, *modify_version
);
183 cls_version_inc(*op
);
187 RGWObjState::RGWObjState() {
190 RGWObjState::~RGWObjState() {
193 RGWObjState::RGWObjState(const RGWObjState
& rhs
) : obj (rhs
.obj
) {
194 is_atomic
= rhs
.is_atomic
;
195 has_attrs
= rhs
.has_attrs
;
198 accounted_size
= rhs
.accounted_size
;
201 if (rhs
.obj_tag
.length()) {
202 obj_tag
= rhs
.obj_tag
;
204 if (rhs
.tail_tag
.length()) {
205 tail_tag
= rhs
.tail_tag
;
207 write_tag
= rhs
.write_tag
;
208 fake_tag
= rhs
.fake_tag
;
209 manifest
= rhs
.manifest
;
210 shadow_obj
= rhs
.shadow_obj
;
211 has_data
= rhs
.has_data
;
212 if (rhs
.data
.length()) {
215 prefetch_data
= rhs
.prefetch_data
;
216 keep_tail
= rhs
.keep_tail
;
218 objv_tracker
= rhs
.objv_tracker
;
222 RGWObjState
*RGWObjectCtx::get_state(const rgw_obj
& obj
) {
224 typename
std::map
<rgw_obj
, RGWObjState
>::iterator iter
;
226 assert (!obj
.empty());
227 iter
= objs_state
.find(obj
);
228 if (iter
!= objs_state
.end()) {
229 result
= &iter
->second
;
230 lock
.unlock_shared();
232 lock
.unlock_shared();
234 result
= &objs_state
[obj
];
240 void RGWObjectCtx::set_atomic(rgw_obj
& obj
) {
241 std::unique_lock wl
{lock
};
242 assert (!obj
.empty());
243 objs_state
[obj
].is_atomic
= true;
245 void RGWObjectCtx::set_prefetch_data(const rgw_obj
& obj
) {
246 std::unique_lock wl
{lock
};
247 assert (!obj
.empty());
248 objs_state
[obj
].prefetch_data
= true;
251 void RGWObjectCtx::invalidate(const rgw_obj
& obj
) {
252 std::unique_lock wl
{lock
};
253 auto iter
= objs_state
.find(obj
);
254 if (iter
== objs_state
.end()) {
257 bool is_atomic
= iter
->second
.is_atomic
;
258 bool prefetch_data
= iter
->second
.prefetch_data
;
260 objs_state
.erase(iter
);
262 if (is_atomic
|| prefetch_data
) {
263 auto& state
= objs_state
[obj
];
264 state
.is_atomic
= is_atomic
;
265 state
.prefetch_data
= prefetch_data
;
269 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
271 write_version
.ver
= 1;
274 write_version
.tag
.clear();
275 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
278 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
280 RGWHTTPManager http_manager
;
283 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
284 http_manager(store
->ctx(), completion_mgr
) {
285 http_manager
.start();
288 int notify_all(map
<rgw_zone_id
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
289 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
293 list
<RGWCoroutinesStack
*> stacks
;
294 for (auto iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
295 RGWRESTConn
*conn
= iter
->second
;
296 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
297 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
299 stacks
.push_back(stack
);
305 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
307 RGWHTTPManager http_manager
;
310 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
311 http_manager(store
->ctx(), completion_mgr
) {
312 http_manager
.start();
315 int notify_all(map
<rgw_zone_id
, RGWRESTConn
*>& conn_map
, map
<int, set
<string
> >& shards
) {
316 rgw_http_param_pair pairs
[] = { { "type", "data" },
318 { "source-zone", store
->svc
.zone
->get_zone_params().get_id().c_str() },
321 list
<RGWCoroutinesStack
*> stacks
;
322 for (auto iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
323 RGWRESTConn
*conn
= iter
->second
;
324 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
325 stack
->call(new RGWPostRESTResourceCR
<map
<int, set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
327 stacks
.push_back(stack
);
333 /* class RGWRadosThread */
335 void RGWRadosThread::start()
337 worker
= new Worker(cct
, this);
338 worker
->create(thread_name
.c_str());
341 void RGWRadosThread::stop()
353 void *RGWRadosThread::Worker::entry() {
354 uint64_t msec
= processor
->interval_msec();
355 auto interval
= std::chrono::milliseconds(msec
);
358 auto start
= ceph::real_clock::now();
359 int r
= processor
->process();
361 dout(0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
364 if (processor
->going_down())
367 auto end
= ceph::real_clock::now() - start
;
369 uint64_t cur_msec
= processor
->interval_msec();
370 if (cur_msec
!= msec
) { /* was it reconfigured? */
372 interval
= std::chrono::milliseconds(msec
);
377 continue; // next round
379 auto wait_time
= interval
- end
;
380 wait_interval(wait_time
);
384 } while (!processor
->going_down());
389 class RGWMetaNotifier
: public RGWRadosThread
{
390 RGWMetaNotifierManager notify_mgr
;
391 RGWMetadataLog
*const log
;
393 uint64_t interval_msec() override
{
394 return cct
->_conf
->rgw_md_notify_interval_msec
;
396 void stop_process() override
{
400 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
401 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
403 int process() override
;
406 int RGWMetaNotifier::process()
410 log
->read_clear_modified(shards
);
412 if (shards
.empty()) {
416 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
417 ldout(cct
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
420 notify_mgr
.notify_all(store
->svc
.zone
->get_zone_conn_map(), shards
);
425 class RGWDataNotifier
: public RGWRadosThread
{
426 RGWDataNotifierManager notify_mgr
;
428 uint64_t interval_msec() override
{
429 return cct
->_conf
.get_val
<int64_t>("rgw_data_notify_interval_msec");
431 void stop_process() override
{
435 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
437 int process() override
;
440 int RGWDataNotifier::process()
442 auto data_log
= store
->svc
.datalog_rados
->get_log();
447 map
<int, set
<string
> > shards
;
449 data_log
->read_clear_modified(shards
);
451 if (shards
.empty()) {
455 for (map
<int, set
<string
> >::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
456 ldout(cct
, 20) << __func__
<< "(): notifying datalog change, shard_id=" << iter
->first
<< ": " << iter
->second
<< dendl
;
459 notify_mgr
.notify_all(store
->svc
.zone
->get_zone_data_notify_to_map(), shards
);
464 class RGWSyncProcessorThread
: public RGWRadosThread
{
466 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
467 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
468 ~RGWSyncProcessorThread() override
{}
469 int init() override
= 0 ;
470 int process() override
= 0;
473 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
475 RGWMetaSyncStatusManager sync
;
477 uint64_t interval_msec() override
{
478 return 0; /* no interval associated, it'll run once until stopped */
480 void stop_process() override
{
484 RGWMetaSyncProcessorThread(rgw::sal::RGWRadosStore
*_store
, RGWAsyncRadosProcessor
*async_rados
)
485 : RGWSyncProcessorThread(_store
->getRados(), "meta-sync"), sync(_store
, async_rados
) {}
487 void wakeup_sync_shards(set
<int>& shard_ids
) {
488 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
492 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
494 int init() override
{
495 int ret
= sync
.init();
497 ldout(store
->ctx(), 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
503 int process() override
{
509 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
511 PerfCountersRef counters
;
512 RGWDataSyncStatusManager sync
;
515 uint64_t interval_msec() override
{
517 return 0; /* no interval associated, it'll run once until stopped */
519 #define DATA_SYNC_INIT_WAIT_SEC 20
520 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
523 void stop_process() override
{
527 RGWDataSyncProcessorThread(rgw::sal::RGWRadosStore
*_store
, RGWAsyncRadosProcessor
*async_rados
,
528 const RGWZone
* source_zone
)
529 : RGWSyncProcessorThread(_store
->getRados(), "data-sync"),
530 counters(sync_counters::build(store
->ctx(), std::string("data-sync-from-") + source_zone
->name
)),
531 sync(_store
, async_rados
, source_zone
->id
, counters
.get()),
532 initialized(false) {}
534 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
535 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
536 sync
.wakeup(iter
->first
, iter
->second
);
539 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
541 int init() override
{
545 int process() override
{
546 while (!initialized
) {
550 int ret
= sync
.init();
563 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
, DoutPrefixProvider
565 RGWCoroutinesManager crs
;
566 rgw::sal::RGWRadosStore
*store
;
567 rgw::BucketTrimManager
*bucket_trim
;
569 const utime_t trim_interval
;
571 uint64_t interval_msec() override
{ return 0; }
572 void stop_process() override
{ crs
.stop(); }
574 RGWSyncLogTrimThread(rgw::sal::RGWRadosStore
*store
, rgw::BucketTrimManager
*bucket_trim
,
576 : RGWSyncProcessorThread(store
->getRados(), "sync-log-trim"),
577 crs(store
->ctx(), store
->getRados()->get_cr_registry()), store(store
),
578 bucket_trim(bucket_trim
),
579 http(store
->ctx(), crs
.get_completion_mgr()),
580 trim_interval(interval
, 0)
583 int init() override
{
586 int process() override
{
587 list
<RGWCoroutinesStack
*> stacks
;
588 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
589 meta
->call(create_meta_log_trim_cr(this, store
, &http
,
590 cct
->_conf
->rgw_md_log_max_shards
,
592 stacks
.push_back(meta
);
594 if (store
->svc()->zone
->sync_module_exports_data()) {
595 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
596 data
->call(create_data_log_trim_cr(store
, &http
,
597 cct
->_conf
->rgw_data_log_num_shards
,
599 stacks
.push_back(data
);
601 auto bucket
= new RGWCoroutinesStack(store
->ctx(), &crs
);
602 bucket
->call(bucket_trim
->create_bucket_trim_cr(&http
));
603 stacks
.push_back(bucket
);
610 // implements DoutPrefixProvider
611 CephContext
*get_cct() const override
{ return store
->ctx(); }
612 unsigned get_subsys() const override
617 std::ostream
& gen_prefix(std::ostream
& out
) const override
619 return out
<< "sync log trim: ";
624 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
626 std::lock_guard l
{meta_sync_thread_lock
};
627 if (meta_sync_processor_thread
) {
628 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
632 void RGWRados::wakeup_data_sync_shards(const rgw_zone_id
& source_zone
, map
<int, set
<string
> >& shard_ids
)
634 ldout(ctx(), 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
635 std::lock_guard l
{data_sync_thread_lock
};
636 auto iter
= data_sync_processor_threads
.find(source_zone
);
637 if (iter
== data_sync_processor_threads
.end()) {
638 ldout(ctx(), 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
642 RGWDataSyncProcessorThread
*thread
= iter
->second
;
644 thread
->wakeup_sync_shards(shard_ids
);
647 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
649 std::lock_guard l
{meta_sync_thread_lock
};
650 if (meta_sync_processor_thread
) {
651 return meta_sync_processor_thread
->get_manager();
656 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const rgw_zone_id
& source_zone
)
658 std::lock_guard l
{data_sync_thread_lock
};
659 auto thread
= data_sync_processor_threads
.find(source_zone
);
660 if (thread
== data_sync_processor_threads
.end()) {
663 return thread
->second
->get_manager();
666 int RGWRados::get_required_alignment(const rgw_pool
& pool
, uint64_t *alignment
)
669 int r
= open_pool_ctx(pool
, ioctx
, false);
671 ldout(cct
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
676 r
= ioctx
.pool_requires_alignment2(&requires
);
678 ldout(cct
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
689 r
= ioctx
.pool_required_alignment2(&align
);
691 ldout(cct
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
696 ldout(cct
, 20) << "required alignment=" << align
<< dendl
;
702 void RGWRados::get_max_aligned_size(uint64_t size
, uint64_t alignment
, uint64_t *max_size
)
704 if (alignment
== 0) {
709 if (size
<= alignment
) {
710 *max_size
= alignment
;
714 *max_size
= size
- (size
% alignment
);
717 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
, uint64_t *palignment
)
720 int r
= get_required_alignment(pool
, &alignment
);
726 *palignment
= alignment
;
729 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
731 get_max_aligned_size(config_chunk_size
, alignment
, max_chunk_size
);
733 ldout(cct
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
738 int RGWRados::get_max_chunk_size(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
,
739 uint64_t *max_chunk_size
, uint64_t *palignment
)
742 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
743 ldout(cct
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
746 return get_max_chunk_size(pool
, max_chunk_size
, palignment
);
749 class RGWIndexCompletionManager
;
751 struct complete_op_data
{
752 ceph::mutex lock
= ceph::make_mutex("complete_op_data");
753 AioCompletion
*rados_completion
{nullptr};
754 int manager_shard_id
{-1};
755 RGWIndexCompletionManager
*manager
{nullptr};
759 rgw_bucket_entry_ver ver
;
761 rgw_bucket_dir_entry_meta dir_meta
;
762 list
<cls_rgw_obj_key
> remove_objs
;
765 rgw_zone_set zones_trace
;
770 std::lock_guard l
{lock
};
775 class RGWIndexCompletionThread
: public RGWRadosThread
{
778 uint64_t interval_msec() override
{
782 list
<complete_op_data
*> completions
;
784 ceph::mutex completions_lock
=
785 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
787 RGWIndexCompletionThread(RGWRados
*_store
)
788 : RGWRadosThread(_store
, "index-complete"), store(_store
) {}
790 int process() override
;
792 void add_completion(complete_op_data
*completion
) {
794 std::lock_guard l
{completions_lock
};
795 completions
.push_back(completion
);
802 int RGWIndexCompletionThread::process()
804 list
<complete_op_data
*> comps
;
807 std::lock_guard l
{completions_lock
};
808 completions
.swap(comps
);
811 for (auto c
: comps
) {
812 std::unique_ptr
<complete_op_data
> up
{c
};
817 ldout(store
->ctx(), 20) << __func__
<< "(): handling completion for key=" << c
->key
<< dendl
;
819 RGWRados::BucketShard
bs(store
);
820 RGWBucketInfo bucket_info
;
822 int r
= bs
.init(c
->obj
.bucket
, c
->obj
, &bucket_info
);
824 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): failed to initialize BucketShard, obj=" << c
->obj
<< " r=" << r
<< dendl
;
829 r
= store
->guard_reshard(&bs
, c
->obj
, bucket_info
,
830 [&](RGWRados::BucketShard
*bs
) -> int {
831 librados::ObjectWriteOperation o
;
832 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
833 cls_rgw_bucket_complete_op(o
, c
->op
, c
->tag
, c
->ver
, c
->key
, c
->dir_meta
, &c
->remove_objs
,
834 c
->log_op
, c
->bilog_op
, &c
->zones_trace
);
835 return bs
->bucket_obj
.operate(&o
, null_yield
);
838 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): bucket index completion failed, obj=" << c
->obj
<< " r=" << r
<< dendl
;
839 /* ignoring error, can't do anything about it */
842 r
= store
->svc
.datalog_rados
->add_entry(bucket_info
, bs
.shard_id
);
844 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
851 class RGWIndexCompletionManager
{
852 RGWRados
*store
{nullptr};
853 ceph::containers::tiny_vector
<ceph::mutex
> locks
;
854 vector
<set
<complete_op_data
*> > completions
;
856 RGWIndexCompletionThread
*completion_thread
{nullptr};
860 std::atomic
<int> cur_shard
{0};
864 RGWIndexCompletionManager(RGWRados
*_store
) :
866 locks
{ceph::make_lock_container
<ceph::mutex
>(
867 store
->ctx()->_conf
->rgw_thread_pool_size
,
869 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
873 num_shards
= store
->ctx()->_conf
->rgw_thread_pool_size
;
874 completions
.resize(num_shards
);
876 ~RGWIndexCompletionManager() {
881 int result
= cur_shard
% num_shards
;
886 void create_completion(const rgw_obj
& obj
,
887 RGWModifyOp op
, string
& tag
,
888 rgw_bucket_entry_ver
& ver
,
889 const cls_rgw_obj_key
& key
,
890 rgw_bucket_dir_entry_meta
& dir_meta
,
891 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
893 rgw_zone_set
*zones_trace
,
894 complete_op_data
**result
);
895 bool handle_completion(completion_t cb
, complete_op_data
*arg
);
898 completion_thread
= new RGWIndexCompletionThread(store
);
899 int ret
= completion_thread
->init();
903 completion_thread
->start();
907 if (completion_thread
) {
908 completion_thread
->stop();
909 delete completion_thread
;
912 for (int i
= 0; i
< num_shards
; ++i
) {
913 std::lock_guard l
{locks
[i
]};
914 for (auto c
: completions
[i
]) {
922 static void obj_complete_cb(completion_t cb
, void *arg
)
924 complete_op_data
*completion
= (complete_op_data
*)arg
;
925 completion
->lock
.lock();
926 if (completion
->stopped
) {
927 completion
->lock
.unlock(); /* can drop lock, no one else is referencing us */
931 bool need_delete
= completion
->manager
->handle_completion(cb
, completion
);
932 completion
->lock
.unlock();
939 void RGWIndexCompletionManager::create_completion(const rgw_obj
& obj
,
940 RGWModifyOp op
, string
& tag
,
941 rgw_bucket_entry_ver
& ver
,
942 const cls_rgw_obj_key
& key
,
943 rgw_bucket_dir_entry_meta
& dir_meta
,
944 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
946 rgw_zone_set
*zones_trace
,
947 complete_op_data
**result
)
949 complete_op_data
*entry
= new complete_op_data
;
951 int shard_id
= next_shard();
953 entry
->manager_shard_id
= shard_id
;
954 entry
->manager
= this;
960 entry
->dir_meta
= dir_meta
;
961 entry
->log_op
= log_op
;
962 entry
->bilog_op
= bilog_op
;
965 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
966 entry
->remove_objs
.push_back(*iter
);
971 entry
->zones_trace
= *zones_trace
;
973 entry
->zones_trace
.insert(store
->svc
.zone
->get_zone().id
, obj
.bucket
.get_key());
978 entry
->rados_completion
= librados::Rados::aio_create_completion(entry
, obj_complete_cb
);
980 std::lock_guard l
{locks
[shard_id
]};
981 completions
[shard_id
].insert(entry
);
984 bool RGWIndexCompletionManager::handle_completion(completion_t cb
, complete_op_data
*arg
)
986 int shard_id
= arg
->manager_shard_id
;
988 std::lock_guard l
{locks
[shard_id
]};
990 auto& comps
= completions
[shard_id
];
992 auto iter
= comps
.find(arg
);
993 if (iter
== comps
.end()) {
1000 int r
= rados_aio_get_return_value(cb
);
1001 if (r
!= -ERR_BUSY_RESHARDING
) {
1004 completion_thread
->add_completion(arg
);
1008 void RGWRados::finalize()
1010 if (run_sync_thread
) {
1011 std::lock_guard l
{meta_sync_thread_lock
};
1012 meta_sync_processor_thread
->stop();
1014 std::lock_guard dl
{data_sync_thread_lock
};
1015 for (auto iter
: data_sync_processor_threads
) {
1016 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1019 if (sync_log_trimmer
) {
1020 sync_log_trimmer
->stop();
1023 if (run_sync_thread
) {
1024 delete meta_sync_processor_thread
;
1025 meta_sync_processor_thread
= NULL
;
1026 std::lock_guard dl
{data_sync_thread_lock
};
1027 for (auto iter
: data_sync_processor_threads
) {
1028 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1031 data_sync_processor_threads
.clear();
1032 delete sync_log_trimmer
;
1033 sync_log_trimmer
= nullptr;
1034 bucket_trim
= boost::none
;
1036 if (meta_notifier
) {
1037 meta_notifier
->stop();
1038 delete meta_notifier
;
1040 if (data_notifier
) {
1041 data_notifier
->stop();
1042 delete data_notifier
;
1055 RGWQuotaHandler::free_handler(quota_handler
);
1063 delete obj_tombstone_cache
;
1065 if (reshard_wait
.get()) {
1066 reshard_wait
->stop();
1067 reshard_wait
.reset();
1070 if (run_reshard_thread
) {
1071 reshard
->stop_processor();
1074 delete index_completion_manager
;
1078 * Initialize the RADOS instance and prepare to do other ops
1079 * Returns 0 on success, -ERR# on failure.
1081 int RGWRados::init_rados()
1085 ret
= rados
.init_with_context(cct
);
1089 ret
= rados
.connect();
1094 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
1095 new RGWCoroutinesManagerRegistry(cct
)};
1096 ret
= crs
->hook_to_admin_command("cr dump");
1101 cr_registry
= crs
.release();
1105 int RGWRados::register_to_service_map(const string
& daemon_type
, const map
<string
, string
>& meta
)
1107 map
<string
,string
> metadata
= meta
;
1108 metadata
["num_handles"] = "1"s
;
1109 metadata
["zonegroup_id"] = svc
.zone
->get_zonegroup().get_id();
1110 metadata
["zonegroup_name"] = svc
.zone
->get_zonegroup().get_name();
1111 metadata
["zone_name"] = svc
.zone
->zone_name();
1112 metadata
["zone_id"] = svc
.zone
->zone_id().id
;
1113 string name
= cct
->_conf
->name
.get_id();
1114 if (name
.compare(0, 4, "rgw.") == 0) {
1115 name
= name
.substr(4);
1117 int ret
= rados
.service_daemon_register(daemon_type
, name
, metadata
);
1119 ldout(cct
, 0) << "ERROR: service_daemon_register() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1126 int RGWRados::update_service_map(std::map
<std::string
, std::string
>&& status
)
1128 int ret
= rados
.service_daemon_update_status(move(status
));
1130 ldout(cct
, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1138 * Initialize the RADOS instance and prepare to do other ops
1139 * Returns 0 on success, -ERR# on failure.
1141 int RGWRados::init_complete()
1146 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1148 sync_module
= svc
.sync_modules
->get_sync_module();
1150 ret
= open_root_pool_ctx();
1154 ret
= open_gc_pool_ctx();
1158 ret
= open_lc_pool_ctx();
1162 ret
= open_objexp_pool_ctx();
1166 ret
= open_reshard_pool_ctx();
1170 pools_initialized
= true;
1173 gc
->initialize(cct
, this);
1175 obj_expirer
= new RGWObjectExpirer(this->store
);
1177 if (use_gc_thread
) {
1178 gc
->start_processor();
1179 obj_expirer
->start_processor();
1182 auto& current_period
= svc
.zone
->get_current_period();
1183 auto& zonegroup
= svc
.zone
->get_zonegroup();
1184 auto& zone_params
= svc
.zone
->get_zone_params();
1185 auto& zone
= svc
.zone
->get_zone();
1187 /* no point of running sync thread if we don't have a master zone configured
1188 or there is no rest_master_conn */
1189 if (!svc
.zone
->need_to_sync()) {
1190 run_sync_thread
= false;
1193 if (svc
.zone
->is_meta_master()) {
1194 auto md_log
= svc
.mdlog
->get_log(current_period
.get_id());
1195 meta_notifier
= new RGWMetaNotifier(this, md_log
);
1196 meta_notifier
->start();
1199 /* init it anyway, might run sync through radosgw-admin explicitly */
1200 sync_tracer
= new RGWSyncTraceManager(cct
, cct
->_conf
->rgw_sync_trace_history_size
);
1201 sync_tracer
->init(this);
1202 ret
= sync_tracer
->hook_to_admin_command();
1207 if (run_sync_thread
) {
1208 for (const auto &pt
: zonegroup
.placement_targets
) {
1209 if (zone_params
.placement_pools
.find(pt
.second
.name
)
1210 == zone_params
.placement_pools
.end()){
1211 ldout(cct
, 0) << "WARNING: This zone does not contain the placement target "
1212 << pt
.second
.name
<< " present in zonegroup" << dendl
;
1215 auto async_processor
= svc
.rados
->get_async_processor();
1216 std::lock_guard l
{meta_sync_thread_lock
};
1217 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this->store
, async_processor
);
1218 ret
= meta_sync_processor_thread
->init();
1220 ldout(cct
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
1223 meta_sync_processor_thread
->start();
1225 // configure the bucket trim manager
1226 rgw::BucketTrimConfig config
;
1227 rgw::configure_bucket_trim(cct
, config
);
1229 bucket_trim
.emplace(this->store
, config
);
1230 ret
= bucket_trim
->init();
1232 ldout(cct
, 0) << "ERROR: failed to start bucket trim manager" << dendl
;
1235 svc
.datalog_rados
->set_observer(&*bucket_trim
);
1237 std::lock_guard dl
{data_sync_thread_lock
};
1238 for (auto source_zone
: svc
.zone
->get_data_sync_source_zones()) {
1239 ldout(cct
, 5) << "starting data sync thread for zone " << source_zone
->name
<< dendl
;
1240 auto *thread
= new RGWDataSyncProcessorThread(this->store
, svc
.rados
->get_async_processor(), source_zone
);
1241 ret
= thread
->init();
1243 ldout(cct
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
1247 data_sync_processor_threads
[rgw_zone_id(source_zone
->id
)] = thread
;
1249 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
1251 sync_log_trimmer
= new RGWSyncLogTrimThread(this->store
, &*bucket_trim
, interval
);
1252 ret
= sync_log_trimmer
->init();
1254 ldout(cct
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
1257 sync_log_trimmer
->start();
1260 data_notifier
= new RGWDataNotifier(this);
1261 data_notifier
->start();
1263 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
1264 binfo_cache
->init(svc
.cache
);
1267 lc
->initialize(cct
, this->store
);
1270 lc
->start_processor();
1272 quota_handler
= RGWQuotaHandler::generate_handler(this->store
, quota_threads
);
1274 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
1275 zone
.bucket_index_max_shards
);
1276 if (bucket_index_max_shards
> get_max_bucket_shards()) {
1277 bucket_index_max_shards
= get_max_bucket_shards();
1278 ldout(cct
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
1279 << get_max_bucket_shards() << dendl
;
1281 ldout(cct
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
1283 bool need_tombstone_cache
= !svc
.zone
->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1285 if (need_tombstone_cache
) {
1286 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
1289 reshard_wait
= std::make_shared
<RGWReshardWait
>();
1291 reshard
= new RGWReshard(this->store
);
1293 /* only the master zone in the zonegroup reshards buckets */
1294 run_reshard_thread
= run_reshard_thread
&& (zonegroup
.master_zone
== zone
.id
);
1295 if (run_reshard_thread
) {
1296 reshard
->start_processor();
1299 index_completion_manager
= new RGWIndexCompletionManager(this);
1300 ret
= index_completion_manager
->start();
1305 int RGWRados::init_svc(bool raw
)
1308 return svc
.init_raw(cct
, use_cache
);
1311 return svc
.init(cct
, use_cache
, run_sync_thread
);
1314 int RGWRados::init_ctl()
1316 return ctl
.init(&svc
);
1320 * Initialize the RADOS instance and prepare to do other ops
1321 * Returns 0 on success, -ERR# on failure.
1323 int RGWRados::initialize()
1327 inject_notify_timeout_probability
=
1328 cct
->_conf
.get_val
<double>("rgw_inject_notify_timeout_probability");
1329 max_notify_retries
= cct
->_conf
.get_val
<uint64_t>("rgw_max_notify_retries");
1331 ret
= init_svc(false);
1333 ldout(cct
, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
1339 ldout(cct
, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
1343 host_id
= svc
.zone_utils
->gen_host_id();
1349 return init_complete();
1353 * Open the pool used as root for this gateway
1354 * Returns: 0 on success, -ERR# otherwise.
1356 int RGWRados::open_root_pool_ctx()
1358 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().domain_root
, root_pool_ctx
, true, true);
1361 int RGWRados::open_gc_pool_ctx()
1363 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().gc_pool
, gc_pool_ctx
, true, true);
1366 int RGWRados::open_lc_pool_ctx()
1368 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().lc_pool
, lc_pool_ctx
, true, true);
1371 int RGWRados::open_objexp_pool_ctx()
1373 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, objexp_pool_ctx
, true, true);
1376 int RGWRados::open_reshard_pool_ctx()
1378 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().reshard_pool
, reshard_pool_ctx
, true, true);
1381 int RGWRados::open_pool_ctx(const rgw_pool
& pool
, librados::IoCtx
& io_ctx
,
1384 constexpr bool create
= true; // create the pool if it doesn't exist
1385 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
, mostly_omap
);
1390 struct log_list_state
{
1392 librados::IoCtx io_ctx
;
1393 librados::NObjectIterator obit
;
1396 int RGWRados::log_list_init(const string
& prefix
, RGWAccessHandle
*handle
)
1398 log_list_state
*state
= new log_list_state
;
1399 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1404 state
->prefix
= prefix
;
1405 state
->obit
= state
->io_ctx
.nobjects_begin();
1406 *handle
= (RGWAccessHandle
)state
;
1410 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
1412 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
1414 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
1418 if (state
->prefix
.length() &&
1419 state
->obit
->get_oid().find(state
->prefix
) != 0) {
1423 *name
= state
->obit
->get_oid();
1430 int RGWRados::log_remove(const string
& name
)
1432 librados::IoCtx io_ctx
;
1433 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
1436 return io_ctx
.remove(name
);
1439 struct log_show_state
{
1440 librados::IoCtx io_ctx
;
1442 bufferlist::const_iterator p
;
1446 log_show_state() : pos(0), eof(false) {}
1449 int RGWRados::log_show_init(const string
& name
, RGWAccessHandle
*handle
)
1451 log_show_state
*state
= new log_show_state
;
1452 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1458 *handle
= (RGWAccessHandle
)state
;
1462 int RGWRados::log_show_next(RGWAccessHandle handle
, rgw_log_entry
*entry
)
1464 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
1465 off_t off
= state
->p
.get_off();
1467 ldout(cct
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
1469 << " eof " << (int)state
->eof
1472 unsigned chunk
= 1024*1024;
1473 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
1475 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
1481 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
1482 } catch (buffer::error
& err
) {
1486 state
->bl
.claim(old
);
1487 state
->bl
.claim_append(more
);
1488 state
->p
= state
->bl
.cbegin();
1489 if ((unsigned)r
< chunk
)
1491 ldout(cct
, 10) << " read " << r
<< dendl
;
1495 return 0; // end of file
1497 decode(*entry
, state
->p
);
1499 catch (const buffer::error
&e
) {
1506 * usage_log_hash: get usage log key hash, based on name and index
1508 * Get the usage object name. Since a user may have more than 1
1509 * object holding that info (multiple shards), we use index to
1510 * specify that shard number. Once index exceeds max shards it
1512 * If name is not being set, results for all users will be returned
1513 * and index will wrap only after total shards number.
1515 * @param cct [in] ceph context
1516 * @param name [in] user name
1517 * @param hash [out] hash value
1518 * @param index [in] shard index number
1520 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
1522 uint32_t val
= index
;
1524 if (!name
.empty()) {
1525 int max_user_shards
= cct
->_conf
->rgw_usage_max_user_shards
;
1526 val
%= max_user_shards
;
1527 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
1530 int max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1531 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
1535 int RGWRados::log_usage(map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
1539 map
<string
, rgw_usage_log_info
> log_objs
;
1544 /* restructure usage map, zone by object hash */
1545 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
1546 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
1547 const rgw_user_bucket
& ub
= iter
->first
;
1548 RGWUsageBatch
& info
= iter
->second
;
1550 if (ub
.user
.empty()) {
1551 ldout(cct
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
1555 if (ub
.user
!= last_user
) {
1556 /* index *should* be random, but why waste extra cycles
1557 in most cases max user shards is not going to exceed 1,
1558 so just incrementing it */
1559 usage_log_hash(cct
, ub
.user
, hash
, index
++);
1561 last_user
= ub
.user
;
1562 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
1564 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
1565 v
.push_back(miter
->second
);
1569 map
<string
, rgw_usage_log_info
>::iterator liter
;
1571 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
1572 int r
= cls_obj_usage_log_add(liter
->first
, liter
->second
);
1579 int RGWRados::read_usage(const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
,
1580 uint32_t max_entries
, bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
,
1581 rgw_usage_log_entry
>& usage
)
1583 uint32_t num
= max_entries
;
1584 string hash
, first_hash
;
1585 string user_str
= user
.to_str();
1586 usage_log_hash(cct
, user_str
, first_hash
, 0);
1588 if (usage_iter
.index
) {
1589 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
1597 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
1598 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
1600 int ret
= cls_obj_usage_log_read(hash
, user_str
, bucket_name
, start_epoch
, end_epoch
, num
,
1601 usage_iter
.read_iter
, ret_usage
, is_truncated
);
1608 num
-= ret_usage
.size();
1610 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
1611 usage
[iter
->first
].aggregate(iter
->second
);
1615 if (!*is_truncated
) {
1616 usage_iter
.read_iter
.clear();
1617 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
1619 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
1623 int RGWRados::trim_usage(const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
)
1626 string hash
, first_hash
;
1627 string user_str
= user
.to_str();
1628 usage_log_hash(cct
, user_str
, first_hash
, index
);
1632 int ret
= cls_obj_usage_log_trim(hash
, user_str
, bucket_name
, start_epoch
, end_epoch
);
1634 if (ret
< 0 && ret
!= -ENOENT
)
1637 usage_log_hash(cct
, user_str
, hash
, ++index
);
1638 } while (hash
!= first_hash
);
1644 int RGWRados::clear_usage()
1646 auto max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1648 for (unsigned i
=0; i
< max_shards
; i
++){
1649 string oid
= RGW_USAGE_OBJ_PREFIX
+ to_string(i
);
1650 ret
= cls_obj_usage_log_clear(oid
);
1652 ldout(cct
,0) << "usage clear on oid="<< oid
<< "failed with ret=" << ret
<< dendl
;
1659 int RGWRados::decode_policy(bufferlist
& bl
, ACLOwner
*owner
)
1661 auto i
= bl
.cbegin();
1662 RGWAccessControlPolicy
policy(cct
);
1664 policy
.decode_owner(i
);
1665 } catch (buffer::error
& err
) {
1666 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
1669 *owner
= policy
.get_owner();
1673 int rgw_policy_from_attrset(CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
1675 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
1676 if (aiter
== attrset
.end())
1679 bufferlist
& bl
= aiter
->second
;
1680 auto iter
= bl
.cbegin();
1682 policy
->decode(iter
);
1683 } catch (buffer::error
& err
) {
1684 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
1687 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 15>()) {
1688 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
1689 ldout(cct
, 15) << __func__
<< " Read AccessControlPolicy";
1690 s3policy
->to_xml(*_dout
);
1697 int RGWRados::Bucket::update_bucket_id(const string
& new_bucket_id
)
1699 rgw_bucket bucket
= bucket_info
.bucket
;
1700 bucket
.update_bucket_id(new_bucket_id
);
1702 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
1704 bucket_info
.objv_tracker
.clear();
1705 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, nullptr, nullptr, null_yield
);
1715 * Get ordered listing of the objects in a bucket.
1717 * max_p: maximum number of results to return
1718 * bucket: bucket to list contents of
1719 * prefix: only return results that match this prefix
1720 * delim: do not include results that match this string.
1721 * Any skipped results will have the matching portion of their name
1722 * inserted in common_prefixes with a "true" mark.
1723 * marker: if filled in, begin the listing with this object.
1724 * end_marker: if filled in, end the listing with this object.
1725 * result: the objects are put in here.
1726 * common_prefixes: if delim is filled in, any matching prefixes are
1728 * is_truncated: if number of objects in the bucket is bigger than
1729 * max, then truncated.
1731 int RGWRados::Bucket::List::list_objects_ordered(
1733 vector
<rgw_bucket_dir_entry
> *result
,
1734 map
<string
, bool> *common_prefixes
,
1738 RGWRados
*store
= target
->get_store();
1739 CephContext
*cct
= store
->ctx();
1740 int shard_id
= target
->get_shard_id();
1743 bool truncated
= true;
1744 bool cls_filtered
= false;
1745 const int64_t max
= // protect against memory issues and negative vals
1746 std::min(bucket_list_objects_absolute_max
, std::max(int64_t(0), max_p
));
1747 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
, max
);
1751 // use a local marker; either the marker will have a previous entry
1752 // or it will be empty; either way it's OK to copy
1753 rgw_obj_key
marker_obj(params
.marker
.name
,
1754 params
.marker
.instance
,
1756 rgw_obj_index_key cur_marker
;
1757 marker_obj
.get_index_key(&cur_marker
);
1759 rgw_obj_key
end_marker_obj(params
.end_marker
.name
,
1760 params
.end_marker
.instance
,
1761 params
.end_marker
.ns
);
1762 rgw_obj_index_key cur_end_marker
;
1763 end_marker_obj
.get_index_key(&cur_end_marker
);
1764 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
1766 rgw_obj_key
prefix_obj(params
.prefix
);
1767 prefix_obj
.set_ns(params
.ns
);
1768 string cur_prefix
= prefix_obj
.get_index_key_name();
1769 string after_delim_s
; /* needed in !params.delim.empty() AND later */
1771 if (!params
.delim
.empty()) {
1772 after_delim_s
= cls_rgw_after_delim(params
.delim
);
1773 /* if marker points at a common prefix, fast forward it into its
1774 * upper bound string */
1775 int delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
1776 if (delim_pos
>= 0) {
1777 string s
= cur_marker
.name
.substr(0, delim_pos
);
1778 s
.append(after_delim_s
);
1783 rgw_obj_index_key prev_marker
;
1784 uint16_t attempt
= 0;
1786 ldout(cct
, 20) << "RGWRados::Bucket::List::" << __func__
<<
1787 " beginning attempt=" << ++attempt
<< dendl
;
1789 // this loop is generally expected only to have a single
1790 // iteration; the standard exit is at the bottom of the loop, but
1791 // there's an error condition emergency exit as well
1793 if (attempt
> 1 && !(prev_marker
< cur_marker
)) {
1794 // we've failed to make forward progress
1795 ldout(cct
, 0) << "RGWRados::Bucket::List::" << __func__
<<
1796 ": ERROR marker failed to make forward progress; attempt=" << attempt
<<
1797 ", prev_marker=" << prev_marker
<<
1798 ", cur_marker=" << cur_marker
<< dendl
;
1801 prev_marker
= cur_marker
;
1804 ent_map
.reserve(read_ahead
);
1805 int r
= store
->cls_bucket_list_ordered(target
->get_bucket_info(),
1810 read_ahead
+ 1 - count
,
1811 params
.list_versions
,
1822 for (auto eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
1823 rgw_bucket_dir_entry
& entry
= eiter
->second
;
1824 rgw_obj_index_key index_key
= entry
.key
;
1825 rgw_obj_key
obj(index_key
);
1827 ldout(cct
, 20) << "RGWRados::Bucket::List::" << __func__
<<
1828 " considering entry " << entry
.key
<< dendl
;
1830 /* note that parse_raw_oid() here will not set the correct
1831 * object's instance, as rgw_obj_index_key encodes that
1832 * separately. We don't need to set the instance because it's
1833 * not needed for the checks here and we end up using the raw
1834 * entry for the return vector
1836 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
1838 ldout(cct
, 0) << "ERROR: could not parse object name: " <<
1843 bool matched_ns
= (obj
.ns
== params
.ns
);
1844 if (!params
.list_versions
&& !entry
.is_visible()) {
1848 if (params
.enforce_ns
&& !matched_ns
) {
1849 if (!params
.ns
.empty()) {
1850 /* we've iterated past the namespace we're searching -- done now */
1855 /* we're not looking at the namespace this object is in, next! */
1859 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
1865 params
.marker
= index_key
;
1866 next_marker
= index_key
;
1869 if (params
.filter
&&
1870 ! params
.filter
->filter(obj
.name
, index_key
.name
)) {
1874 if (params
.prefix
.size() &&
1875 0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
)) {
1879 if (!params
.delim
.empty()) {
1880 const int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
1881 if (delim_pos
>= 0) {
1882 // run either the code where delimiter filtering is done a)
1883 // in the OSD/CLS or b) here.
1885 // NOTE: this condition is for the newer versions of the
1886 // OSD that does filtering on the CLS side
1888 // should only find one delimiter at the end if it finds any
1891 int(obj
.name
.length() - params
.delim
.length())) {
1893 "WARNING: found delimiter in place other than the end of "
1894 "the prefix; obj.name=" << obj
.name
<<
1895 ", prefix=" << params
.prefix
<< dendl
;
1897 if (common_prefixes
) {
1903 (*common_prefixes
)[obj
.name
] = true;
1909 // NOTE: this condition is for older versions of the OSD
1910 // that do not filter on the CLS side, so the following code
1911 // must do the filtering; once we reach version 16 of ceph,
1912 // this code can be removed along with the conditional that
1913 // can lead this way
1915 /* extract key -with trailing delimiter- for CommonPrefix */
1917 obj
.name
.substr(0, delim_pos
+ params
.delim
.length());
1919 if (common_prefixes
&&
1920 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
1925 next_marker
= prefix_key
;
1926 (*common_prefixes
)[prefix_key
] = true;
1932 } // if we're running an older OSD version
1933 } // if a delimiter was found after prefix
1934 } // if a delimiter was passed in
1941 ldout(cct
, 20) << "RGWRados::Bucket::List::" << __func__
<<
1942 " adding entry " << entry
.key
<< " to result" << dendl
;
1944 result
->emplace_back(std::move(entry
));
1948 // NOTE: the following conditional is needed by older versions of
1949 // the OSD that don't do delimiter filtering on the CLS side; once
1950 // we reach version 16 of ceph, the following conditional and the
1951 // code within can be removed
1952 if (!cls_filtered
&& !params
.delim
.empty()) {
1953 int marker_delim_pos
=
1954 cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
1955 if (marker_delim_pos
>= 0) {
1956 std::string skip_after_delim
=
1957 cur_marker
.name
.substr(0, marker_delim_pos
);
1958 skip_after_delim
.append(after_delim_s
);
1960 ldout(cct
, 20) << "skip_after_delim=" << skip_after_delim
<< dendl
;
1962 if (skip_after_delim
> cur_marker
.name
) {
1963 cur_marker
= skip_after_delim
;
1964 ldout(cct
, 20) << "setting cur_marker="
1966 << "[" << cur_marker
.instance
<< "]"
1970 } // if older osd didn't do delimiter filtering
1972 ldout(cct
, 20) << "RGWRados::Bucket::List::" << __func__
<<
1973 " INFO end of outer loop, truncated=" << truncated
<<
1974 ", count=" << count
<< ", attempt=" << attempt
<< dendl
;
1976 if (!truncated
|| count
>= (max
+ 1) / 2) {
1977 // if we finished listing, or if we're returning at least half the
1978 // requested entries, that's enough; S3 and swift protocols allow
1979 // returning fewer than max entries
1981 } else if (attempt
> 8 && count
>= 1) {
1982 // if we've made at least 8 attempts and we have some, but very
1983 // few, results, return with what we have
1987 ldout(cct
, 1) << "RGWRados::Bucket::List::" << __func__
<<
1988 " INFO ordered bucket listing requires read #" << (1 + attempt
) <<
1990 } // read attempt loop
1995 *is_truncated
= truncated
;
1999 } // list_objects_ordered
2003 * Get listing of the objects in a bucket and allow the results to be out
2006 * Even though there are key differences with the ordered counterpart,
2007 * the parameters are the same to maintain some compatability.
2009 * max: maximum number of results to return
2010 * bucket: bucket to list contents of
2011 * prefix: only return results that match this prefix
2012 * delim: should not be set; if it is we should have indicated an error
2013 * marker: if filled in, begin the listing with this object.
2014 * end_marker: if filled in, end the listing with this object.
2015 * result: the objects are put in here.
2016 * common_prefixes: this is never filled with an unordered list; the param
2017 * is maintained for compatibility
2018 * is_truncated: if number of objects in the bucket is bigger than max, then
2021 int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p
,
2022 vector
<rgw_bucket_dir_entry
> *result
,
2023 map
<string
, bool> *common_prefixes
,
2027 RGWRados
*store
= target
->get_store();
2028 CephContext
*cct
= store
->ctx();
2029 int shard_id
= target
->get_shard_id();
2032 bool truncated
= true;
2034 const int64_t max
= // protect against memory issues and negative vals
2035 std::min(bucket_list_objects_absolute_max
, std::max(int64_t(0), max_p
));
2037 // read a few extra in each call to cls_bucket_list_unordered in
2038 // case some are filtered out due to namespace matching, versioning,
2040 const int64_t max_read_ahead
= 100;
2041 const uint32_t read_ahead
= uint32_t(max
+ std::min(max
, max_read_ahead
));
2045 // use a local marker; either the marker will have a previous entry
2046 // or it will be empty; either way it's OK to copy
2047 rgw_obj_key
marker_obj(params
.marker
.name
,
2048 params
.marker
.instance
,
2050 rgw_obj_index_key cur_marker
;
2051 marker_obj
.get_index_key(&cur_marker
);
2053 rgw_obj_key
end_marker_obj(params
.end_marker
.name
,
2054 params
.end_marker
.instance
,
2055 params
.end_marker
.ns
);
2056 rgw_obj_index_key cur_end_marker
;
2057 end_marker_obj
.get_index_key(&cur_end_marker
);
2058 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
2060 rgw_obj_key
prefix_obj(params
.prefix
);
2061 prefix_obj
.set_ns(params
.ns
);
2062 string cur_prefix
= prefix_obj
.get_index_key_name();
2064 while (truncated
&& count
<= max
) {
2065 std::vector
<rgw_bucket_dir_entry
> ent_list
;
2066 ent_list
.reserve(read_ahead
);
2068 int r
= store
->cls_bucket_list_unordered(target
->get_bucket_info(),
2073 params
.list_versions
,
2081 // NB: while regions of ent_list will be sorted, we have no
2082 // guarantee that all items will be sorted since they can cross
2085 for (auto& entry
: ent_list
) {
2086 rgw_obj_index_key index_key
= entry
.key
;
2087 rgw_obj_key
obj(index_key
);
2090 params
.marker
.set(index_key
);
2091 next_marker
.set(index_key
);
2094 /* note that parse_raw_oid() here will not set the correct
2095 * object's instance, as rgw_obj_index_key encodes that
2096 * separately. We don't need to set the instance because it's
2097 * not needed for the checks here and we end up using the raw
2098 * entry for the return vector
2100 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
2102 ldout(cct
, 0) << "ERROR: could not parse object name: " <<
2107 if (!params
.list_versions
&& !entry
.is_visible()) {
2111 if (params
.enforce_ns
&& obj
.ns
!= params
.ns
) {
2115 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
2116 // we're not guaranteed items will come in order, so we have
2117 // to loop through all
2121 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
2124 if (params
.prefix
.size() &&
2125 (0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
)))
2133 result
->emplace_back(std::move(entry
));
2135 } // for (auto& entry : ent_list)
2136 } // while (truncated && count <= max)
2140 *is_truncated
= truncated
;
2143 } // list_objects_unordered
2147 * create a rados pool, associated meta info
2148 * returns 0 on success, -ERR# otherwise.
2150 int RGWRados::create_pool(const rgw_pool
& pool
)
2152 librados::IoCtx io_ctx
;
2153 constexpr bool create
= true;
2154 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
);
2157 void RGWRados::create_bucket_id(string
*bucket_id
)
2159 uint64_t iid
= instance_id();
2160 uint64_t bid
= next_bucket_id();
2161 char buf
[svc
.zone
->get_zone_params().get_id().size() + 48];
2162 snprintf(buf
, sizeof(buf
), "%s.%" PRIu64
".%" PRIu64
,
2163 svc
.zone
->get_zone_params().get_id().c_str(), iid
, bid
);
2167 int RGWRados::create_bucket(const RGWUserInfo
& owner
, rgw_bucket
& bucket
,
2168 const string
& zonegroup_id
,
2169 const rgw_placement_rule
& placement_rule
,
2170 const string
& swift_ver_location
,
2171 const RGWQuotaInfo
* pquota_info
,
2172 map
<std::string
, bufferlist
>& attrs
,
2173 RGWBucketInfo
& info
,
2175 obj_version
*pep_objv
,
2176 real_time creation_time
,
2177 rgw_bucket
*pmaster_bucket
,
2178 uint32_t *pmaster_num_shards
,
2181 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2182 rgw_placement_rule selected_placement_rule
;
2183 RGWZonePlacementInfo rule_info
;
2185 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
2187 ret
= svc
.zone
->select_bucket_placement(owner
, zonegroup_id
, placement_rule
,
2188 &selected_placement_rule
, &rule_info
);
2192 if (!pmaster_bucket
) {
2193 create_bucket_id(&bucket
.marker
);
2194 bucket
.bucket_id
= bucket
.marker
;
2196 bucket
.marker
= pmaster_bucket
->marker
;
2197 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
2200 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
2202 objv_tracker
.read_version
.clear();
2205 objv_tracker
.write_version
= *pobjv
;
2207 objv_tracker
.generate_new_write_ver(cct
);
2210 info
.bucket
= bucket
;
2211 info
.owner
= owner
.user_id
;
2212 info
.zonegroup
= zonegroup_id
;
2213 info
.placement_rule
= selected_placement_rule
;
2214 info
.index_type
= rule_info
.index_type
;
2215 info
.swift_ver_location
= swift_ver_location
;
2216 info
.swift_versioning
= (!swift_ver_location
.empty());
2217 if (pmaster_num_shards
) {
2218 info
.num_shards
= *pmaster_num_shards
;
2220 info
.num_shards
= bucket_index_max_shards
;
2222 info
.bucket_index_shard_hash_type
= RGWBucketInfo::MOD
;
2223 info
.requester_pays
= false;
2224 if (real_clock::is_zero(creation_time
)) {
2225 info
.creation_time
= ceph::real_clock::now();
2227 info
.creation_time
= creation_time
;
2230 info
.quota
= *pquota_info
;
2233 int r
= svc
.bi
->init_index(info
);
2238 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true);
2239 if (ret
== -ECANCELED
) {
2242 if (ret
== -EEXIST
) {
2243 /* we need to reread the info and return it, caller will have a use for it */
2244 RGWBucketInfo orig_info
;
2245 r
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, orig_info
, NULL
, null_yield
, NULL
);
2250 ldout(cct
, 0) << "get_bucket_info returned " << r
<< dendl
;
2254 /* only remove it if it's a different bucket instance */
2255 if (orig_info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
2256 int r
= svc
.bi
->clean_index(info
);
2258 ldout(cct
, 0) << "WARNING: could not remove bucket index (r=" << r
<< ")" << dendl
;
2260 r
= ctl
.bucket
->remove_bucket_instance_info(info
.bucket
, info
, null_yield
);
2262 ldout(cct
, 0) << "WARNING: " << __func__
<< "(): failed to remove bucket instance info: bucket instance=" << info
.bucket
.get_key() << ": r=" << r
<< dendl
;
2263 /* continue anyway */
2267 info
= std::move(orig_info
);
2268 /* ret == -EEXIST here */
2273 /* this is highly unlikely */
2274 ldout(cct
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
2278 bool RGWRados::get_obj_data_pool(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
2280 return rgw_get_obj_data_pool(svc
.zone
->get_zonegroup(), svc
.zone
->get_zone_params(), placement_rule
, obj
, pool
);
2283 bool RGWRados::obj_to_raw(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
2285 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
2287 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
2290 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
2293 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
2296 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2297 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2301 int r
= open_pool_ctx(pool
, *ioctx
, false);
2306 ioctx
->locator_set_key(key
);
2311 int RGWRados::get_obj_head_ref(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_rados_ref
*ref
)
2313 get_obj_bucket_and_oid_loc(obj
, ref
->obj
.oid
, ref
->obj
.loc
);
2316 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2317 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2321 ref
->pool
= svc
.rados
->pool(pool
);
2323 int r
= ref
->pool
.open(RGWSI_RADOS::OpenParams()
2324 .set_mostly_omap(false));
2326 ldout(cct
, 0) << "ERROR: failed opening data pool (pool=" << pool
<< "); r=" << r
<< dendl
;
2330 ref
->pool
.ioctx().locator_set_key(ref
->obj
.loc
);
2335 int RGWRados::get_raw_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2339 if (ref
->obj
.oid
.empty()) {
2340 ref
->obj
.oid
= obj
.pool
.to_str();
2341 ref
->obj
.pool
= svc
.zone
->get_zone_params().domain_root
;
2343 ref
->pool
= svc
.rados
->pool(obj
.pool
);
2344 int r
= ref
->pool
.open(RGWSI_RADOS::OpenParams()
2345 .set_mostly_omap(false));
2347 ldout(cct
, 0) << "ERROR: failed opening pool (pool=" << obj
.pool
<< "); r=" << r
<< dendl
;
2351 ref
->pool
.ioctx().locator_set_key(ref
->obj
.loc
);
2356 int RGWRados::get_system_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2358 return get_raw_obj_ref(obj
, ref
);
2362 * fixes an issue where head objects were supposed to have a locator created, but ended
2365 int RGWRados::fix_head_obj_locator(const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
2367 const rgw_bucket
& bucket
= bucket_info
.bucket
;
2371 rgw_obj
obj(bucket
, key
);
2373 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
2375 if (locator
.empty()) {
2376 ldout(cct
, 20) << "object does not have a locator, nothing to fix" << dendl
;
2380 librados::IoCtx ioctx
;
2382 int ret
= get_obj_head_ioctx(bucket_info
, obj
, &ioctx
);
2384 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
2387 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
2392 struct timespec mtime_ts
;
2393 map
<string
, bufferlist
> attrs
;
2394 librados::ObjectReadOperation op
;
2395 op
.getxattrs(&attrs
, NULL
);
2396 op
.stat2(&size
, &mtime_ts
, NULL
);
2397 #define HEAD_SIZE 512 * 1024
2398 op
.read(0, HEAD_SIZE
, &data
, NULL
);
2400 ret
= rgw_rados_operate(ioctx
, oid
, &op
, &data
, null_yield
);
2402 lderr(cct
) << "ERROR: rgw_rados_operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
2406 if (size
> HEAD_SIZE
) {
2407 lderr(cct
) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
2411 if (size
!= data
.length()) {
2412 lderr(cct
) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
2417 librados::ObjectWriteOperation wop
;
2419 wop
.mtime2(&mtime_ts
);
2421 map
<string
, bufferlist
>::iterator iter
;
2422 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
2423 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
2428 ioctx
.locator_set_key(locator
);
2429 rgw_rados_operate(ioctx
, oid
, &wop
, null_yield
);
2433 ioctx
.locator_set_key(string());
2435 ret
= ioctx
.remove(oid
);
2437 lderr(cct
) << "ERROR: failed to remove original bad object" << dendl
;
2445 int RGWRados::move_rados_obj(librados::IoCtx
& src_ioctx
,
2446 const string
& src_oid
, const string
& src_locator
,
2447 librados::IoCtx
& dst_ioctx
,
2448 const string
& dst_oid
, const string
& dst_locator
)
2451 #define COPY_BUF_SIZE (4 * 1024 * 1024)
2453 uint64_t chunk_size
= COPY_BUF_SIZE
;
2457 struct timespec mtime_ts
;
2460 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
2464 src_ioctx
.locator_set_key(src_locator
);
2465 dst_ioctx
.locator_set_key(dst_locator
);
2469 ObjectReadOperation rop
;
2470 ObjectWriteOperation wop
;
2473 rop
.stat2(&size
, &mtime_ts
, NULL
);
2474 mtime
= real_clock::from_timespec(mtime_ts
);
2476 rop
.read(ofs
, chunk_size
, &data
, NULL
);
2477 ret
= rgw_rados_operate(src_ioctx
, src_oid
, &rop
, &data
, null_yield
);
2482 if (data
.length() == 0) {
2487 wop
.create(true); /* make it exclusive */
2488 wop
.mtime2(&mtime_ts
);
2489 mtime
= real_clock::from_timespec(mtime_ts
);
2491 wop
.write(ofs
, data
);
2492 ret
= rgw_rados_operate(dst_ioctx
, dst_oid
, &wop
, null_yield
);
2496 ofs
+= data
.length();
2497 done
= data
.length() != chunk_size
;
2501 lderr(cct
) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
2502 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
2507 src_ioctx
.remove(src_oid
);
2512 // TODO: clean up dst_oid if we created it
2513 lderr(cct
) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
2518 * fixes an issue where head objects were supposed to have a locator created, but ended
2521 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
, optional_yield y
)
2523 const rgw_bucket
& bucket
= bucket_info
.bucket
;
2524 rgw_obj
obj(bucket
, key
);
2531 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
2536 RGWObjState
*astate
= NULL
;
2537 RGWObjectCtx
rctx(this->store
);
2538 r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false, y
);
2542 if (astate
->manifest
) {
2543 RGWObjManifest::obj_iterator miter
;
2544 RGWObjManifest
& manifest
= *astate
->manifest
;
2545 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
2546 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(this);
2551 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
2553 if (loc
.key
.ns
.empty()) {
2554 /* continue, we're only interested in tail objects */
2558 auto& ioctx
= ref
.pool
.ioctx();
2560 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
2561 ref
.pool
.ioctx().locator_set_key(locator
);
2563 ldout(cct
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
2565 r
= ioctx
.stat(oid
, NULL
, NULL
);
2571 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
2573 /* create a new ioctx with the bad locator */
2574 librados::IoCtx src_ioctx
;
2575 src_ioctx
.dup(ioctx
);
2576 src_ioctx
.locator_set_key(bad_loc
);
2578 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
2580 /* cannot find a broken part */
2583 ldout(cct
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
2588 r
= move_rados_obj(src_ioctx
, oid
, bad_loc
, ioctx
, oid
, locator
);
2590 lderr(cct
) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
2599 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
2601 RGWBucketInfo
* bucket_info_out
)
2605 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
2607 RGWBucketInfo bucket_info
;
2608 RGWBucketInfo
* bucket_info_p
=
2609 bucket_info_out
? bucket_info_out
: &bucket_info
;
2611 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
, null_yield
);
2618 ret
= store
->svc
.bi_rados
->open_bucket_index_shard(*bucket_info_p
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
2620 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2623 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
.get_raw_obj() << dendl
;
2628 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
2630 RGWBucketInfo
* bucket_info_out
)
2635 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
2637 RGWBucketInfo bucket_info
;
2638 RGWBucketInfo
* bucket_info_p
=
2639 bucket_info_out
? bucket_info_out
: &bucket_info
;
2640 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
, null_yield
);
2647 ret
= store
->svc
.bi_rados
->open_bucket_index_shard(*bucket_info_p
, shard_id
, &bucket_obj
);
2649 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2652 ldout(store
->ctx(), 20) << " bucket index oid: " << bucket_obj
.get_raw_obj() << dendl
;
2657 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
,
2660 bucket
= bucket_info
.bucket
;
2662 int ret
= store
->svc
.bi_rados
->open_bucket_index_shard(bucket_info
,
2663 obj
.get_hash_object(),
2667 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2670 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
2675 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
, int sid
)
2677 bucket
= bucket_info
.bucket
;
2680 int ret
= store
->svc
.bi_rados
->open_bucket_index_shard(bucket_info
, shard_id
, &bucket_obj
);
2682 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2685 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
2691 /* Execute @handler on last item in bucket listing for bucket specified
2692 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2693 * to objects matching these criterias. */
2694 int RGWRados::on_last_entry_in_listing(RGWBucketInfo
& bucket_info
,
2695 const std::string
& obj_prefix
,
2696 const std::string
& obj_delim
,
2697 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
2699 RGWRados::Bucket
target(this, bucket_info
);
2700 RGWRados::Bucket::List
list_op(&target
);
2702 list_op
.params
.prefix
= obj_prefix
;
2703 list_op
.params
.delim
= obj_delim
;
2705 ldout(cct
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
2706 << ", obj_prefix=" << obj_prefix
2707 << ", obj_delim=" << obj_delim
2710 bool is_truncated
= false;
2712 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
2713 /* We need to rewind to the last object in a listing. */
2715 /* List bucket entries in chunks. */
2716 static constexpr int MAX_LIST_OBJS
= 100;
2717 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
2719 int ret
= list_op
.list_objects(MAX_LIST_OBJS
, &entries
, nullptr,
2720 &is_truncated
, null_yield
);
2723 } else if (!entries
.empty()) {
2724 last_entry
= entries
.back();
2726 } while (is_truncated
);
2729 return handler(*last_entry
);
2732 /* Empty listing - no items we can run handler on. */
2737 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
2738 const rgw_user
& user
,
2739 RGWBucketInfo
& bucket_info
,
2741 const DoutPrefixProvider
*dpp
,
2744 if (! swift_versioning_enabled(bucket_info
)) {
2748 obj_ctx
.set_atomic(obj
);
2750 RGWObjState
* state
= nullptr;
2751 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &state
, false, y
);
2756 if (!state
->exists
) {
2760 const string
& src_name
= obj
.get_oid();
2761 char buf
[src_name
.size() + 32];
2762 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
2763 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
2764 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
2766 RGWBucketInfo dest_bucket_info
;
2768 r
= get_bucket_info(&svc
, bucket_info
.bucket
.tenant
, bucket_info
.swift_ver_location
, dest_bucket_info
, NULL
, null_yield
, NULL
);
2770 ldout(cct
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
2772 return -ERR_PRECONDITION_FAILED
;
2777 if (dest_bucket_info
.owner
!= bucket_info
.owner
) {
2778 return -ERR_PRECONDITION_FAILED
;
2781 rgw_obj
dest_obj(dest_bucket_info
.bucket
, buf
);
2783 if (dest_bucket_info
.versioning_enabled()){
2784 gen_rand_obj_instance_name(&dest_obj
);
2787 obj_ctx
.set_atomic(dest_obj
);
2789 rgw_zone_id no_zone
;
2791 r
= copy_obj(obj_ctx
,
2793 NULL
, /* req_info *info */
2799 bucket_info
.placement_rule
,
2800 NULL
, /* time_t *src_mtime */
2801 NULL
, /* time_t *mtime */
2802 NULL
, /* const time_t *mod_ptr */
2803 NULL
, /* const time_t *unmod_ptr */
2804 false, /* bool high_precision_time */
2805 NULL
, /* const char *if_match */
2806 NULL
, /* const char *if_nomatch */
2807 RGWRados::ATTRSMOD_NONE
,
2808 true, /* bool copy_if_newer */
2810 RGWObjCategory::Main
,
2811 0, /* uint64_t olh_epoch */
2812 real_time(), /* time_t delete_at */
2813 NULL
, /* string *version_id */
2814 NULL
, /* string *ptag */
2815 NULL
, /* string *petag */
2816 NULL
, /* void (*progress_cb)(off_t, void *) */
2817 NULL
, /* void *progress_data */
2820 if (r
== -ECANCELED
|| r
== -ENOENT
) {
2821 /* Has already been overwritten, meaning another rgw process already
2829 int RGWRados::swift_versioning_restore(RGWObjectCtx
& obj_ctx
,
2830 const rgw_user
& user
,
2831 RGWBucketInfo
& bucket_info
,
2833 bool& restored
, /* out */
2834 const DoutPrefixProvider
*dpp
)
2836 if (! swift_versioning_enabled(bucket_info
)) {
2840 /* Bucket info of the bucket that stores previous versions of our object. */
2841 RGWBucketInfo archive_binfo
;
2843 int ret
= get_bucket_info(&svc
, bucket_info
.bucket
.tenant
,
2844 bucket_info
.swift_ver_location
, archive_binfo
,
2845 nullptr, null_yield
, nullptr);
2850 /* Abort the operation if the bucket storing our archive belongs to someone
2851 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
2852 * into consideration. For we can live with that.
2854 * TODO: delegate this check to un upper layer and compare with ACLs. */
2855 if (bucket_info
.owner
!= archive_binfo
.owner
) {
2859 /* This code will be executed on latest version of the object. */
2860 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
2861 rgw_zone_id no_zone
;
2863 /* We don't support object versioning of Swift API on those buckets that
2864 * are already versioned using the S3 mechanism. This affects also bucket
2865 * storing archived objects. Otherwise the delete operation would create
2866 * a deletion marker. */
2867 if (archive_binfo
.versioned()) {
2869 return -ERR_PRECONDITION_FAILED
;
2872 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
2873 * irrelevant and may be safely skipped. */
2874 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
2876 rgw_obj
archive_obj(archive_binfo
.bucket
, entry
.key
);
2878 if (bucket_info
.versioning_enabled()){
2879 gen_rand_obj_instance_name(&obj
);
2882 obj_ctx
.set_atomic(archive_obj
);
2883 obj_ctx
.set_atomic(obj
);
2885 int ret
= copy_obj(obj_ctx
,
2887 nullptr, /* req_info *info */
2890 archive_obj
, /* src obj */
2891 bucket_info
, /* dest bucket info */
2892 archive_binfo
, /* src bucket info */
2893 bucket_info
.placement_rule
, /* placement_rule */
2894 nullptr, /* time_t *src_mtime */
2895 nullptr, /* time_t *mtime */
2896 nullptr, /* const time_t *mod_ptr */
2897 nullptr, /* const time_t *unmod_ptr */
2898 false, /* bool high_precision_time */
2899 nullptr, /* const char *if_match */
2900 nullptr, /* const char *if_nomatch */
2901 RGWRados::ATTRSMOD_NONE
,
2902 true, /* bool copy_if_newer */
2904 RGWObjCategory::Main
,
2905 0, /* uint64_t olh_epoch */
2906 real_time(), /* time_t delete_at */
2907 nullptr, /* string *version_id */
2908 nullptr, /* string *ptag */
2909 nullptr, /* string *petag */
2910 nullptr, /* void (*progress_cb)(off_t, void *) */
2911 nullptr, /* void *progress_data */
2914 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
2915 /* Has already been overwritten, meaning another rgw process already
2918 } else if (ret
< 0) {
2924 /* Need to remove the archived copy. */
2925 ret
= delete_obj(obj_ctx
, archive_binfo
, archive_obj
,
2926 archive_binfo
.versioning_status());
2931 const std::string
& obj_name
= obj
.get_oid();
2932 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
2935 return on_last_entry_in_listing(archive_binfo
, prefix
, std::string(),
2939 int RGWRados::Object::Write::_do_write_meta(uint64_t size
, uint64_t accounted_size
,
2940 map
<string
, bufferlist
>& attrs
,
2941 bool assume_noent
, bool modify_tail
,
2942 void *_index_op
, optional_yield y
)
2944 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
2945 RGWRados
*store
= target
->get_store();
2947 ObjectWriteOperation op
;
2949 const struct req_state
* s
= get_req_state();
2953 req_id
= store
->svc
.zone_utils
->unique_id(store
->get_new_req_id());
2960 int r
= target
->get_state(&state
, false, y
, assume_noent
);
2964 rgw_obj
& obj
= target
->get_obj();
2966 if (obj
.get_oid().empty()) {
2967 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
2972 r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
2976 bool is_olh
= state
->is_olh
;
2978 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
2980 const string
*ptag
= meta
.ptag
;
2981 if (!ptag
&& !index_op
->get_optag()->empty()) {
2982 ptag
= index_op
->get_optag();
2984 r
= target
->prepare_atomic_modification(op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false, modify_tail
, y
);
2988 if (real_clock::is_zero(meta
.set_mtime
)) {
2989 meta
.set_mtime
= real_clock::now();
2992 if (target
->bucket_info
.obj_lock_enabled() && target
->bucket_info
.obj_lock
.has_rule() && meta
.flags
== PUT_OBJ_CREATE
) {
2993 auto iter
= attrs
.find(RGW_ATTR_OBJECT_RETENTION
);
2994 if (iter
== attrs
.end()) {
2995 real_time lock_until_date
= target
->bucket_info
.obj_lock
.get_lock_until_date(meta
.set_mtime
);
2996 string mode
= target
->bucket_info
.obj_lock
.get_mode();
2997 RGWObjectRetention
obj_retention(mode
, lock_until_date
);
2999 obj_retention
.encode(bl
);
3000 op
.setxattr(RGW_ATTR_OBJECT_RETENTION
, bl
);
3004 if (state
->is_olh
) {
3005 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
3008 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
3009 op
.mtime2(&mtime_ts
);
3012 /* if we want to overwrite the data, we also want to overwrite the
3013 xattrs, so just remove the object */
3014 op
.write_full(*meta
.data
);
3018 string content_type
;
3020 string storage_class
;
3022 map
<string
, bufferlist
>::iterator iter
;
3024 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
3025 const string
& name
= iter
->first
;
3026 op
.rmxattr(name
.c_str());
3030 if (meta
.manifest
) {
3031 storage_class
= meta
.manifest
->get_tail_placement().placement_rule
.storage_class
;
3033 /* remove existing manifest attr */
3034 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
3035 if (iter
!= attrs
.end())
3039 encode(*meta
.manifest
, bl
);
3040 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
3043 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
3044 const string
& name
= iter
->first
;
3045 bufferlist
& bl
= iter
->second
;
3050 op
.setxattr(name
.c_str(), bl
);
3052 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
3053 etag
= rgw_bl_str(bl
);
3054 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
3055 content_type
= rgw_bl_str(bl
);
3056 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
3060 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
3061 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
3064 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
3066 encode(store
->svc
.zone
->get_zone_short_id(), bl
);
3067 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
3070 if (!storage_class
.empty()) {
3072 bl
.append(storage_class
);
3073 op
.setxattr(RGW_ATTR_STORAGE_CLASS
, bl
);
3084 if (!reset_obj
) { //Multipart upload, it has immutable head.
3085 orig_exists
= false;
3088 orig_exists
= state
->exists
;
3089 orig_size
= state
->accounted_size
;
3092 bool versioned_target
= (meta
.olh_epoch
&& *meta
.olh_epoch
> 0) ||
3093 !obj
.key
.instance
.empty();
3095 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
3098 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
3101 if (!index_op
->is_prepared()) {
3102 tracepoint(rgw_rados
, prepare_enter
, req_id
.c_str());
3103 r
= index_op
->prepare(CLS_RGW_OP_ADD
, &state
->write_tag
, y
);
3104 tracepoint(rgw_rados
, prepare_exit
, req_id
.c_str());
3109 auto& ioctx
= ref
.pool
.ioctx();
3111 tracepoint(rgw_rados
, operate_enter
, req_id
.c_str());
3112 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
3113 tracepoint(rgw_rados
, operate_exit
, req_id
.c_str());
3114 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
3115 or -ENOENT if was removed, or -EEXIST if it did not exist
3116 before and now it does */
3117 if (r
== -EEXIST
&& assume_noent
) {
3118 target
->invalidate_state();
3124 epoch
= ioctx
.get_last_version();
3125 poolid
= ioctx
.get_id();
3127 r
= target
->complete_atomic_modification();
3129 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
3132 tracepoint(rgw_rados
, complete_enter
, req_id
.c_str());
3133 r
= index_op
->complete(poolid
, epoch
, size
, accounted_size
,
3134 meta
.set_mtime
, etag
, content_type
,
3135 storage_class
, &acl_bl
,
3136 meta
.category
, meta
.remove_objs
, meta
.user_data
, meta
.appendable
);
3137 tracepoint(rgw_rados
, complete_exit
, req_id
.c_str());
3142 *meta
.mtime
= meta
.set_mtime
;
3145 /* note that index_op was using state so we couldn't invalidate it earlier */
3146 target
->invalidate_state();
3149 if (versioned_op
&& meta
.olh_epoch
) {
3150 r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, *meta
.olh_epoch
, real_time(), false, y
, meta
.zones_trace
);
3156 if (!real_clock::is_zero(meta
.delete_at
)) {
3157 rgw_obj_index_key obj_key
;
3158 obj
.key
.get_index_key(&obj_key
);
3160 r
= store
->obj_expirer
->hint_add(meta
.delete_at
, obj
.bucket
.tenant
, obj
.bucket
.name
,
3161 obj
.bucket
.bucket_id
, obj_key
);
3163 ldout(store
->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
3164 /* ignoring error, nothing we can do at this point */
3167 meta
.canceled
= false;
3169 /* update quota cache */
3170 if (meta
.completeMultipart
){
3171 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3175 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3176 accounted_size
, orig_size
);
3181 int ret
= index_op
->cancel();
3183 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
3186 meta
.canceled
= true;
3188 /* we lost in a race. There are a few options:
3189 * - existing object was rewritten (ECANCELED)
3190 * - non existing object was created (EEXIST)
3191 * - object was removed (ENOENT)
3192 * should treat it as a success
3194 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
3195 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
3199 if (meta
.if_match
!= NULL
) {
3200 // only overwrite existing object
3201 if (strcmp(meta
.if_match
, "*") == 0) {
3203 r
= -ERR_PRECONDITION_FAILED
;
3204 } else if (r
== -ECANCELED
) {
3210 if (meta
.if_nomatch
!= NULL
) {
3211 // only create a new object
3212 if (strcmp(meta
.if_nomatch
, "*") == 0) {
3214 r
= -ERR_PRECONDITION_FAILED
;
3215 } else if (r
== -ENOENT
) {
3225 int RGWRados::Object::Write::write_meta(uint64_t size
, uint64_t accounted_size
,
3226 map
<string
, bufferlist
>& attrs
, optional_yield y
)
3228 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
3230 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
3231 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
3232 index_op
.set_zones_trace(meta
.zones_trace
);
3234 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
3237 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
, y
);
3239 assume_noent
= false;
3242 if (!assume_noent
) {
3243 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
, y
);
3248 class RGWRadosPutObj
: public RGWHTTPStreamRWRequest::ReceiveCB
3252 rgw::putobj::DataProcessor
*filter
;
3253 boost::optional
<RGWPutObj_Compress
>& compressor
;
3254 boost::optional
<rgw::putobj::ChunkProcessor
> buffering
;
3255 CompressorRef
& plugin
;
3256 rgw::putobj::ObjectProcessor
*processor
;
3257 void (*progress_cb
)(off_t
, void *);
3258 void *progress_data
;
3259 bufferlist extra_data_bl
;
3260 uint64_t extra_data_left
{0};
3261 bool need_to_process_attrs
{true};
3262 uint64_t data_len
{0};
3263 map
<string
, bufferlist
> src_attrs
;
3265 uint64_t lofs
{0}; /* logical ofs */
3266 std::function
<int(map
<string
, bufferlist
>&)> attrs_handler
;
3268 RGWRadosPutObj(CephContext
* cct
,
3269 CompressorRef
& plugin
,
3270 boost::optional
<RGWPutObj_Compress
>& compressor
,
3271 rgw::putobj::ObjectProcessor
*p
,
3272 void (*_progress_cb
)(off_t
, void *),
3273 void *_progress_data
,
3274 std::function
<int(map
<string
, bufferlist
>&)> _attrs_handler
) :
3277 compressor(compressor
),
3280 progress_cb(_progress_cb
),
3281 progress_data(_progress_data
),
3282 attrs_handler(_attrs_handler
) {}
3284 int process_attrs(void) {
3285 if (extra_data_bl
.length()) {
3287 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
3288 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
3292 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
3294 src_attrs
.erase(RGW_ATTR_COMPRESSION
);
3295 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
3297 // filter out olh attributes
3298 auto iter
= src_attrs
.lower_bound(RGW_ATTR_OLH_PREFIX
);
3299 while (iter
!= src_attrs
.end()) {
3300 if (!boost::algorithm::starts_with(iter
->first
, RGW_ATTR_OLH_PREFIX
)) {
3303 iter
= src_attrs
.erase(iter
);
3307 int ret
= attrs_handler(src_attrs
);
3312 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
3313 //do not compress if object is encrypted
3314 compressor
= boost::in_place(cct
, plugin
, filter
);
3315 // add a filter that buffers data so we don't try to compress tiny blocks.
3316 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3317 // compression ratio
3318 constexpr unsigned buffer_size
= 512 * 1024;
3319 buffering
= boost::in_place(&*compressor
, buffer_size
);
3320 filter
= &*buffering
;
3323 need_to_process_attrs
= false;
3328 int handle_data(bufferlist
& bl
, bool *pause
) override
{
3330 progress_cb(data_len
, progress_data
);
3332 if (extra_data_left
) {
3333 uint64_t extra_len
= bl
.length();
3334 if (extra_len
> extra_data_left
)
3335 extra_len
= extra_data_left
;
3338 bl
.splice(0, extra_len
, &extra
);
3339 extra_data_bl
.append(extra
);
3341 extra_data_left
-= extra_len
;
3342 if (extra_data_left
== 0) {
3343 int res
= process_attrs();
3348 if (bl
.length() == 0) {
3352 if (need_to_process_attrs
) {
3353 /* need to call process_attrs() even if we don't get any attrs,
3354 * need it to call attrs_handler().
3356 int res
= process_attrs();
3362 ceph_assert(uint64_t(ofs
) >= extra_data_len
);
3364 uint64_t size
= bl
.length();
3367 const uint64_t lofs
= data_len
;
3370 return filter
->process(std::move(bl
), lofs
);
3374 return filter
->process({}, data_len
);
3377 bufferlist
& get_extra_data() { return extra_data_bl
; }
3379 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
3381 void set_extra_data_len(uint64_t len
) override
{
3382 extra_data_left
= len
;
3383 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len
);
3386 uint64_t get_data_len() {
3392 * prepare attrset depending on attrs_mod.
3394 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
3395 map
<string
, bufferlist
>& attrs
,
3396 RGWRados::AttrsMod attrs_mod
)
3398 switch (attrs_mod
) {
3399 case RGWRados::ATTRSMOD_NONE
:
3402 case RGWRados::ATTRSMOD_REPLACE
:
3403 if (!attrs
[RGW_ATTR_ETAG
].length()) {
3404 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
3406 if (!attrs
[RGW_ATTR_TAIL_TAG
].length()) {
3407 auto ttiter
= src_attrs
.find(RGW_ATTR_TAIL_TAG
);
3408 if (ttiter
!= src_attrs
.end()) {
3409 attrs
[RGW_ATTR_TAIL_TAG
] = src_attrs
[RGW_ATTR_TAIL_TAG
];
3413 case RGWRados::ATTRSMOD_MERGE
:
3414 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
3415 if (attrs
.find(it
->first
) == attrs
.end()) {
3416 attrs
[it
->first
] = it
->second
;
3423 int RGWRados::rewrite_obj(RGWBucketInfo
& dest_bucket_info
, const rgw_obj
& obj
, const DoutPrefixProvider
*dpp
, optional_yield y
)
3425 map
<string
, bufferlist
> attrset
;
3429 RGWObjectCtx
rctx(this->store
);
3431 RGWRados::Object
op_target(this, dest_bucket_info
, rctx
, obj
);
3432 RGWRados::Object::Read
read_op(&op_target
);
3434 read_op
.params
.attrs
= &attrset
;
3435 read_op
.params
.lastmod
= &mtime
;
3436 read_op
.params
.obj_size
= &obj_size
;
3438 int ret
= read_op
.prepare(y
);
3442 attrset
.erase(RGW_ATTR_ID_TAG
);
3443 attrset
.erase(RGW_ATTR_TAIL_TAG
);
3445 return copy_obj_data(rctx
, dest_bucket_info
, dest_bucket_info
.placement_rule
,
3446 read_op
, obj_size
- 1, obj
, NULL
, mtime
, attrset
,
3447 0, real_time(), NULL
, dpp
, y
);
3450 struct obj_time_weight
{
3452 uint32_t zone_short_id
;
3454 bool high_precision
;
3456 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3458 bool compare_low_precision(const obj_time_weight
& rhs
) {
3459 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
3460 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
3469 if (!zone_short_id
|| !rhs
.zone_short_id
) {
3470 /* don't compare zone ids, if one wasn't provided */
3473 if (zone_short_id
!= rhs
.zone_short_id
) {
3474 return (zone_short_id
< rhs
.zone_short_id
);
3476 return (pg_ver
< rhs
.pg_ver
);
3480 bool operator<(const obj_time_weight
& rhs
) {
3481 if (!high_precision
|| !rhs
.high_precision
) {
3482 return compare_low_precision(rhs
);
3484 if (mtime
> rhs
.mtime
) {
3487 if (mtime
< rhs
.mtime
) {
3490 if (!zone_short_id
|| !rhs
.zone_short_id
) {
3491 /* don't compare zone ids, if one wasn't provided */
3494 if (zone_short_id
!= rhs
.zone_short_id
) {
3495 return (zone_short_id
< rhs
.zone_short_id
);
3497 return (pg_ver
< rhs
.pg_ver
);
3500 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
3502 zone_short_id
= _short_id
;
3506 void init(RGWObjState
*state
) {
3507 mtime
= state
->mtime
;
3508 zone_short_id
= state
->zone_short_id
;
3509 pg_ver
= state
->pg_ver
;
3513 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
3516 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
3517 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
3523 class RGWGetExtraDataCB
: public RGWHTTPStreamRWRequest::ReceiveCB
{
3524 bufferlist extra_data
;
3526 RGWGetExtraDataCB() {}
3527 int handle_data(bufferlist
& bl
, bool *pause
) override
{
3528 int bl_len
= (int)bl
.length();
3529 if (extra_data
.length() < extra_data_len
) {
3530 off_t max
= extra_data_len
- extra_data
.length();
3534 bl
.splice(0, max
, &extra_data
);
3539 bufferlist
& get_extra_data() {
3544 int RGWRados::stat_remote_obj(RGWObjectCtx
& obj_ctx
,
3545 const rgw_user
& user_id
,
3547 const rgw_zone_id
& source_zone
,
3549 const RGWBucketInfo
*src_bucket_info
,
3550 real_time
*src_mtime
,
3552 const real_time
*mod_ptr
,
3553 const real_time
*unmod_ptr
,
3554 bool high_precision_time
,
3555 const char *if_match
,
3556 const char *if_nomatch
,
3557 map
<string
, bufferlist
> *pattrs
,
3558 map
<string
, string
> *pheaders
,
3563 /* source is in a different zonegroup, copy from there */
3565 RGWRESTStreamRWRequest
*in_stream_req
;
3567 map
<string
, bufferlist
> src_attrs
;
3568 append_rand_alpha(cct
, tag
, tag
, 32);
3569 obj_time_weight set_mtime_weight
;
3570 set_mtime_weight
.high_precision
= high_precision_time
;
3573 if (source_zone
.empty()) {
3574 if (!src_bucket_info
|| src_bucket_info
->zonegroup
.empty()) {
3575 /* source is in the master zonegroup */
3576 conn
= svc
.zone
->get_master_conn();
3578 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
3579 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
->zonegroup
);
3580 if (iter
== zonegroup_conn_map
.end()) {
3581 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
3584 conn
= iter
->second
;
3587 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
3588 auto iter
= zone_conn_map
.find(source_zone
);
3589 if (iter
== zone_conn_map
.end()) {
3590 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
3593 conn
= iter
->second
;
3596 RGWGetExtraDataCB cb
;
3597 map
<string
, string
> req_headers
;
3598 real_time set_mtime
;
3600 const real_time
*pmod
= mod_ptr
;
3602 obj_time_weight dest_mtime_weight
;
3604 constexpr bool prepend_meta
= true;
3605 constexpr bool get_op
= true;
3606 constexpr bool rgwx_stat
= true;
3607 constexpr bool sync_manifest
= true;
3608 constexpr bool skip_decrypt
= true;
3609 int ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
3610 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
3611 prepend_meta
, get_op
, rgwx_stat
,
3612 sync_manifest
, skip_decrypt
,
3613 true, &cb
, &in_stream_req
);
3618 ret
= conn
->complete_request(in_stream_req
, nullptr, &set_mtime
, psize
, nullptr, pheaders
);
3623 bufferlist
& extra_data_bl
= cb
.get_extra_data();
3624 if (extra_data_bl
.length()) {
3626 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
3627 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
3631 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
3633 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
3637 *src_mtime
= set_mtime
;
3641 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
3642 if (iter
!= src_attrs
.end()) {
3643 bufferlist
& etagbl
= iter
->second
;
3644 *petag
= etagbl
.to_str();
3645 while (petag
->size() > 0 && (*petag
)[petag
->size() - 1] == '\0') {
3646 *petag
= petag
->substr(0, petag
->size() - 1);
3652 *pattrs
= std::move(src_attrs
);
3658 int RGWFetchObjFilter_Default::filter(CephContext
*cct
,
3659 const rgw_obj_key
& source_key
,
3660 const RGWBucketInfo
& dest_bucket_info
,
3661 std::optional
<rgw_placement_rule
> dest_placement_rule
,
3662 const map
<string
, bufferlist
>& obj_attrs
,
3663 std::optional
<rgw_user
> *poverride_owner
,
3664 const rgw_placement_rule
**prule
)
3666 const rgw_placement_rule
*ptail_rule
= (dest_placement_rule
? &(*dest_placement_rule
) : nullptr);
3668 auto iter
= obj_attrs
.find(RGW_ATTR_STORAGE_CLASS
);
3669 if (iter
!= obj_attrs
.end()) {
3670 dest_rule
.storage_class
= iter
->second
.to_str();
3671 dest_rule
.inherit_from(dest_bucket_info
.placement_rule
);
3672 ptail_rule
= &dest_rule
;
3674 ptail_rule
= &dest_bucket_info
.placement_rule
;
3677 *prule
= ptail_rule
;
3681 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
3682 const rgw_user
& user_id
,
3684 const rgw_zone_id
& source_zone
,
3685 const rgw_obj
& dest_obj
,
3686 const rgw_obj
& src_obj
,
3687 const RGWBucketInfo
& dest_bucket_info
,
3688 const RGWBucketInfo
*src_bucket_info
,
3689 std::optional
<rgw_placement_rule
> dest_placement_rule
,
3690 real_time
*src_mtime
,
3692 const real_time
*mod_ptr
,
3693 const real_time
*unmod_ptr
,
3694 bool high_precision_time
,
3695 const char *if_match
,
3696 const char *if_nomatch
,
3699 map
<string
, bufferlist
>& attrs
,
3700 RGWObjCategory category
,
3701 std::optional
<uint64_t> olh_epoch
,
3702 real_time delete_at
,
3705 void (*progress_cb
)(off_t
, void *),
3706 void *progress_data
,
3707 const DoutPrefixProvider
*dpp
,
3708 RGWFetchObjFilter
*filter
,
3709 rgw_zone_set
*zones_trace
,
3710 std::optional
<uint64_t>* bytes_transferred
)
3712 /* source is in a different zonegroup, copy from there */
3714 RGWRESTStreamRWRequest
*in_stream_req
;
3717 append_rand_alpha(cct
, tag
, tag
, 32);
3718 obj_time_weight set_mtime_weight
;
3719 set_mtime_weight
.high_precision
= high_precision_time
;
3722 rgw::BlockingAioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
3723 using namespace rgw::putobj
;
3724 AtomicObjectProcessor
processor(&aio
, this->store
, dest_bucket_info
, nullptr, user_id
,
3725 obj_ctx
, dest_obj
, olh_epoch
, tag
, dpp
, null_yield
);
3727 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
3728 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
3729 if (source_zone
.empty()) {
3730 if (!src_bucket_info
|| src_bucket_info
->zonegroup
.empty()) {
3731 /* source is in the master zonegroup */
3732 conn
= svc
.zone
->get_master_conn();
3734 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
->zonegroup
);
3735 if (iter
== zonegroup_conn_map
.end()) {
3736 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
3739 conn
= iter
->second
;
3742 auto iter
= zone_conn_map
.find(source_zone
);
3743 if (iter
== zone_conn_map
.end()) {
3744 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
3747 conn
= iter
->second
;
3750 boost::optional
<RGWPutObj_Compress
> compressor
;
3751 CompressorRef plugin
;
3753 RGWFetchObjFilter_Default source_filter
;
3755 filter
= &source_filter
;
3758 std::optional
<rgw_user
> override_owner
;
3760 RGWRadosPutObj
cb(cct
, plugin
, compressor
, &processor
, progress_cb
, progress_data
,
3761 [&](map
<string
, bufferlist
>& obj_attrs
) {
3762 const rgw_placement_rule
*ptail_rule
;
3764 int ret
= filter
->filter(cct
,
3767 dest_placement_rule
,
3772 ldout(cct
, 5) << "Aborting fetch: source object filter returned ret=" << ret
<< dendl
;
3776 processor
.set_tail_placement(*ptail_rule
);
3778 const auto& compression_type
= svc
.zone
->get_zone_params().get_compression_type(*ptail_rule
);
3779 if (compression_type
!= "none") {
3780 plugin
= Compressor::create(cct
, compression_type
);
3782 ldout(cct
, 1) << "Cannot load plugin for compression type "
3783 << compression_type
<< dendl
;
3787 ret
= processor
.prepare(null_yield
);
3795 real_time set_mtime
;
3796 uint64_t expected_size
= 0;
3798 RGWObjState
*dest_state
= NULL
;
3800 const real_time
*pmod
= mod_ptr
;
3802 obj_time_weight dest_mtime_weight
;
3804 if (copy_if_newer
) {
3805 /* need to get mtime for destination */
3806 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false, null_yield
);
3810 if (!real_clock::is_zero(dest_state
->mtime
)) {
3811 dest_mtime_weight
.init(dest_state
);
3812 pmod
= &dest_mtime_weight
.mtime
;
3816 static constexpr bool prepend_meta
= true;
3817 static constexpr bool get_op
= true;
3818 static constexpr bool rgwx_stat
= false;
3819 static constexpr bool sync_manifest
= true;
3820 static constexpr bool skip_decrypt
= true;
3821 ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
3822 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
3823 prepend_meta
, get_op
, rgwx_stat
,
3824 sync_manifest
, skip_decrypt
,
3826 &cb
, &in_stream_req
);
3831 ret
= conn
->complete_request(in_stream_req
, &etag
, &set_mtime
,
3832 &expected_size
, nullptr, nullptr);
3840 if (cb
.get_data_len() != expected_size
) {
3842 ldout(cct
, 0) << "ERROR: object truncated during fetching, expected "
3843 << expected_size
<< " bytes but received " << cb
.get_data_len() << dendl
;
3846 if (compressor
&& compressor
->is_compressed()) {
3848 RGWCompressionInfo cs_info
;
3849 cs_info
.compression_type
= plugin
->get_type_name();
3850 cs_info
.orig_size
= cb
.get_data_len();
3851 cs_info
.blocks
= move(compressor
->get_compression_blocks());
3852 encode(cs_info
, tmp
);
3853 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
3856 if (override_owner
) {
3857 processor
.set_owner(*override_owner
);
3859 auto& obj_attrs
= cb
.get_attrs();
3861 RGWUserInfo owner_info
;
3862 if (ctl
.user
->get_info_by_uid(*override_owner
, &owner_info
, null_yield
) < 0) {
3863 ldout(cct
, 10) << "owner info does not exist" << dendl
;
3867 RGWAccessControlPolicy acl
;
3869 auto aiter
= obj_attrs
.find(RGW_ATTR_ACL
);
3870 if (aiter
== obj_attrs
.end()) {
3871 ldout(cct
, 0) << "WARNING: " << __func__
<< "(): object doesn't have ACL attribute, setting default ACLs" << dendl
;
3872 acl
.create_default(owner_info
.user_id
, owner_info
.display_name
);
3874 auto iter
= aiter
->second
.cbegin();
3877 } catch (buffer::error
& err
) {
3878 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): could not decode policy, caught buffer::error" << dendl
;
3884 new_owner
.set_id(*override_owner
);
3885 new_owner
.set_name(owner_info
.display_name
);
3887 acl
.set_owner(new_owner
);
3891 obj_attrs
[RGW_ATTR_ACL
] = std::move(bl
);
3894 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
3895 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
3897 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
3898 if (iter
!= cb
.get_attrs().end()) {
3900 decode(delete_at
, iter
->second
);
3901 } catch (buffer::error
& err
) {
3902 ldout(cct
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
3908 *src_mtime
= set_mtime
;
3912 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
3913 if (iter
!= cb
.get_attrs().end()) {
3914 *petag
= iter
->second
.to_str();
3918 //erase the append attr
3919 cb
.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM
);
3921 if (source_zone
.empty()) {
3922 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
3924 attrs
= cb
.get_attrs();
3927 if (copy_if_newer
) {
3928 uint64_t pg_ver
= 0;
3929 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
3930 if (i
!= attrs
.end() && i
->second
.length() > 0) {
3931 auto iter
= i
->second
.cbegin();
3933 decode(pg_ver
, iter
);
3934 } catch (buffer::error
& err
) {
3935 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
3936 /* non critical error */
3939 set_mtime_weight
.init(set_mtime
, svc
.zone
->get_zone_short_id(), pg_ver
);
3942 #define MAX_COMPLETE_RETRY 100
3943 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
3944 bool canceled
= false;
3945 ret
= processor
.complete(cb
.get_data_len(), etag
, mtime
, set_mtime
,
3946 attrs
, delete_at
, nullptr, nullptr, nullptr,
3947 zones_trace
, &canceled
, null_yield
);
3951 if (copy_if_newer
&& canceled
) {
3952 ldout(cct
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
3953 obj_ctx
.invalidate(dest_obj
); /* object was overwritten */
3954 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false, null_yield
);
3956 ldout(cct
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
3959 dest_mtime_weight
.init(dest_state
);
3960 dest_mtime_weight
.high_precision
= high_precision_time
;
3961 if (!dest_state
->exists
||
3962 dest_mtime_weight
< set_mtime_weight
) {
3963 ldout(cct
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
3966 ldout(cct
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
3972 if (i
== MAX_COMPLETE_RETRY
) {
3973 ldout(cct
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
3978 if (bytes_transferred
) {
3979 *bytes_transferred
= cb
.get_data_len();
3983 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
3984 // we may have already fetched during sync of OP_ADD, but were waiting
3985 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
3986 if (olh_epoch
&& *olh_epoch
> 0) {
3987 constexpr bool log_data_change
= true;
3988 ret
= set_olh(obj_ctx
, dest_bucket_info
, dest_obj
, false, nullptr,
3989 *olh_epoch
, real_time(), false, null_yield
, zones_trace
, log_data_change
);
3991 // we already have the latest copy
3999 int RGWRados::copy_obj_to_remote_dest(RGWObjState
*astate
,
4000 map
<string
, bufferlist
>& src_attrs
,
4001 RGWRados::Object::Read
& read_op
,
4002 const rgw_user
& user_id
,
4008 RGWRESTStreamS3PutObj
*out_stream_req
;
4010 auto rest_master_conn
= svc
.zone
->get_master_conn();
4012 int ret
= rest_master_conn
->put_obj_async(user_id
, dest_obj
, astate
->size
, src_attrs
, true, &out_stream_req
);
4017 ret
= read_op
.iterate(0, astate
->size
- 1, out_stream_req
->get_out_cb(), null_yield
);
4019 delete out_stream_req
;
4023 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
);
4032 * dest_obj: the object to copy into
4033 * src_obj: the object to copy from
4034 * attrs: usage depends on attrs_mod parameter
4035 * attrs_mod: the modification mode of the attrs, may have the following values:
4036 * ATTRSMOD_NONE - the attributes of the source object will be
4037 * copied without modifications, attrs parameter is ignored;
4038 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4039 * parameter, source object attributes are not copied;
4040 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4041 * are overwritten by values contained in attrs parameter.
4042 * err: stores any errors resulting from the get of the original object
4043 * Returns: 0 on success, -ERR# otherwise.
4045 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
4046 const rgw_user
& user_id
,
4048 const rgw_zone_id
& source_zone
,
4051 RGWBucketInfo
& dest_bucket_info
,
4052 RGWBucketInfo
& src_bucket_info
,
4053 const rgw_placement_rule
& dest_placement
,
4054 real_time
*src_mtime
,
4056 const real_time
*mod_ptr
,
4057 const real_time
*unmod_ptr
,
4058 bool high_precision_time
,
4059 const char *if_match
,
4060 const char *if_nomatch
,
4063 map
<string
, bufferlist
>& attrs
,
4064 RGWObjCategory category
,
4066 real_time delete_at
,
4070 void (*progress_cb
)(off_t
, void *),
4071 void *progress_data
,
4072 const DoutPrefixProvider
*dpp
,
4077 rgw_obj shadow_obj
= dest_obj
;
4083 append_rand_alpha(cct
, dest_obj
.get_oid(), shadow_oid
, 32);
4084 shadow_obj
.init_ns(dest_obj
.bucket
, shadow_oid
, shadow_ns
);
4086 auto& zonegroup
= svc
.zone
->get_zonegroup();
4088 remote_dest
= !zonegroup
.equals(dest_bucket_info
.zonegroup
);
4089 remote_src
= !zonegroup
.equals(src_bucket_info
.zonegroup
);
4091 if (remote_src
&& remote_dest
) {
4092 ldpp_dout(dpp
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
4096 ldpp_dout(dpp
, 5) << "Copy object " << src_obj
.bucket
<< ":" << src_obj
.get_oid() << " => " << dest_obj
.bucket
<< ":" << dest_obj
.get_oid() << dendl
;
4098 if (remote_src
|| !source_zone
.empty()) {
4099 return fetch_remote_obj(obj_ctx
, user_id
, info
, source_zone
,
4100 dest_obj
, src_obj
, dest_bucket_info
, &src_bucket_info
,
4101 dest_placement
, src_mtime
, mtime
, mod_ptr
,
4102 unmod_ptr
, high_precision_time
,
4103 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
4104 olh_epoch
, delete_at
, ptag
, petag
, progress_cb
, progress_data
, dpp
,
4105 nullptr /* filter */);
4108 map
<string
, bufferlist
> src_attrs
;
4109 RGWRados::Object
src_op_target(this, src_bucket_info
, obj_ctx
, src_obj
);
4110 RGWRados::Object::Read
read_op(&src_op_target
);
4112 read_op
.conds
.mod_ptr
= mod_ptr
;
4113 read_op
.conds
.unmod_ptr
= unmod_ptr
;
4114 read_op
.conds
.high_precision_time
= high_precision_time
;
4115 read_op
.conds
.if_match
= if_match
;
4116 read_op
.conds
.if_nomatch
= if_nomatch
;
4117 read_op
.params
.attrs
= &src_attrs
;
4118 read_op
.params
.lastmod
= src_mtime
;
4119 read_op
.params
.obj_size
= &obj_size
;
4121 ret
= read_op
.prepare(y
);
4125 if (src_attrs
.count(RGW_ATTR_CRYPT_MODE
)) {
4126 // Current implementation does not follow S3 spec and even
4127 // may result in data corruption silently when copying
4128 // multipart objects acorss pools. So reject COPY operations
4129 //on encrypted objects before it is fully functional.
4130 ldpp_dout(dpp
, 0) << "ERROR: copy op for encrypted object " << src_obj
4131 << " has not been implemented." << dendl
;
4132 return -ERR_NOT_IMPLEMENTED
;
4135 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
4136 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
4138 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
4139 attrs
.erase(RGW_ATTR_ID_TAG
);
4140 attrs
.erase(RGW_ATTR_PG_VER
);
4141 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
4142 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
4143 if (cmp
!= src_attrs
.end())
4144 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
4146 RGWObjManifest manifest
;
4147 RGWObjState
*astate
= NULL
;
4149 ret
= get_obj_state(&obj_ctx
, src_bucket_info
, src_obj
, &astate
, y
);
4154 vector
<rgw_raw_obj
> ref_objs
;
4157 /* dest is in a different zonegroup, copy it there */
4158 return copy_obj_to_remote_dest(astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
4160 uint64_t max_chunk_size
;
4162 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, dest_obj
, &max_chunk_size
);
4164 ldpp_dout(dpp
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
.bucket
<< dendl
;
4171 const rgw_placement_rule
*src_rule
{nullptr};
4173 if (astate
->manifest
) {
4174 src_rule
= &astate
->manifest
->get_tail_placement().placement_rule
;
4175 ldpp_dout(dpp
, 20) << __func__
<< "(): manifest src_rule=" << src_rule
->to_str() << dendl
;
4178 if (!src_rule
|| src_rule
->empty()) {
4179 src_rule
= &src_bucket_info
.placement_rule
;
4182 if (!get_obj_data_pool(*src_rule
, src_obj
, &src_pool
)) {
4183 ldpp_dout(dpp
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
4187 if (!get_obj_data_pool(dest_placement
, dest_obj
, &dest_pool
)) {
4188 ldpp_dout(dpp
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
4192 ldpp_dout(dpp
, 20) << __func__
<< "(): src_rule=" << src_rule
->to_str() << " src_pool=" << src_pool
4193 << " dest_rule=" << dest_placement
.to_str() << " dest_pool=" << dest_pool
<< dendl
;
4195 bool copy_data
= (!astate
->manifest
) ||
4196 (*src_rule
!= dest_placement
) ||
4197 (src_pool
!= dest_pool
);
4199 bool copy_first
= false;
4200 if (astate
->manifest
) {
4201 if (!astate
->manifest
->has_tail()) {
4204 uint64_t head_size
= astate
->manifest
->get_head_size();
4206 if (head_size
> 0) {
4207 if (head_size
> max_chunk_size
) {
4217 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4218 if (iter
!= attrs
.end()) {
4219 *petag
= iter
->second
.to_str();
4223 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
4224 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4225 return copy_obj_data(obj_ctx
, dest_bucket_info
, dest_placement
, read_op
, obj_size
- 1, dest_obj
,
4226 mtime
, real_time(), attrs
, olh_epoch
, delete_at
, petag
, dpp
, y
);
4229 RGWObjManifest::obj_iterator miter
= astate
->manifest
->obj_begin();
4231 if (copy_first
) { // we need to copy first chunk, not increase refcount
4236 ret
= get_raw_obj_ref(miter
.get_location().get_raw_obj(this), &ref
);
4241 bufferlist first_chunk
;
4243 bool copy_itself
= (dest_obj
== src_obj
);
4244 RGWObjManifest
*pmanifest
;
4245 ldpp_dout(dpp
, 20) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
4247 RGWRados::Object
dest_op_target(this, dest_bucket_info
, obj_ctx
, dest_obj
);
4248 RGWRados::Object::Write
write_op(&dest_op_target
);
4257 append_rand_alpha(cct
, tag
, tag
, 32);
4261 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4262 manifest
= *astate
->manifest
;
4263 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
4264 if (tail_placement
.bucket
.name
.empty()) {
4265 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
.bucket
);
4268 for (; miter
!= astate
->manifest
->obj_end(); ++miter
) {
4269 ObjectWriteOperation op
;
4270 ref_tag
= tag
+ '\0';
4271 cls_refcount_get(op
, ref_tag
, true);
4272 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(this);
4274 auto& ioctx
= ref
.pool
.ioctx();
4275 ioctx
.locator_set_key(loc
.loc
);
4277 ret
= rgw_rados_operate(ioctx
, loc
.oid
, &op
, null_yield
);
4282 ref_objs
.push_back(loc
);
4285 pmanifest
= &manifest
;
4287 pmanifest
= &(*astate
->manifest
);
4288 /* don't send the object's tail for garbage collection */
4289 astate
->keep_tail
= true;
4293 ret
= read_op
.read(0, max_chunk_size
, first_chunk
, y
);
4298 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, first_chunk
.length());
4300 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, 0);
4303 write_op
.meta
.data
= &first_chunk
;
4304 write_op
.meta
.manifest
= pmanifest
;
4305 write_op
.meta
.ptag
= &tag
;
4306 write_op
.meta
.owner
= dest_bucket_info
.owner
;
4307 write_op
.meta
.mtime
= mtime
;
4308 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
4309 write_op
.meta
.category
= category
;
4310 write_op
.meta
.olh_epoch
= olh_epoch
;
4311 write_op
.meta
.delete_at
= delete_at
;
4312 write_op
.meta
.modify_tail
= !copy_itself
;
4314 ret
= write_op
.write_meta(obj_size
, astate
->accounted_size
, attrs
, y
);
4323 vector
<rgw_raw_obj
>::iterator riter
;
4325 /* rollback reference */
4326 string ref_tag
= tag
+ '\0';
4327 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
4328 ObjectWriteOperation op
;
4329 cls_refcount_put(op
, ref_tag
, true);
4331 ref
.pool
.ioctx().locator_set_key(riter
->loc
);
4333 int r
= rgw_rados_operate(ref
.pool
.ioctx(), riter
->oid
, &op
, null_yield
);
4335 ldpp_dout(dpp
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
4343 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
4344 RGWBucketInfo
& dest_bucket_info
,
4345 const rgw_placement_rule
& dest_placement
,
4346 RGWRados::Object::Read
& read_op
, off_t end
,
4347 const rgw_obj
& dest_obj
,
4349 real_time set_mtime
,
4350 map
<string
, bufferlist
>& attrs
,
4352 real_time delete_at
,
4354 const DoutPrefixProvider
*dpp
,
4358 append_rand_alpha(cct
, tag
, tag
, 32);
4360 rgw::BlockingAioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
4361 using namespace rgw::putobj
;
4362 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4363 // it causes crashes in the ragweed tests
4364 AtomicObjectProcessor
processor(&aio
, this->store
, dest_bucket_info
, &dest_placement
,
4365 dest_bucket_info
.owner
, obj_ctx
,
4366 dest_obj
, olh_epoch
, tag
, dpp
, null_yield
);
4367 int ret
= processor
.prepare(y
);
4375 ret
= read_op
.read(ofs
, end
, bl
, y
);
4377 ldpp_dout(dpp
, 0) << "ERROR: fail to read object data, ret = " << ret
<< dendl
;
4381 uint64_t read_len
= ret
;
4382 ret
= processor
.process(std::move(bl
), ofs
);
4388 } while (ofs
<= end
);
4391 ret
= processor
.process({}, ofs
);
4397 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4398 if (iter
!= attrs
.end()) {
4399 bufferlist
& bl
= iter
->second
;
4406 uint64_t accounted_size
;
4408 bool compressed
{false};
4409 RGWCompressionInfo cs_info
;
4410 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
4412 ldpp_dout(dpp
, 0) << "ERROR: failed to read compression info" << dendl
;
4415 // pass original size if compressed
4416 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
4419 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
,
4420 nullptr, nullptr, nullptr, nullptr, nullptr, y
);
4423 int RGWRados::transition_obj(RGWObjectCtx
& obj_ctx
,
4424 RGWBucketInfo
& bucket_info
,
4426 const rgw_placement_rule
& placement_rule
,
4427 const real_time
& mtime
,
4429 const DoutPrefixProvider
*dpp
,
4432 map
<string
, bufferlist
> attrs
;
4433 real_time read_mtime
;
4436 obj_ctx
.set_atomic(obj
);
4438 RGWRados::Object
op_target(this, bucket_info
, obj_ctx
, obj
);
4439 RGWRados::Object::Read
read_op(&op_target
);
4441 read_op
.params
.attrs
= &attrs
;
4442 read_op
.params
.lastmod
= &read_mtime
;
4443 read_op
.params
.obj_size
= &obj_size
;
4445 int ret
= read_op
.prepare(y
);
4450 if (read_mtime
!= mtime
) {
4455 attrs
.erase(RGW_ATTR_ID_TAG
);
4456 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4458 ret
= copy_obj_data(obj_ctx
,
4464 nullptr /* pmtime */,
4469 nullptr /* petag */,
4479 int RGWRados::check_bucket_empty(RGWBucketInfo
& bucket_info
, optional_yield y
)
4481 constexpr uint NUM_ENTRIES
= 1000u;
4483 rgw_obj_index_key marker
;
4488 std::vector
<rgw_bucket_dir_entry
> ent_list
;
4489 ent_list
.reserve(NUM_ENTRIES
);
4491 int r
= cls_bucket_list_unordered(bucket_info
,
4506 for (auto const& dirent
: ent_list
) {
4509 if (rgw_obj_key::oid_to_key_in_ns(dirent
.key
.name
, &obj
, ns
)) {
4513 } while (is_truncated
);
4520 * bucket: the name of the bucket to delete
4521 * Returns 0 on success, -ERR# otherwise.
4523 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, optional_yield y
, bool check_empty
)
4525 const rgw_bucket
& bucket
= bucket_info
.bucket
;
4526 RGWSI_RADOS::Pool index_pool
;
4527 map
<int, string
> bucket_objs
;
4528 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
4533 r
= check_bucket_empty(bucket_info
, y
);
4539 bool remove_ep
= true;
4541 if (objv_tracker
.read_version
.empty()) {
4542 RGWBucketEntryPoint ep
;
4543 r
= ctl
.bucket
->read_bucket_entrypoint_info(bucket_info
.bucket
,
4546 RGWBucketCtl::Bucket::GetParams()
4547 .set_objv_tracker(&objv_tracker
));
4549 (!bucket_info
.bucket
.bucket_id
.empty() &&
4550 ep
.bucket
.bucket_id
!= bucket_info
.bucket
.bucket_id
)) {
4552 ldout(cct
, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info
.bucket
<< " returned error: r=" << r
<< dendl
;
4553 /* we have no idea what caused the error, will not try to remove it */
4556 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4564 r
= ctl
.bucket
->remove_bucket_entrypoint_info(bucket_info
.bucket
, null_yield
,
4565 RGWBucketCtl::Bucket::RemoveParams()
4566 .set_objv_tracker(&objv_tracker
));
4571 /* if the bucket is not synced we can remove the meta file */
4572 if (!svc
.zone
->is_syncing_bucket_meta(bucket
)) {
4573 RGWObjVersionTracker objv_tracker
;
4574 r
= ctl
.bucket
->remove_bucket_instance_info(bucket
, bucket_info
, null_yield
);
4579 /* remove bucket index objects asynchronously by best effort */
4580 (void) CLSRGWIssueBucketIndexClean(index_pool
.ioctx(),
4582 cct
->_conf
->rgw_bucket_index_max_aio
)();
4588 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
)
4591 map
<string
, bufferlist
> attrs
;
4593 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
4595 if (bucket
.bucket_id
.empty()) {
4596 r
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, info
, NULL
, null_yield
, &attrs
);
4598 r
= get_bucket_instance_info(obj_ctx
, bucket
, info
, nullptr, &attrs
, null_yield
);
4601 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
4605 info
.owner
= owner
.get_id();
4607 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
4609 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
4617 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
)
4621 vector
<rgw_bucket
>::iterator iter
;
4623 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
4624 rgw_bucket
& bucket
= *iter
;
4626 ldout(cct
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
4628 ldout(cct
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
4631 map
<string
, bufferlist
> attrs
;
4632 int r
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, info
, NULL
, null_yield
, &attrs
);
4634 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
4639 info
.flags
&= ~BUCKET_SUSPENDED
;
4641 info
.flags
|= BUCKET_SUSPENDED
;
4644 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
4646 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
4654 int RGWRados::bucket_suspended(rgw_bucket
& bucket
, bool *suspended
)
4656 RGWBucketInfo bucket_info
;
4657 int ret
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
, null_yield
);
4662 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
4666 int RGWRados::Object::complete_atomic_modification()
4668 if ((!state
->manifest
)|| state
->keep_tail
)
4671 cls_rgw_obj_chain chain
;
4672 store
->update_gc_chain(obj
, *state
->manifest
, &chain
);
4674 if (chain
.empty()) {
4678 string tag
= (state
->tail_tag
.length() > 0 ? state
->tail_tag
.to_str() : state
->obj_tag
.to_str());
4679 auto ret
= store
->gc
->send_chain(chain
, tag
); // do it synchronously
4681 //Delete objects inline if send chain to gc fails
4682 store
->delete_objs_inline(chain
, tag
);
4687 void RGWRados::update_gc_chain(rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
4689 RGWObjManifest::obj_iterator iter
;
4690 rgw_raw_obj raw_head
;
4691 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
4692 for (iter
= manifest
.obj_begin(); iter
!= manifest
.obj_end(); ++iter
) {
4693 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(this);
4694 if (mobj
== raw_head
)
4696 cls_rgw_obj_key
key(mobj
.oid
);
4697 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
4701 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
)
4703 return gc
->send_chain(chain
, tag
);
4706 void RGWRados::delete_objs_inline(cls_rgw_obj_chain
& chain
, const string
& tag
)
4709 std::unique_ptr
<IoCtx
> ctx(new IoCtx
);
4711 for (auto liter
= chain
.objs
.begin(); liter
!= chain
.objs
.end(); ++liter
) {
4712 cls_rgw_obj
& obj
= *liter
;
4713 if (obj
.pool
!= last_pool
) {
4714 ctx
.reset(new IoCtx
);
4715 ret
= rgw_init_ioctx(get_rados_handle(), obj
.pool
, *ctx
);
4718 ldout(cct
, 0) << "ERROR: failed to create ioctx pool=" <<
4722 last_pool
= obj
.pool
;
4724 ctx
->locator_set_key(obj
.loc
);
4725 const string
& oid
= obj
.key
.name
; /* just stored raw oid there */
4726 ldout(cct
, 5) << "delete_objs_inline: removing " << obj
.pool
<<
4727 ":" << obj
.key
.name
<< dendl
;
4728 ObjectWriteOperation op
;
4729 cls_refcount_put(op
, tag
, true);
4730 ret
= ctx
->operate(oid
, &op
);
4732 ldout(cct
, 5) << "delete_objs_inline: refcount put returned error " << ret
<< dendl
;
4737 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
4738 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
4740 for (const auto& pair
: header
.stats
) {
4741 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
4742 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
4744 RGWStorageStats
& s
= stats
[category
];
4746 s
.category
= category
;
4747 s
.size
+= header_stats
.total_size
;
4748 s
.size_rounded
+= header_stats
.total_size_rounded
;
4749 s
.size_utilized
+= header_stats
.actual_size
;
4750 s
.num_objects
+= header_stats
.num_entries
;
4754 int RGWRados::bucket_check_index(RGWBucketInfo
& bucket_info
,
4755 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
4756 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
4758 RGWSI_RADOS::Pool index_pool
;
4759 // key - bucket index object id
4760 // value - bucket index check OP returned result with the given bucket index object (shard)
4761 map
<int, string
> oids
;
4762 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
4764 int ret
= svc
.bi_rados
->open_bucket_index(bucket_info
, std::nullopt
, &index_pool
, &oids
, nullptr);
4769 for (auto& iter
: oids
) {
4770 bucket_objs_ret
[iter
.first
] = rgw_cls_check_index_ret();
4773 ret
= CLSRGWIssueBucketCheck(index_pool
.ioctx(), oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
4778 // Aggregate results (from different shards if there is any)
4779 map
<int, struct rgw_cls_check_index_ret
>::iterator iter
;
4780 for (iter
= bucket_objs_ret
.begin(); iter
!= bucket_objs_ret
.end(); ++iter
) {
4781 accumulate_raw_stats(iter
->second
.existing_header
, *existing_stats
);
4782 accumulate_raw_stats(iter
->second
.calculated_header
, *calculated_stats
);
4788 int RGWRados::bucket_rebuild_index(RGWBucketInfo
& bucket_info
)
4790 RGWSI_RADOS::Pool index_pool
;
4791 map
<int, string
> bucket_objs
;
4793 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
4798 return CLSRGWIssueBucketRebuild(index_pool
.ioctx(), bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
4801 int RGWRados::bucket_set_reshard(const RGWBucketInfo
& bucket_info
, const cls_rgw_bucket_instance_entry
& entry
)
4803 RGWSI_RADOS::Pool index_pool
;
4804 map
<int, string
> bucket_objs
;
4806 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
4811 return CLSRGWIssueSetBucketResharding(index_pool
.ioctx(), bucket_objs
, entry
, cct
->_conf
->rgw_bucket_index_max_aio
)();
4814 int RGWRados::defer_gc(void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, optional_yield y
)
4816 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
4817 std::string oid
, key
;
4818 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
4822 RGWObjState
*state
= NULL
;
4824 int r
= get_obj_state(rctx
, bucket_info
, obj
, &state
, false, y
);
4828 if (!state
->is_atomic
) {
4829 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
4835 if (state
->tail_tag
.length() > 0) {
4836 tag
= state
->tail_tag
.c_str();
4837 } else if (state
->obj_tag
.length() > 0) {
4838 tag
= state
->obj_tag
.c_str();
4840 ldout(cct
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
4844 ldout(cct
, 0) << "defer chain tag=" << tag
<< dendl
;
4846 cls_rgw_obj_chain chain
;
4847 update_gc_chain(state
->obj
, *state
->manifest
, &chain
);
4848 return gc
->async_defer_chain(tag
, chain
);
4851 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
4853 list
<string
> prefixes
;
4854 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
4855 cls_rgw_remove_obj(op
, prefixes
);
4858 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
4860 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
4863 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
4865 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
4868 struct tombstone_entry
{
4869 ceph::real_time mtime
;
4870 uint32_t zone_short_id
;
4873 tombstone_entry() = default;
4874 explicit tombstone_entry(const RGWObjState
& state
)
4875 : mtime(state
.mtime
), zone_short_id(state
.zone_short_id
),
4876 pg_ver(state
.pg_ver
) {}
4881 * bucket: name of the bucket storing the object
4882 * obj: name of the object to delete
4883 * Returns: 0 on success, -ERR# otherwise.
4885 int RGWRados::Object::Delete::delete_obj(optional_yield y
)
4887 RGWRados
*store
= target
->get_store();
4888 rgw_obj
& src_obj
= target
->get_obj();
4889 const string
& instance
= src_obj
.key
.instance
;
4890 rgw_obj obj
= src_obj
;
4892 if (instance
== "null") {
4893 obj
.key
.instance
.clear();
4896 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
4898 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
4899 if (instance
.empty() || explicit_marker_version
) {
4900 rgw_obj marker
= obj
;
4902 if (!params
.marker_version_id
.empty()) {
4903 if (params
.marker_version_id
!= "null") {
4904 marker
.key
.set_instance(params
.marker_version_id
);
4906 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
4907 store
->gen_rand_obj_instance_name(&marker
);
4910 result
.version_id
= marker
.key
.instance
;
4911 if (result
.version_id
.empty())
4912 result
.version_id
= "null";
4913 result
.delete_marker
= true;
4915 struct rgw_bucket_dir_entry_meta meta
;
4917 meta
.owner
= params
.obj_owner
.get_id().to_str();
4918 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
4920 if (real_clock::is_zero(params
.mtime
)) {
4921 meta
.mtime
= real_clock::now();
4923 meta
.mtime
= params
.mtime
;
4926 int r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
, y
, params
.zones_trace
);
4931 rgw_bucket_dir_entry dirent
;
4933 int r
= store
->bi_get_instance(target
->get_bucket_info(), obj
, &dirent
);
4937 result
.delete_marker
= dirent
.is_delete_marker();
4938 r
= store
->unlink_obj_instance(target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
, y
, params
.zones_trace
);
4942 result
.version_id
= instance
;
4946 int r
= target
->get_bucket_shard(&bs
);
4948 ldout(store
->ctx(), 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
4952 r
= store
->svc
.datalog_rados
->add_entry(target
->bucket_info
, bs
->shard_id
);
4954 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
4962 int r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
4968 r
= target
->get_state(&state
, false, y
);
4972 ObjectWriteOperation op
;
4974 if (!real_clock::is_zero(params
.unmod_since
)) {
4975 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
4976 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
4977 if (!params
.high_precision_time
) {
4982 ldout(store
->ctx(), 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
4983 if (ctime
> unmod
) {
4984 return -ERR_PRECONDITION_FAILED
;
4987 /* only delete object if mtime is less than or equal to params.unmod_since */
4988 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
4990 uint64_t obj_accounted_size
= state
->accounted_size
;
4992 if(params
.abortmp
) {
4993 obj_accounted_size
= params
.parts_accounted_size
;
4996 if (!real_clock::is_zero(params
.expiration_time
)) {
4998 real_time delete_at
;
5000 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
5002 auto iter
= bl
.cbegin();
5003 decode(delete_at
, iter
);
5004 } catch (buffer::error
& err
) {
5005 ldout(store
->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
5009 if (params
.expiration_time
!= delete_at
) {
5010 return -ERR_PRECONDITION_FAILED
;
5013 return -ERR_PRECONDITION_FAILED
;
5017 if (!state
->exists
) {
5018 target
->invalidate_state();
5022 r
= target
->prepare_atomic_modification(op
, false, NULL
, NULL
, NULL
, true, false, y
);
5026 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
5028 RGWRados::Bucket
bop(store
, bucket_info
);
5029 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5031 index_op
.set_zones_trace(params
.zones_trace
);
5032 index_op
.set_bilog_flags(params
.bilog_flags
);
5034 r
= index_op
.prepare(CLS_RGW_OP_DEL
, &state
->write_tag
, y
);
5038 store
->remove_rgw_head_obj(op
);
5040 auto& ioctx
= ref
.pool
.ioctx();
5041 r
= rgw_rados_operate(ioctx
, ref
.obj
.oid
, &op
, null_yield
);
5043 /* raced with another operation, object state is indeterminate */
5044 const bool need_invalidate
= (r
== -ECANCELED
);
5046 int64_t poolid
= ioctx
.get_id();
5048 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
5049 if (obj_tombstone_cache
) {
5050 tombstone_entry entry
{*state
};
5051 obj_tombstone_cache
->add(obj
, entry
);
5053 r
= index_op
.complete_del(poolid
, ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
5055 int ret
= target
->complete_atomic_modification();
5057 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
5059 /* other than that, no need to propagate error */
5061 int ret
= index_op
.cancel();
5063 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
5067 if (need_invalidate
) {
5068 target
->invalidate_state();
5074 /* update quota cache */
5075 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_accounted_size
);
5080 int RGWRados::delete_obj(RGWObjectCtx
& obj_ctx
,
5081 const RGWBucketInfo
& bucket_info
,
5083 int versioning_status
,
5084 uint16_t bilog_flags
,
5085 const real_time
& expiration_time
,
5086 rgw_zone_set
*zones_trace
)
5088 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
5089 RGWRados::Object::Delete
del_op(&del_target
);
5091 del_op
.params
.bucket_owner
= bucket_info
.owner
;
5092 del_op
.params
.versioning_status
= versioning_status
;
5093 del_op
.params
.bilog_flags
= bilog_flags
;
5094 del_op
.params
.expiration_time
= expiration_time
;
5095 del_op
.params
.zones_trace
= zones_trace
;
5097 return del_op
.delete_obj(null_yield
);
5100 int RGWRados::delete_raw_obj(const rgw_raw_obj
& obj
)
5103 int r
= get_raw_obj_ref(obj
, &ref
);
5108 ObjectWriteOperation op
;
5111 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
5118 int RGWRados::delete_obj_index(const rgw_obj
& obj
, ceph::real_time mtime
)
5120 std::string oid
, key
;
5121 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5123 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5125 RGWBucketInfo bucket_info
;
5126 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
, null_yield
);
5128 ldout(cct
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
5132 RGWRados::Bucket
bop(this, bucket_info
);
5133 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5135 return index_op
.complete_del(-1 /* pool */, 0, mtime
, NULL
);
5138 static void generate_fake_tag(RGWRados
*store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
5142 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin();
5143 if (mi
!= manifest
.obj_end()) {
5144 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
5146 tag
= mi
.get_location().get_raw_obj(store
).oid
;
5150 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
5151 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
5153 hash
.Update((const unsigned char *)manifest_bl
.c_str(), manifest_bl
.length());
5155 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
5156 if (iter
!= attrset
.end()) {
5157 bufferlist
& bl
= iter
->second
;
5158 hash
.Update((const unsigned char *)bl
.c_str(), bl
.length());
5162 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
5163 tag
.append(md5_str
);
5165 ldout(store
->ctx(), 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
5167 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
5170 static bool is_olh(map
<string
, bufferlist
>& attrs
)
5172 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
5173 return (iter
!= attrs
.end());
5176 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
5178 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
5179 return (iter
!= attrs
.end());
5182 int RGWRados::get_olh_target_state(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5183 RGWObjState
*olh_state
, RGWObjState
**target_state
, optional_yield y
)
5185 ceph_assert(olh_state
->is_olh
);
5188 int r
= RGWRados::follow_olh(bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
5192 r
= get_obj_state(&obj_ctx
, bucket_info
, target
, target_state
, false, y
);
5200 int RGWRados::get_obj_state_impl(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5201 RGWObjState
**state
, bool follow_olh
, optional_yield y
, bool assume_noent
)
5207 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
5209 RGWObjState
*s
= rctx
->get_state(obj
);
5210 ldout(cct
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
5213 if (s
->is_olh
&& need_follow_olh
) {
5214 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
, y
);
5221 rgw_raw_obj raw_obj
;
5222 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
5226 if (!assume_noent
) {
5227 r
= RGWRados::raw_obj_stat(raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
, y
);
5232 s
->has_attrs
= true;
5233 tombstone_entry entry
;
5234 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
5235 s
->mtime
= entry
.mtime
;
5236 s
->zone_short_id
= entry
.zone_short_id
;
5237 s
->pg_ver
= entry
.pg_ver
;
5238 ldout(cct
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
5239 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
5241 s
->mtime
= real_time();
5249 s
->has_attrs
= true;
5250 s
->accounted_size
= s
->size
;
5252 auto iter
= s
->attrset
.find(RGW_ATTR_ETAG
);
5253 if (iter
!= s
->attrset
.end()) {
5254 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5255 bufferlist
& bletag
= iter
->second
;
5256 if (bletag
.length() > 0 && bletag
[bletag
.length() - 1] == '\0') {
5258 bletag
.splice(0, bletag
.length() - 1, &newbl
);
5259 bletag
.claim(newbl
);
5263 iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
5264 const bool compressed
= (iter
!= s
->attrset
.end());
5266 // use uncompressed size for accounted_size
5268 RGWCompressionInfo info
;
5269 auto p
= iter
->second
.cbegin();
5271 s
->accounted_size
= info
.orig_size
;
5272 } catch (buffer::error
&) {
5273 dout(0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
5278 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
5279 if (iter
!= s
->attrset
.end()) {
5280 bufferlist bl
= iter
->second
;
5281 bufferlist::iterator it
= bl
.begin();
5282 it
.copy(bl
.length(), s
->shadow_obj
);
5283 s
->shadow_obj
[bl
.length()] = '\0';
5285 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
5286 auto ttiter
= s
->attrset
.find(RGW_ATTR_TAIL_TAG
);
5287 if (ttiter
!= s
->attrset
.end()) {
5288 s
->tail_tag
= s
->attrset
[RGW_ATTR_TAIL_TAG
];
5291 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
5292 if (manifest_bl
.length()) {
5293 auto miter
= manifest_bl
.cbegin();
5295 s
->manifest
.emplace();
5296 decode(*s
->manifest
, miter
);
5297 s
->manifest
->set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
5298 broken due to old bugs */
5299 s
->size
= s
->manifest
->get_obj_size();
5301 s
->accounted_size
= s
->size
;
5302 } catch (buffer::error
& err
) {
5303 ldout(cct
, 0) << "ERROR: couldn't decode manifest" << dendl
;
5306 ldout(cct
, 10) << "manifest: total_size = " << s
->manifest
->get_obj_size() << dendl
;
5307 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>() && \
5308 s
->manifest
->has_explicit_objs()) {
5309 RGWObjManifest::obj_iterator mi
;
5310 for (mi
= s
->manifest
->obj_begin(); mi
!= s
->manifest
->obj_end(); ++mi
) {
5311 ldout(cct
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(this) << dendl
;
5315 if (!s
->obj_tag
.length()) {
5317 * Uh oh, something's wrong, object with manifest should have tag. Let's
5318 * create one out of the manifest, would be unique
5320 generate_fake_tag(this, s
->attrset
, *s
->manifest
, manifest_bl
, s
->obj_tag
);
5324 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
5325 if (aiter
!= s
->attrset
.end()) {
5326 bufferlist
& pg_ver_bl
= aiter
->second
;
5327 if (pg_ver_bl
.length()) {
5328 auto pgbl
= pg_ver_bl
.cbegin();
5330 decode(s
->pg_ver
, pgbl
);
5331 } catch (buffer::error
& err
) {
5332 ldout(cct
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5336 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
5337 if (aiter
!= s
->attrset
.end()) {
5338 bufferlist
& zone_short_id_bl
= aiter
->second
;
5339 if (zone_short_id_bl
.length()) {
5340 auto zbl
= zone_short_id_bl
.cbegin();
5342 decode(s
->zone_short_id
, zbl
);
5343 } catch (buffer::error
& err
) {
5344 ldout(cct
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5348 if (s
->obj_tag
.length())
5349 ldout(cct
, 20) << "get_obj_state: setting s->obj_tag to " << s
->obj_tag
.c_str() << dendl
;
5351 ldout(cct
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
5353 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5354 * it exist, and not only if is_olh() returns true
5356 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
5357 if (iter
!= s
->attrset
.end()) {
5358 s
->olh_tag
= iter
->second
;
5361 if (is_olh(s
->attrset
)) {
5364 ldout(cct
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
5366 if (need_follow_olh
) {
5367 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
, y
);
5368 } else if (obj
.key
.have_null_instance() && !s
->manifest
) {
5369 // read null version, and the head object only have olh info
5378 int RGWRados::get_obj_state(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
5379 bool follow_olh
, optional_yield y
, bool assume_noent
)
5384 ret
= get_obj_state_impl(rctx
, bucket_info
, obj
, state
, follow_olh
, y
, assume_noent
);
5385 } while (ret
== -EAGAIN
);
5390 int RGWRados::Object::get_manifest(RGWObjManifest
**pmanifest
, optional_yield y
)
5392 RGWObjState
*astate
;
5393 int r
= get_state(&astate
, true, y
);
5398 *pmanifest
= &(*astate
->manifest
);
5403 int RGWRados::Object::Read::get_attr(const char *name
, bufferlist
& dest
, optional_yield y
)
5406 int r
= source
->get_state(&state
, true, y
);
5411 if (!state
->get_attr(name
, dest
))
5417 int RGWRados::Object::Stat::stat_async()
5419 RGWObjectCtx
& ctx
= source
->get_ctx();
5420 rgw_obj
& obj
= source
->get_obj();
5421 RGWRados
*store
= source
->get_store();
5423 RGWObjState
*s
= ctx
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
5427 result
.size
= s
->size
;
5428 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
5429 result
.attrs
= s
->attrset
;
5430 result
.manifest
= s
->manifest
;
5436 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
5438 int r
= store
->get_obj_head_ioctx(source
->get_bucket_info(), obj
, &state
.io_ctx
);
5443 librados::ObjectReadOperation op
;
5444 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
5445 op
.getxattrs(&result
.attrs
, NULL
);
5446 state
.completion
= librados::Rados::aio_create_completion(nullptr, nullptr);
5447 state
.io_ctx
.locator_set_key(loc
);
5448 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
5450 ldout(store
->ctx(), 5) << __func__
5451 << ": ERROR: aio_operate() returned ret=" << r
5460 int RGWRados::Object::Stat::wait()
5462 if (!state
.completion
) {
5466 state
.completion
->wait_for_complete();
5467 state
.ret
= state
.completion
->get_return_value();
5468 state
.completion
->release();
5470 if (state
.ret
!= 0) {
5477 int RGWRados::Object::Stat::finish()
5479 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
5480 if (iter
!= result
.attrs
.end()) {
5481 bufferlist
& bl
= iter
->second
;
5482 auto biter
= bl
.cbegin();
5484 result
.manifest
.emplace();
5485 decode(*result
.manifest
, biter
);
5486 } catch (buffer::error
& err
) {
5487 RGWRados
*store
= source
->get_store();
5488 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
5496 int RGWRados::append_atomic_test(RGWObjectCtx
*rctx
,
5497 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5498 ObjectOperation
& op
, RGWObjState
**pstate
, optional_yield y
)
5503 int r
= get_obj_state(rctx
, bucket_info
, obj
, pstate
, false, y
);
5507 return append_atomic_test(*pstate
, op
);
5510 int RGWRados::append_atomic_test(const RGWObjState
* state
,
5511 librados::ObjectOperation
& op
)
5513 if (!state
->is_atomic
) {
5514 ldout(cct
, 20) << "state for obj=" << state
->obj
<< " is not atomic, not appending atomic test" << dendl
;
5518 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
5519 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
5521 ldout(cct
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
5526 int RGWRados::Object::get_state(RGWObjState
**pstate
, bool follow_olh
, optional_yield y
, bool assume_noent
)
5528 return store
->get_obj_state(&ctx
, bucket_info
, obj
, pstate
, follow_olh
, y
, assume_noent
);
5531 void RGWRados::Object::invalidate_state()
5533 ctx
.invalidate(obj
);
5536 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
5537 const char *if_match
, const char *if_nomatch
, bool removal_op
,
5538 bool modify_tail
, optional_yield y
)
5540 int r
= get_state(&state
, false, y
);
5544 bool need_guard
= ((state
->manifest
) || (state
->obj_tag
.length() != 0) ||
5545 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
5548 if (!state
->is_atomic
) {
5549 ldout(store
->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
5553 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
5560 /* first verify that the object wasn't replaced under */
5561 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
5562 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
5563 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5567 if (strcmp(if_match
, "*") == 0) {
5568 // test the object is existing
5569 if (!state
->exists
) {
5570 return -ERR_PRECONDITION_FAILED
;
5574 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
5575 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
5576 return -ERR_PRECONDITION_FAILED
;
5582 if (strcmp(if_nomatch
, "*") == 0) {
5583 // test the object is NOT existing
5584 if (state
->exists
) {
5585 return -ERR_PRECONDITION_FAILED
;
5589 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
5590 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
5591 return -ERR_PRECONDITION_FAILED
;
5598 if (state
->exists
) {
5600 store
->remove_rgw_head_obj(op
);
5607 /* the object is being removed, no need to update its tag */
5612 state
->write_tag
= *ptag
;
5614 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
5617 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
5619 ldout(store
->ctx(), 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
5621 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
5623 op
.setxattr(RGW_ATTR_TAIL_TAG
, bl
);
5630 * Set an attr on an object.
5631 * bucket: name of the bucket holding the object
5632 * obj: name of the object to set the attr on
5633 * name: the attr to set
5634 * bl: the contents of the attr
5635 * Returns: 0 on success, -ERR# otherwise.
5637 int RGWRados::set_attr(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
5639 map
<string
, bufferlist
> attrs
;
5641 return set_attrs(ctx
, bucket_info
, obj
, attrs
, NULL
, null_yield
);
5644 int RGWRados::set_attrs(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& src_obj
,
5645 map
<string
, bufferlist
>& attrs
,
5646 map
<string
, bufferlist
>* rmattrs
,
5649 rgw_obj obj
= src_obj
;
5650 if (obj
.key
.instance
== "null") {
5651 obj
.key
.instance
.clear();
5655 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
5659 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
5661 ObjectWriteOperation op
;
5662 RGWObjState
*state
= NULL
;
5664 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &state
, y
);
5668 // ensure null version object exist
5669 if (src_obj
.key
.instance
== "null" && !state
->manifest
) {
5673 map
<string
, bufferlist
>::iterator iter
;
5675 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
5676 const string
& name
= iter
->first
;
5677 op
.rmxattr(name
.c_str());
5681 const rgw_bucket
& bucket
= obj
.bucket
;
5683 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
5684 const string
& name
= iter
->first
;
5685 bufferlist
& bl
= iter
->second
;
5690 op
.setxattr(name
.c_str(), bl
);
5692 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
5697 rgw_obj_index_key obj_key
;
5698 obj
.key
.get_index_key(&obj_key
);
5700 obj_expirer
->hint_add(ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
5701 } catch (buffer::error
& err
) {
5702 ldout(cct
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
5710 RGWObjectCtx
obj_ctx(this->store
);
5713 RGWRados::Bucket
bop(this, bucket_info
);
5714 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5718 append_rand_alpha(cct
, tag
, tag
, 32);
5719 state
->write_tag
= tag
;
5720 r
= index_op
.prepare(CLS_RGW_OP_ADD
, &state
->write_tag
, y
);
5725 bl
.append(tag
.c_str(), tag
.size() + 1);
5726 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
5730 real_time mtime
= real_clock::now();
5731 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
5732 op
.mtime2(&mtime_ts
);
5733 auto& ioctx
= ref
.pool
.ioctx();
5734 r
= rgw_rados_operate(ioctx
, ref
.obj
.oid
, &op
, null_yield
);
5737 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
5738 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
5739 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
5740 string etag
= rgw_bl_str(etag_bl
);
5741 string content_type
= rgw_bl_str(content_type_bl
);
5742 string storage_class
;
5743 auto iter
= attrs
.find(RGW_ATTR_STORAGE_CLASS
);
5744 if (iter
!= attrs
.end()) {
5745 storage_class
= rgw_bl_str(iter
->second
);
5747 uint64_t epoch
= ioctx
.get_last_version();
5748 int64_t poolid
= ioctx
.get_id();
5749 r
= index_op
.complete(poolid
, epoch
, state
->size
, state
->accounted_size
,
5750 mtime
, etag
, content_type
, storage_class
, &acl_bl
,
5751 RGWObjCategory::Main
, NULL
);
5753 int ret
= index_op
.cancel();
5755 ldout(cct
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
5763 state
->obj_tag
.swap(bl
);
5765 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
5766 state
->attrset
.erase(iter
->first
);
5770 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
5771 state
->attrset
[iter
->first
] = iter
->second
;
5774 auto iter
= state
->attrset
.find(RGW_ATTR_ID_TAG
);
5775 if (iter
!= state
->attrset
.end()) {
5776 iter
->second
= state
->obj_tag
;
5783 int RGWRados::Object::Read::prepare(optional_yield y
)
5785 RGWRados
*store
= source
->get_store();
5786 CephContext
*cct
= store
->ctx();
5790 map
<string
, bufferlist
>::iterator iter
;
5792 RGWObjState
*astate
;
5793 int r
= source
->get_state(&astate
, true, y
);
5797 if (!astate
->exists
) {
5801 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
5803 state
.obj
= astate
->obj
;
5804 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
5806 state
.cur_pool
= state
.head_obj
.pool
;
5807 state
.cur_ioctx
= &state
.io_ctxs
[state
.cur_pool
];
5809 r
= store
->get_obj_head_ioctx(bucket_info
, state
.obj
, state
.cur_ioctx
);
5813 if (params
.target_obj
) {
5814 *params
.target_obj
= state
.obj
;
5817 *params
.attrs
= astate
->attrset
;
5818 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>()) {
5819 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
5820 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
5825 /* Convert all times go GMT to make them compatible */
5826 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
5827 obj_time_weight src_weight
;
5828 src_weight
.init(astate
);
5829 src_weight
.high_precision
= conds
.high_precision_time
;
5831 obj_time_weight dest_weight
;
5832 dest_weight
.high_precision
= conds
.high_precision_time
;
5834 if (conds
.mod_ptr
&& !conds
.if_nomatch
) {
5835 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
5836 ldout(cct
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
5837 if (!(dest_weight
< src_weight
)) {
5838 return -ERR_NOT_MODIFIED
;
5842 if (conds
.unmod_ptr
&& !conds
.if_match
) {
5843 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
5844 ldout(cct
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
5845 if (dest_weight
< src_weight
) {
5846 return -ERR_PRECONDITION_FAILED
;
5850 if (conds
.if_match
|| conds
.if_nomatch
) {
5851 r
= get_attr(RGW_ATTR_ETAG
, etag
, y
);
5855 if (conds
.if_match
) {
5856 string if_match_str
= rgw_string_unquote(conds
.if_match
);
5857 ldout(cct
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-Match: " << if_match_str
<< dendl
;
5858 if (if_match_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) != 0) {
5859 return -ERR_PRECONDITION_FAILED
;
5863 if (conds
.if_nomatch
) {
5864 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
5865 ldout(cct
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
5866 if (if_nomatch_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) == 0) {
5867 return -ERR_NOT_MODIFIED
;
5872 if (params
.obj_size
)
5873 *params
.obj_size
= astate
->size
;
5875 *params
.lastmod
= astate
->mtime
;
5880 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
5887 } else if (end
< 0) {
5892 if (ofs
>= (off_t
)obj_size
) {
5895 if (end
>= (off_t
)obj_size
) {
5902 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard
**pbs
, std::function
<int(BucketShard
*)> call
)
5904 RGWRados
*store
= target
->get_store();
5908 #define NUM_RESHARD_RETRIES 10
5909 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
5910 int ret
= get_bucket_shard(&bs
);
5912 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
5916 if (r
!= -ERR_BUSY_RESHARDING
) {
5919 ldout(store
->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
5920 string new_bucket_id
;
5921 r
= store
->block_while_resharding(bs
, &new_bucket_id
,
5922 target
->bucket_info
, null_yield
);
5923 if (r
== -ERR_BUSY_RESHARDING
) {
5929 ldout(store
->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
5930 i
= 0; /* resharding is finished, make sure we can retry */
5931 r
= target
->update_bucket_id(new_bucket_id
);
5933 ldout(store
->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id
<< " returned r=" << r
<< dendl
;
5950 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op
, const string
*write_tag
, optional_yield y
)
5955 RGWRados
*store
= target
->get_store();
5957 if (write_tag
&& write_tag
->length()) {
5958 optag
= string(write_tag
->c_str(), write_tag
->length());
5960 if (optag
.empty()) {
5961 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
5965 int r
= guard_reshard(nullptr, [&](BucketShard
*bs
) -> int {
5966 return store
->cls_obj_prepare_op(*bs
, op
, optag
, obj
, bilog_flags
, y
, zones_trace
);
5977 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid
, uint64_t epoch
,
5978 uint64_t size
, uint64_t accounted_size
,
5979 ceph::real_time
& ut
, const string
& etag
,
5980 const string
& content_type
, const string
& storage_class
,
5982 RGWObjCategory category
,
5983 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
,
5989 RGWRados
*store
= target
->get_store();
5992 int ret
= get_bucket_shard(&bs
);
5994 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
5998 rgw_bucket_dir_entry ent
;
5999 obj
.key
.get_index_key(&ent
.key
);
6000 ent
.meta
.size
= size
;
6001 ent
.meta
.accounted_size
= accounted_size
;
6002 ent
.meta
.mtime
= ut
;
6003 ent
.meta
.etag
= etag
;
6004 ent
.meta
.storage_class
= storage_class
;
6006 ent
.meta
.user_data
= *user_data
;
6009 if (acl_bl
&& acl_bl
->length()) {
6010 int ret
= store
->decode_policy(*acl_bl
, &owner
);
6012 ldout(store
->ctx(), 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
6015 ent
.meta
.owner
= owner
.get_id().to_str();
6016 ent
.meta
.owner_display_name
= owner
.get_display_name();
6017 ent
.meta
.content_type
= content_type
;
6018 ent
.meta
.appendable
= appendable
;
6020 ret
= store
->cls_obj_complete_add(*bs
, obj
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
6022 int r
= store
->svc
.datalog_rados
->add_entry(target
->bucket_info
, bs
->shard_id
);
6024 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6030 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid
, uint64_t epoch
,
6031 real_time
& removed_mtime
,
6032 list
<rgw_obj_index_key
> *remove_objs
)
6037 RGWRados
*store
= target
->get_store();
6040 int ret
= get_bucket_shard(&bs
);
6042 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6046 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
, zones_trace
);
6048 int r
= store
->svc
.datalog_rados
->add_entry(target
->bucket_info
, bs
->shard_id
);
6050 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6057 int RGWRados::Bucket::UpdateIndex::cancel()
6062 RGWRados
*store
= target
->get_store();
6065 int ret
= guard_reshard(&bs
, [&](BucketShard
*bs
) -> int {
6066 return store
->cls_obj_complete_cancel(*bs
, optag
, obj
, bilog_flags
, zones_trace
);
6070 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6071 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6072 * have no way to tell that they're all caught up
6074 int r
= store
->svc
.datalog_rados
->add_entry(target
->bucket_info
, bs
->shard_id
);
6076 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6082 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
, optional_yield y
)
6084 RGWRados
*store
= source
->get_store();
6085 CephContext
*cct
= store
->ctx();
6087 rgw_raw_obj read_obj
;
6088 uint64_t read_ofs
= ofs
;
6089 uint64_t len
, read_len
;
6090 bool reading_from_head
= true;
6091 ObjectReadOperation op
;
6093 bool merge_bl
= false;
6094 bufferlist
*pbl
= &bl
;
6096 uint64_t max_chunk_size
;
6098 RGWObjState
*astate
;
6099 int r
= source
->get_state(&astate
, true, y
);
6103 if (astate
->size
== 0) {
6105 } else if (end
>= (int64_t)astate
->size
) {
6106 end
= astate
->size
- 1;
6112 len
= end
- ofs
+ 1;
6114 if (astate
->manifest
&& astate
->manifest
->has_tail()) {
6115 /* now get the relevant object part */
6116 RGWObjManifest::obj_iterator iter
= astate
->manifest
->obj_find(ofs
);
6118 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
6119 read_obj
= iter
.get_location().get_raw_obj(store
);
6120 len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6121 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6122 reading_from_head
= (read_obj
== state
.head_obj
);
6124 read_obj
= state
.head_obj
;
6127 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
);
6129 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
6133 if (len
> max_chunk_size
)
6134 len
= max_chunk_size
;
6139 if (reading_from_head
) {
6140 /* only when reading from the head object do we need to do the atomic test */
6141 r
= store
->append_atomic_test(&source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
, y
);
6145 if (astate
&& astate
->prefetch_data
) {
6146 if (!ofs
&& astate
->data
.length() >= len
) {
6151 if (ofs
< astate
->data
.length()) {
6152 unsigned copy_len
= std::min((uint64_t)astate
->data
.length() - ofs
, len
);
6153 astate
->data
.begin(ofs
).copy(copy_len
, bl
);
6154 read_len
-= copy_len
;
6155 read_ofs
+= copy_len
;
6165 ldout(cct
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
6166 op
.read(read_ofs
, read_len
, pbl
, NULL
);
6168 if (state
.cur_pool
!= read_obj
.pool
) {
6169 auto iter
= state
.io_ctxs
.find(read_obj
.pool
);
6170 if (iter
== state
.io_ctxs
.end()) {
6171 state
.cur_ioctx
= &state
.io_ctxs
[read_obj
.pool
];
6172 r
= store
->open_pool_ctx(read_obj
.pool
, *state
.cur_ioctx
, false);
6174 ldout(cct
, 20) << "ERROR: failed to open pool context for pool=" << read_obj
.pool
<< " r=" << r
<< dendl
;
6178 state
.cur_ioctx
= &iter
->second
;
6180 state
.cur_pool
= read_obj
.pool
;
6183 state
.cur_ioctx
->locator_set_key(read_obj
.loc
);
6185 r
= state
.cur_ioctx
->operate(read_obj
.oid
, &op
, NULL
);
6186 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
6199 struct get_obj_data
{
6201 RGWGetDataCB
* client_cb
;
6203 uint64_t offset
; // next offset to write to client
6204 rgw::AioResultList completed
; // completed read results, sorted by offset
6205 optional_yield yield
;
6207 get_obj_data(RGWRados
* store
, RGWGetDataCB
* cb
, rgw::Aio
* aio
,
6208 uint64_t offset
, optional_yield yield
)
6209 : store(store
), client_cb(cb
), aio(aio
), offset(offset
), yield(yield
) {}
6211 int flush(rgw::AioResultList
&& results
) {
6212 int r
= rgw::check_for_errors(results
);
6217 auto cmp
= [](const auto& lhs
, const auto& rhs
) { return lhs
.id
< rhs
.id
; };
6218 results
.sort(cmp
); // merge() requires results to be sorted first
6219 completed
.merge(results
, cmp
); // merge results in sorted order
6221 while (!completed
.empty() && completed
.front().id
== offset
) {
6222 auto bl
= std::move(completed
.front().data
);
6223 completed
.pop_front_and_dispose(std::default_delete
<rgw::AioResultEntry
>{});
6225 offset
+= bl
.length();
6226 int r
= client_cb
->handle_data(bl
, 0, bl
.length());
6235 // wait for all completions to drain and ignore the results
6240 auto c
= aio
->wait();
6241 while (!c
.empty()) {
6242 int r
= flush(std::move(c
));
6249 return flush(std::move(c
));
6253 static int _get_obj_iterate_cb(const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6254 off_t read_ofs
, off_t len
, bool is_head_obj
,
6255 RGWObjState
*astate
, void *arg
)
6257 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
6259 return d
->store
->get_obj_iterate_cb(read_obj
, obj_ofs
, read_ofs
, len
,
6260 is_head_obj
, astate
, arg
);
6263 int RGWRados::get_obj_iterate_cb(const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6264 off_t read_ofs
, off_t len
, bool is_head_obj
,
6265 RGWObjState
*astate
, void *arg
)
6267 ObjectReadOperation op
;
6268 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
6272 /* only when reading from the head object do we need to do the atomic test */
6273 int r
= append_atomic_test(astate
, op
);
6278 obj_ofs
< astate
->data
.length()) {
6279 unsigned chunk_len
= std::min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
6281 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
6286 d
->offset
+= chunk_len
;
6287 read_ofs
+= chunk_len
;
6288 obj_ofs
+= chunk_len
;
6294 auto obj
= d
->store
->svc
.rados
->obj(read_obj
);
6297 ldout(cct
, 4) << "failed to open rados context for " << read_obj
<< dendl
;
6301 ldout(cct
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
6302 op
.read(read_ofs
, len
, nullptr, nullptr);
6304 const uint64_t cost
= len
;
6305 const uint64_t id
= obj_ofs
; // use logical object offset for sorting replies
6307 auto completed
= d
->aio
->get(obj
, rgw::Aio::librados_op(std::move(op
), d
->yield
), cost
, id
);
6309 return d
->flush(std::move(completed
));
6312 int RGWRados::Object::Read::iterate(int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
,
6315 RGWRados
*store
= source
->get_store();
6316 CephContext
*cct
= store
->ctx();
6317 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
6318 const uint64_t chunk_size
= cct
->_conf
->rgw_get_obj_max_req_size
;
6319 const uint64_t window_size
= cct
->_conf
->rgw_get_obj_window_size
;
6321 auto aio
= rgw::make_throttle(window_size
, y
);
6322 get_obj_data
data(store
, cb
, &*aio
, ofs
, y
);
6324 int r
= store
->iterate_obj(obj_ctx
, source
->get_bucket_info(), state
.obj
,
6325 ofs
, end
, chunk_size
, _get_obj_iterate_cb
, &data
, y
);
6327 ldout(cct
, 0) << "iterate_obj() failed with " << r
<< dendl
;
6328 data
.cancel(); // drain completions without writing back to client
6332 return data
.drain();
6335 int RGWRados::iterate_obj(RGWObjectCtx
& obj_ctx
,
6336 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6337 off_t ofs
, off_t end
, uint64_t max_chunk_size
,
6338 iterate_obj_cb cb
, void *arg
, optional_yield y
)
6340 rgw_raw_obj head_obj
;
6341 rgw_raw_obj read_obj
;
6342 uint64_t read_ofs
= ofs
;
6344 bool reading_from_head
= true;
6345 RGWObjState
*astate
= NULL
;
6347 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
6349 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &astate
, false, y
);
6357 len
= end
- ofs
+ 1;
6359 if (astate
->manifest
) {
6360 /* now get the relevant object stripe */
6361 RGWObjManifest::obj_iterator iter
= astate
->manifest
->obj_find(ofs
);
6363 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
->obj_end();
6365 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
6366 off_t stripe_ofs
= iter
.get_stripe_ofs();
6367 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
6369 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
6370 read_obj
= iter
.get_location().get_raw_obj(this);
6371 uint64_t read_len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6372 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6374 if (read_len
> max_chunk_size
) {
6375 read_len
= max_chunk_size
;
6378 reading_from_head
= (read_obj
== head_obj
);
6379 r
= cb(read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
6389 while (ofs
<= end
) {
6390 read_obj
= head_obj
;
6391 uint64_t read_len
= std::min(len
, max_chunk_size
);
6393 r
= cb(read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
6406 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
6409 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6414 return rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, op
, null_yield
);
6417 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
6420 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6427 return rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, op
, &outbl
, null_yield
);
6430 int RGWRados::olh_init_modification_impl(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
6432 ObjectWriteOperation op
;
6434 ceph_assert(olh_obj
.key
.instance
.empty());
6436 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
6438 if (!state
.exists
) {
6442 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
6443 op
.mtime2(&mtime_ts
);
6447 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6448 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6449 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6450 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6451 * log will reflect that.
6453 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6454 * is used for object data instance, olh_tag for olh instance.
6457 /* guard against racing writes */
6458 bucket_index_guard_olh_op(state
, op
);
6463 string obj_tag
= gen_rand_alphanumeric_lower(cct
, 32);
6466 bl
.append(obj_tag
.c_str(), obj_tag
.size());
6467 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6469 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
6473 string olh_tag
= gen_rand_alphanumeric_lower(cct
, 32);
6476 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
6477 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
6479 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
6480 state
.olh_tag
= olh_bl
;
6481 state
.is_olh
= true;
6484 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
6488 RGWOLHPendingInfo pending_info
;
6489 pending_info
.time
= real_clock::now();
6490 encode(pending_info
, bl
);
6492 #define OLH_PENDING_TAG_LEN 32
6493 /* tag will start with current time epoch, this so that entries are sorted by time */
6495 utime_t
ut(pending_info
.time
);
6496 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
6499 string s
= gen_rand_alphanumeric_lower(cct
, OLH_PENDING_TAG_LEN
- op_tag
->size());
6503 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
6504 attr_name
.append(*op_tag
);
6506 op
.setxattr(attr_name
.c_str(), bl
);
6508 int ret
= obj_operate(bucket_info
, olh_obj
, &op
);
6513 state
.exists
= true;
6514 state
.attrset
[attr_name
] = bl
;
6519 int RGWRados::olh_init_modification(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
6523 ret
= olh_init_modification_impl(bucket_info
, state
, obj
, op_tag
);
6524 if (ret
== -EEXIST
) {
6531 int RGWRados::guard_reshard(BucketShard
*bs
,
6532 const rgw_obj
& obj_instance
,
6533 const RGWBucketInfo
& bucket_info
,
6534 std::function
<int(BucketShard
*)> call
)
6537 const rgw_obj
*pobj
= &obj_instance
;
6540 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
6541 r
= bs
->init(pobj
->bucket
, *pobj
, nullptr /* no RGWBucketInfo */);
6543 ldout(cct
, 5) << "bs.init() returned ret=" << r
<< dendl
;
6547 if (r
!= -ERR_BUSY_RESHARDING
) {
6550 ldout(cct
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
6551 string new_bucket_id
;
6552 r
= block_while_resharding(bs
, &new_bucket_id
, bucket_info
, null_yield
);
6553 if (r
== -ERR_BUSY_RESHARDING
) {
6559 ldout(cct
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
6560 i
= 0; /* resharding is finished, make sure we can retry */
6563 obj
.bucket
.update_bucket_id(new_bucket_id
);
6574 int RGWRados::block_while_resharding(RGWRados::BucketShard
*bs
,
6575 string
*new_bucket_id
,
6576 const RGWBucketInfo
& bucket_info
,
6580 cls_rgw_bucket_instance_entry entry
;
6582 // since we want to run this recovery code from two distinct places,
6583 // let's just put it in a lambda so we can easily re-use; if the
6584 // lambda successfully fetches a new bucket id, it sets
6585 // new_bucket_id and returns 0, otherwise it returns a negative
6587 auto fetch_new_bucket_id
=
6588 [this, &bucket_info
](const std::string
& log_tag
,
6589 std::string
* new_bucket_id
) -> int {
6590 RGWBucketInfo fresh_bucket_info
= bucket_info
;
6591 int ret
= try_refresh_bucket_info(fresh_bucket_info
, nullptr);
6593 ldout(cct
, 0) << __func__
<<
6594 " ERROR: failed to refresh bucket info after reshard at " <<
6595 log_tag
<< ": " << cpp_strerror(-ret
) << dendl
;
6598 *new_bucket_id
= fresh_bucket_info
.bucket
.bucket_id
;
6602 constexpr int num_retries
= 10;
6603 for (int i
= 1; i
<= num_retries
; i
++) { // nb: 1-based for loop
6604 auto& ref
= bs
->bucket_obj
.get_ref();
6605 ret
= cls_rgw_get_bucket_resharding(ref
.pool
.ioctx(), ref
.obj
.oid
, &entry
);
6606 if (ret
== -ENOENT
) {
6607 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id
);
6608 } else if (ret
< 0) {
6609 ldout(cct
, 0) << __func__
<<
6610 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret
) <<
6615 if (!entry
.resharding_in_progress()) {
6616 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6620 ldout(cct
, 20) << "NOTICE: reshard still in progress; " <<
6621 (i
< num_retries
? "retrying" : "too many retries") << dendl
;
6623 if (i
== num_retries
) {
6627 // If bucket is erroneously marked as resharding (e.g., crash or
6628 // other error) then fix it. If we can take the bucket reshard
6629 // lock then it means no other resharding should be taking place,
6630 // and we're free to clear the flags.
6632 // since we expect to do this rarely, we'll do our work in a
6633 // block and erase our work after each try
6635 RGWObjectCtx
obj_ctx(this->store
);
6636 const rgw_bucket
& b
= bs
->bucket
;
6637 std::string bucket_id
= b
.get_key();
6638 RGWBucketReshardLock
reshard_lock(this->store
, bucket_info
, true);
6639 ret
= reshard_lock
.lock();
6641 ldout(cct
, 20) << __func__
<<
6642 " INFO: failed to take reshard lock for bucket " <<
6643 bucket_id
<< "; expected if resharding underway" << dendl
;
6645 ldout(cct
, 10) << __func__
<<
6646 " INFO: was able to take reshard lock for bucket " <<
6648 ret
= RGWBucketReshard::clear_resharding(this->store
, bucket_info
);
6650 reshard_lock
.unlock();
6651 ldout(cct
, 0) << __func__
<<
6652 " ERROR: failed to clear resharding flags for bucket " <<
6655 reshard_lock
.unlock();
6656 ldout(cct
, 5) << __func__
<<
6657 " INFO: apparently successfully cleared resharding flags for "
6658 "bucket " << bucket_id
<< dendl
;
6659 continue; // if we apparently succeed immediately test again
6660 } // if clear resharding succeeded
6661 } // if taking of lock succeeded
6662 } // block to encapsulate recovery from incomplete reshard
6664 ret
= reshard_wait
->wait(y
);
6666 ldout(cct
, 0) << __func__
<<
6667 " ERROR: bucket is still resharding, please retry" << dendl
;
6672 ldout(cct
, 0) << __func__
<<
6673 " ERROR: bucket is still resharding, please retry" << dendl
;
6674 return -ERR_BUSY_RESHARDING
;
6677 int RGWRados::bucket_index_link_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
6679 const string
& op_tag
,
6680 struct rgw_bucket_dir_entry_meta
*meta
,
6682 real_time unmod_since
, bool high_precision_time
,
6683 rgw_zone_set
*_zones_trace
, bool log_data_change
)
6686 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
6691 rgw_zone_set zones_trace
;
6693 zones_trace
= *_zones_trace
;
6695 zones_trace
.insert(svc
.zone
->get_zone().id
, bucket_info
.bucket
.get_key());
6697 BucketShard
bs(this);
6699 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
6700 [&](BucketShard
*bs
) -> int {
6701 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
6702 auto& ref
= bs
->bucket_obj
.get_ref();
6703 librados::ObjectWriteOperation op
;
6704 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
6705 cls_rgw_bucket_link_olh(op
, key
, olh_state
.olh_tag
,
6706 delete_marker
, op_tag
, meta
, olh_epoch
,
6707 unmod_since
, high_precision_time
,
6708 svc
.zone
->get_zone().log_data
, zones_trace
);
6709 return rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
6712 ldout(cct
, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
6716 r
= svc
.datalog_rados
->add_entry(bucket_info
, bs
.shard_id
);
6718 ldout(cct
, 0) << "ERROR: failed writing data log" << dendl
;
6724 void RGWRados::bucket_index_guard_olh_op(RGWObjState
& olh_state
, ObjectOperation
& op
)
6726 ldout(cct
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
6727 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
6730 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
6731 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
, rgw_zone_set
*_zones_trace
)
6734 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
6739 rgw_zone_set zones_trace
;
6741 zones_trace
= *_zones_trace
;
6743 zones_trace
.insert(svc
.zone
->get_zone().id
, bucket_info
.bucket
.get_key());
6745 BucketShard
bs(this);
6747 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
6748 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
6749 [&](BucketShard
*bs
) -> int {
6750 auto& ref
= bs
->bucket_obj
.get_ref();
6751 librados::ObjectWriteOperation op
;
6752 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
6753 cls_rgw_bucket_unlink_instance(op
, key
, op_tag
,
6754 olh_tag
, olh_epoch
, svc
.zone
->get_zone().log_data
, zones_trace
);
6755 return rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
6758 ldout(cct
, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r
<< dendl
;
6765 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
6766 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
6767 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
6771 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
6776 BucketShard
bs(this);
6778 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
6780 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
6784 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
6786 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
6788 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
6789 [&](BucketShard
*bs
) -> int {
6790 auto& ref
= bs
->bucket_obj
.get_ref();
6791 ObjectReadOperation op
;
6792 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
6794 rgw_cls_read_olh_log_ret log_ret
;
6796 cls_rgw_get_olh_log(op
, key
, ver_marker
, olh_tag
, log_ret
, op_ret
);
6798 int r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, &outbl
, null_yield
);
6806 *log
= std::move(log_ret
.log
);
6807 *is_truncated
= log_ret
.is_truncated
;
6811 ldout(cct
, 20) << "cls_rgw_get_olh_log() returned r=" << r
<< dendl
;
6818 // a multisite sync bug resulted in the OLH head attributes being overwritten by
6819 // the attributes from another zone, causing link_olh() to fail endlessly due to
6820 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
6821 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
6822 int RGWRados::repair_olh(RGWObjState
* state
, const RGWBucketInfo
& bucket_info
,
6825 // fetch the current olh entry from the bucket index
6826 rgw_bucket_olh_entry olh
;
6827 int r
= bi_get_olh(bucket_info
, obj
, &olh
);
6829 ldout(cct
, 0) << "repair_olh failed to read olh entry for " << obj
<< dendl
;
6832 if (olh
.tag
== rgw_bl_str(state
->olh_tag
)) { // mismatch already resolved?
6836 ldout(cct
, 4) << "repair_olh setting olh_tag=" << olh
.tag
6837 << " key=" << olh
.key
<< " delete_marker=" << olh
.delete_marker
<< dendl
;
6839 // rewrite OLH_ID_TAG and OLH_INFO from current olh
6840 ObjectWriteOperation op
;
6841 // assert this is the same olh tag we think we're fixing
6842 bucket_index_guard_olh_op(*state
, op
);
6843 // preserve existing mtime
6844 struct timespec mtime_ts
= ceph::real_clock::to_timespec(state
->mtime
);
6845 op
.mtime2(&mtime_ts
);
6848 bl
.append(olh
.tag
.c_str(), olh
.tag
.size());
6849 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, bl
);
6853 info
.target
= rgw_obj(bucket_info
.bucket
, olh
.key
);
6854 info
.removed
= olh
.delete_marker
;
6857 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
6860 r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6864 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
6866 ldout(cct
, 0) << "repair_olh failed to write olh attributes with "
6867 << cpp_strerror(r
) << dendl
;
6873 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
6876 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
6881 BucketShard
bs(this);
6883 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
6885 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
6889 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
6891 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
6893 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
6894 [&](BucketShard
*pbs
) -> int {
6895 ObjectWriteOperation op
;
6896 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
6897 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
6898 return pbs
->bucket_obj
.operate(&op
, null_yield
);
6901 ldout(cct
, 20) << "cls_rgw_trim_olh_log() returned r=" << ret
<< dendl
;
6908 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
6911 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
6916 BucketShard
bs(this);
6918 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
6920 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
6922 int ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
6923 [&](BucketShard
*pbs
) -> int {
6924 ObjectWriteOperation op
;
6925 auto& ref
= pbs
->bucket_obj
.get_ref();
6926 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
6927 cls_rgw_clear_olh(op
, key
, olh_tag
);
6928 return rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
6931 ldout(cct
, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
6938 static int decode_olh_info(CephContext
* cct
, const bufferlist
& bl
, RGWOLHInfo
*olh
)
6941 auto biter
= bl
.cbegin();
6942 decode(*olh
, biter
);
6944 } catch (buffer::error
& err
) {
6945 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
6950 int RGWRados::apply_olh_log(RGWObjectCtx
& obj_ctx
, RGWObjState
& state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6951 bufferlist
& olh_tag
, map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >& log
,
6952 uint64_t *plast_ver
, rgw_zone_set
* zones_trace
)
6958 librados::ObjectWriteOperation op
;
6960 uint64_t last_ver
= log
.rbegin()->first
;
6961 *plast_ver
= last_ver
;
6963 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
6965 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
6966 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GTE
, last_ver
);
6969 string last_ver_s
= to_string(last_ver
);
6970 ver_bl
.append(last_ver_s
.c_str(), last_ver_s
.size());
6971 op
.setxattr(RGW_ATTR_OLH_VER
, ver_bl
);
6973 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
6974 op
.mtime2(&mtime_ts
);
6976 bool need_to_link
= false;
6977 uint64_t link_epoch
= 0;
6978 cls_rgw_obj_key key
;
6979 bool delete_marker
= false;
6980 list
<cls_rgw_obj_key
> remove_instances
;
6981 bool need_to_remove
= false;
6983 // decode current epoch and instance
6984 auto olh_ver
= state
.attrset
.find(RGW_ATTR_OLH_VER
);
6985 if (olh_ver
!= state
.attrset
.end()) {
6986 std::string str
= olh_ver
->second
.to_str();
6988 link_epoch
= strict_strtoll(str
.c_str(), 10, &err
);
6990 auto olh_info
= state
.attrset
.find(RGW_ATTR_OLH_INFO
);
6991 if (olh_info
!= state
.attrset
.end()) {
6993 int r
= decode_olh_info(cct
, olh_info
->second
, &info
);
6997 info
.target
.key
.get_index_key(&key
);
6998 delete_marker
= info
.removed
;
7001 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
7002 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
7003 for (; viter
!= iter
->second
.end(); ++viter
) {
7004 rgw_bucket_olh_log_entry
& entry
= *viter
;
7006 ldout(cct
, 20) << "olh_log_entry: epoch=" << iter
->first
<< " op=" << (int)entry
.op
7007 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
7008 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
7010 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
7011 remove_instances
.push_back(entry
.key
);
7013 case CLS_RGW_OLH_OP_LINK_OLH
:
7014 // only overwrite a link of the same epoch if its key sorts before
7015 if (link_epoch
< iter
->first
|| key
.instance
.empty() ||
7016 key
.instance
> entry
.key
.instance
) {
7017 ldout(cct
, 20) << "apply_olh_log applying key=" << entry
.key
<< " epoch=" << iter
->first
<< " delete_marker=" << entry
.delete_marker
7018 << " over current=" << key
<< " epoch=" << link_epoch
<< " delete_marker=" << delete_marker
<< dendl
;
7019 need_to_link
= true;
7020 need_to_remove
= false;
7022 delete_marker
= entry
.delete_marker
;
7024 ldout(cct
, 20) << "apply_olh skipping key=" << entry
.key
<< " epoch=" << iter
->first
<< " delete_marker=" << entry
.delete_marker
7025 << " before current=" << key
<< " epoch=" << link_epoch
<< " delete_marker=" << delete_marker
<< dendl
;
7028 case CLS_RGW_OLH_OP_UNLINK_OLH
:
7029 need_to_remove
= true;
7030 need_to_link
= false;
7033 ldout(cct
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
7036 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
7037 attr_name
.append(entry
.op_tag
);
7038 op
.rmxattr(attr_name
.c_str());
7043 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
7048 const rgw_bucket
& bucket
= obj
.bucket
;
7051 rgw_obj
target(bucket
, key
);
7053 info
.target
= target
;
7054 info
.removed
= delete_marker
;
7057 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7060 /* first remove object instances */
7061 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
7062 liter
!= remove_instances
.end(); ++liter
) {
7063 cls_rgw_obj_key
& key
= *liter
;
7064 rgw_obj
obj_instance(bucket
, key
);
7065 int ret
= delete_obj(obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
, ceph::real_time(), zones_trace
);
7066 if (ret
< 0 && ret
!= -ENOENT
) {
7067 ldout(cct
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
7072 /* update olh object */
7073 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7074 if (r
== -ECANCELED
) {
7078 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7082 r
= bucket_index_trim_olh_log(bucket_info
, state
, obj
, last_ver
);
7084 ldout(cct
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
7088 if (need_to_remove
) {
7089 ObjectWriteOperation rm_op
;
7091 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7092 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_EQ
, last_ver
);
7093 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
7096 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &rm_op
, null_yield
);
7097 if (r
== -ECANCELED
) {
7098 return 0; /* someone else won this race */
7101 * only clear if was successful, otherwise we might clobber pending operations on this object
7103 r
= bucket_index_clear_olh(bucket_info
, state
, obj
);
7105 ldout(cct
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
7115 * read olh log and apply it
7117 int RGWRados::update_olh(RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_zone_set
*zones_trace
)
7119 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
7121 uint64_t ver_marker
= 0;
7124 int ret
= bucket_index_read_olh_log(bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
7128 ret
= apply_olh_log(obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
, zones_trace
);
7132 } while (is_truncated
);
7137 int RGWRados::set_olh(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
7138 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
,
7139 optional_yield y
, rgw_zone_set
*zones_trace
, bool log_data_change
)
7143 rgw_obj olh_obj
= target_obj
;
7144 olh_obj
.key
.instance
.clear();
7146 RGWObjState
*state
= NULL
;
7151 #define MAX_ECANCELED_RETRY 100
7152 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7153 if (ret
== -ECANCELED
) {
7154 obj_ctx
.invalidate(olh_obj
);
7157 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false, y
); /* don't follow olh */
7162 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
7164 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7165 if (ret
== -ECANCELED
) {
7170 ret
= bucket_index_link_olh(bucket_info
, *state
, target_obj
, delete_marker
,
7171 op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
,
7172 zones_trace
, log_data_change
);
7174 ldout(cct
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7175 if (ret
== -ECANCELED
) {
7176 // the bucket index rejected the link_olh() due to olh tag mismatch;
7177 // attempt to reconstruct olh head attributes based on the bucket index
7178 int r2
= repair_olh(state
, bucket_info
, olh_obj
);
7179 if (r2
< 0 && r2
!= -ECANCELED
) {
7189 if (i
== MAX_ECANCELED_RETRY
) {
7190 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7194 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
7195 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7199 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7206 int RGWRados::unlink_obj_instance(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
7207 uint64_t olh_epoch
, optional_yield y
, rgw_zone_set
*zones_trace
)
7211 rgw_obj olh_obj
= target_obj
;
7212 olh_obj
.key
.instance
.clear();
7214 RGWObjState
*state
= NULL
;
7219 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7220 if (ret
== -ECANCELED
) {
7221 obj_ctx
.invalidate(olh_obj
);
7224 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false, y
); /* don't follow olh */
7228 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
7230 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7231 if (ret
== -ECANCELED
) {
7237 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
7239 ret
= bucket_index_unlink_instance(bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
, zones_trace
);
7241 ldout(cct
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7242 if (ret
== -ECANCELED
) {
7250 if (i
== MAX_ECANCELED_RETRY
) {
7251 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7255 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
, zones_trace
);
7256 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7260 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7267 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key
*target_key
)
7269 #define OBJ_INSTANCE_LEN 32
7270 char buf
[OBJ_INSTANCE_LEN
+ 1];
7272 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
7273 no underscore for instance name due to the way we encode the raw keys */
7275 target_key
->set_instance(buf
);
7278 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
7280 gen_rand_obj_instance_name(&target_obj
->key
);
7283 int RGWRados::get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
7285 map
<string
, bufferlist
> attrset
;
7287 ObjectReadOperation op
;
7288 op
.getxattrs(&attrset
, NULL
);
7290 int r
= obj_operate(bucket_info
, obj
, &op
);
7295 auto iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
7296 if (iter
== attrset
.end()) { /* not an olh */
7300 return decode_olh_info(cct
, iter
->second
, olh
);
7303 void RGWRados::check_pending_olh_entries(map
<string
, bufferlist
>& pending_entries
,
7304 map
<string
, bufferlist
> *rm_pending_entries
)
7306 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
7308 real_time now
= real_clock::now();
7310 while (iter
!= pending_entries
.end()) {
7311 auto biter
= iter
->second
.cbegin();
7312 RGWOLHPendingInfo pending_info
;
7314 decode(pending_info
, biter
);
7315 } catch (buffer::error
& err
) {
7316 /* skipping bad entry, we could remove it but it might hide a bug */
7317 ldout(cct
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
7322 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
7324 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
7325 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
7326 pending_entries
.erase(cur_iter
);
7328 /* entries names are sorted by time (rounded to a second) */
7334 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
7337 int r
= get_obj_head_ref(bucket_info
, olh_obj
, &ref
);
7342 // trim no more than 1000 entries per osd op
7343 constexpr int max_entries
= 1000;
7345 auto i
= pending_attrs
.begin();
7346 while (i
!= pending_attrs
.end()) {
7347 ObjectWriteOperation op
;
7348 bucket_index_guard_olh_op(state
, op
);
7350 for (int n
= 0; n
< max_entries
&& i
!= pending_attrs
.end(); ++n
, ++i
) {
7351 op
.rmxattr(i
->first
.c_str());
7354 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7355 if (r
== -ENOENT
|| r
== -ECANCELED
) {
7356 /* raced with some other change, shouldn't sweat about it */
7360 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7367 int RGWRados::follow_olh(const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
7369 map
<string
, bufferlist
> pending_entries
;
7370 rgw_filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
7372 map
<string
, bufferlist
> rm_pending_entries
;
7373 check_pending_olh_entries(pending_entries
, &rm_pending_entries
);
7375 if (!rm_pending_entries
.empty()) {
7376 int ret
= remove_olh_pending_entries(bucket_info
, *state
, olh_obj
, rm_pending_entries
);
7378 ldout(cct
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
7382 if (!pending_entries
.empty()) {
7383 ldout(cct
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
7385 int ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
7391 auto iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
7392 if (iter
== state
->attrset
.end()) {
7397 int ret
= decode_olh_info(cct
, iter
->second
, &olh
);
7406 *target
= olh
.target
;
7411 int RGWRados::raw_obj_stat(rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
7412 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
7413 RGWObjVersionTracker
*objv_tracker
, optional_yield y
)
7416 int r
= get_raw_obj_ref(obj
, &ref
);
7421 map
<string
, bufferlist
> unfiltered_attrset
;
7423 struct timespec mtime_ts
;
7425 ObjectReadOperation op
;
7427 objv_tracker
->prepare_op_for_read(&op
);
7430 op
.getxattrs(&unfiltered_attrset
, NULL
);
7432 if (psize
|| pmtime
) {
7433 op
.stat2(&size
, &mtime_ts
, NULL
);
7436 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
7439 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, &outbl
, null_yield
);
7442 *epoch
= ref
.pool
.ioctx().get_last_version();
7451 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
7453 rgw_filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
7459 int RGWRados::get_bucket_stats(RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
7460 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
, bool *syncstopped
)
7462 vector
<rgw_bucket_dir_header
> headers
;
7463 map
<int, string
> bucket_instance_ids
;
7464 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
7469 ceph_assert(headers
.size() == bucket_instance_ids
.size());
7471 auto iter
= headers
.begin();
7472 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
7473 BucketIndexShardsManager ver_mgr
;
7474 BucketIndexShardsManager master_ver_mgr
;
7475 BucketIndexShardsManager marker_mgr
;
7477 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
7478 accumulate_raw_stats(*iter
, stats
);
7479 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->ver
);
7480 ver_mgr
.add(viter
->first
, string(buf
));
7481 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->master_ver
);
7482 master_ver_mgr
.add(viter
->first
, string(buf
));
7483 if (shard_id
>= 0) {
7484 *max_marker
= iter
->max_marker
;
7486 marker_mgr
.add(viter
->first
, iter
->max_marker
);
7488 if (syncstopped
!= NULL
)
7489 *syncstopped
= iter
->syncstopped
;
7491 ver_mgr
.to_string(bucket_ver
);
7492 master_ver_mgr
.to_string(master_ver
);
7494 marker_mgr
.to_string(max_marker
);
7499 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
7500 RGWGetBucketStats_CB
*cb
;
7502 map
<RGWObjCategory
, RGWStorageStats
> stats
;
7505 ceph::mutex lock
= ceph::make_mutex("RGWGetBucketStatsContext");
7508 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
7509 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true)
7512 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
7513 std::lock_guard l
{lock
};
7516 accumulate_raw_stats(header
, stats
);
7522 if (--pendings
== 0) {
7524 cb
->set_response(&stats
);
7526 cb
->handle_response(ret_code
);
7533 std::lock_guard l
{lock
};
7538 int RGWRados::get_bucket_stats_async(RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
7541 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.num_shards
? : 1);
7542 ceph_assert(get_ctx
);
7543 int r
= cls_bucket_head_async(bucket_info
, shard_id
, get_ctx
, &num_aio
);
7547 get_ctx
->unset_cb();
7554 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const string
& meta_key
, RGWBucketInfo
& info
,
7555 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
, optional_yield y
)
7558 rgw_bucket_parse_bucket_key(cct
, meta_key
, &bucket
, nullptr);
7560 return get_bucket_instance_info(obj_ctx
, bucket
, info
, pmtime
, pattrs
, y
);
7563 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
7564 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
, optional_yield y
)
7566 RGWSI_MetaBackend_CtxParams bectx_params
= RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx
);
7567 return ctl
.bucket
->read_bucket_instance_info(bucket
, &info
,
7569 RGWBucketCtl::BucketInstance::GetParams()
7572 .set_bectx_params(bectx_params
));
7575 int RGWRados::get_bucket_info(RGWServices
*svc
,
7576 const string
& tenant
, const string
& bucket_name
,
7577 RGWBucketInfo
& info
,
7579 optional_yield y
, map
<string
, bufferlist
> *pattrs
)
7581 auto obj_ctx
= svc
->sysobj
->init_obj_ctx();
7582 RGWSI_MetaBackend_CtxParams bectx_params
= RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx
);
7584 bucket
.tenant
= tenant
;
7585 bucket
.name
= bucket_name
;
7586 return ctl
.bucket
->read_bucket_info(bucket
, &info
, y
,
7587 RGWBucketCtl::BucketInstance::GetParams()
7590 .set_bectx_params(bectx_params
));
7593 int RGWRados::try_refresh_bucket_info(RGWBucketInfo
& info
,
7594 ceph::real_time
*pmtime
,
7595 map
<string
, bufferlist
> *pattrs
)
7597 rgw_bucket bucket
= info
.bucket
;
7598 bucket
.bucket_id
.clear();
7600 auto rv
= info
.objv_tracker
.read_version
;
7602 return ctl
.bucket
->read_bucket_info(bucket
, &info
, null_yield
,
7603 RGWBucketCtl::BucketInstance::GetParams()
7606 .set_refresh_version(rv
));
7609 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
7610 real_time mtime
, map
<string
, bufferlist
> *pattrs
)
7612 return ctl
.bucket
->store_bucket_instance_info(info
.bucket
, info
, null_yield
,
7613 RGWBucketCtl::BucketInstance::PutParams()
7614 .set_exclusive(exclusive
)
7616 .set_attrs(pattrs
));
7619 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
7620 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
)
7622 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
7624 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
);
7630 return 0; /* done! */
7632 RGWBucketEntryPoint entry_point
;
7633 entry_point
.bucket
= info
.bucket
;
7634 entry_point
.owner
= info
.owner
;
7635 entry_point
.creation_time
= info
.creation_time
;
7636 entry_point
.linked
= true;
7637 RGWObjVersionTracker ot
;
7638 if (pep_objv
&& !pep_objv
->tag
.empty()) {
7639 ot
.write_version
= *pep_objv
;
7641 ot
.generate_new_write_ver(cct
);
7643 *pep_objv
= ot
.write_version
;
7646 ret
= ctl
.bucket
->store_bucket_entrypoint_info(info
.bucket
, entry_point
, null_yield
, RGWBucketCtl::Bucket::PutParams()
7647 .set_exclusive(exclusive
)
7648 .set_objv_tracker(&ot
)
7656 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
)
7658 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
7660 map
<string
, RGWBucketEnt
>::iterator iter
;
7661 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
7662 RGWBucketEnt
& ent
= iter
->second
;
7663 rgw_bucket
& bucket
= ent
.bucket
;
7666 ent
.size_rounded
= 0;
7668 vector
<rgw_bucket_dir_header
> headers
;
7670 RGWBucketInfo bucket_info
;
7671 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
, null_yield
);
7676 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
7680 auto hiter
= headers
.begin();
7681 for (; hiter
!= headers
.end(); ++hiter
) {
7682 RGWObjCategory category
= main_category
;
7683 auto iter
= (hiter
->stats
).find(category
);
7684 if (iter
!= hiter
->stats
.end()) {
7685 struct rgw_bucket_category_stats
& stats
= iter
->second
;
7686 ent
.count
+= stats
.num_entries
;
7687 ent
.size
+= stats
.total_size
;
7688 ent
.size_rounded
+= stats
.total_size_rounded
;
7692 // fill in placement_rule from the bucket instance for use in swift's
7693 // per-storage policy statistics
7694 ent
.placement_rule
= std::move(bucket_info
.placement_rule
);
7700 int RGWRados::append_async(rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
7703 int r
= get_raw_obj_ref(obj
, &ref
);
7707 librados::Rados
*rad
= get_rados_handle();
7708 librados::AioCompletion
*completion
= rad
->aio_create_completion(nullptr, nullptr);
7710 r
= ref
.pool
.ioctx().aio_append(ref
.obj
.oid
, completion
, bl
, size
);
7711 completion
->release();
7715 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
7717 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
7718 librados::NObjectIterator
& iter
= ctx
.iter
;
7720 int r
= open_pool_ctx(pool
, io_ctx
, false);
7724 iter
= io_ctx
.nobjects_begin();
7729 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, const string
& cursor
, RGWPoolIterCtx
& ctx
)
7731 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
7732 librados::NObjectIterator
& iter
= ctx
.iter
;
7734 int r
= open_pool_ctx(pool
, io_ctx
, false);
7738 librados::ObjectCursor oc
;
7739 if (!oc
.from_str(cursor
)) {
7740 ldout(cct
, 10) << "failed to parse cursor: " << cursor
<< dendl
;
7745 iter
= io_ctx
.nobjects_begin(oc
);
7747 } catch (const std::system_error
& e
) {
7748 r
= -e
.code().value();
7749 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
7750 << ", returning " << r
<< dendl
;
7752 } catch (const std::exception
& e
) {
7753 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
7754 << ", returning -5" << dendl
;
7759 string
RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx
& ctx
)
7761 return ctx
.iter
.get_cursor().to_str();
7764 static int do_pool_iterate(CephContext
* cct
, RGWPoolIterCtx
& ctx
, uint32_t num
,
7765 vector
<rgw_bucket_dir_entry
>& objs
,
7766 bool *is_truncated
, RGWAccessListFilter
*filter
)
7768 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
7769 librados::NObjectIterator
& iter
= ctx
.iter
;
7771 if (iter
== io_ctx
.nobjects_end())
7776 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
7777 rgw_bucket_dir_entry e
;
7779 string oid
= iter
->get_oid();
7780 ldout(cct
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
7782 // fill it in with initial values; we may correct later
7783 if (filter
&& !filter
->filter(oid
, oid
))
7791 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
7796 int RGWRados::pool_iterate(RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
7797 bool *is_truncated
, RGWAccessListFilter
*filter
)
7799 // catch exceptions from NObjectIterator::operator++()
7801 return do_pool_iterate(cct
, ctx
, num
, objs
, is_truncated
, filter
);
7802 } catch (const std::system_error
& e
) {
7803 int r
= -e
.code().value();
7804 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
7805 << ", returning " << r
<< dendl
;
7807 } catch (const std::exception
& e
) {
7808 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
7809 << ", returning -5" << dendl
;
7814 int RGWRados::list_raw_objects_init(const rgw_pool
& pool
, const string
& marker
, RGWListRawObjsCtx
*ctx
)
7816 if (!ctx
->initialized
) {
7817 int r
= pool_iterate_begin(pool
, marker
, ctx
->iter_ctx
);
7819 ldout(cct
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
7822 ctx
->initialized
= true;
7827 int RGWRados::list_raw_objects_next(const string
& prefix_filter
, int max
,
7828 RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
7831 if (!ctx
.initialized
) {
7834 RGWAccessListFilterPrefix
filter(prefix_filter
);
7835 vector
<rgw_bucket_dir_entry
> objs
;
7836 int r
= pool_iterate(ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
7839 ldout(cct
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
7843 vector
<rgw_bucket_dir_entry
>::iterator iter
;
7844 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
7845 oids
.push_back(iter
->key
.name
);
7851 int RGWRados::list_raw_objects(const rgw_pool
& pool
, const string
& prefix_filter
,
7852 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
7855 if (!ctx
.initialized
) {
7856 int r
= list_raw_objects_init(pool
, string(), &ctx
);
7862 return list_raw_objects_next(prefix_filter
, max
, ctx
, oids
, is_truncated
);
7865 string
RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx
& ctx
)
7867 return pool_iterate_get_cursor(ctx
.iter_ctx
);
7870 int RGWRados::bi_get_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
7871 rgw_bucket_dir_entry
*dirent
)
7873 rgw_cls_bi_entry bi_entry
;
7874 int r
= bi_get(bucket_info
, obj
, BIIndexType::Instance
, &bi_entry
);
7875 if (r
< 0 && r
!= -ENOENT
) {
7876 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
7881 auto iter
= bi_entry
.data
.cbegin();
7883 decode(*dirent
, iter
);
7884 } catch (buffer::error
& err
) {
7885 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
7892 int RGWRados::bi_get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
7893 rgw_bucket_olh_entry
*olh
)
7895 rgw_cls_bi_entry bi_entry
;
7896 int r
= bi_get(bucket_info
, obj
, BIIndexType::OLH
, &bi_entry
);
7897 if (r
< 0 && r
!= -ENOENT
) {
7898 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
7903 auto iter
= bi_entry
.data
.cbegin();
7906 } catch (buffer::error
& err
) {
7907 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
7914 int RGWRados::bi_get(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
7915 BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
7917 BucketShard
bs(this);
7918 int ret
= bs
.init(bucket_info
, obj
);
7920 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7924 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
7926 auto& ref
= bs
.bucket_obj
.get_ref();
7928 return cls_rgw_bi_get(ref
.pool
.ioctx(), ref
.obj
.oid
, index_type
, key
, entry
);
7931 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
7933 auto& ref
= bs
.bucket_obj
.get_ref();
7934 cls_rgw_bi_put(op
, ref
.obj
.oid
, entry
);
7937 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
7939 auto& ref
= bs
.bucket_obj
.get_ref();
7940 int ret
= cls_rgw_bi_put(ref
.pool
.ioctx(), ref
.obj
.oid
, entry
);
7947 int RGWRados::bi_put(rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
7949 BucketShard
bs(this);
7950 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
7952 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7956 return bi_put(bs
, entry
);
7959 int RGWRados::bi_list(rgw_bucket
& bucket
, const string
& obj_name
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
7961 rgw_obj
obj(bucket
, obj_name
);
7962 BucketShard
bs(this);
7963 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
7965 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7969 auto& ref
= bs
.bucket_obj
.get_ref();
7970 ret
= cls_rgw_bi_list(ref
.pool
.ioctx(), ref
.obj
.oid
, obj_name
, marker
, max
, entries
, is_truncated
);
7971 if (ret
== -ENOENT
) {
7972 *is_truncated
= false;
7980 int RGWRados::bi_list(BucketShard
& bs
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
7982 auto& ref
= bs
.bucket_obj
.get_ref();
7983 int ret
= cls_rgw_bi_list(ref
.pool
.ioctx(), ref
.obj
.oid
, filter_obj
, marker
, max
, entries
, is_truncated
);
7990 int RGWRados::bi_remove(BucketShard
& bs
)
7992 auto& ref
= bs
.bucket_obj
.get_ref();
7993 int ret
= ref
.pool
.ioctx().remove(ref
.obj
.oid
);
7994 if (ret
== -ENOENT
) {
7998 ldout(cct
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
8005 int RGWRados::bi_list(rgw_bucket
& bucket
, int shard_id
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8007 BucketShard
bs(this);
8008 int ret
= bs
.init(bucket
, shard_id
, nullptr /* no RGWBucketInfo */);
8010 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8014 return bi_list(bs
, filter_obj
, marker
, max
, entries
, is_truncated
);
8017 int RGWRados::gc_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
8019 return rgw_rados_operate(gc_pool_ctx
, oid
, op
, null_yield
);
8022 int RGWRados::gc_aio_operate(const string
& oid
, librados::AioCompletion
*c
,
8023 librados::ObjectWriteOperation
*op
)
8025 return gc_pool_ctx
.aio_operate(oid
, c
, op
);
8028 int RGWRados::gc_operate(string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
8030 return rgw_rados_operate(gc_pool_ctx
, oid
, op
, pbl
, null_yield
);
8033 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
, bool& processing_queue
)
8035 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
, processing_queue
);
8038 int RGWRados::process_gc(bool expired_only
)
8040 return gc
->process(expired_only
);
8043 int RGWRados::list_lc_progress(const string
& marker
, uint32_t max_entries
, map
<string
, int> *progress_map
)
8045 return lc
->list_lc_progress(marker
, max_entries
, progress_map
);
8048 int RGWRados::process_lc()
8050 return lc
->process();
8053 bool RGWRados::process_expire_objects()
8055 return obj_expirer
->inspect_all_shards(utime_t(), ceph_clock_now());
8058 int RGWRados::cls_obj_prepare_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
8059 rgw_obj
& obj
, uint16_t bilog_flags
, optional_yield y
, rgw_zone_set
*_zones_trace
)
8061 rgw_zone_set zones_trace
;
8063 zones_trace
= *_zones_trace
;
8065 zones_trace
.insert(svc
.zone
->get_zone().id
, bs
.bucket
.get_key());
8067 ObjectWriteOperation o
;
8068 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
8069 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
8070 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), svc
.zone
->get_zone().log_data
, bilog_flags
, zones_trace
);
8071 return bs
.bucket_obj
.operate(&o
, y
);
8074 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, const rgw_obj
& obj
, RGWModifyOp op
, string
& tag
,
8075 int64_t pool
, uint64_t epoch
,
8076 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
8077 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
8079 ObjectWriteOperation o
;
8080 rgw_bucket_dir_entry_meta dir_meta
;
8081 dir_meta
= ent
.meta
;
8082 dir_meta
.category
= category
;
8084 rgw_zone_set zones_trace
;
8086 zones_trace
= *_zones_trace
;
8088 zones_trace
.insert(svc
.zone
->get_zone().id
, bs
.bucket
.get_key());
8090 rgw_bucket_entry_ver ver
;
8093 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
8094 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
8095 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
8096 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
);
8097 complete_op_data
*arg
;
8098 index_completion_manager
->create_completion(obj
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
8099 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
, &arg
);
8100 librados::AioCompletion
*completion
= arg
->rados_completion
;
8101 int ret
= bs
.bucket_obj
.aio_operate(arg
->rados_completion
, &o
);
8102 completion
->release(); /* can't reference arg here, as it might have already been released */
8106 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, const rgw_obj
& obj
, string
& tag
,
8107 int64_t pool
, uint64_t epoch
,
8108 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
8109 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
8111 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
8114 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
8115 int64_t pool
, uint64_t epoch
,
8117 real_time
& removed_mtime
,
8118 list
<rgw_obj_index_key
> *remove_objs
,
8119 uint16_t bilog_flags
,
8120 rgw_zone_set
*zones_trace
)
8122 rgw_bucket_dir_entry ent
;
8123 ent
.meta
.mtime
= removed_mtime
;
8124 obj
.key
.get_index_key(&ent
.key
);
8125 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
,
8126 ent
, RGWObjCategory::None
, remove_objs
,
8127 bilog_flags
, zones_trace
);
8130 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
8132 rgw_bucket_dir_entry ent
;
8133 obj
.key
.get_index_key(&ent
.key
);
8134 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_CANCEL
, tag
,
8135 -1 /* pool id */, 0, ent
,
8136 RGWObjCategory::None
, NULL
, bilog_flags
,
8140 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo
& bucket_info
, uint64_t timeout
)
8142 RGWSI_RADOS::Pool index_pool
;
8143 map
<int, string
> bucket_objs
;
8144 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
8148 return CLSRGWIssueSetTagTimeout(index_pool
.ioctx(), bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
8152 uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries
,
8153 uint32_t num_shards
)
8155 // We want to minimize the chances that when num_shards >>
8156 // num_entries that we return much fewer than num_entries to the
8157 // client. Given all the overhead of making a cls call to the osd,
8158 // returning a few entries is not much more work than returning one
8159 // entry. This minimum might be better tuned based on future
8160 // experiments where num_shards >> num_entries. (Note: ">>" should
8161 // be interpreted as "much greater than".)
8162 constexpr uint32_t min_read
= 8;
8164 // The following is based on _"Balls into Bins" -- A Simple and
8165 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8166 // cases when num_shards >> num_entries (it almost serves as a
8167 // ceiling calculation). We also assume alpha is 1.0 and extract it
8168 // from the calculation. Future work could involve memoizing some of
8169 // the transcendental functions to minimize repeatedly re-calling
8170 // them with the same parameters, which we expect to be the case the
8171 // majority of the time.
8172 uint32_t calc_read
=
8174 static_cast<uint32_t>((num_entries
/ num_shards
) +
8175 sqrt((2 * num_entries
) *
8176 log(num_shards
) / num_shards
));
8178 return std::max(min_read
, calc_read
);
8182 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo
& bucket_info
,
8184 const rgw_obj_index_key
& start_after
,
8185 const string
& prefix
,
8186 const string
& delimiter
,
8187 const uint32_t num_entries
,
8188 const bool list_versions
,
8189 const uint16_t expansion_factor
,
8193 rgw_obj_index_key
*last_entry
,
8195 check_filter_t force_check_filter
)
8197 /* expansion_factor allows the number of entries to read to grow
8198 * exponentially; this is used when earlier reads are producing too
8199 * few results, perhaps due to filtering or to a series of
8200 * namespaced entries */
8202 ldout(cct
, 10) << "RGWRados::" << __func__
<< ": " << bucket_info
.bucket
<<
8203 " start_after=\"" << start_after
.name
<<
8204 "[" << start_after
.instance
<<
8205 "]\", prefix=\"" << prefix
<<
8206 "\" num_entries=" << num_entries
<<
8207 ", list_versions=" << list_versions
<<
8208 ", expansion_factor=" << expansion_factor
<< dendl
;
8212 RGWSI_RADOS::Pool index_pool
;
8213 // key - oid (for different shards if there is any)
8214 // value - list result for the corresponding oid (shard), it is filled by
8216 map
<int, string
> shard_oids
;
8217 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, shard_id
,
8218 &index_pool
, &shard_oids
,
8224 const uint32_t shard_count
= shard_oids
.size();
8225 uint32_t num_entries_per_shard
;
8226 if (expansion_factor
== 0) {
8227 num_entries_per_shard
=
8228 calc_ordered_bucket_list_per_shard(num_entries
, shard_count
);
8229 } else if (expansion_factor
<= 11) {
8230 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8231 num_entries_per_shard
=
8232 std::min(num_entries
,
8233 (uint32_t(1 << (expansion_factor
- 1)) *
8234 calc_ordered_bucket_list_per_shard(num_entries
, shard_count
)));
8236 num_entries_per_shard
= num_entries
;
8239 ldout(cct
, 10) << "RGWRados::" << __func__
<<
8240 " request from each of " << shard_count
<<
8241 " shard(s) for " << num_entries_per_shard
<< " entries to get " <<
8242 num_entries
<< " total entries" << dendl
;
8244 auto& ioctx
= index_pool
.ioctx();
8245 map
<int, rgw_cls_list_ret
> shard_list_results
;
8246 cls_rgw_obj_key
start_after_key(start_after
.name
, start_after
.instance
);
8247 r
= CLSRGWIssueBucketList(ioctx
, start_after_key
, prefix
, delimiter
,
8248 num_entries_per_shard
,
8249 list_versions
, shard_oids
, shard_list_results
,
8250 cct
->_conf
->rgw_bucket_index_max_aio
)();
8255 // to manage the iterators through each shard's list results
8256 struct ShardTracker
{
8257 const size_t shard_idx
;
8258 rgw_cls_list_ret
& result
;
8259 const std::string
& oid_name
;
8260 RGWRados::ent_map_t::iterator cursor
;
8261 RGWRados::ent_map_t::iterator end
;
8263 // manages an iterator through a shard and provides other
8265 ShardTracker(size_t _shard_idx
,
8266 rgw_cls_list_ret
& _result
,
8267 const std::string
& _oid_name
):
8268 shard_idx(_shard_idx
),
8270 oid_name(_oid_name
),
8271 cursor(_result
.dir
.m
.begin()),
8272 end(_result
.dir
.m
.end())
8275 inline const std::string
& entry_name() const {
8276 return cursor
->first
;
8278 rgw_bucket_dir_entry
& dir_entry() const {
8279 return cursor
->second
;
8281 inline bool is_truncated() const {
8282 return result
.is_truncated
;
8284 inline ShardTracker
& advance() {
8286 // return a self-reference to allow for chaining of calls, such
8287 // as x.advance().at_end()
8290 inline bool at_end() const {
8291 return cursor
== end
;
8295 // add the next unique candidate, or return false if we reach the end
8296 auto next_candidate
= [] (ShardTracker
& t
,
8297 std::map
<std::string
, size_t>& candidates
,
8298 size_t tracker_idx
) {
8299 while (!t
.at_end()) {
8300 if (candidates
.emplace(t
.entry_name(), tracker_idx
).second
) {
8303 t
.advance(); // skip duplicate common prefixes
8307 // one tracker per shard requested (may not be all shards)
8308 std::vector
<ShardTracker
> results_trackers
;
8309 results_trackers
.reserve(shard_list_results
.size());
8310 for (auto& r
: shard_list_results
) {
8311 results_trackers
.emplace_back(r
.first
, r
.second
, shard_oids
[r
.first
]);
8313 // if any *one* shard's result is trucated, the entire result is
8315 *is_truncated
= *is_truncated
|| r
.second
.is_truncated
;
8317 // unless *all* are shards are cls_filtered, the entire result is
8319 *cls_filtered
= *cls_filtered
&& r
.second
.cls_filtered
;
8322 // create a map to track the next candidate entry from ShardTracker
8323 // (key=candidate, value=index into results_trackers); as we consume
8324 // entries from shards, we replace them with the next entries in the
8325 // shards until we run out
8326 map
<string
, size_t> candidates
;
8327 size_t tracker_idx
= 0;
8328 for (auto& t
: results_trackers
) {
8329 // it's important that the values in the map refer to the index
8330 // into the results_trackers vector, which may not be the same
8331 // as the shard number (i.e., when not all shards are requested)
8332 next_candidate(t
, candidates
, tracker_idx
);
8336 rgw_bucket_dir_entry
*
8337 last_entry_visited
= nullptr; // to set last_entry (marker)
8338 map
<string
, bufferlist
> updates
;
8340 while (count
< num_entries
&& !candidates
.empty()) {
8342 // select the next entry in lexical order (first key in map);
8343 // again tracker_idx is not necessarily shard number, but is index
8344 // into results_trackers vector
8345 tracker_idx
= candidates
.begin()->second
;
8346 auto& tracker
= results_trackers
.at(tracker_idx
);
8347 last_entry_visited
= &tracker
.dir_entry();
8348 const string
& name
= tracker
.entry_name();
8349 rgw_bucket_dir_entry
& dirent
= tracker
.dir_entry();
8351 ldout(cct
, 20) << "RGWRados::" << __func__
<< " currently processing " <<
8352 dirent
.key
<< " from shard " << tracker
.shard_idx
<< dendl
;
8354 const bool force_check
=
8355 force_check_filter
&& force_check_filter(dirent
.key
.name
);
8357 if ((!dirent
.exists
&&
8358 !dirent
.is_delete_marker() &&
8359 !dirent
.is_common_prefix()) ||
8360 !dirent
.pending_map
.empty() ||
8362 /* there are uncommitted ops. We need to check the current
8363 * state, and if the tags are old we need to do clean-up as
8365 librados::IoCtx sub_ctx
;
8367 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
,
8368 updates
[tracker
.oid_name
], y
);
8369 if (r
< 0 && r
!= -ENOENT
) {
8377 ldout(cct
, 10) << "RGWRados::" << __func__
<< ": got " <<
8378 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
8379 m
[name
] = std::move(dirent
);
8382 ldout(cct
, 10) << "RGWRados::" << __func__
<< ": skipping " <<
8383 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
8386 // refresh the candidates map
8387 candidates
.erase(candidates
.begin());
8390 next_candidate(tracker
, candidates
, tracker_idx
);
8392 if (tracker
.at_end() && tracker
.is_truncated()) {
8393 // once we exhaust one shard that is truncated, we need to stop,
8394 // as we cannot be certain that one of the next entries needs to
8395 // come from that shard; S3 and swift protocols allow returning
8396 // fewer than what was requested
8399 } // while we haven't provided requested # of result entries
8401 // suggest updates if there are any
8402 for (auto& miter
: updates
) {
8403 if (miter
.second
.length()) {
8404 ObjectWriteOperation o
;
8405 cls_rgw_suggest_changes(o
, miter
.second
);
8406 // we don't care if we lose suggested updates, send them off blindly
8408 librados::Rados::aio_create_completion(nullptr, nullptr);
8409 ioctx
.aio_operate(miter
.first
, c
, &o
);
8414 // determine truncation by checking if all the returned entries are
8416 *is_truncated
= false;
8417 for (const auto& t
: results_trackers
) {
8418 if (!t
.at_end() || t
.is_truncated()) {
8419 *is_truncated
= true;
8424 ldout(cct
, 20) << "RGWRados::" << __func__
<<
8425 ": returning, count=" << count
<< ", is_truncated=" << *is_truncated
<<
8428 if (*is_truncated
&& count
< num_entries
) {
8429 ldout(cct
, 10) << "RGWRados::" << __func__
<<
8430 ": INFO requested " << num_entries
<< " entries but returning " <<
8431 count
<< ", which is truncated" << dendl
;
8434 if (last_entry_visited
!= nullptr && last_entry
) {
8435 // since we'll not need this any more, might as well move it...
8436 *last_entry
= std::move(last_entry_visited
->key
);
8437 ldout(cct
, 20) << "RGWRados::" << __func__
<<
8438 ": returning, last_entry=" << *last_entry
<< dendl
;
8440 ldout(cct
, 20) << "RGWRados::" << __func__
<<
8441 ": returning, last_entry NOT SET" << dendl
;
8448 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo
& bucket_info
,
8450 const rgw_obj_index_key
& start_after
,
8451 const string
& prefix
,
8452 uint32_t num_entries
,
8454 std::vector
<rgw_bucket_dir_entry
>& ent_list
,
8456 rgw_obj_index_key
*last_entry
,
8458 check_filter_t force_check_filter
) {
8459 ldout(cct
, 10) << "cls_bucket_list_unordered " << bucket_info
.bucket
<<
8460 " start_after " << start_after
.name
<< "[" << start_after
.instance
<<
8461 "] num_entries " << num_entries
<< dendl
;
8464 static MultipartMetaFilter multipart_meta_filter
;
8466 *is_truncated
= false;
8467 RGWSI_RADOS::Pool index_pool
;
8469 map
<int, string
> oids
;
8470 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, shard_id
, &index_pool
, &oids
, nullptr);
8474 auto& ioctx
= index_pool
.ioctx();
8476 const uint32_t num_shards
= oids
.size();
8478 rgw_obj_index_key marker
= start_after
;
8479 uint32_t current_shard
;
8480 if (shard_id
>= 0) {
8481 current_shard
= shard_id
;
8482 } else if (start_after
.empty()) {
8485 // at this point we have a marker (start_after) that has something
8486 // in it, so we need to get to the bucket shard index, so we can
8487 // start reading from there
8490 // test whether object name is a multipart meta name
8491 if(! multipart_meta_filter
.filter(start_after
.name
, key
)) {
8492 // if multipart_meta_filter fails, must be "regular" (i.e.,
8493 // unadorned) and the name is the key
8494 key
= start_after
.name
;
8497 // now convert the key (oid) to an rgw_obj_key since that will
8498 // separate out the namespace, name, and instance
8499 rgw_obj_key obj_key
;
8500 bool parsed
= rgw_obj_key::parse_raw_oid(key
, &obj_key
);
8503 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
8504 "start marker: '" << start_after
<< "'" << dendl
;
8506 } else if (obj_key
.name
.empty()) {
8507 // if the name is empty that means the object name came in with
8508 // a namespace only, and therefore we need to start our scan at
8509 // the first bucket index shard
8512 // so now we have the key used to compute the bucket index shard
8513 // and can extract the specific shard from it
8514 current_shard
= svc
.bi_rados
->bucket_shard_index(obj_key
.name
, num_shards
);
8518 uint32_t count
= 0u;
8519 map
<string
, bufferlist
> updates
;
8520 rgw_obj_index_key last_added_entry
;
8521 while (count
<= num_entries
&&
8522 ((shard_id
>= 0 && current_shard
== uint32_t(shard_id
)) ||
8523 current_shard
< num_shards
)) {
8524 const std::string
& oid
= oids
[current_shard
];
8525 rgw_cls_list_ret result
;
8527 librados::ObjectReadOperation op
;
8528 string empty_delimiter
;
8529 cls_rgw_bucket_list_op(op
, marker
, prefix
, empty_delimiter
,
8531 list_versions
, &result
);
8532 r
= rgw_rados_operate(ioctx
, oid
, &op
, nullptr, null_yield
);
8536 for (auto& entry
: result
.dir
.m
) {
8537 rgw_bucket_dir_entry
& dirent
= entry
.second
;
8539 bool force_check
= force_check_filter
&&
8540 force_check_filter(dirent
.key
.name
);
8541 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
8542 !dirent
.pending_map
.empty() ||
8544 /* there are uncommitted ops. We need to check the current state,
8545 * and if the tags are old we need to do cleanup as well. */
8546 librados::IoCtx sub_ctx
;
8548 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
, updates
[oid
], y
);
8549 if (r
< 0 && r
!= -ENOENT
) {
8556 // at this point either r >=0 or r == -ENOENT
8557 if (r
>= 0) { // i.e., if r != -ENOENT
8558 ldout(cct
, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
8559 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
8561 if (count
< num_entries
) {
8562 marker
= last_added_entry
= dirent
.key
; // double assign
8563 ent_list
.emplace_back(std::move(dirent
));
8566 *is_truncated
= true;
8569 } else { // r == -ENOENT
8570 // in the case of -ENOENT, make sure we're advancing marker
8571 // for possible next call to CLSRGWIssueBucketList
8572 marker
= dirent
.key
;
8576 if (!result
.is_truncated
) {
8577 // if we reached the end of the shard read next shard
8579 marker
= rgw_obj_index_key();
8585 // suggest updates if there is any
8586 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
8587 for (; miter
!= updates
.end(); ++miter
) {
8588 if (miter
->second
.length()) {
8589 ObjectWriteOperation o
;
8590 cls_rgw_suggest_changes(o
, miter
->second
);
8591 // we don't care if we lose suggested updates, send them off blindly
8592 AioCompletion
*c
= librados::Rados::aio_create_completion(nullptr, nullptr);
8593 ioctx
.aio_operate(miter
->first
, c
, &o
);
8598 if (last_entry
&& !ent_list
.empty()) {
8599 *last_entry
= last_added_entry
;
8603 } // RGWRados::cls_bucket_list_unordered
8606 int RGWRados::cls_obj_usage_log_add(const string
& oid
,
8607 rgw_usage_log_info
& info
)
8609 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
8612 int r
= get_raw_obj_ref(obj
, &ref
);
8617 ObjectWriteOperation op
;
8618 cls_rgw_usage_log_add(op
, info
);
8620 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
8624 int RGWRados::cls_obj_usage_log_read(const string
& oid
, const string
& user
, const string
& bucket
,
8625 uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
8626 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
,
8629 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
8632 int r
= get_raw_obj_ref(obj
, &ref
);
8637 *is_truncated
= false;
8639 r
= cls_rgw_usage_log_read(ref
.pool
.ioctx(), ref
.obj
.oid
, user
, bucket
, start_epoch
, end_epoch
,
8640 max_entries
, read_iter
, usage
, is_truncated
);
8645 static int cls_rgw_usage_log_trim_repeat(rgw_rados_ref ref
, const string
& user
, const string
& bucket
, uint64_t start_epoch
, uint64_t end_epoch
)
8649 librados::ObjectWriteOperation op
;
8650 cls_rgw_usage_log_trim(op
, user
, bucket
, start_epoch
, end_epoch
);
8651 int r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
8661 int RGWRados::cls_obj_usage_log_trim(const string
& oid
, const string
& user
, const string
& bucket
,
8662 uint64_t start_epoch
, uint64_t end_epoch
)
8664 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
8667 int r
= get_raw_obj_ref(obj
, &ref
);
8672 r
= cls_rgw_usage_log_trim_repeat(ref
, user
, bucket
, start_epoch
, end_epoch
);
8676 int RGWRados::cls_obj_usage_log_clear(string
& oid
)
8678 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
8681 int r
= get_raw_obj_ref(obj
, &ref
);
8685 librados::ObjectWriteOperation op
;
8686 cls_rgw_usage_log_clear(op
);
8687 r
= rgw_rados_operate(ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
8692 int RGWRados::remove_objs_from_index(RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
8694 RGWSI_RADOS::Pool index_pool
;
8697 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
8699 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, &index_pool
, &dir_oid
);
8705 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
8706 rgw_bucket_dir_entry entry
;
8708 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
8709 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
8710 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
8711 encode(entry
, updates
);
8716 r
= index_pool
.ioctx().exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
8721 int RGWRados::check_disk_state(librados::IoCtx io_ctx
,
8722 const RGWBucketInfo
& bucket_info
,
8723 rgw_bucket_dir_entry
& list_state
,
8724 rgw_bucket_dir_entry
& object
,
8725 bufferlist
& suggested_updates
,
8728 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8729 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
8733 rgw_obj
obj(bucket
, list_state
.key
);
8736 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
8738 if (loc
!= list_state
.locator
) {
8739 ldout(cct
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
8742 io_ctx
.locator_set_key(list_state
.locator
);
8744 RGWObjState
*astate
= NULL
;
8745 RGWObjectCtx
rctx(this->store
);
8746 int r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false, y
);
8750 list_state
.pending_map
.clear(); // we don't need this and it inflates size
8751 if (!list_state
.is_delete_marker() && !astate
->exists
) {
8752 /* object doesn't exist right now -- hopefully because it's
8753 * marked as !exists and got deleted */
8754 if (list_state
.exists
) {
8755 /* FIXME: what should happen now? Work out if there are any
8756 * non-bad ways this could happen (there probably are, but annoying
8759 // encode a suggested removal of that key
8760 list_state
.ver
.epoch
= io_ctx
.get_last_version();
8761 list_state
.ver
.pool
= io_ctx
.get_id();
8762 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
8767 string content_type
;
8770 object
.meta
.size
= astate
->size
;
8771 object
.meta
.accounted_size
= astate
->accounted_size
;
8772 object
.meta
.mtime
= astate
->mtime
;
8774 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
8775 if (iter
!= astate
->attrset
.end()) {
8776 etag
= rgw_bl_str(iter
->second
);
8778 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
8779 if (iter
!= astate
->attrset
.end()) {
8780 content_type
= rgw_bl_str(iter
->second
);
8782 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
8783 if (iter
!= astate
->attrset
.end()) {
8784 r
= decode_policy(iter
->second
, &owner
);
8786 dout(0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
8790 if (astate
->manifest
) {
8791 RGWObjManifest::obj_iterator miter
;
8792 RGWObjManifest
& manifest
= *astate
->manifest
;
8793 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
8794 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(this);
8796 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
8798 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
8799 dout(10) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
8800 r
= delete_obj_index(loc
, astate
->mtime
);
8802 dout(0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
8808 object
.meta
.etag
= etag
;
8809 object
.meta
.content_type
= content_type
;
8810 object
.meta
.owner
= owner
.get_id().to_str();
8811 object
.meta
.owner_display_name
= owner
.get_display_name();
8813 // encode suggested updates
8814 list_state
.ver
.pool
= io_ctx
.get_id();
8815 list_state
.ver
.epoch
= astate
->epoch
;
8816 list_state
.meta
.size
= object
.meta
.size
;
8817 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
8818 list_state
.meta
.mtime
= object
.meta
.mtime
;
8819 list_state
.meta
.category
= main_category
;
8820 list_state
.meta
.etag
= etag
;
8821 list_state
.meta
.content_type
= content_type
;
8822 if (astate
->obj_tag
.length() > 0)
8823 list_state
.tag
= astate
->obj_tag
.c_str();
8824 list_state
.meta
.owner
= owner
.get_id().to_str();
8825 list_state
.meta
.owner_display_name
= owner
.get_display_name();
8827 list_state
.exists
= true;
8828 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
8832 int RGWRados::cls_bucket_head(const RGWBucketInfo
& bucket_info
, int shard_id
, vector
<rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
8834 RGWSI_RADOS::Pool index_pool
;
8835 map
<int, string
> oids
;
8836 map
<int, struct rgw_cls_list_ret
> list_results
;
8837 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, shard_id
, &index_pool
, &oids
, bucket_instance_ids
);
8839 ldout(cct
, 20) << "cls_bucket_head: open_bucket_index() returned "
8844 r
= CLSRGWIssueGetDirHeader(index_pool
.ioctx(), oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8846 ldout(cct
, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
8851 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
8852 for(; iter
!= list_results
.end(); ++iter
) {
8853 headers
.push_back(std::move(iter
->second
.dir
.header
));
8858 int RGWRados::cls_bucket_head_async(const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
8860 RGWSI_RADOS::Pool index_pool
;
8861 map
<int, string
> bucket_objs
;
8862 int r
= svc
.bi_rados
->open_bucket_index(bucket_info
, shard_id
, &index_pool
, &bucket_objs
, nullptr);
8866 map
<int, string
>::iterator iter
= bucket_objs
.begin();
8867 for (; iter
!= bucket_objs
.end(); ++iter
) {
8868 r
= cls_rgw_get_dir_header_async(index_pool
.ioctx(), iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
8879 int RGWRados::check_bucket_shards(const RGWBucketInfo
& bucket_info
,
8880 const rgw_bucket
& bucket
,
8883 if (! cct
->_conf
.get_val
<bool>("rgw_dynamic_resharding")) {
8887 bool need_resharding
= false;
8888 uint32_t num_source_shards
=
8889 (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
8890 const uint32_t max_dynamic_shards
=
8891 uint32_t(cct
->_conf
.get_val
<uint64_t>("rgw_max_dynamic_shards"));
8893 if (num_source_shards
>= max_dynamic_shards
) {
8897 uint32_t suggested_num_shards
= 0;
8898 const uint64_t max_objs_per_shard
=
8899 cct
->_conf
.get_val
<uint64_t>("rgw_max_objs_per_shard");
8901 quota_handler
->check_bucket_shards(max_objs_per_shard
, num_source_shards
,
8902 num_objs
, need_resharding
, &suggested_num_shards
);
8903 if (! need_resharding
) {
8907 const uint32_t final_num_shards
=
8908 RGWBucketReshard::get_preferred_shards(suggested_num_shards
,
8909 max_dynamic_shards
);
8910 // final verification, so we don't reduce number of shards
8911 if (final_num_shards
<= num_source_shards
) {
8915 ldout(cct
, 20) << "RGWRados::" << __func__
<< " bucket " << bucket
.name
<<
8916 " needs resharding; current num shards " << bucket_info
.num_shards
<<
8917 "; new num shards " << final_num_shards
<< " (suggested " <<
8918 suggested_num_shards
<< ")" << dendl
;
8920 return add_bucket_to_reshard(bucket_info
, final_num_shards
);
8923 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo
& bucket_info
, uint32_t new_num_shards
)
8925 RGWReshard
reshard(this->store
);
8927 uint32_t num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
8929 new_num_shards
= std::min(new_num_shards
, get_max_bucket_shards());
8930 if (new_num_shards
<= num_source_shards
) {
8931 ldout(cct
, 20) << "not resharding bucket name=" << bucket_info
.bucket
.name
<< ", orig_num=" << num_source_shards
<< ", new_num_shards=" << new_num_shards
<< dendl
;
8935 cls_rgw_reshard_entry entry
;
8936 entry
.time
= real_clock::now();
8937 entry
.tenant
= bucket_info
.owner
.tenant
;
8938 entry
.bucket_name
= bucket_info
.bucket
.name
;
8939 entry
.bucket_id
= bucket_info
.bucket
.bucket_id
;
8940 entry
.old_num_shards
= num_source_shards
;
8941 entry
.new_num_shards
= new_num_shards
;
8943 return reshard
.add(entry
);
8946 int RGWRados::check_quota(const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
8947 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
, uint64_t obj_size
, bool check_size_only
)
8949 // if we only check size, then num_objs will set to 0
8951 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 0, obj_size
);
8953 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
);
8956 int RGWRados::get_target_shard_id(const RGWBucketInfo
& bucket_info
, const string
& obj_key
,
8960 switch (bucket_info
.bucket_index_shard_hash_type
) {
8961 case RGWBucketInfo::MOD
:
8962 if (!bucket_info
.num_shards
) {
8967 uint32_t sid
= svc
.bi_rados
->bucket_shard_index(obj_key
, bucket_info
.num_shards
);
8969 *shard_id
= (int)sid
;
8979 uint64_t RGWRados::instance_id()
8981 return get_rados_handle()->get_instance_id();
8984 uint64_t RGWRados::next_bucket_id()
8986 std::lock_guard l
{bucket_id_lock
};
8987 return ++max_bucket_id
;
8990 librados::Rados
* RGWRados::get_rados_handle()
8995 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
8998 int ret
= get_raw_obj_ref(obj
, &ref
);
9000 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
9004 ObjectWriteOperation op
;
9005 list
<string
> prefixes
;
9006 cls_rgw_remove_obj(op
, prefixes
);
9008 AioCompletion
*c
= librados::Rados::aio_create_completion(nullptr, nullptr);
9009 ret
= ref
.pool
.ioctx().aio_operate(ref
.obj
.oid
, c
, &op
);
9011 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
9016 handles
.push_back(c
);
9021 int RGWRados::delete_obj_aio(const rgw_obj
& obj
,
9022 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
9023 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
,
9027 int ret
= get_obj_head_ref(bucket_info
, obj
, &ref
);
9029 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
9033 if (keep_index_consistent
) {
9034 RGWRados::Bucket
bop(this, bucket_info
);
9035 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
9037 ret
= index_op
.prepare(CLS_RGW_OP_DEL
, &astate
->write_tag
, y
);
9039 lderr(cct
) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
9044 ObjectWriteOperation op
;
9045 list
<string
> prefixes
;
9046 cls_rgw_remove_obj(op
, prefixes
);
9048 AioCompletion
*c
= librados::Rados::aio_create_completion(nullptr, nullptr);
9049 ret
= ref
.pool
.ioctx().aio_operate(ref
.obj
.oid
, c
, &op
);
9051 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
9056 handles
.push_back(c
);
9058 if (keep_index_consistent
) {
9059 ret
= delete_obj_index(obj
, astate
->mtime
);
9061 lderr(cct
) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;