1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab ft=cpp
4 #include "include/compat.h"
10 #include <boost/algorithm/string.hpp>
11 #include <string_view>
13 #include <boost/container/flat_set.hpp>
14 #include <boost/format.hpp>
15 #include <boost/optional.hpp>
16 #include <boost/utility/in_place_factory.hpp>
18 #include "common/ceph_json.h"
20 #include "common/errno.h"
21 #include "common/Formatter.h"
22 #include "common/Throttle.h"
26 #include "rgw_cache.h"
28 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
29 #include "rgw_aio_throttle.h"
30 #include "rgw_bucket.h"
31 #include "rgw_rest_conn.h"
32 #include "rgw_cr_rados.h"
33 #include "rgw_cr_rest.h"
34 #include "rgw_datalog.h"
35 #include "rgw_putobj_processor.h"
37 #include "cls/rgw/cls_rgw_ops.h"
38 #include "cls/rgw/cls_rgw_client.h"
39 #include "cls/rgw/cls_rgw_const.h"
40 #include "cls/refcount/cls_refcount_client.h"
41 #include "cls/version/cls_version_client.h"
42 #include "osd/osd_types.h"
44 #include "rgw_tools.h"
45 #include "rgw_coroutine.h"
46 #include "rgw_compression.h"
47 #include "rgw_etag_verifier.h"
48 #include "rgw_worker.h"
49 #include "rgw_notify.h"
51 #undef fork // fails to compile RGWPeriod::fork() below
53 #include "common/Clock.h"
55 using namespace librados
;
63 #include "include/random.h"
68 #include "rgw_object_expirer_core.h"
70 #include "rgw_sync_counters.h"
71 #include "rgw_sync_trace.h"
72 #include "rgw_trim_datalog.h"
73 #include "rgw_trim_mdlog.h"
74 #include "rgw_data_sync.h"
75 #include "rgw_realm_watcher.h"
76 #include "rgw_reshard.h"
78 #include "services/svc_zone.h"
79 #include "services/svc_zone_utils.h"
80 #include "services/svc_quota.h"
81 #include "services/svc_sync_modules.h"
82 #include "services/svc_sys_obj.h"
83 #include "services/svc_sys_obj_cache.h"
84 #include "services/svc_bucket.h"
85 #include "services/svc_mdlog.h"
87 #include "compressor/Compressor.h"
89 #include "rgw_d3n_datacache.h"
92 #define TRACEPOINT_DEFINE
93 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
94 #include "tracing/rgw_rados.h"
95 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
96 #undef TRACEPOINT_DEFINE
98 #define tracepoint(...)
101 #define dout_context g_ceph_context
102 #define dout_subsys ceph_subsys_rgw
105 static string shadow_ns
= "shadow";
106 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
107 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
109 static RGWObjCategory main_category
= RGWObjCategory::Main
;
110 #define RGW_USAGE_OBJ_PREFIX "usage."
113 // returns true on success, false on failure
114 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
115 const rgw_placement_rule
& head_placement_rule
,
116 const rgw_obj
& obj
, rgw_pool
*pool
)
118 if (!zone_params
.get_head_data_pool(head_placement_rule
, obj
, pool
)) {
119 RGWZonePlacementInfo placement
;
120 if (!zone_params
.get_placement(zonegroup
.default_placement
.name
, &placement
)) {
124 if (!obj
.in_extra_data
) {
125 *pool
= placement
.get_data_pool(zonegroup
.default_placement
.storage_class
);
127 *pool
= placement
.get_data_extra_pool();
134 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
135 const rgw_placement_rule
& head_placement_rule
,
136 const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
138 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
140 return rgw_get_obj_data_pool(zonegroup
, zone_params
, head_placement_rule
, obj
, &raw_obj
->pool
);
143 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
147 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
153 rgw_raw_obj
rgw_obj_select::get_raw_obj(rgw::sal::RadosStore
* store
) const
157 store
->get_raw_obj(placement_rule
, obj
, &r
);
163 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
165 obj_version
*check_objv
= version_for_check();
168 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
171 cls_version_read(*op
, &read_version
);
174 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
176 obj_version
*check_objv
= version_for_check();
177 obj_version
*modify_version
= version_for_write();
180 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
183 if (modify_version
) {
184 cls_version_set(*op
, *modify_version
);
186 cls_version_inc(*op
);
190 void RGWObjVersionTracker::apply_write()
192 const bool checked
= (read_version
.ver
!= 0);
193 const bool incremented
= (write_version
.ver
== 0);
195 if (checked
&& incremented
) {
196 // apply cls_version_inc() so our next operation can recheck it
199 read_version
= write_version
;
201 write_version
= obj_version();
204 RGWObjState::RGWObjState() {
207 RGWObjState::~RGWObjState() {
210 RGWObjState::RGWObjState(const RGWObjState
& rhs
) : obj (rhs
.obj
) {
211 is_atomic
= rhs
.is_atomic
;
212 has_attrs
= rhs
.has_attrs
;
215 accounted_size
= rhs
.accounted_size
;
218 if (rhs
.obj_tag
.length()) {
219 obj_tag
= rhs
.obj_tag
;
221 if (rhs
.tail_tag
.length()) {
222 tail_tag
= rhs
.tail_tag
;
224 write_tag
= rhs
.write_tag
;
225 fake_tag
= rhs
.fake_tag
;
226 manifest
= rhs
.manifest
;
227 shadow_obj
= rhs
.shadow_obj
;
228 has_data
= rhs
.has_data
;
229 if (rhs
.data
.length()) {
232 prefetch_data
= rhs
.prefetch_data
;
233 keep_tail
= rhs
.keep_tail
;
235 objv_tracker
= rhs
.objv_tracker
;
237 compressed
= rhs
.compressed
;
240 RGWObjState
*RGWObjectCtx::get_state(const rgw_obj
& obj
) {
242 typename
std::map
<rgw_obj
, RGWObjState
>::iterator iter
;
244 assert (!obj
.empty());
245 iter
= objs_state
.find(obj
);
246 if (iter
!= objs_state
.end()) {
247 result
= &iter
->second
;
248 lock
.unlock_shared();
250 lock
.unlock_shared();
252 result
= &objs_state
[obj
];
258 void RGWObjectCtx::set_compressed(const rgw_obj
& obj
) {
259 std::unique_lock wl
{lock
};
260 assert (!obj
.empty());
261 objs_state
[obj
].compressed
= true;
264 void RGWObjectCtx::set_atomic(rgw_obj
& obj
) {
265 std::unique_lock wl
{lock
};
266 assert (!obj
.empty());
267 objs_state
[obj
].is_atomic
= true;
269 void RGWObjectCtx::set_prefetch_data(const rgw_obj
& obj
) {
270 std::unique_lock wl
{lock
};
271 assert (!obj
.empty());
272 objs_state
[obj
].prefetch_data
= true;
275 void RGWObjectCtx::invalidate(const rgw_obj
& obj
) {
276 std::unique_lock wl
{lock
};
277 auto iter
= objs_state
.find(obj
);
278 if (iter
== objs_state
.end()) {
281 bool is_atomic
= iter
->second
.is_atomic
;
282 bool prefetch_data
= iter
->second
.prefetch_data
;
283 bool compressed
= iter
->second
.compressed
;
285 objs_state
.erase(iter
);
287 if (is_atomic
|| prefetch_data
|| compressed
) {
288 auto& state
= objs_state
[obj
];
289 state
.is_atomic
= is_atomic
;
290 state
.prefetch_data
= prefetch_data
;
291 state
.compressed
= compressed
;
295 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
297 write_version
.ver
= 1;
300 write_version
.tag
.clear();
301 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
304 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
306 RGWHTTPManager http_manager
;
309 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
310 http_manager(store
->ctx(), completion_mgr
) {
311 http_manager
.start();
314 int notify_all(const DoutPrefixProvider
*dpp
, map
<rgw_zone_id
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
315 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
319 list
<RGWCoroutinesStack
*> stacks
;
320 for (auto iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
321 RGWRESTConn
*conn
= iter
->second
;
322 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
323 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
325 stacks
.push_back(stack
);
327 return run(dpp
, stacks
);
331 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
333 RGWHTTPManager http_manager
;
336 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
337 http_manager(store
->ctx(), completion_mgr
) {
338 http_manager
.start();
341 int notify_all(const DoutPrefixProvider
*dpp
, map
<rgw_zone_id
, RGWRESTConn
*>& conn_map
,
342 bc::flat_map
<int, bc::flat_set
<string
> >& shards
) {
343 rgw_http_param_pair pairs
[] = { { "type", "data" },
345 { "source-zone", store
->svc
.zone
->get_zone_params().get_id().c_str() },
348 list
<RGWCoroutinesStack
*> stacks
;
349 for (auto iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
350 RGWRESTConn
*conn
= iter
->second
;
351 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
352 stack
->call(new RGWPostRESTResourceCR
<bc::flat_map
<int, bc::flat_set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
354 stacks
.push_back(stack
);
356 return run(dpp
, stacks
);
360 /* class RGWRadosThread */
362 void RGWRadosThread::start()
364 worker
= new Worker(cct
, this);
365 worker
->create(thread_name
.c_str());
368 void RGWRadosThread::stop()
380 void *RGWRadosThread::Worker::entry() {
381 uint64_t msec
= processor
->interval_msec();
382 auto interval
= std::chrono::milliseconds(msec
);
385 auto start
= ceph::real_clock::now();
386 int r
= processor
->process(this);
388 ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
391 if (processor
->going_down())
394 auto end
= ceph::real_clock::now() - start
;
396 uint64_t cur_msec
= processor
->interval_msec();
397 if (cur_msec
!= msec
) { /* was it reconfigured? */
399 interval
= std::chrono::milliseconds(msec
);
404 continue; // next round
406 auto wait_time
= interval
- end
;
407 wait_interval(wait_time
);
411 } while (!processor
->going_down());
416 class RGWMetaNotifier
: public RGWRadosThread
{
417 RGWMetaNotifierManager notify_mgr
;
418 RGWMetadataLog
*const log
;
420 uint64_t interval_msec() override
{
421 return cct
->_conf
->rgw_md_notify_interval_msec
;
423 void stop_process() override
{
427 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
428 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
430 int process(const DoutPrefixProvider
*dpp
) override
;
433 int RGWMetaNotifier::process(const DoutPrefixProvider
*dpp
)
437 log
->read_clear_modified(shards
);
439 if (shards
.empty()) {
443 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
444 ldpp_dout(dpp
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
447 notify_mgr
.notify_all(dpp
, store
->svc
.zone
->get_zone_conn_map(), shards
);
452 class RGWDataNotifier
: public RGWRadosThread
{
453 RGWDataNotifierManager notify_mgr
;
455 uint64_t interval_msec() override
{
456 return cct
->_conf
.get_val
<int64_t>("rgw_data_notify_interval_msec");
458 void stop_process() override
{
462 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
464 int process(const DoutPrefixProvider
*dpp
) override
;
467 int RGWDataNotifier::process(const DoutPrefixProvider
*dpp
)
469 auto data_log
= store
->svc
.datalog_rados
;
474 auto shards
= data_log
->read_clear_modified();
476 if (shards
.empty()) {
480 for (const auto& [shard_id
, keys
] : shards
) {
481 ldpp_dout(dpp
, 20) << __func__
<< "(): notifying datalog change, shard_id="
482 << shard_id
<< ": " << keys
<< dendl
;
485 notify_mgr
.notify_all(dpp
, store
->svc
.zone
->get_zone_data_notify_to_map(), shards
);
490 class RGWSyncProcessorThread
: public RGWRadosThread
{
492 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
493 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
494 ~RGWSyncProcessorThread() override
{}
495 int init(const DoutPrefixProvider
*dpp
) override
= 0 ;
496 int process(const DoutPrefixProvider
*dpp
) override
= 0;
499 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
501 RGWMetaSyncStatusManager sync
;
503 uint64_t interval_msec() override
{
504 return 0; /* no interval associated, it'll run once until stopped */
506 void stop_process() override
{
510 RGWMetaSyncProcessorThread(rgw::sal::RadosStore
* _store
, RGWAsyncRadosProcessor
*async_rados
)
511 : RGWSyncProcessorThread(_store
->getRados(), "meta-sync"), sync(_store
, async_rados
) {}
513 void wakeup_sync_shards(set
<int>& shard_ids
) {
514 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
518 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
520 int init(const DoutPrefixProvider
*dpp
) override
{
521 int ret
= sync
.init(dpp
);
523 ldpp_dout(dpp
, 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
529 int process(const DoutPrefixProvider
*dpp
) override
{
530 sync
.run(dpp
, null_yield
);
535 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
537 PerfCountersRef counters
;
538 RGWDataSyncStatusManager sync
;
541 uint64_t interval_msec() override
{
543 return 0; /* no interval associated, it'll run once until stopped */
545 #define DATA_SYNC_INIT_WAIT_SEC 20
546 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
549 void stop_process() override
{
553 RGWDataSyncProcessorThread(rgw::sal::RadosStore
* _store
, RGWAsyncRadosProcessor
*async_rados
,
554 const RGWZone
* source_zone
)
555 : RGWSyncProcessorThread(_store
->getRados(), "data-sync"),
556 counters(sync_counters::build(store
->ctx(), std::string("data-sync-from-") + source_zone
->name
)),
557 sync(_store
, async_rados
, source_zone
->id
, counters
.get()),
558 initialized(false) {}
560 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
561 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
562 sync
.wakeup(iter
->first
, iter
->second
);
565 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
567 int init(const DoutPrefixProvider
*dpp
) override
{
571 int process(const DoutPrefixProvider
*dpp
) override
{
572 while (!initialized
) {
576 int ret
= sync
.init(dpp
);
589 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
, DoutPrefixProvider
591 RGWCoroutinesManager crs
;
592 rgw::sal::RadosStore
* store
;
593 rgw::BucketTrimManager
*bucket_trim
;
595 const utime_t trim_interval
;
597 uint64_t interval_msec() override
{ return 0; }
598 void stop_process() override
{ crs
.stop(); }
600 RGWSyncLogTrimThread(rgw::sal::RadosStore
* store
, rgw::BucketTrimManager
*bucket_trim
,
602 : RGWSyncProcessorThread(store
->getRados(), "sync-log-trim"),
603 crs(store
->ctx(), store
->getRados()->get_cr_registry()), store(store
),
604 bucket_trim(bucket_trim
),
605 http(store
->ctx(), crs
.get_completion_mgr()),
606 trim_interval(interval
, 0)
609 int init(const DoutPrefixProvider
*dpp
) override
{
612 int process(const DoutPrefixProvider
*dpp
) override
{
613 list
<RGWCoroutinesStack
*> stacks
;
614 auto metatrimcr
= create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore
*>(store
), &http
,
615 cct
->_conf
->rgw_md_log_max_shards
,
618 ldpp_dout(dpp
, -1) << "Bailing out of trim thread!" << dendl
;
621 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
622 meta
->call(metatrimcr
);
624 stacks
.push_back(meta
);
626 if (store
->svc()->zone
->sync_module_exports_data()) {
627 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
628 data
->call(create_data_log_trim_cr(dpp
, static_cast<rgw::sal::RadosStore
*>(store
), &http
,
629 cct
->_conf
->rgw_data_log_num_shards
,
631 stacks
.push_back(data
);
633 auto bucket
= new RGWCoroutinesStack(store
->ctx(), &crs
);
634 bucket
->call(bucket_trim
->create_bucket_trim_cr(&http
));
635 stacks
.push_back(bucket
);
638 crs
.run(dpp
, stacks
);
642 // implements DoutPrefixProvider
643 CephContext
*get_cct() const override
{ return store
->ctx(); }
644 unsigned get_subsys() const override
649 std::ostream
& gen_prefix(std::ostream
& out
) const override
651 return out
<< "sync log trim: ";
656 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
658 std::lock_guard l
{meta_sync_thread_lock
};
659 if (meta_sync_processor_thread
) {
660 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
664 void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider
*dpp
, const rgw_zone_id
& source_zone
, map
<int, set
<string
> >& shard_ids
)
666 ldpp_dout(dpp
, 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
667 std::lock_guard l
{data_sync_thread_lock
};
668 auto iter
= data_sync_processor_threads
.find(source_zone
);
669 if (iter
== data_sync_processor_threads
.end()) {
670 ldpp_dout(dpp
, 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
674 RGWDataSyncProcessorThread
*thread
= iter
->second
;
676 thread
->wakeup_sync_shards(shard_ids
);
679 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
681 std::lock_guard l
{meta_sync_thread_lock
};
682 if (meta_sync_processor_thread
) {
683 return meta_sync_processor_thread
->get_manager();
688 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const rgw_zone_id
& source_zone
)
690 std::lock_guard l
{data_sync_thread_lock
};
691 auto thread
= data_sync_processor_threads
.find(source_zone
);
692 if (thread
== data_sync_processor_threads
.end()) {
695 return thread
->second
->get_manager();
698 int RGWRados::get_required_alignment(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
, uint64_t *alignment
)
701 int r
= open_pool_ctx(dpp
, pool
, ioctx
, false);
703 ldpp_dout(dpp
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
708 r
= ioctx
.pool_requires_alignment2(&requires
);
710 ldpp_dout(dpp
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
721 r
= ioctx
.pool_required_alignment2(&align
);
723 ldpp_dout(dpp
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
728 ldpp_dout(dpp
, 20) << "required alignment=" << align
<< dendl
;
734 void RGWRados::get_max_aligned_size(uint64_t size
, uint64_t alignment
, uint64_t *max_size
)
736 if (alignment
== 0) {
741 if (size
<= alignment
) {
742 *max_size
= alignment
;
746 *max_size
= size
- (size
% alignment
);
749 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
, const DoutPrefixProvider
*dpp
, uint64_t *palignment
)
752 int r
= get_required_alignment(dpp
, pool
, &alignment
);
758 *palignment
= alignment
;
761 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
763 get_max_aligned_size(config_chunk_size
, alignment
, max_chunk_size
);
765 ldpp_dout(dpp
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
770 int RGWRados::get_max_chunk_size(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
,
771 uint64_t *max_chunk_size
, const DoutPrefixProvider
*dpp
, uint64_t *palignment
)
774 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
775 ldpp_dout(dpp
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
778 return get_max_chunk_size(pool
, max_chunk_size
, dpp
, palignment
);
781 class RGWIndexCompletionManager
;
783 struct complete_op_data
{
784 ceph::mutex lock
= ceph::make_mutex("complete_op_data");
785 AioCompletion
*rados_completion
{nullptr};
786 int manager_shard_id
{-1};
787 RGWIndexCompletionManager
*manager
{nullptr};
791 rgw_bucket_entry_ver ver
;
793 rgw_bucket_dir_entry_meta dir_meta
;
794 list
<cls_rgw_obj_key
> remove_objs
;
797 rgw_zone_set zones_trace
;
802 std::lock_guard l
{lock
};
807 class RGWIndexCompletionThread
: public RGWRadosThread
, public DoutPrefixProvider
{
810 uint64_t interval_msec() override
{
814 list
<complete_op_data
*> completions
;
816 ceph::mutex completions_lock
=
817 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
819 RGWIndexCompletionThread(RGWRados
*_store
)
820 : RGWRadosThread(_store
, "index-complete"), store(_store
) {}
822 int process(const DoutPrefixProvider
*dpp
) override
;
824 void add_completion(complete_op_data
*completion
) {
826 std::lock_guard l
{completions_lock
};
827 completions
.push_back(completion
);
833 CephContext
*get_cct() const override
{ return store
->ctx(); }
834 unsigned get_subsys() const { return dout_subsys
; }
835 std::ostream
& gen_prefix(std::ostream
& out
) const { return out
<< "rgw index completion thread: "; }
838 int RGWIndexCompletionThread::process(const DoutPrefixProvider
*dpp
)
840 list
<complete_op_data
*> comps
;
843 std::lock_guard l
{completions_lock
};
844 completions
.swap(comps
);
847 for (auto c
: comps
) {
848 std::unique_ptr
<complete_op_data
> up
{c
};
853 ldpp_dout(this, 20) << __func__
<< "(): handling completion for key=" << c
->key
<< dendl
;
855 RGWRados::BucketShard
bs(store
);
856 RGWBucketInfo bucket_info
;
858 int r
= bs
.init(c
->obj
.bucket
, c
->obj
, &bucket_info
, this);
860 ldpp_dout(this, 0) << "ERROR: " << __func__
<< "(): failed to initialize BucketShard, obj=" << c
->obj
<< " r=" << r
<< dendl
;
865 r
= store
->guard_reshard(this, &bs
, c
->obj
, bucket_info
,
866 [&](RGWRados::BucketShard
*bs
) -> int {
867 librados::ObjectWriteOperation o
;
868 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
869 cls_rgw_bucket_complete_op(o
, c
->op
, c
->tag
, c
->ver
, c
->key
, c
->dir_meta
, &c
->remove_objs
,
870 c
->log_op
, c
->bilog_op
, &c
->zones_trace
);
871 return bs
->bucket_obj
.operate(this, &o
, null_yield
);
874 ldpp_dout(this, 0) << "ERROR: " << __func__
<< "(): bucket index completion failed, obj=" << c
->obj
<< " r=" << r
<< dendl
;
875 /* ignoring error, can't do anything about it */
878 r
= store
->svc
.datalog_rados
->add_entry(this, bucket_info
, bs
.shard_id
);
880 ldpp_dout(this, -1) << "ERROR: failed writing data log" << dendl
;
887 class RGWIndexCompletionManager
{
888 RGWRados
*store
{nullptr};
889 ceph::containers::tiny_vector
<ceph::mutex
> locks
;
890 vector
<set
<complete_op_data
*> > completions
;
892 RGWIndexCompletionThread
*completion_thread
{nullptr};
896 std::atomic
<int> cur_shard
{0};
900 RGWIndexCompletionManager(RGWRados
*_store
) :
902 locks
{ceph::make_lock_container
<ceph::mutex
>(
903 store
->ctx()->_conf
->rgw_thread_pool_size
,
905 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
909 num_shards
= store
->ctx()->_conf
->rgw_thread_pool_size
;
910 completions
.resize(num_shards
);
912 ~RGWIndexCompletionManager() {
917 int result
= cur_shard
% num_shards
;
922 void create_completion(const rgw_obj
& obj
,
923 RGWModifyOp op
, string
& tag
,
924 rgw_bucket_entry_ver
& ver
,
925 const cls_rgw_obj_key
& key
,
926 rgw_bucket_dir_entry_meta
& dir_meta
,
927 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
929 rgw_zone_set
*zones_trace
,
930 complete_op_data
**result
);
931 bool handle_completion(completion_t cb
, complete_op_data
*arg
);
933 int start(const DoutPrefixProvider
*dpp
) {
934 completion_thread
= new RGWIndexCompletionThread(store
);
935 int ret
= completion_thread
->init(dpp
);
939 completion_thread
->start();
943 if (completion_thread
) {
944 completion_thread
->stop();
945 delete completion_thread
;
948 for (int i
= 0; i
< num_shards
; ++i
) {
949 std::lock_guard l
{locks
[i
]};
950 for (auto c
: completions
[i
]) {
958 static void obj_complete_cb(completion_t cb
, void *arg
)
960 complete_op_data
*completion
= (complete_op_data
*)arg
;
961 completion
->lock
.lock();
962 if (completion
->stopped
) {
963 completion
->lock
.unlock(); /* can drop lock, no one else is referencing us */
967 bool need_delete
= completion
->manager
->handle_completion(cb
, completion
);
968 completion
->lock
.unlock();
975 void RGWIndexCompletionManager::create_completion(const rgw_obj
& obj
,
976 RGWModifyOp op
, string
& tag
,
977 rgw_bucket_entry_ver
& ver
,
978 const cls_rgw_obj_key
& key
,
979 rgw_bucket_dir_entry_meta
& dir_meta
,
980 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
982 rgw_zone_set
*zones_trace
,
983 complete_op_data
**result
)
985 complete_op_data
*entry
= new complete_op_data
;
987 int shard_id
= next_shard();
989 entry
->manager_shard_id
= shard_id
;
990 entry
->manager
= this;
996 entry
->dir_meta
= dir_meta
;
997 entry
->log_op
= log_op
;
998 entry
->bilog_op
= bilog_op
;
1001 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
1002 entry
->remove_objs
.push_back(*iter
);
1007 entry
->zones_trace
= *zones_trace
;
1009 entry
->zones_trace
.insert(store
->svc
.zone
->get_zone().id
, obj
.bucket
.get_key());
1014 entry
->rados_completion
= librados::Rados::aio_create_completion(entry
, obj_complete_cb
);
1016 std::lock_guard l
{locks
[shard_id
]};
1017 completions
[shard_id
].insert(entry
);
1020 bool RGWIndexCompletionManager::handle_completion(completion_t cb
, complete_op_data
*arg
)
1022 int shard_id
= arg
->manager_shard_id
;
1024 std::lock_guard l
{locks
[shard_id
]};
1026 auto& comps
= completions
[shard_id
];
1028 auto iter
= comps
.find(arg
);
1029 if (iter
== comps
.end()) {
1036 int r
= rados_aio_get_return_value(cb
);
1037 if (r
!= -ERR_BUSY_RESHARDING
) {
1040 completion_thread
->add_completion(arg
);
1044 void RGWRados::finalize()
1046 if (run_sync_thread
) {
1047 std::lock_guard l
{meta_sync_thread_lock
};
1048 meta_sync_processor_thread
->stop();
1050 std::lock_guard dl
{data_sync_thread_lock
};
1051 for (auto iter
: data_sync_processor_threads
) {
1052 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1055 if (sync_log_trimmer
) {
1056 sync_log_trimmer
->stop();
1059 if (run_sync_thread
) {
1060 delete meta_sync_processor_thread
;
1061 meta_sync_processor_thread
= NULL
;
1062 std::lock_guard dl
{data_sync_thread_lock
};
1063 for (auto iter
: data_sync_processor_threads
) {
1064 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1067 data_sync_processor_threads
.clear();
1068 delete sync_log_trimmer
;
1069 sync_log_trimmer
= nullptr;
1070 bucket_trim
= boost::none
;
1072 if (meta_notifier
) {
1073 meta_notifier
->stop();
1074 delete meta_notifier
;
1076 if (data_notifier
) {
1077 data_notifier
->stop();
1078 delete data_notifier
;
1091 RGWQuotaHandler::free_handler(quota_handler
);
1099 delete obj_tombstone_cache
;
1101 delete d3n_data_cache
;
1103 if (reshard_wait
.get()) {
1104 reshard_wait
->stop();
1105 reshard_wait
.reset();
1108 if (run_reshard_thread
) {
1109 reshard
->stop_processor();
1112 delete index_completion_manager
;
1114 rgw::notify::shutdown();
1118 * Initialize the RADOS instance and prepare to do other ops
1119 * Returns 0 on success, -ERR# on failure.
1121 int RGWRados::init_rados()
1125 ret
= rados
.init_with_context(cct
);
1129 ret
= rados
.connect();
1134 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
1135 new RGWCoroutinesManagerRegistry(cct
)};
1136 ret
= crs
->hook_to_admin_command("cr dump");
1141 cr_registry
= crs
.release();
1143 if (use_datacache
) {
1144 d3n_data_cache
= new D3nDataCache();
1145 d3n_data_cache
->init(cct
);
1151 int RGWRados::register_to_service_map(const DoutPrefixProvider
*dpp
, const string
& daemon_type
, const map
<string
, string
>& meta
)
1153 string name
= cct
->_conf
->name
.get_id();
1154 if (name
.compare(0, 4, "rgw.") == 0) {
1155 name
= name
.substr(4);
1157 map
<string
,string
> metadata
= meta
;
1158 metadata
["num_handles"] = "1"s
;
1159 metadata
["zonegroup_id"] = svc
.zone
->get_zonegroup().get_id();
1160 metadata
["zonegroup_name"] = svc
.zone
->get_zonegroup().get_name();
1161 metadata
["zone_name"] = svc
.zone
->zone_name();
1162 metadata
["zone_id"] = svc
.zone
->zone_id().id
;
1163 metadata
["realm_name"] = svc
.zone
->get_realm().get_name();
1164 metadata
["realm_id"] = svc
.zone
->get_realm().get_id();
1165 metadata
["id"] = name
;
1166 int ret
= rados
.service_daemon_register(
1168 stringify(rados
.get_instance_id()),
1171 ldpp_dout(dpp
, 0) << "ERROR: service_daemon_register() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1178 int RGWRados::update_service_map(const DoutPrefixProvider
*dpp
, std::map
<std::string
, std::string
>&& status
)
1180 int ret
= rados
.service_daemon_update_status(move(status
));
1182 ldpp_dout(dpp
, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1190 * Initialize the RADOS instance and prepare to do other ops
1191 * Returns 0 on success, -ERR# on failure.
1193 int RGWRados::init_complete(const DoutPrefixProvider
*dpp
)
1198 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1200 sync_module
= svc
.sync_modules
->get_sync_module();
1202 ret
= open_root_pool_ctx(dpp
);
1206 ret
= open_gc_pool_ctx(dpp
);
1210 ret
= open_lc_pool_ctx(dpp
);
1214 ret
= open_objexp_pool_ctx(dpp
);
1218 ret
= open_reshard_pool_ctx(dpp
);
1222 ret
= open_notif_pool_ctx(dpp
);
1226 pools_initialized
= true;
1230 gc
->initialize(cct
, this);
1232 ldpp_dout(dpp
, 5) << "note: GC not initialized" << dendl
;
1235 obj_expirer
= new RGWObjectExpirer(this->store
);
1237 if (use_gc_thread
&& use_gc
) {
1238 gc
->start_processor();
1239 obj_expirer
->start_processor();
1242 auto& current_period
= svc
.zone
->get_current_period();
1243 auto& zonegroup
= svc
.zone
->get_zonegroup();
1244 auto& zone_params
= svc
.zone
->get_zone_params();
1245 auto& zone
= svc
.zone
->get_zone();
1247 /* no point of running sync thread if we don't have a master zone configured
1248 or there is no rest_master_conn */
1249 if (!svc
.zone
->need_to_sync()) {
1250 run_sync_thread
= false;
1253 if (svc
.zone
->is_meta_master()) {
1254 auto md_log
= svc
.mdlog
->get_log(current_period
.get_id());
1255 meta_notifier
= new RGWMetaNotifier(this, md_log
);
1256 meta_notifier
->start();
1259 /* init it anyway, might run sync through radosgw-admin explicitly */
1260 sync_tracer
= new RGWSyncTraceManager(cct
, cct
->_conf
->rgw_sync_trace_history_size
);
1261 sync_tracer
->init(this);
1262 ret
= sync_tracer
->hook_to_admin_command();
1267 if (run_sync_thread
) {
1268 for (const auto &pt
: zonegroup
.placement_targets
) {
1269 if (zone_params
.placement_pools
.find(pt
.second
.name
)
1270 == zone_params
.placement_pools
.end()){
1271 ldpp_dout(dpp
, 0) << "WARNING: This zone does not contain the placement target "
1272 << pt
.second
.name
<< " present in zonegroup" << dendl
;
1275 auto async_processor
= svc
.rados
->get_async_processor();
1276 std::lock_guard l
{meta_sync_thread_lock
};
1277 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this->store
, async_processor
);
1278 ret
= meta_sync_processor_thread
->init(dpp
);
1280 ldpp_dout(dpp
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
1283 meta_sync_processor_thread
->start();
1285 // configure the bucket trim manager
1286 rgw::BucketTrimConfig config
;
1287 rgw::configure_bucket_trim(cct
, config
);
1289 bucket_trim
.emplace(this->store
, config
);
1290 ret
= bucket_trim
->init();
1292 ldpp_dout(dpp
, 0) << "ERROR: failed to start bucket trim manager" << dendl
;
1295 svc
.datalog_rados
->set_observer(&*bucket_trim
);
1297 std::lock_guard dl
{data_sync_thread_lock
};
1298 for (auto source_zone
: svc
.zone
->get_data_sync_source_zones()) {
1299 ldpp_dout(dpp
, 5) << "starting data sync thread for zone " << source_zone
->name
<< dendl
;
1300 auto *thread
= new RGWDataSyncProcessorThread(this->store
, svc
.rados
->get_async_processor(), source_zone
);
1301 ret
= thread
->init(dpp
);
1303 ldpp_dout(dpp
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
1307 data_sync_processor_threads
[rgw_zone_id(source_zone
->id
)] = thread
;
1309 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
1311 sync_log_trimmer
= new RGWSyncLogTrimThread(this->store
, &*bucket_trim
, interval
);
1312 ret
= sync_log_trimmer
->init(dpp
);
1314 ldpp_dout(dpp
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
1317 sync_log_trimmer
->start();
1320 if (cct
->_conf
->rgw_data_notify_interval_msec
) {
1321 data_notifier
= new RGWDataNotifier(this);
1322 data_notifier
->start();
1325 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
1326 binfo_cache
->init(svc
.cache
);
1329 lc
->initialize(cct
, this->store
);
1332 lc
->start_processor();
1334 quota_handler
= RGWQuotaHandler::generate_handler(dpp
, this->store
, quota_threads
);
1336 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
1337 zone
.bucket_index_max_shards
);
1338 if (bucket_index_max_shards
> get_max_bucket_shards()) {
1339 bucket_index_max_shards
= get_max_bucket_shards();
1340 ldpp_dout(dpp
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
1341 << get_max_bucket_shards() << dendl
;
1343 ldpp_dout(dpp
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
1345 bool need_tombstone_cache
= !svc
.zone
->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1347 if (need_tombstone_cache
) {
1348 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
1351 reshard_wait
= std::make_shared
<RGWReshardWait
>();
1353 reshard
= new RGWReshard(this->store
);
1355 /* only the master zone in the zonegroup reshards buckets */
1356 run_reshard_thread
= run_reshard_thread
&& (zonegroup
.master_zone
== zone
.id
);
1357 if (run_reshard_thread
) {
1358 reshard
->start_processor();
1361 index_completion_manager
= new RGWIndexCompletionManager(this);
1362 ret
= index_completion_manager
->start(dpp
);
1366 ret
= rgw::notify::init(cct
, store
, dpp
);
1368 ldpp_dout(dpp
, 1) << "ERROR: failed to initialize notification manager" << dendl
;
1374 int RGWRados::init_svc(bool raw
, const DoutPrefixProvider
*dpp
)
1377 return svc
.init_raw(cct
, use_cache
, null_yield
, dpp
);
1380 return svc
.init(cct
, use_cache
, run_sync_thread
, null_yield
, dpp
);
1383 int RGWRados::init_ctl(const DoutPrefixProvider
*dpp
)
1385 return ctl
.init(&svc
, dpp
);
1389 * Initialize the RADOS instance and prepare to do other ops
1390 * Returns 0 on success, -ERR# on failure.
1392 int RGWRados::initialize(const DoutPrefixProvider
*dpp
)
1396 inject_notify_timeout_probability
=
1397 cct
->_conf
.get_val
<double>("rgw_inject_notify_timeout_probability");
1398 max_notify_retries
= cct
->_conf
.get_val
<uint64_t>("rgw_max_notify_retries");
1400 ret
= init_svc(false, dpp
);
1402 ldpp_dout(dpp
, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
1406 ret
= init_ctl(dpp
);
1408 ldpp_dout(dpp
, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
1412 host_id
= svc
.zone_utils
->gen_host_id();
1418 return init_complete(dpp
);
1422 * Open the pool used as root for this gateway
1423 * Returns: 0 on success, -ERR# otherwise.
1425 int RGWRados::open_root_pool_ctx(const DoutPrefixProvider
*dpp
)
1427 return rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().domain_root
, root_pool_ctx
, true, true);
1430 int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider
*dpp
)
1432 return rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().gc_pool
, gc_pool_ctx
, true, true);
1435 int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider
*dpp
)
1437 return rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().lc_pool
, lc_pool_ctx
, true, true);
1440 int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider
*dpp
)
1442 return rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, objexp_pool_ctx
, true, true);
1445 int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider
*dpp
)
1447 return rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().reshard_pool
, reshard_pool_ctx
, true, true);
1450 int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider
*dpp
)
1452 return rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().notif_pool
, notif_pool_ctx
, true, true);
1455 int RGWRados::open_pool_ctx(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
, librados::IoCtx
& io_ctx
,
1458 constexpr bool create
= true; // create the pool if it doesn't exist
1459 return rgw_init_ioctx(dpp
, get_rados_handle(), pool
, io_ctx
, create
, mostly_omap
);
1464 struct log_list_state
{
1466 librados::IoCtx io_ctx
;
1467 librados::NObjectIterator obit
;
1470 int RGWRados::log_list_init(const DoutPrefixProvider
*dpp
, const string
& prefix
, RGWAccessHandle
*handle
)
1472 log_list_state
*state
= new log_list_state
;
1473 int r
= rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1478 state
->prefix
= prefix
;
1479 state
->obit
= state
->io_ctx
.nobjects_begin();
1480 *handle
= (RGWAccessHandle
)state
;
1484 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
1486 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
1488 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
1492 if (state
->prefix
.length() &&
1493 state
->obit
->get_oid().find(state
->prefix
) != 0) {
1497 *name
= state
->obit
->get_oid();
1504 int RGWRados::log_remove(const DoutPrefixProvider
*dpp
, const string
& name
)
1506 librados::IoCtx io_ctx
;
1507 int r
= rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
1510 return io_ctx
.remove(name
);
1513 struct log_show_state
{
1514 librados::IoCtx io_ctx
;
1516 bufferlist::const_iterator p
;
1520 log_show_state() : pos(0), eof(false) {}
1523 int RGWRados::log_show_init(const DoutPrefixProvider
*dpp
, const string
& name
, RGWAccessHandle
*handle
)
1525 log_show_state
*state
= new log_show_state
;
1526 int r
= rgw_init_ioctx(dpp
, get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1532 *handle
= (RGWAccessHandle
)state
;
1536 int RGWRados::log_show_next(const DoutPrefixProvider
*dpp
, RGWAccessHandle handle
, rgw_log_entry
*entry
)
1538 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
1539 off_t off
= state
->p
.get_off();
1541 ldpp_dout(dpp
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
1543 << " eof " << (int)state
->eof
1546 unsigned chunk
= 1024*1024;
1547 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
1549 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
1555 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
1556 } catch (buffer::error
& err
) {
1559 state
->bl
= std::move(old
);
1560 state
->bl
.claim_append(more
);
1561 state
->p
= state
->bl
.cbegin();
1562 if ((unsigned)r
< chunk
)
1564 ldpp_dout(dpp
, 10) << " read " << r
<< dendl
;
1568 return 0; // end of file
1570 decode(*entry
, state
->p
);
1572 catch (const buffer::error
&e
) {
1579 * usage_log_hash: get usage log key hash, based on name and index
1581 * Get the usage object name. Since a user may have more than 1
1582 * object holding that info (multiple shards), we use index to
1583 * specify that shard number. Once index exceeds max shards it
1585 * If name is not being set, results for all users will be returned
1586 * and index will wrap only after total shards number.
1588 * @param cct [in] ceph context
1589 * @param name [in] user name
1590 * @param hash [out] hash value
1591 * @param index [in] shard index number
1593 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
1595 uint32_t val
= index
;
1597 if (!name
.empty()) {
1598 int max_user_shards
= cct
->_conf
->rgw_usage_max_user_shards
;
1599 val
%= max_user_shards
;
1600 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
1603 int max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1604 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
1608 int RGWRados::log_usage(const DoutPrefixProvider
*dpp
, map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
1612 map
<string
, rgw_usage_log_info
> log_objs
;
1617 /* restructure usage map, zone by object hash */
1618 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
1619 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
1620 const rgw_user_bucket
& ub
= iter
->first
;
1621 RGWUsageBatch
& info
= iter
->second
;
1623 if (ub
.user
.empty()) {
1624 ldpp_dout(dpp
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
1628 if (ub
.user
!= last_user
) {
1629 /* index *should* be random, but why waste extra cycles
1630 in most cases max user shards is not going to exceed 1,
1631 so just incrementing it */
1632 usage_log_hash(cct
, ub
.user
, hash
, index
++);
1634 last_user
= ub
.user
;
1635 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
1637 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
1638 v
.push_back(miter
->second
);
1642 map
<string
, rgw_usage_log_info
>::iterator liter
;
1644 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
1645 int r
= cls_obj_usage_log_add(dpp
, liter
->first
, liter
->second
);
1652 int RGWRados::read_usage(const DoutPrefixProvider
*dpp
, const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
,
1653 uint32_t max_entries
, bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
,
1654 rgw_usage_log_entry
>& usage
)
1656 uint32_t num
= max_entries
;
1657 string hash
, first_hash
;
1658 string user_str
= user
.to_str();
1659 usage_log_hash(cct
, user_str
, first_hash
, 0);
1661 if (usage_iter
.index
) {
1662 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
1670 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
1671 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
1673 int ret
= cls_obj_usage_log_read(dpp
, hash
, user_str
, bucket_name
, start_epoch
, end_epoch
, num
,
1674 usage_iter
.read_iter
, ret_usage
, is_truncated
);
1681 num
-= ret_usage
.size();
1683 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
1684 usage
[iter
->first
].aggregate(iter
->second
);
1688 if (!*is_truncated
) {
1689 usage_iter
.read_iter
.clear();
1690 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
1692 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
1696 int RGWRados::trim_usage(const DoutPrefixProvider
*dpp
, const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
)
1699 string hash
, first_hash
;
1700 string user_str
= user
.to_str();
1701 usage_log_hash(cct
, user_str
, first_hash
, index
);
1705 int ret
= cls_obj_usage_log_trim(dpp
, hash
, user_str
, bucket_name
, start_epoch
, end_epoch
);
1707 if (ret
< 0 && ret
!= -ENOENT
)
1710 usage_log_hash(cct
, user_str
, hash
, ++index
);
1711 } while (hash
!= first_hash
);
1717 int RGWRados::clear_usage(const DoutPrefixProvider
*dpp
)
1719 auto max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1721 for (unsigned i
=0; i
< max_shards
; i
++){
1722 string oid
= RGW_USAGE_OBJ_PREFIX
+ to_string(i
);
1723 ret
= cls_obj_usage_log_clear(dpp
, oid
);
1725 ldpp_dout(dpp
,0) << "usage clear on oid="<< oid
<< "failed with ret=" << ret
<< dendl
;
1732 int RGWRados::decode_policy(const DoutPrefixProvider
*dpp
, bufferlist
& bl
, ACLOwner
*owner
)
1734 auto i
= bl
.cbegin();
1735 RGWAccessControlPolicy
policy(cct
);
1737 policy
.decode_owner(i
);
1738 } catch (buffer::error
& err
) {
1739 ldpp_dout(dpp
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
1742 *owner
= policy
.get_owner();
1746 int rgw_policy_from_attrset(const DoutPrefixProvider
*dpp
, CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
1748 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
1749 if (aiter
== attrset
.end())
1752 bufferlist
& bl
= aiter
->second
;
1753 auto iter
= bl
.cbegin();
1755 policy
->decode(iter
);
1756 } catch (buffer::error
& err
) {
1757 ldpp_dout(dpp
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
1760 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 15>()) {
1761 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
1762 ldpp_dout(dpp
, 15) << __func__
<< " Read AccessControlPolicy";
1763 s3policy
->to_xml(*_dout
);
1770 int RGWRados::Bucket::update_bucket_id(const string
& new_bucket_id
, const DoutPrefixProvider
*dpp
)
1772 rgw_bucket bucket
= bucket_info
.bucket
;
1773 bucket
.update_bucket_id(new_bucket_id
);
1775 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
1777 bucket_info
.objv_tracker
.clear();
1778 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, nullptr, nullptr, null_yield
, dpp
);
1788 * Get ordered listing of the objects in a bucket.
1790 * max_p: maximum number of results to return
1791 * bucket: bucket to list contents of
1792 * prefix: only return results that match this prefix
1793 * delim: do not include results that match this string.
1794 * Any skipped results will have the matching portion of their name
1795 * inserted in common_prefixes with a "true" mark.
1796 * marker: if filled in, begin the listing with this object.
1797 * end_marker: if filled in, end the listing with this object.
1798 * result: the objects are put in here.
1799 * common_prefixes: if delim is filled in, any matching prefixes are
1801 * is_truncated: if number of objects in the bucket is bigger than
1802 * max, then truncated.
1804 int RGWRados::Bucket::List::list_objects_ordered(
1805 const DoutPrefixProvider
*dpp
,
1807 std::vector
<rgw_bucket_dir_entry
> *result
,
1808 std::map
<std::string
, bool> *common_prefixes
,
1812 RGWRados
*store
= target
->get_store();
1813 CephContext
*cct
= store
->ctx();
1814 int shard_id
= target
->get_shard_id();
1817 bool truncated
= true;
1818 bool cls_filtered
= false;
1819 const int64_t max
= // protect against memory issues and negative vals
1820 std::min(bucket_list_objects_absolute_max
, std::max(int64_t(0), max_p
));
1821 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
, max
);
1825 // use a local marker; either the marker will have a previous entry
1826 // or it will be empty; either way it's OK to copy
1827 rgw_obj_key
marker_obj(params
.marker
.name
,
1828 params
.marker
.instance
,
1829 params
.ns
.empty() ? params
.marker
.ns
: params
.ns
);
1830 rgw_obj_index_key cur_marker
;
1831 marker_obj
.get_index_key(&cur_marker
);
1833 rgw_obj_key
end_marker_obj(params
.end_marker
.name
,
1834 params
.end_marker
.instance
,
1835 params
.ns
.empty() ? params
.end_marker
.ns
: params
.ns
);
1836 rgw_obj_index_key cur_end_marker
;
1837 end_marker_obj
.get_index_key(&cur_end_marker
);
1838 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
1840 rgw_obj_key
prefix_obj(params
.prefix
);
1841 prefix_obj
.set_ns(params
.ns
);
1842 std::string cur_prefix
= prefix_obj
.get_index_key_name();
1843 std::string after_delim_s
; /* needed in !params.delim.empty() AND later */
1845 if (!params
.delim
.empty()) {
1846 after_delim_s
= cls_rgw_after_delim(params
.delim
);
1847 /* if marker points at a common prefix, fast forward it into its
1848 * upper bound string */
1849 int delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
1850 if (delim_pos
>= 0) {
1851 string s
= cur_marker
.name
.substr(0, delim_pos
);
1852 s
.append(after_delim_s
);
1857 // we'll stop after this many attempts as long we return at least
1858 // one entry; but we will also go beyond this number of attempts
1859 // until we return at least one entry
1860 constexpr uint16_t SOFT_MAX_ATTEMPTS
= 8;
1862 rgw_obj_index_key prev_marker
;
1863 for (uint16_t attempt
= 1; /* empty */; ++attempt
) {
1864 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
1865 ": starting attempt " << attempt
<< dendl
;
1867 if (attempt
> 1 && !(prev_marker
< cur_marker
)) {
1868 // we've failed to make forward progress
1869 ldpp_dout(dpp
, 0) << "ERROR: " << __PRETTY_FUNCTION__
<<
1870 " marker failed to make forward progress; attempt=" << attempt
<<
1871 ", prev_marker=" << prev_marker
<<
1872 ", cur_marker=" << cur_marker
<< dendl
;
1875 prev_marker
= cur_marker
;
1878 ent_map
.reserve(read_ahead
);
1879 int r
= store
->cls_bucket_list_ordered(dpp
,
1880 target
->get_bucket_info(),
1885 read_ahead
+ 1 - count
,
1886 params
.list_versions
,
1893 params
.force_check_filter
);
1898 for (auto eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
1899 rgw_bucket_dir_entry
& entry
= eiter
->second
;
1900 rgw_obj_index_key index_key
= entry
.key
;
1901 rgw_obj_key
obj(index_key
);
1903 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
1904 ": considering entry " << entry
.key
<< dendl
;
1906 /* note that parse_raw_oid() here will not set the correct
1907 * object's instance, as rgw_obj_index_key encodes that
1908 * separately. We don't need to set the instance because it's
1909 * not needed for the checks here and we end up using the raw
1910 * entry for the return vector
1912 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
1914 ldpp_dout(dpp
, 0) << "ERROR: " << __PRETTY_FUNCTION__
<<
1915 " could not parse object name: " << obj
.name
<< dendl
;
1919 bool matched_ns
= (obj
.ns
== params
.ns
);
1920 if (!params
.list_versions
&& !entry
.is_visible()) {
1921 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
1922 ": skipping not visible entry \"" << entry
.key
<< "\"" << dendl
;
1926 if (params
.enforce_ns
&& !matched_ns
) {
1927 if (!params
.ns
.empty()) {
1928 /* we've iterated past the namespace we're searching -- done now */
1930 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
1931 ": finished due to getting past requested namespace \"" <<
1932 params
.ns
<< "\"" << dendl
;
1936 /* we're skipping past namespaced objects */
1937 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
1938 ": skipping past namespaced objects, including \"" << entry
.key
<<
1943 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
1945 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
1946 ": finished due to gitting end marker of \"" << cur_end_marker
<<
1947 "\" with \"" << entry
.key
<< "\"" << dendl
;
1952 params
.marker
= index_key
;
1953 next_marker
= index_key
;
1956 if (params
.access_list_filter
&&
1957 ! params
.access_list_filter
->filter(obj
.name
, index_key
.name
)) {
1958 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
1959 ": skipping past namespaced objects, including \"" << entry
.key
<<
1964 if (params
.prefix
.size() &&
1965 0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
)) {
1966 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
1967 ": skipping object \"" << entry
.key
<<
1968 "\" that doesn't match prefix \"" << params
.prefix
<< "\"" << dendl
;
1972 if (!params
.delim
.empty()) {
1973 const int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
1974 if (delim_pos
>= 0) {
1975 // run either the code where delimiter filtering is done a)
1976 // in the OSD/CLS or b) here.
1978 // NOTE: this condition is for the newer versions of the
1979 // OSD that does filtering on the CLS side should only
1980 // find one delimiter at the end if it finds any after the
1983 int(obj
.name
.length() - params
.delim
.length())) {
1984 ldpp_dout(dpp
, 0) << "WARNING: " << __PRETTY_FUNCTION__
<<
1985 " found delimiter in place other than the end of "
1986 "the prefix; obj.name=" << obj
.name
<<
1987 ", prefix=" << params
.prefix
<< dendl
;
1989 if (common_prefixes
) {
1992 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
1993 ": stopping early with common prefix \"" << entry
.key
<<
1994 "\" because requested number (" << max
<<
1995 ") reached (cls filtered)" << dendl
;
1999 (*common_prefixes
)[obj
.name
] = true;
2003 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2004 ": finished entry with common prefix \"" << entry
.key
<<
2005 "\" so continuing loop (cls filtered)" << dendl
;
2008 // NOTE: this condition is for older versions of the OSD
2009 // that do not filter on the CLS side, so the following code
2010 // must do the filtering; once we reach version 16 of ceph,
2011 // this code can be removed along with the conditional that
2012 // can lead this way
2014 /* extract key -with trailing delimiter- for CommonPrefix */
2016 obj
.name
.substr(0, delim_pos
+ params
.delim
.length());
2018 if (common_prefixes
&&
2019 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
2022 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
2023 ": stopping early with common prefix \"" << entry
.key
<<
2024 "\" because requested number (" << max
<<
2025 ") reached (not cls filtered)" << dendl
;
2028 next_marker
= prefix_key
;
2029 (*common_prefixes
)[prefix_key
] = true;
2034 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2035 ": finished entry with common prefix \"" << entry
.key
<<
2036 "\" so continuing loop (not cls filtered)" << dendl
;
2038 } // if we're running an older OSD version
2039 } // if a delimiter was found after prefix
2040 } // if a delimiter was passed in
2044 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
2045 ": stopping early with entry \"" << entry
.key
<<
2046 "\" because requested number (" << max
<<
2047 ") reached" << dendl
;
2051 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2052 ": adding entry " << entry
.key
<< " to result" << dendl
;
2054 result
->emplace_back(std::move(entry
));
2058 // NOTE: the following conditional is needed by older versions of
2059 // the OSD that don't do delimiter filtering on the CLS side; once
2060 // we reach version 16 of ceph, the following conditional and the
2061 // code within can be removed
2062 if (!cls_filtered
&& !params
.delim
.empty()) {
2063 int marker_delim_pos
=
2064 cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
2065 if (marker_delim_pos
>= 0) {
2066 std::string skip_after_delim
=
2067 cur_marker
.name
.substr(0, marker_delim_pos
);
2068 skip_after_delim
.append(after_delim_s
);
2070 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2071 ": skip_after_delim=" << skip_after_delim
<< dendl
;
2073 if (skip_after_delim
> cur_marker
.name
) {
2074 cur_marker
= skip_after_delim
;
2075 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2076 ": setting cur_marker=" << cur_marker
.name
<<
2077 "[" << cur_marker
.instance
<< "]" << dendl
;
2080 } // if older osd didn't do delimiter filtering
2082 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
2083 ": end of outer loop, truncated=" << truncated
<<
2084 ", count=" << count
<< ", attempt=" << attempt
<< dendl
;
2086 if (!truncated
|| count
>= (max
+ 1) / 2) {
2087 // if we finished listing, or if we're returning at least half the
2088 // requested entries, that's enough; S3 and swift protocols allow
2089 // returning fewer than max entries
2090 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
2091 ": exiting attempt loop because we reached end (" << truncated
<<
2092 ") or we're returning half the requested entries (" << count
<<
2093 " of " << max
<< ")" << dendl
;
2095 } else if (attempt
> SOFT_MAX_ATTEMPTS
&& count
>= 1) {
2096 // if we've made at least 8 attempts and we have some, but very
2097 // few, results, return with what we have
2098 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
2099 ": exiting attempt loop because we made " << attempt
<<
2100 " attempts and we're returning " << count
<< " entries" << dendl
;
2103 } // for (uint16_t attempt...
2108 *is_truncated
= truncated
;
2112 } // list_objects_ordered
2116 * Get listing of the objects in a bucket and allow the results to be out
2119 * Even though there are key differences with the ordered counterpart,
2120 * the parameters are the same to maintain some compatability.
2122 * max: maximum number of results to return
2123 * bucket: bucket to list contents of
2124 * prefix: only return results that match this prefix
2125 * delim: should not be set; if it is we should have indicated an error
2126 * marker: if filled in, begin the listing with this object.
2127 * end_marker: if filled in, end the listing with this object.
2128 * result: the objects are put in here.
2129 * common_prefixes: this is never filled with an unordered list; the param
2130 * is maintained for compatibility
2131 * is_truncated: if number of objects in the bucket is bigger than max, then
2134 int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider
*dpp
,
2136 std::vector
<rgw_bucket_dir_entry
>* result
,
2137 std::map
<std::string
, bool>* common_prefixes
,
2141 RGWRados
*store
= target
->get_store();
2142 int shard_id
= target
->get_shard_id();
2145 bool truncated
= true;
2147 const int64_t max
= // protect against memory issues and negative vals
2148 std::min(bucket_list_objects_absolute_max
, std::max(int64_t(0), max_p
));
2150 // read a few extra in each call to cls_bucket_list_unordered in
2151 // case some are filtered out due to namespace matching, versioning,
2153 const int64_t max_read_ahead
= 100;
2154 const uint32_t read_ahead
= uint32_t(max
+ std::min(max
, max_read_ahead
));
2158 // use a local marker; either the marker will have a previous entry
2159 // or it will be empty; either way it's OK to copy
2160 rgw_obj_key
marker_obj(params
.marker
.name
,
2161 params
.marker
.instance
,
2162 params
.ns
.empty() ? params
.marker
.ns
: params
.ns
);
2163 rgw_obj_index_key cur_marker
;
2164 marker_obj
.get_index_key(&cur_marker
);
2166 rgw_obj_key
end_marker_obj(params
.end_marker
.name
,
2167 params
.end_marker
.instance
,
2168 params
.ns
.empty() ? params
.end_marker
.ns
: params
.ns
);
2169 rgw_obj_index_key cur_end_marker
;
2170 end_marker_obj
.get_index_key(&cur_end_marker
);
2171 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
2173 rgw_obj_key
prefix_obj(params
.prefix
);
2174 prefix_obj
.set_ns(params
.ns
);
2175 std::string cur_prefix
= prefix_obj
.get_index_key_name();
2177 while (truncated
&& count
<= max
) {
2178 std::vector
<rgw_bucket_dir_entry
> ent_list
;
2179 ent_list
.reserve(read_ahead
);
2181 int r
= store
->cls_bucket_list_unordered(dpp
,
2182 target
->get_bucket_info(),
2187 params
.list_versions
,
2193 ldpp_dout(dpp
, 0) << "ERROR: " << __PRETTY_FUNCTION__
<<
2194 " cls_bucket_list_unordered returned " << r
<< " for " <<
2195 target
->get_bucket_info().bucket
<< dendl
;
2199 // NB: while regions of ent_list will be sorted, we have no
2200 // guarantee that all items will be sorted since they can cross
2203 for (auto& entry
: ent_list
) {
2204 rgw_obj_index_key index_key
= entry
.key
;
2205 rgw_obj_key
obj(index_key
);
2208 params
.marker
.set(index_key
);
2209 next_marker
.set(index_key
);
2212 /* note that parse_raw_oid() here will not set the correct
2213 * object's instance, as rgw_obj_index_key encodes that
2214 * separately. We don't need to set the instance because it's
2215 * not needed for the checks here and we end up using the raw
2216 * entry for the return vector
2218 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
2220 ldpp_dout(dpp
, 0) << "ERROR: " << __PRETTY_FUNCTION__
<<
2221 " could not parse object name: " << obj
.name
<< dendl
;
2225 if (!params
.list_versions
&& !entry
.is_visible()) {
2226 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2227 ": skippping \"" << index_key
<<
2228 "\" because not listing versions and entry not visibile" << dendl
;
2232 if (params
.enforce_ns
&& obj
.ns
!= params
.ns
) {
2233 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2234 ": skippping \"" << index_key
<<
2235 "\" because namespace does not match" << dendl
;
2239 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
2240 // we're not guaranteed items will come in order, so we have
2241 // to loop through all
2242 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2243 ": skippping \"" << index_key
<<
2244 "\" because after end_marker" << dendl
;
2248 if (params
.access_list_filter
&&
2249 !params
.access_list_filter
->filter(obj
.name
, index_key
.name
)) {
2250 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2251 ": skippping \"" << index_key
<<
2252 "\" because doesn't match filter" << dendl
;
2256 if (params
.prefix
.size() &&
2257 (0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
))) {
2258 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
2259 ": skippping \"" << index_key
<<
2260 "\" because doesn't match prefix" << dendl
;
2269 result
->emplace_back(std::move(entry
));
2271 } // for (auto& entry : ent_list)
2272 } // while (truncated && count <= max)
2277 *is_truncated
= truncated
;
2281 } // list_objects_unordered
2285 * create a rados pool, associated meta info
2286 * returns 0 on success, -ERR# otherwise.
2288 int RGWRados::create_pool(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
)
2290 librados::IoCtx io_ctx
;
2291 constexpr bool create
= true;
2292 return rgw_init_ioctx(dpp
, get_rados_handle(), pool
, io_ctx
, create
);
2295 void RGWRados::create_bucket_id(string
*bucket_id
)
2297 uint64_t iid
= instance_id();
2298 uint64_t bid
= next_bucket_id();
2299 char buf
[svc
.zone
->get_zone_params().get_id().size() + 48];
2300 snprintf(buf
, sizeof(buf
), "%s.%" PRIu64
".%" PRIu64
,
2301 svc
.zone
->get_zone_params().get_id().c_str(), iid
, bid
);
2305 int RGWRados::create_bucket(const RGWUserInfo
& owner
, rgw_bucket
& bucket
,
2306 const string
& zonegroup_id
,
2307 const rgw_placement_rule
& placement_rule
,
2308 const string
& swift_ver_location
,
2309 const RGWQuotaInfo
* pquota_info
,
2310 map
<std::string
, bufferlist
>& attrs
,
2311 RGWBucketInfo
& info
,
2313 obj_version
*pep_objv
,
2314 real_time creation_time
,
2315 rgw_bucket
*pmaster_bucket
,
2316 uint32_t *pmaster_num_shards
,
2318 const DoutPrefixProvider
*dpp
,
2321 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2322 rgw_placement_rule selected_placement_rule
;
2323 RGWZonePlacementInfo rule_info
;
2325 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
2327 ret
= svc
.zone
->select_bucket_placement(dpp
, owner
, zonegroup_id
, placement_rule
,
2328 &selected_placement_rule
, &rule_info
, y
);
2332 if (!pmaster_bucket
) {
2333 create_bucket_id(&bucket
.marker
);
2334 bucket
.bucket_id
= bucket
.marker
;
2336 bucket
.marker
= pmaster_bucket
->marker
;
2337 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
2340 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
2342 objv_tracker
.read_version
.clear();
2345 objv_tracker
.write_version
= *pobjv
;
2347 objv_tracker
.generate_new_write_ver(cct
);
2350 info
.bucket
= bucket
;
2351 info
.owner
= owner
.user_id
;
2352 info
.zonegroup
= zonegroup_id
;
2353 info
.placement_rule
= selected_placement_rule
;
2354 info
.swift_ver_location
= swift_ver_location
;
2355 info
.swift_versioning
= (!swift_ver_location
.empty());
2357 init_default_bucket_layout(cct
, info
.layout
, svc
.zone
->get_zone(),
2358 pmaster_num_shards
?
2359 std::optional
{*pmaster_num_shards
} :
2361 rule_info
.index_type
);
2363 info
.requester_pays
= false;
2364 if (real_clock::is_zero(creation_time
)) {
2365 info
.creation_time
= ceph::real_clock::now();
2367 info
.creation_time
= creation_time
;
2370 info
.quota
= *pquota_info
;
2373 int r
= svc
.bi
->init_index(dpp
, info
);
2378 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true, dpp
);
2379 if (ret
== -ECANCELED
) {
2382 if (ret
== -EEXIST
) {
2383 /* we need to reread the info and return it, caller will have a use for it */
2384 RGWBucketInfo orig_info
;
2385 r
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, orig_info
, NULL
, null_yield
, NULL
);
2390 ldpp_dout(dpp
, 0) << "get_bucket_info returned " << r
<< dendl
;
2394 /* only remove it if it's a different bucket instance */
2395 if (orig_info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
2396 int r
= svc
.bi
->clean_index(dpp
, info
);
2398 ldpp_dout(dpp
, 0) << "WARNING: could not remove bucket index (r=" << r
<< ")" << dendl
;
2400 r
= ctl
.bucket
->remove_bucket_instance_info(info
.bucket
, info
, null_yield
, dpp
);
2402 ldpp_dout(dpp
, 0) << "WARNING: " << __func__
<< "(): failed to remove bucket instance info: bucket instance=" << info
.bucket
.get_key() << ": r=" << r
<< dendl
;
2403 /* continue anyway */
2407 info
= std::move(orig_info
);
2408 /* ret == -EEXIST here */
2413 /* this is highly unlikely */
2414 ldpp_dout(dpp
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
2418 // returns true on success, false on failure
2419 bool RGWRados::get_obj_data_pool(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
2421 return rgw_get_obj_data_pool(svc
.zone
->get_zonegroup(), svc
.zone
->get_zone_params(), placement_rule
, obj
, pool
);
2424 bool RGWRados::obj_to_raw(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
2426 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
2428 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
2431 std::string
RGWRados::get_cluster_fsid(const DoutPrefixProvider
*dpp
, optional_yield y
)
2433 return svc
.rados
->cluster_fsid();
2436 int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
2439 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
2442 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2443 ldpp_dout(dpp
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2447 int r
= open_pool_ctx(dpp
, pool
, *ioctx
, false);
2452 ioctx
->locator_set_key(key
);
2457 int RGWRados::get_obj_head_ref(const DoutPrefixProvider
*dpp
,
2458 const rgw_placement_rule
& target_placement_rule
,
2462 get_obj_bucket_and_oid_loc(obj
, ref
->obj
.oid
, ref
->obj
.loc
);
2465 if (!get_obj_data_pool(target_placement_rule
, obj
, &pool
)) {
2466 ldpp_dout(dpp
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2470 ref
->pool
= svc
.rados
->pool(pool
);
2472 int r
= ref
->pool
.open(dpp
, RGWSI_RADOS::OpenParams()
2473 .set_mostly_omap(false));
2475 ldpp_dout(dpp
, 0) << "ERROR: failed opening data pool (pool=" << pool
<< "); r=" << r
<< dendl
;
2479 ref
->pool
.ioctx().locator_set_key(ref
->obj
.loc
);
2484 int RGWRados::get_obj_head_ref(const DoutPrefixProvider
*dpp
,
2485 const RGWBucketInfo
& bucket_info
,
2489 return get_obj_head_ref(dpp
, bucket_info
.placement_rule
, obj
, ref
);
2492 int RGWRados::get_raw_obj_ref(const DoutPrefixProvider
*dpp
, const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2496 if (ref
->obj
.oid
.empty()) {
2497 ref
->obj
.oid
= obj
.pool
.to_str();
2498 ref
->obj
.pool
= svc
.zone
->get_zone_params().domain_root
;
2500 ref
->pool
= svc
.rados
->pool(obj
.pool
);
2501 int r
= ref
->pool
.open(dpp
, RGWSI_RADOS::OpenParams()
2502 .set_mostly_omap(false));
2504 ldpp_dout(dpp
, 0) << "ERROR: failed opening pool (pool=" << obj
.pool
<< "); r=" << r
<< dendl
;
2508 ref
->pool
.ioctx().locator_set_key(ref
->obj
.loc
);
2513 int RGWRados::get_system_obj_ref(const DoutPrefixProvider
*dpp
, const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2515 return get_raw_obj_ref(dpp
, obj
, ref
);
2519 * fixes an issue where head objects were supposed to have a locator created, but ended
2522 int RGWRados::fix_head_obj_locator(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
2524 const rgw_bucket
& bucket
= bucket_info
.bucket
;
2528 rgw_obj
obj(bucket
, key
);
2530 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
2532 if (locator
.empty()) {
2533 ldpp_dout(dpp
, 20) << "object does not have a locator, nothing to fix" << dendl
;
2537 librados::IoCtx ioctx
;
2539 int ret
= get_obj_head_ioctx(dpp
, bucket_info
, obj
, &ioctx
);
2541 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
2544 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
2549 struct timespec mtime_ts
;
2550 map
<string
, bufferlist
> attrs
;
2551 librados::ObjectReadOperation op
;
2552 op
.getxattrs(&attrs
, NULL
);
2553 op
.stat2(&size
, &mtime_ts
, NULL
);
2554 #define HEAD_SIZE 512 * 1024
2555 op
.read(0, HEAD_SIZE
, &data
, NULL
);
2557 ret
= rgw_rados_operate(dpp
, ioctx
, oid
, &op
, &data
, null_yield
);
2559 ldpp_dout(dpp
, -1) << "ERROR: rgw_rados_operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
2563 if (size
> HEAD_SIZE
) {
2564 ldpp_dout(dpp
, -1) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
2568 if (size
!= data
.length()) {
2569 ldpp_dout(dpp
, -1) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
2574 librados::ObjectWriteOperation wop
;
2576 wop
.mtime2(&mtime_ts
);
2578 map
<string
, bufferlist
>::iterator iter
;
2579 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
2580 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
2585 ioctx
.locator_set_key(locator
);
2586 rgw_rados_operate(dpp
, ioctx
, oid
, &wop
, null_yield
);
2590 ioctx
.locator_set_key(string());
2592 ret
= ioctx
.remove(oid
);
2594 ldpp_dout(dpp
, -1) << "ERROR: failed to remove original bad object" << dendl
;
2602 int RGWRados::move_rados_obj(const DoutPrefixProvider
*dpp
,
2603 librados::IoCtx
& src_ioctx
,
2604 const string
& src_oid
, const string
& src_locator
,
2605 librados::IoCtx
& dst_ioctx
,
2606 const string
& dst_oid
, const string
& dst_locator
)
2609 #define COPY_BUF_SIZE (4 * 1024 * 1024)
2611 uint64_t chunk_size
= COPY_BUF_SIZE
;
2615 struct timespec mtime_ts
;
2618 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
2622 src_ioctx
.locator_set_key(src_locator
);
2623 dst_ioctx
.locator_set_key(dst_locator
);
2627 ObjectReadOperation rop
;
2628 ObjectWriteOperation wop
;
2631 rop
.stat2(&size
, &mtime_ts
, NULL
);
2632 mtime
= real_clock::from_timespec(mtime_ts
);
2634 rop
.read(ofs
, chunk_size
, &data
, NULL
);
2635 ret
= rgw_rados_operate(dpp
, src_ioctx
, src_oid
, &rop
, &data
, null_yield
);
2640 if (data
.length() == 0) {
2645 wop
.create(true); /* make it exclusive */
2646 wop
.mtime2(&mtime_ts
);
2647 mtime
= real_clock::from_timespec(mtime_ts
);
2649 wop
.write(ofs
, data
);
2650 ret
= rgw_rados_operate(dpp
, dst_ioctx
, dst_oid
, &wop
, null_yield
);
2654 ofs
+= data
.length();
2655 done
= data
.length() != chunk_size
;
2659 ldpp_dout(dpp
, -1) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
2660 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
2665 src_ioctx
.remove(src_oid
);
2670 // TODO: clean up dst_oid if we created it
2671 ldpp_dout(dpp
, -1) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
2676 * fixes an issue where head objects were supposed to have a locator created, but ended
2679 int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
, optional_yield y
)
2681 const rgw_bucket
& bucket
= bucket_info
.bucket
;
2682 rgw_obj
obj(bucket
, key
);
2689 int r
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
2694 RGWObjState
*astate
= NULL
;
2695 RGWObjectCtx
rctx(this->store
);
2696 r
= get_obj_state(dpp
, &rctx
, bucket_info
, obj
, &astate
, false, y
);
2700 if (astate
->manifest
) {
2701 RGWObjManifest::obj_iterator miter
;
2702 RGWObjManifest
& manifest
= *astate
->manifest
;
2703 for (miter
= manifest
.obj_begin(dpp
); miter
!= manifest
.obj_end(dpp
); ++miter
) {
2704 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(store
);
2709 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
2711 if (loc
.key
.ns
.empty()) {
2712 /* continue, we're only interested in tail objects */
2716 auto& ioctx
= ref
.pool
.ioctx();
2718 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
2719 ref
.pool
.ioctx().locator_set_key(locator
);
2721 ldpp_dout(dpp
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
2723 r
= ioctx
.stat(oid
, NULL
, NULL
);
2729 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
2731 /* create a new ioctx with the bad locator */
2732 librados::IoCtx src_ioctx
;
2733 src_ioctx
.dup(ioctx
);
2734 src_ioctx
.locator_set_key(bad_loc
);
2736 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
2738 /* cannot find a broken part */
2741 ldpp_dout(dpp
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
2746 r
= move_rados_obj(dpp
, src_ioctx
, oid
, bad_loc
, ioctx
, oid
, locator
);
2748 ldpp_dout(dpp
, -1) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
2757 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
2759 RGWBucketInfo
* bucket_info_out
,
2760 const DoutPrefixProvider
*dpp
)
2764 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
2766 RGWBucketInfo bucket_info
;
2767 RGWBucketInfo
* bucket_info_p
=
2768 bucket_info_out
? bucket_info_out
: &bucket_info
;
2770 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
, null_yield
, dpp
);
2777 ret
= store
->svc
.bi_rados
->open_bucket_index_shard(dpp
, *bucket_info_p
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
2779 ldpp_dout(dpp
, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2782 ldpp_dout(dpp
, 20) << " bucket index object: " << bucket_obj
.get_raw_obj() << dendl
;
2787 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
2788 int sid
, const rgw::bucket_index_layout_generation
& idx_layout
,
2789 RGWBucketInfo
* bucket_info_out
,
2790 const DoutPrefixProvider
*dpp
)
2795 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
2798 RGWBucketInfo bucket_info
;
2799 RGWBucketInfo
* bucket_info_p
=
2800 bucket_info_out
? bucket_info_out
: &bucket_info
;
2801 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
, null_yield
, dpp
);
2808 ret
= store
->svc
.bi_rados
->open_bucket_index_shard(dpp
, *bucket_info_p
, shard_id
, idx_layout
, &bucket_obj
);
2810 ldpp_dout(dpp
, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2813 ldpp_dout(dpp
, 20) << " bucket index oid: " << bucket_obj
.get_raw_obj() << dendl
;
2818 int RGWRados::BucketShard::init(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
,
2821 bucket
= bucket_info
.bucket
;
2823 int ret
= store
->svc
.bi_rados
->open_bucket_index_shard(dpp
, bucket_info
,
2824 obj
.get_hash_object(),
2828 ldpp_dout(dpp
, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2831 ldpp_dout(dpp
, 20) << " bucket index object: " << bucket_obj
<< dendl
;
2836 int RGWRados::BucketShard::init(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw::bucket_index_layout_generation
& idx_layout
, int sid
)
2838 bucket
= bucket_info
.bucket
;
2841 int ret
= store
->svc
.bi_rados
->open_bucket_index_shard(dpp
, bucket_info
, shard_id
, idx_layout
, &bucket_obj
);
2843 ldpp_dout(dpp
, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
2846 ldpp_dout(dpp
, 20) << " bucket index object: " << bucket_obj
<< dendl
;
2852 /* Execute @handler on last item in bucket listing for bucket specified
2853 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2854 * to objects matching these criterias. */
2855 int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider
*dpp
,
2856 RGWBucketInfo
& bucket_info
,
2857 const std::string
& obj_prefix
,
2858 const std::string
& obj_delim
,
2859 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
2861 RGWRados::Bucket
target(this, bucket_info
);
2862 RGWRados::Bucket::List
list_op(&target
);
2864 list_op
.params
.prefix
= obj_prefix
;
2865 list_op
.params
.delim
= obj_delim
;
2867 ldpp_dout(dpp
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
2868 << ", obj_prefix=" << obj_prefix
2869 << ", obj_delim=" << obj_delim
2872 bool is_truncated
= false;
2874 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
2875 /* We need to rewind to the last object in a listing. */
2877 /* List bucket entries in chunks. */
2878 static constexpr int MAX_LIST_OBJS
= 100;
2879 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
2881 int ret
= list_op
.list_objects(dpp
, MAX_LIST_OBJS
, &entries
, nullptr,
2882 &is_truncated
, null_yield
);
2885 } else if (!entries
.empty()) {
2886 last_entry
= entries
.back();
2888 } while (is_truncated
);
2891 return handler(*last_entry
);
2894 /* Empty listing - no items we can run handler on. */
2898 bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket
* bucket
) const
2900 return bucket
->get_info().has_swift_versioning() &&
2901 bucket
->get_info().swift_ver_location
.size();
2904 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
2905 const rgw_user
& user
,
2906 rgw::sal::Bucket
* bucket
,
2907 rgw::sal::Object
* obj
,
2908 const DoutPrefixProvider
*dpp
,
2911 if (! swift_versioning_enabled(bucket
)) {
2915 obj
->set_atomic(&obj_ctx
);
2917 RGWObjState
* state
= nullptr;
2918 int r
= get_obj_state(dpp
, &obj_ctx
, bucket
->get_info(), obj
->get_obj(), &state
, false, y
);
2923 if (!state
->exists
) {
2927 const string
& src_name
= obj
->get_oid();
2928 char buf
[src_name
.size() + 32];
2929 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
2930 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
2931 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
2933 RGWBucketInfo dest_bucket_info
;
2935 r
= get_bucket_info(&svc
, bucket
->get_tenant(), bucket
->get_info().swift_ver_location
, dest_bucket_info
, NULL
, null_yield
, NULL
);
2937 ldpp_dout(dpp
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
2939 return -ERR_PRECONDITION_FAILED
;
2944 if (dest_bucket_info
.owner
!= bucket
->get_info().owner
) {
2945 return -ERR_PRECONDITION_FAILED
;
2948 rgw::sal::RadosBucket
dest_bucket(store
, dest_bucket_info
);
2949 rgw::sal::RadosObject
dest_obj(store
, rgw_obj_key(buf
), &dest_bucket
);
2951 if (dest_bucket_info
.versioning_enabled()){
2952 dest_obj
.gen_rand_obj_instance_name();
2955 dest_obj
.set_atomic(&obj_ctx
);
2957 rgw_zone_id no_zone
;
2959 r
= copy_obj(obj_ctx
,
2961 NULL
, /* req_info *info */
2967 bucket
->get_placement_rule(),
2968 NULL
, /* time_t *src_mtime */
2969 NULL
, /* time_t *mtime */
2970 NULL
, /* const time_t *mod_ptr */
2971 NULL
, /* const time_t *unmod_ptr */
2972 false, /* bool high_precision_time */
2973 NULL
, /* const char *if_match */
2974 NULL
, /* const char *if_nomatch */
2975 RGWRados::ATTRSMOD_NONE
,
2976 true, /* bool copy_if_newer */
2978 RGWObjCategory::Main
,
2979 0, /* uint64_t olh_epoch */
2980 real_time(), /* time_t delete_at */
2981 NULL
, /* string *version_id */
2982 NULL
, /* string *ptag */
2983 NULL
, /* string *petag */
2984 NULL
, /* void (*progress_cb)(off_t, void *) */
2985 NULL
, /* void *progress_data */
2988 if (r
== -ECANCELED
|| r
== -ENOENT
) {
2989 /* Has already been overwritten, meaning another rgw process already
2997 int RGWRados::swift_versioning_restore(RGWObjectCtx
& obj_ctx
,
2998 const rgw_user
& user
,
2999 rgw::sal::Bucket
* bucket
,
3000 rgw::sal::Object
* obj
,
3001 bool& restored
, /* out */
3002 const DoutPrefixProvider
*dpp
)
3004 if (! swift_versioning_enabled(bucket
)) {
3008 /* Bucket info of the bucket that stores previous versions of our object. */
3009 RGWBucketInfo archive_binfo
;
3011 int ret
= get_bucket_info(&svc
, bucket
->get_tenant(),
3012 bucket
->get_info().swift_ver_location
,
3013 archive_binfo
, nullptr, null_yield
, nullptr);
3018 /* Abort the operation if the bucket storing our archive belongs to someone
3019 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3020 * into consideration. For we can live with that.
3022 * TODO: delegate this check to un upper layer and compare with ACLs. */
3023 if (bucket
->get_info().owner
!= archive_binfo
.owner
) {
3027 /* This code will be executed on latest version of the object. */
3028 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
3029 rgw_zone_id no_zone
;
3031 /* We don't support object versioning of Swift API on those buckets that
3032 * are already versioned using the S3 mechanism. This affects also bucket
3033 * storing archived objects. Otherwise the delete operation would create
3034 * a deletion marker. */
3035 if (archive_binfo
.versioned()) {
3037 return -ERR_PRECONDITION_FAILED
;
3040 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3041 * irrelevant and may be safely skipped. */
3042 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
3044 rgw::sal::RadosBucket
archive_bucket(store
, archive_binfo
);
3045 rgw::sal::RadosObject
archive_obj(store
, entry
.key
, &archive_bucket
);
3047 if (bucket
->versioning_enabled()){
3048 obj
->gen_rand_obj_instance_name();
3051 archive_obj
.set_atomic(&obj_ctx
);
3052 obj
->set_atomic(&obj_ctx
);
3054 int ret
= copy_obj(obj_ctx
,
3056 nullptr, /* req_info *info */
3059 &archive_obj
, /* src obj */
3060 bucket
, /* dest bucket info */
3061 &archive_bucket
, /* src bucket info */
3062 bucket
->get_placement_rule(), /* placement_rule */
3063 nullptr, /* time_t *src_mtime */
3064 nullptr, /* time_t *mtime */
3065 nullptr, /* const time_t *mod_ptr */
3066 nullptr, /* const time_t *unmod_ptr */
3067 false, /* bool high_precision_time */
3068 nullptr, /* const char *if_match */
3069 nullptr, /* const char *if_nomatch */
3070 RGWRados::ATTRSMOD_NONE
,
3071 true, /* bool copy_if_newer */
3073 RGWObjCategory::Main
,
3074 0, /* uint64_t olh_epoch */
3075 real_time(), /* time_t delete_at */
3076 nullptr, /* string *version_id */
3077 nullptr, /* string *ptag */
3078 nullptr, /* string *petag */
3079 nullptr, /* void (*progress_cb)(off_t, void *) */
3080 nullptr, /* void *progress_data */
3083 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
3084 /* Has already been overwritten, meaning another rgw process already
3087 } else if (ret
< 0) {
3093 /* Need to remove the archived copy. */
3094 ret
= delete_obj(dpp
, obj_ctx
, archive_binfo
, archive_obj
.get_obj(),
3095 archive_binfo
.versioning_status());
3100 const std::string
& obj_name
= obj
->get_oid();
3101 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
3104 return on_last_entry_in_listing(dpp
, archive_binfo
, prefix
, std::string(),
3108 int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider
*dpp
,
3109 uint64_t size
, uint64_t accounted_size
,
3110 map
<string
, bufferlist
>& attrs
,
3111 bool assume_noent
, bool modify_tail
,
3112 void *_index_op
, optional_yield y
)
3114 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
3115 RGWRados
*store
= target
->get_store();
3117 ObjectWriteOperation op
;
3119 const struct req_state
* s
= get_req_state();
3123 req_id
= store
->svc
.zone_utils
->unique_id(store
->get_new_req_id());
3130 int r
= target
->get_state(dpp
, &state
, false, y
, assume_noent
);
3134 rgw_obj
& obj
= target
->get_obj();
3136 if (obj
.get_oid().empty()) {
3137 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
3142 r
= store
->get_obj_head_ref(dpp
, target
->get_meta_placement_rule(), obj
, &ref
);
3146 bool is_olh
= state
->is_olh
;
3148 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
3150 const string
*ptag
= meta
.ptag
;
3151 if (!ptag
&& !index_op
->get_optag()->empty()) {
3152 ptag
= index_op
->get_optag();
3154 r
= target
->prepare_atomic_modification(dpp
, op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false, modify_tail
, y
);
3158 if (real_clock::is_zero(meta
.set_mtime
)) {
3159 meta
.set_mtime
= real_clock::now();
3162 if (target
->bucket_info
.obj_lock_enabled() && target
->bucket_info
.obj_lock
.has_rule() && meta
.flags
== PUT_OBJ_CREATE
) {
3163 auto iter
= attrs
.find(RGW_ATTR_OBJECT_RETENTION
);
3164 if (iter
== attrs
.end()) {
3165 real_time lock_until_date
= target
->bucket_info
.obj_lock
.get_lock_until_date(meta
.set_mtime
);
3166 string mode
= target
->bucket_info
.obj_lock
.get_mode();
3167 RGWObjectRetention
obj_retention(mode
, lock_until_date
);
3169 obj_retention
.encode(bl
);
3170 op
.setxattr(RGW_ATTR_OBJECT_RETENTION
, bl
);
3174 if (state
->is_olh
) {
3175 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
3178 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
3179 op
.mtime2(&mtime_ts
);
3182 /* if we want to overwrite the data, we also want to overwrite the
3183 xattrs, so just remove the object */
3184 op
.write_full(*meta
.data
);
3185 if (state
->compressed
) {
3186 uint32_t alloc_hint_flags
= librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE
;
3187 op
.set_alloc_hint2(0, 0, alloc_hint_flags
);
3192 string content_type
;
3194 string storage_class
;
3196 map
<string
, bufferlist
>::iterator iter
;
3198 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
3199 const string
& name
= iter
->first
;
3200 op
.rmxattr(name
.c_str());
3204 if (meta
.manifest
) {
3205 storage_class
= meta
.manifest
->get_tail_placement().placement_rule
.storage_class
;
3207 /* remove existing manifest attr */
3208 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
3209 if (iter
!= attrs
.end())
3213 encode(*meta
.manifest
, bl
);
3214 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
3217 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
3218 const string
& name
= iter
->first
;
3219 bufferlist
& bl
= iter
->second
;
3224 op
.setxattr(name
.c_str(), bl
);
3226 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
3227 etag
= rgw_bl_str(bl
);
3228 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
3229 content_type
= rgw_bl_str(bl
);
3230 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
3234 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
3235 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
3238 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
3240 encode(store
->svc
.zone
->get_zone_short_id(), bl
);
3241 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
3244 if (!storage_class
.empty()) {
3246 bl
.append(storage_class
);
3247 op
.setxattr(RGW_ATTR_STORAGE_CLASS
, bl
);
3258 if (!reset_obj
) { //Multipart upload, it has immutable head.
3259 orig_exists
= false;
3262 orig_exists
= state
->exists
;
3263 orig_size
= state
->accounted_size
;
3266 bool versioned_target
= (meta
.olh_epoch
&& *meta
.olh_epoch
> 0) ||
3267 !obj
.key
.instance
.empty();
3269 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
3272 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
3275 if (!index_op
->is_prepared()) {
3276 tracepoint(rgw_rados
, prepare_enter
, req_id
.c_str());
3277 r
= index_op
->prepare(dpp
, CLS_RGW_OP_ADD
, &state
->write_tag
, y
);
3278 tracepoint(rgw_rados
, prepare_exit
, req_id
.c_str());
3283 auto& ioctx
= ref
.pool
.ioctx();
3285 tracepoint(rgw_rados
, operate_enter
, req_id
.c_str());
3286 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
3287 tracepoint(rgw_rados
, operate_exit
, req_id
.c_str());
3288 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
3289 or -ENOENT if was removed, or -EEXIST if it did not exist
3290 before and now it does */
3291 if (r
== -EEXIST
&& assume_noent
) {
3292 target
->invalidate_state();
3298 epoch
= ioctx
.get_last_version();
3299 poolid
= ioctx
.get_id();
3301 r
= target
->complete_atomic_modification(dpp
);
3303 ldpp_dout(dpp
, 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
3306 tracepoint(rgw_rados
, complete_enter
, req_id
.c_str());
3307 r
= index_op
->complete(dpp
, poolid
, epoch
, size
, accounted_size
,
3308 meta
.set_mtime
, etag
, content_type
,
3309 storage_class
, &acl_bl
,
3310 meta
.category
, meta
.remove_objs
, meta
.user_data
, meta
.appendable
);
3311 tracepoint(rgw_rados
, complete_exit
, req_id
.c_str());
3316 *meta
.mtime
= meta
.set_mtime
;
3319 /* note that index_op was using state so we couldn't invalidate it earlier */
3320 target
->invalidate_state();
3323 if (versioned_op
&& meta
.olh_epoch
) {
3324 r
= store
->set_olh(dpp
, target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, *meta
.olh_epoch
, real_time(), false, y
, meta
.zones_trace
);
3330 if (!real_clock::is_zero(meta
.delete_at
)) {
3331 rgw_obj_index_key obj_key
;
3332 obj
.key
.get_index_key(&obj_key
);
3334 r
= store
->obj_expirer
->hint_add(dpp
, meta
.delete_at
, obj
.bucket
.tenant
, obj
.bucket
.name
,
3335 obj
.bucket
.bucket_id
, obj_key
);
3337 ldpp_dout(dpp
, 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
3338 /* ignoring error, nothing we can do at this point */
3341 meta
.canceled
= false;
3343 /* update quota cache */
3344 if (meta
.completeMultipart
){
3345 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3349 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3350 accounted_size
, orig_size
);
3355 int ret
= index_op
->cancel(dpp
, meta
.remove_objs
);
3357 ldpp_dout(dpp
, 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
3360 meta
.canceled
= true;
3362 /* we lost in a race. There are a few options:
3363 * - existing object was rewritten (ECANCELED)
3364 * - non existing object was created (EEXIST)
3365 * - object was removed (ENOENT)
3366 * should treat it as a success
3368 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
3369 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
3373 if (meta
.if_match
!= NULL
) {
3374 // only overwrite existing object
3375 if (strcmp(meta
.if_match
, "*") == 0) {
3377 r
= -ERR_PRECONDITION_FAILED
;
3378 } else if (r
== -ECANCELED
) {
3384 if (meta
.if_nomatch
!= NULL
) {
3385 // only create a new object
3386 if (strcmp(meta
.if_nomatch
, "*") == 0) {
3388 r
= -ERR_PRECONDITION_FAILED
;
3389 } else if (r
== -ENOENT
) {
3399 int RGWRados::Object::Write::write_meta(const DoutPrefixProvider
*dpp
, uint64_t size
, uint64_t accounted_size
,
3400 map
<string
, bufferlist
>& attrs
, optional_yield y
)
3402 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
3404 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
3405 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
3406 index_op
.set_zones_trace(meta
.zones_trace
);
3408 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
3411 r
= _do_write_meta(dpp
, size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
, y
);
3413 assume_noent
= false;
3416 if (!assume_noent
) {
3417 r
= _do_write_meta(dpp
, size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
, y
);
3422 class RGWRadosPutObj
: public RGWHTTPStreamRWRequest::ReceiveCB
3424 const DoutPrefixProvider
*dpp
;
3427 rgw::sal::DataProcessor
*filter
;
3428 boost::optional
<RGWPutObj_Compress
>& compressor
;
3429 bool try_etag_verify
;
3430 rgw::putobj::etag_verifier_ptr etag_verifier
;
3431 boost::optional
<rgw::putobj::ChunkProcessor
> buffering
;
3432 CompressorRef
& plugin
;
3433 rgw::sal::ObjectProcessor
*processor
;
3434 void (*progress_cb
)(off_t
, void *);
3435 void *progress_data
;
3436 bufferlist extra_data_bl
, manifest_bl
;
3437 std::optional
<RGWCompressionInfo
> compression_info
;
3438 uint64_t extra_data_left
{0};
3439 bool need_to_process_attrs
{true};
3440 uint64_t data_len
{0};
3441 map
<string
, bufferlist
> src_attrs
;
3443 uint64_t lofs
{0}; /* logical ofs */
3444 std::function
<int(map
<string
, bufferlist
>&)> attrs_handler
;
3447 RGWRadosPutObj(const DoutPrefixProvider
*dpp
,
3449 CompressorRef
& plugin
,
3450 boost::optional
<RGWPutObj_Compress
>& compressor
,
3451 rgw::sal::ObjectProcessor
*p
,
3452 void (*_progress_cb
)(off_t
, void *),
3453 void *_progress_data
,
3454 std::function
<int(map
<string
, bufferlist
>&)> _attrs_handler
) :
3458 compressor(compressor
),
3459 try_etag_verify(cct
->_conf
->rgw_sync_obj_etag_verify
),
3462 progress_cb(_progress_cb
),
3463 progress_data(_progress_data
),
3464 attrs_handler(_attrs_handler
) {}
3467 int process_attrs(void) {
3468 if (extra_data_bl
.length()) {
3470 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
3471 ldpp_dout(dpp
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
3475 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
3477 auto iter
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
3478 if (iter
!= src_attrs
.end()) {
3479 const bufferlist bl
= std::move(iter
->second
);
3480 src_attrs
.erase(iter
); // don't preserve source compression info
3482 if (try_etag_verify
) {
3483 // if we're trying to verify etags, we need to convert compressed
3484 // ranges in the manifest back into logical multipart part offsets
3485 RGWCompressionInfo info
;
3486 bool compressed
= false;
3487 int r
= rgw_compression_info_from_attr(bl
, compressed
, info
);
3489 ldpp_dout(dpp
, 4) << "failed to decode compression info, "
3490 "disabling etag verification" << dendl
;
3491 try_etag_verify
= false;
3492 } else if (compressed
) {
3493 compression_info
= std::move(info
);
3497 /* We need the manifest to recompute the ETag for verification */
3498 iter
= src_attrs
.find(RGW_ATTR_MANIFEST
);
3499 if (iter
!= src_attrs
.end()) {
3500 manifest_bl
= std::move(iter
->second
);
3501 src_attrs
.erase(iter
);
3504 // filter out olh attributes
3505 iter
= src_attrs
.lower_bound(RGW_ATTR_OLH_PREFIX
);
3506 while (iter
!= src_attrs
.end()) {
3507 if (!boost::algorithm::starts_with(iter
->first
, RGW_ATTR_OLH_PREFIX
)) {
3510 iter
= src_attrs
.erase(iter
);
3514 int ret
= attrs_handler(src_attrs
);
3519 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
3520 //do not compress if object is encrypted
3521 compressor
= boost::in_place(cct
, plugin
, filter
);
3522 // add a filter that buffers data so we don't try to compress tiny blocks.
3523 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3524 // compression ratio
3525 constexpr unsigned buffer_size
= 512 * 1024;
3526 buffering
= boost::in_place(&*compressor
, buffer_size
);
3527 filter
= &*buffering
;
3531 * Presently we don't support ETag based verification if encryption is
3532 * requested. We can enable simultaneous support once we have a mechanism
3533 * to know the sequence in which the filters must be applied.
3535 if (try_etag_verify
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
3536 ret
= rgw::putobj::create_etag_verifier(dpp
, cct
, filter
, manifest_bl
,
3540 ldpp_dout(dpp
, 4) << "failed to initial etag verifier, "
3541 "disabling etag verification" << dendl
;
3543 filter
= etag_verifier
.get();
3547 need_to_process_attrs
= false;
3552 int handle_data(bufferlist
& bl
, bool *pause
) override
{
3554 progress_cb(data_len
, progress_data
);
3556 if (extra_data_left
) {
3557 uint64_t extra_len
= bl
.length();
3558 if (extra_len
> extra_data_left
)
3559 extra_len
= extra_data_left
;
3562 bl
.splice(0, extra_len
, &extra
);
3563 extra_data_bl
.append(extra
);
3565 extra_data_left
-= extra_len
;
3566 if (extra_data_left
== 0) {
3567 int res
= process_attrs();
3572 if (bl
.length() == 0) {
3576 if (need_to_process_attrs
) {
3577 /* need to call process_attrs() even if we don't get any attrs,
3578 * need it to call attrs_handler().
3580 int res
= process_attrs();
3586 ceph_assert(uint64_t(ofs
) >= extra_data_len
);
3588 uint64_t size
= bl
.length();
3591 const uint64_t lofs
= data_len
;
3594 return filter
->process(std::move(bl
), lofs
);
3598 return filter
->process({}, data_len
);
3601 bufferlist
& get_extra_data() { return extra_data_bl
; }
3603 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
3605 void set_extra_data_len(uint64_t len
) override
{
3606 extra_data_left
= len
;
3607 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len
);
3610 uint64_t get_data_len() {
3614 std::string
get_verifier_etag() {
3615 if (etag_verifier
) {
3616 etag_verifier
->calculate_etag();
3617 return etag_verifier
->get_calculated_etag();
3625 * prepare attrset depending on attrs_mod.
3627 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
3628 map
<string
, bufferlist
>& attrs
,
3629 RGWRados::AttrsMod attrs_mod
)
3631 switch (attrs_mod
) {
3632 case RGWRados::ATTRSMOD_NONE
:
3635 case RGWRados::ATTRSMOD_REPLACE
:
3636 if (!attrs
[RGW_ATTR_ETAG
].length()) {
3637 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
3639 if (!attrs
[RGW_ATTR_TAIL_TAG
].length()) {
3640 auto ttiter
= src_attrs
.find(RGW_ATTR_TAIL_TAG
);
3641 if (ttiter
!= src_attrs
.end()) {
3642 attrs
[RGW_ATTR_TAIL_TAG
] = src_attrs
[RGW_ATTR_TAIL_TAG
];
3646 case RGWRados::ATTRSMOD_MERGE
:
3647 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
3648 if (attrs
.find(it
->first
) == attrs
.end()) {
3649 attrs
[it
->first
] = it
->second
;
3656 int RGWRados::rewrite_obj(rgw::sal::Object
* obj
, const DoutPrefixProvider
*dpp
, optional_yield y
)
3658 RGWObjectCtx
rctx(this->store
);
3659 rgw::sal::Attrs attrset
;
3661 ceph::real_time mtime
;
3662 RGWRados::Object
op_target(this, obj
->get_bucket()->get_info(), rctx
, obj
->get_obj());
3663 RGWRados::Object::Read
read_op(&op_target
);
3665 read_op
.params
.attrs
= &attrset
;
3666 read_op
.params
.obj_size
= &obj_size
;
3667 read_op
.params
.lastmod
= &mtime
;
3669 int ret
= read_op
.prepare(y
, dpp
);
3673 attrset
.erase(RGW_ATTR_ID_TAG
);
3674 attrset
.erase(RGW_ATTR_TAIL_TAG
);
3676 return store
->getRados()->copy_obj_data(rctx
, obj
->get_bucket(),
3677 obj
->get_bucket()->get_info().placement_rule
,
3678 read_op
, obj_size
- 1, obj
, NULL
, mtime
,
3679 attrset
, 0, real_time(), NULL
, dpp
, y
);
3682 struct obj_time_weight
{
3684 uint32_t zone_short_id
;
3686 bool high_precision
;
3688 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3690 bool compare_low_precision(const obj_time_weight
& rhs
) {
3691 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
3692 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
3701 if (!zone_short_id
|| !rhs
.zone_short_id
) {
3702 /* don't compare zone ids, if one wasn't provided */
3705 if (zone_short_id
!= rhs
.zone_short_id
) {
3706 return (zone_short_id
< rhs
.zone_short_id
);
3708 return (pg_ver
< rhs
.pg_ver
);
3712 bool operator<(const obj_time_weight
& rhs
) {
3713 if (!high_precision
|| !rhs
.high_precision
) {
3714 return compare_low_precision(rhs
);
3716 if (mtime
> rhs
.mtime
) {
3719 if (mtime
< rhs
.mtime
) {
3722 if (!zone_short_id
|| !rhs
.zone_short_id
) {
3723 /* don't compare zone ids, if one wasn't provided */
3726 if (zone_short_id
!= rhs
.zone_short_id
) {
3727 return (zone_short_id
< rhs
.zone_short_id
);
3729 return (pg_ver
< rhs
.pg_ver
);
3732 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
3734 zone_short_id
= _short_id
;
3738 void init(RGWObjState
*state
) {
3739 mtime
= state
->mtime
;
3740 zone_short_id
= state
->zone_short_id
;
3741 pg_ver
= state
->pg_ver
;
3745 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
3748 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
3749 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
3755 class RGWGetExtraDataCB
: public RGWHTTPStreamRWRequest::ReceiveCB
{
3756 bufferlist extra_data
;
3758 RGWGetExtraDataCB() {}
3759 int handle_data(bufferlist
& bl
, bool *pause
) override
{
3760 int bl_len
= (int)bl
.length();
3761 if (extra_data
.length() < extra_data_len
) {
3762 off_t max
= extra_data_len
- extra_data
.length();
3766 bl
.splice(0, max
, &extra_data
);
3771 bufferlist
& get_extra_data() {
3776 int RGWRados::stat_remote_obj(const DoutPrefixProvider
*dpp
,
3777 RGWObjectCtx
& obj_ctx
,
3778 const rgw_user
& user_id
,
3780 const rgw_zone_id
& source_zone
,
3781 rgw::sal::Object
* src_obj
,
3782 const RGWBucketInfo
*src_bucket_info
,
3783 real_time
*src_mtime
,
3785 const real_time
*mod_ptr
,
3786 const real_time
*unmod_ptr
,
3787 bool high_precision_time
,
3788 const char *if_match
,
3789 const char *if_nomatch
,
3790 map
<string
, bufferlist
> *pattrs
,
3791 map
<string
, string
> *pheaders
,
3796 /* source is in a different zonegroup, copy from there */
3798 RGWRESTStreamRWRequest
*in_stream_req
;
3800 map
<string
, bufferlist
> src_attrs
;
3801 append_rand_alpha(cct
, tag
, tag
, 32);
3802 obj_time_weight set_mtime_weight
;
3803 set_mtime_weight
.high_precision
= high_precision_time
;
3806 if (source_zone
.empty()) {
3807 if (!src_bucket_info
|| src_bucket_info
->zonegroup
.empty()) {
3808 /* source is in the master zonegroup */
3809 conn
= svc
.zone
->get_master_conn();
3811 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
3812 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
->zonegroup
);
3813 if (iter
== zonegroup_conn_map
.end()) {
3814 ldpp_dout(dpp
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
3817 conn
= iter
->second
;
3820 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
3821 auto iter
= zone_conn_map
.find(source_zone
);
3822 if (iter
== zone_conn_map
.end()) {
3823 ldpp_dout(dpp
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
3826 conn
= iter
->second
;
3829 RGWGetExtraDataCB cb
;
3830 map
<string
, string
> req_headers
;
3831 real_time set_mtime
;
3833 const real_time
*pmod
= mod_ptr
;
3835 obj_time_weight dest_mtime_weight
;
3837 constexpr bool prepend_meta
= true;
3838 constexpr bool get_op
= true;
3839 constexpr bool rgwx_stat
= true;
3840 constexpr bool sync_manifest
= true;
3841 constexpr bool skip_decrypt
= true;
3842 int ret
= conn
->get_obj(dpp
, user_id
, info
, src_obj
, pmod
, unmod_ptr
,
3843 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
3844 prepend_meta
, get_op
, rgwx_stat
,
3845 sync_manifest
, skip_decrypt
,
3846 true, &cb
, &in_stream_req
);
3851 ret
= conn
->complete_request(in_stream_req
, nullptr, &set_mtime
, psize
,
3852 nullptr, pheaders
, null_yield
);
3857 bufferlist
& extra_data_bl
= cb
.get_extra_data();
3858 if (extra_data_bl
.length()) {
3860 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
3861 ldpp_dout(dpp
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
3865 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
3867 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
3871 *src_mtime
= set_mtime
;
3875 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
3876 if (iter
!= src_attrs
.end()) {
3877 bufferlist
& etagbl
= iter
->second
;
3878 *petag
= etagbl
.to_str();
3879 while (petag
->size() > 0 && (*petag
)[petag
->size() - 1] == '\0') {
3880 *petag
= petag
->substr(0, petag
->size() - 1);
3886 *pattrs
= std::move(src_attrs
);
3892 int RGWFetchObjFilter_Default::filter(CephContext
*cct
,
3893 const rgw_obj_key
& source_key
,
3894 const RGWBucketInfo
& dest_bucket_info
,
3895 std::optional
<rgw_placement_rule
> dest_placement_rule
,
3896 const map
<string
, bufferlist
>& obj_attrs
,
3897 std::optional
<rgw_user
> *poverride_owner
,
3898 const rgw_placement_rule
**prule
)
3900 const rgw_placement_rule
*ptail_rule
= (dest_placement_rule
? &(*dest_placement_rule
) : nullptr);
3902 auto iter
= obj_attrs
.find(RGW_ATTR_STORAGE_CLASS
);
3903 if (iter
!= obj_attrs
.end()) {
3904 dest_rule
.storage_class
= iter
->second
.to_str();
3905 dest_rule
.inherit_from(dest_bucket_info
.placement_rule
);
3906 ptail_rule
= &dest_rule
;
3908 ptail_rule
= &dest_bucket_info
.placement_rule
;
3911 *prule
= ptail_rule
;
3915 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
3916 const rgw_user
& user_id
,
3918 const rgw_zone_id
& source_zone
,
3919 rgw::sal::Object
* dest_obj
,
3920 rgw::sal::Object
* src_obj
,
3921 rgw::sal::Bucket
* dest_bucket
,
3922 rgw::sal::Bucket
* src_bucket
,
3923 std::optional
<rgw_placement_rule
> dest_placement_rule
,
3924 real_time
*src_mtime
,
3926 const real_time
*mod_ptr
,
3927 const real_time
*unmod_ptr
,
3928 bool high_precision_time
,
3929 const char *if_match
,
3930 const char *if_nomatch
,
3933 rgw::sal::Attrs
& attrs
,
3934 RGWObjCategory category
,
3935 std::optional
<uint64_t> olh_epoch
,
3936 real_time delete_at
,
3939 void (*progress_cb
)(off_t
, void *),
3940 void *progress_data
,
3941 const DoutPrefixProvider
*dpp
,
3942 RGWFetchObjFilter
*filter
,
3943 rgw_zone_set
*zones_trace
,
3944 std::optional
<uint64_t>* bytes_transferred
)
3946 /* source is in a different zonegroup, copy from there */
3948 RGWRESTStreamRWRequest
*in_stream_req
;
3951 append_rand_alpha(cct
, tag
, tag
, 32);
3952 obj_time_weight set_mtime_weight
;
3953 set_mtime_weight
.high_precision
= high_precision_time
;
3956 rgw::BlockingAioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
3957 using namespace rgw::putobj
;
3958 AtomicObjectProcessor
processor(&aio
, this->store
, nullptr, user_id
,
3959 obj_ctx
, dest_obj
->clone(), olh_epoch
,
3960 tag
, dpp
, null_yield
);
3962 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
3963 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
3964 if (source_zone
.empty()) {
3965 if (!src_bucket
|| src_bucket
->get_info().zonegroup
.empty()) {
3966 /* source is in the master zonegroup */
3967 conn
= svc
.zone
->get_master_conn();
3969 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket
->get_info().zonegroup
);
3970 if (iter
== zonegroup_conn_map
.end()) {
3971 ldpp_dout(dpp
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
3974 conn
= iter
->second
;
3977 auto iter
= zone_conn_map
.find(source_zone
);
3978 if (iter
== zone_conn_map
.end()) {
3979 ldpp_dout(dpp
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
3982 conn
= iter
->second
;
3985 boost::optional
<RGWPutObj_Compress
> compressor
;
3986 CompressorRef plugin
;
3988 RGWFetchObjFilter_Default source_filter
;
3990 filter
= &source_filter
;
3993 std::optional
<rgw_user
> override_owner
;
3995 RGWRadosPutObj
cb(dpp
, cct
, plugin
, compressor
, &processor
, progress_cb
, progress_data
,
3996 [&](map
<string
, bufferlist
>& obj_attrs
) {
3997 const rgw_placement_rule
*ptail_rule
;
3999 int ret
= filter
->filter(cct
,
4001 dest_bucket
->get_info(),
4002 dest_placement_rule
,
4007 ldpp_dout(dpp
, 5) << "Aborting fetch: source object filter returned ret=" << ret
<< dendl
;
4011 processor
.set_tail_placement(*ptail_rule
);
4013 const auto& compression_type
= svc
.zone
->get_zone_params().get_compression_type(*ptail_rule
);
4014 if (compression_type
!= "none") {
4015 plugin
= Compressor::create(cct
, compression_type
);
4017 ldpp_dout(dpp
, 1) << "Cannot load plugin for compression type "
4018 << compression_type
<< dendl
;
4022 ret
= processor
.prepare(null_yield
);
4030 real_time set_mtime
;
4031 uint64_t expected_size
= 0;
4033 RGWObjState
*dest_state
= NULL
;
4035 const real_time
*pmod
= mod_ptr
;
4037 obj_time_weight dest_mtime_weight
;
4039 if (copy_if_newer
) {
4040 /* need to get mtime for destination */
4041 ret
= get_obj_state(dpp
, &obj_ctx
, dest_bucket
->get_info(), dest_obj
->get_obj(), &dest_state
, false, null_yield
);
4045 if (!real_clock::is_zero(dest_state
->mtime
)) {
4046 dest_mtime_weight
.init(dest_state
);
4047 pmod
= &dest_mtime_weight
.mtime
;
4051 static constexpr bool prepend_meta
= true;
4052 static constexpr bool get_op
= true;
4053 static constexpr bool rgwx_stat
= false;
4054 static constexpr bool sync_manifest
= true;
4055 static constexpr bool skip_decrypt
= true;
4056 ret
= conn
->get_obj(dpp
, user_id
, info
, src_obj
, pmod
, unmod_ptr
,
4057 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
4058 prepend_meta
, get_op
, rgwx_stat
,
4059 sync_manifest
, skip_decrypt
,
4061 &cb
, &in_stream_req
);
4066 ret
= conn
->complete_request(in_stream_req
, &etag
, &set_mtime
,
4067 &expected_size
, nullptr, nullptr, null_yield
);
4075 if (cb
.get_data_len() != expected_size
) {
4077 ldpp_dout(dpp
, 0) << "ERROR: object truncated during fetching, expected "
4078 << expected_size
<< " bytes but received " << cb
.get_data_len() << dendl
;
4081 if (compressor
&& compressor
->is_compressed()) {
4083 RGWCompressionInfo cs_info
;
4084 cs_info
.compression_type
= plugin
->get_type_name();
4085 cs_info
.orig_size
= cb
.get_data_len();
4086 cs_info
.compressor_message
= compressor
->get_compressor_message();
4087 cs_info
.blocks
= move(compressor
->get_compression_blocks());
4088 encode(cs_info
, tmp
);
4089 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
4092 if (override_owner
) {
4093 processor
.set_owner(*override_owner
);
4095 auto& obj_attrs
= cb
.get_attrs();
4097 RGWUserInfo owner_info
;
4098 if (ctl
.user
->get_info_by_uid(dpp
, *override_owner
, &owner_info
, null_yield
) < 0) {
4099 ldpp_dout(dpp
, 10) << "owner info does not exist" << dendl
;
4103 RGWAccessControlPolicy acl
;
4105 auto aiter
= obj_attrs
.find(RGW_ATTR_ACL
);
4106 if (aiter
== obj_attrs
.end()) {
4107 ldpp_dout(dpp
, 0) << "WARNING: " << __func__
<< "(): object doesn't have ACL attribute, setting default ACLs" << dendl
;
4108 acl
.create_default(owner_info
.user_id
, owner_info
.display_name
);
4110 auto iter
= aiter
->second
.cbegin();
4113 } catch (buffer::error
& err
) {
4114 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<< "(): could not decode policy, caught buffer::error" << dendl
;
4120 new_owner
.set_id(*override_owner
);
4121 new_owner
.set_name(owner_info
.display_name
);
4123 acl
.set_owner(new_owner
);
4127 obj_attrs
[RGW_ATTR_ACL
] = std::move(bl
);
4130 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4131 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
4133 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
4134 if (iter
!= cb
.get_attrs().end()) {
4136 decode(delete_at
, iter
->second
);
4137 } catch (buffer::error
& err
) {
4138 ldpp_dout(dpp
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
4144 *src_mtime
= set_mtime
;
4148 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
4149 if (iter
!= cb
.get_attrs().end()) {
4150 *petag
= iter
->second
.to_str();
4154 //erase the append attr
4155 cb
.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM
);
4157 if (source_zone
.empty()) {
4158 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
4160 attrs
= cb
.get_attrs();
4163 if (copy_if_newer
) {
4164 uint64_t pg_ver
= 0;
4165 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
4166 if (i
!= attrs
.end() && i
->second
.length() > 0) {
4167 auto iter
= i
->second
.cbegin();
4169 decode(pg_ver
, iter
);
4170 } catch (buffer::error
& err
) {
4171 ldpp_dout(dpp
, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
4172 /* non critical error */
4175 set_mtime_weight
.init(set_mtime
, svc
.zone
->get_zone_short_id(), pg_ver
);
4178 /* Perform ETag verification is we have computed the object's MD5 sum at our end */
4179 if (const auto& verifier_etag
= cb
.get_verifier_etag();
4180 !verifier_etag
.empty()) {
4181 string trimmed_etag
= etag
;
4183 /* Remove the leading and trailing double quotes from etag */
4184 trimmed_etag
.erase(std::remove(trimmed_etag
.begin(), trimmed_etag
.end(),'\"'),
4185 trimmed_etag
.end());
4187 if (verifier_etag
!= trimmed_etag
) {
4189 ldpp_dout(dpp
, 0) << "ERROR: source and destination objects don't match. Expected etag:"
4190 << trimmed_etag
<< " Computed etag:" << verifier_etag
<< dendl
;
4195 #define MAX_COMPLETE_RETRY 100
4196 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
4197 bool canceled
= false;
4198 ret
= processor
.complete(cb
.get_data_len(), etag
, mtime
, set_mtime
,
4199 attrs
, delete_at
, nullptr, nullptr, nullptr,
4200 zones_trace
, &canceled
, null_yield
);
4205 if (copy_if_newer
&& canceled
) {
4206 ldpp_dout(dpp
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
4207 obj_ctx
.invalidate(dest_obj
->get_obj()); /* object was overwritten */
4208 ret
= get_obj_state(dpp
, &obj_ctx
, dest_bucket
->get_info(), dest_obj
->get_obj(), &dest_state
, false, null_yield
);
4210 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
4213 dest_mtime_weight
.init(dest_state
);
4214 dest_mtime_weight
.high_precision
= high_precision_time
;
4215 if (!dest_state
->exists
||
4216 dest_mtime_weight
< set_mtime_weight
) {
4217 ldpp_dout(dpp
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
4220 ldpp_dout(dpp
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
4226 if (i
== MAX_COMPLETE_RETRY
) {
4227 ldpp_dout(dpp
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
4232 if (bytes_transferred
) {
4233 *bytes_transferred
= cb
.get_data_len();
4237 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
4238 // we may have already fetched during sync of OP_ADD, but were waiting
4239 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4240 if (olh_epoch
&& *olh_epoch
> 0) {
4241 constexpr bool log_data_change
= true;
4242 ret
= set_olh(dpp
, obj_ctx
, dest_bucket
->get_info(), dest_obj
->get_obj(), false, nullptr,
4243 *olh_epoch
, real_time(), false, null_yield
, zones_trace
, log_data_change
);
4245 // we already have the latest copy
4253 int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider
*dpp
,
4254 RGWObjState
*astate
,
4255 map
<string
, bufferlist
>& src_attrs
,
4256 RGWRados::Object::Read
& read_op
,
4257 const rgw_user
& user_id
,
4258 rgw::sal::Object
* dest_obj
,
4263 RGWRESTStreamS3PutObj
*out_stream_req
;
4265 auto rest_master_conn
= svc
.zone
->get_master_conn();
4267 int ret
= rest_master_conn
->put_obj_async_init(dpp
, user_id
, dest_obj
, src_attrs
, &out_stream_req
);
4272 out_stream_req
->set_send_length(astate
->size
);
4274 ret
= RGWHTTP::send(out_stream_req
);
4276 delete out_stream_req
;
4280 ret
= read_op
.iterate(dpp
, 0, astate
->size
- 1, out_stream_req
->get_out_cb(), null_yield
);
4282 delete out_stream_req
;
4286 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
, null_yield
);
4295 * dest_obj: the object to copy into
4296 * src_obj: the object to copy from
4297 * attrs: usage depends on attrs_mod parameter
4298 * attrs_mod: the modification mode of the attrs, may have the following values:
4299 * ATTRSMOD_NONE - the attributes of the source object will be
4300 * copied without modifications, attrs parameter is ignored;
4301 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4302 * parameter, source object attributes are not copied;
4303 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4304 * are overwritten by values contained in attrs parameter.
4305 * err: stores any errors resulting from the get of the original object
4306 * Returns: 0 on success, -ERR# otherwise.
4308 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
4309 const rgw_user
& user_id
,
4311 const rgw_zone_id
& source_zone
,
4312 rgw::sal::Object
* dest_obj
,
4313 rgw::sal::Object
* src_obj
,
4314 rgw::sal::Bucket
* dest_bucket
,
4315 rgw::sal::Bucket
* src_bucket
,
4316 const rgw_placement_rule
& dest_placement
,
4317 real_time
*src_mtime
,
4319 const real_time
*mod_ptr
,
4320 const real_time
*unmod_ptr
,
4321 bool high_precision_time
,
4322 const char *if_match
,
4323 const char *if_nomatch
,
4326 rgw::sal::Attrs
& attrs
,
4327 RGWObjCategory category
,
4329 real_time delete_at
,
4333 void (*progress_cb
)(off_t
, void *),
4334 void *progress_data
,
4335 const DoutPrefixProvider
*dpp
,
4340 rgw_obj shadow_obj
= dest_obj
->get_obj();
4346 append_rand_alpha(cct
, dest_obj
->get_oid(), shadow_oid
, 32);
4347 shadow_obj
.init_ns(dest_obj
->get_bucket()->get_key(), shadow_oid
, shadow_ns
);
4349 auto& zonegroup
= svc
.zone
->get_zonegroup();
4351 remote_dest
= !zonegroup
.equals(dest_bucket
->get_info().zonegroup
);
4352 remote_src
= !zonegroup
.equals(src_bucket
->get_info().zonegroup
);
4354 if (remote_src
&& remote_dest
) {
4355 ldpp_dout(dpp
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
4359 ldpp_dout(dpp
, 5) << "Copy object " << src_obj
->get_bucket() << ":" << src_obj
->get_oid() << " => " << dest_obj
->get_bucket() << ":" << dest_obj
->get_oid() << dendl
;
4361 if (remote_src
|| !source_zone
.empty()) {
4362 return fetch_remote_obj(obj_ctx
, user_id
, info
, source_zone
,
4363 dest_obj
, src_obj
, dest_bucket
, src_bucket
,
4364 dest_placement
, src_mtime
, mtime
, mod_ptr
,
4365 unmod_ptr
, high_precision_time
,
4366 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
4367 olh_epoch
, delete_at
, ptag
, petag
, progress_cb
, progress_data
, dpp
,
4368 nullptr /* filter */);
4371 map
<string
, bufferlist
> src_attrs
;
4372 RGWRados::Object
src_op_target(this, src_bucket
->get_info(), obj_ctx
, src_obj
->get_obj());
4373 RGWRados::Object::Read
read_op(&src_op_target
);
4375 read_op
.conds
.mod_ptr
= mod_ptr
;
4376 read_op
.conds
.unmod_ptr
= unmod_ptr
;
4377 read_op
.conds
.high_precision_time
= high_precision_time
;
4378 read_op
.conds
.if_match
= if_match
;
4379 read_op
.conds
.if_nomatch
= if_nomatch
;
4380 read_op
.params
.attrs
= &src_attrs
;
4381 read_op
.params
.lastmod
= src_mtime
;
4382 read_op
.params
.obj_size
= &obj_size
;
4384 ret
= read_op
.prepare(y
, dpp
);
4388 if (src_attrs
.count(RGW_ATTR_CRYPT_MODE
)) {
4389 // Current implementation does not follow S3 spec and even
4390 // may result in data corruption silently when copying
4391 // multipart objects acorss pools. So reject COPY operations
4392 //on encrypted objects before it is fully functional.
4393 ldpp_dout(dpp
, 0) << "ERROR: copy op for encrypted object " << src_obj
4394 << " has not been implemented." << dendl
;
4395 return -ERR_NOT_IMPLEMENTED
;
4398 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
4399 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
4401 src_attrs
.erase(RGW_ATTR_OBJECT_RETENTION
);
4402 src_attrs
.erase(RGW_ATTR_OBJECT_LEGAL_HOLD
);
4403 map
<string
, bufferlist
>::iterator rt
= attrs
.find(RGW_ATTR_OBJECT_RETENTION
);
4404 if (rt
!= attrs
.end())
4405 src_attrs
[RGW_ATTR_OBJECT_RETENTION
] = rt
->second
;
4406 map
<string
, bufferlist
>::iterator lh
= attrs
.find(RGW_ATTR_OBJECT_LEGAL_HOLD
);
4407 if (lh
!= attrs
.end())
4408 src_attrs
[RGW_ATTR_OBJECT_LEGAL_HOLD
] = lh
->second
;
4410 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
4411 attrs
.erase(RGW_ATTR_ID_TAG
);
4412 attrs
.erase(RGW_ATTR_PG_VER
);
4413 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
4414 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
4415 if (cmp
!= src_attrs
.end())
4416 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
4418 RGWObjManifest manifest
;
4419 RGWObjState
*astate
= NULL
;
4421 ret
= get_obj_state(dpp
, &obj_ctx
, src_bucket
->get_info(), src_obj
->get_obj(), &astate
, y
);
4426 vector
<rgw_raw_obj
> ref_objs
;
4429 /* dest is in a different zonegroup, copy it there */
4430 return copy_obj_to_remote_dest(dpp
, astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
4432 uint64_t max_chunk_size
;
4434 ret
= get_max_chunk_size(dest_bucket
->get_placement_rule(), dest_obj
->get_obj(), &max_chunk_size
, dpp
);
4436 ldpp_dout(dpp
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
->get_bucket() << dendl
;
4443 const rgw_placement_rule
*src_rule
{nullptr};
4445 if (astate
->manifest
) {
4446 src_rule
= &astate
->manifest
->get_tail_placement().placement_rule
;
4447 ldpp_dout(dpp
, 20) << __func__
<< "(): manifest src_rule=" << src_rule
->to_str() << dendl
;
4450 if (!src_rule
|| src_rule
->empty()) {
4451 src_rule
= &src_bucket
->get_placement_rule();
4454 if (!get_obj_data_pool(*src_rule
, src_obj
->get_obj(), &src_pool
)) {
4455 ldpp_dout(dpp
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
4459 if (!get_obj_data_pool(dest_placement
, dest_obj
->get_obj(), &dest_pool
)) {
4460 ldpp_dout(dpp
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
4464 ldpp_dout(dpp
, 20) << __func__
<< "(): src_rule=" << src_rule
->to_str() << " src_pool=" << src_pool
4465 << " dest_rule=" << dest_placement
.to_str() << " dest_pool=" << dest_pool
<< dendl
;
4467 bool copy_data
= (!astate
->manifest
) ||
4468 (*src_rule
!= dest_placement
) ||
4469 (src_pool
!= dest_pool
);
4471 bool copy_first
= false;
4472 if (astate
->manifest
) {
4473 if (!astate
->manifest
->has_tail()) {
4476 uint64_t head_size
= astate
->manifest
->get_head_size();
4478 if (head_size
> 0) {
4479 if (head_size
> max_chunk_size
) {
4489 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4490 if (iter
!= attrs
.end()) {
4491 *petag
= iter
->second
.to_str();
4495 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
4496 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4497 return copy_obj_data(obj_ctx
, dest_bucket
, dest_placement
, read_op
, obj_size
- 1, dest_obj
,
4498 mtime
, real_time(), attrs
, olh_epoch
, delete_at
, petag
, dpp
, y
);
4501 RGWObjManifest::obj_iterator miter
= astate
->manifest
->obj_begin(dpp
);
4503 if (copy_first
) { // we need to copy first chunk, not increase refcount
4508 ret
= get_raw_obj_ref(dpp
, miter
.get_location().get_raw_obj(store
), &ref
);
4513 bufferlist first_chunk
;
4515 const bool copy_itself
= (dest_obj
->get_obj() == src_obj
->get_obj());
4516 RGWObjManifest
*pmanifest
;
4517 ldpp_dout(dpp
, 20) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
4519 RGWRados::Object
dest_op_target(this, dest_bucket
->get_info(), obj_ctx
, dest_obj
->get_obj());
4520 RGWRados::Object::Write
write_op(&dest_op_target
);
4529 append_rand_alpha(cct
, tag
, tag
, 32);
4533 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4534 manifest
= *astate
->manifest
;
4535 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
4536 if (tail_placement
.bucket
.name
.empty()) {
4537 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
->get_bucket()->get_key());
4540 for (; miter
!= astate
->manifest
->obj_end(dpp
); ++miter
) {
4541 ObjectWriteOperation op
;
4542 ref_tag
= tag
+ '\0';
4543 cls_refcount_get(op
, ref_tag
, true);
4544 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(store
);
4546 auto& ioctx
= ref
.pool
.ioctx();
4547 ioctx
.locator_set_key(loc
.loc
);
4549 ret
= rgw_rados_operate(dpp
, ioctx
, loc
.oid
, &op
, null_yield
);
4554 ref_objs
.push_back(loc
);
4557 pmanifest
= &manifest
;
4559 pmanifest
= &(*astate
->manifest
);
4560 /* don't send the object's tail for garbage collection */
4561 astate
->keep_tail
= true;
4565 ret
= read_op
.read(0, max_chunk_size
, first_chunk
, y
, dpp
);
4570 pmanifest
->set_head(dest_bucket
->get_placement_rule(), dest_obj
->get_obj(), first_chunk
.length());
4572 pmanifest
->set_head(dest_bucket
->get_placement_rule(), dest_obj
->get_obj(), 0);
4575 write_op
.meta
.data
= &first_chunk
;
4576 write_op
.meta
.manifest
= pmanifest
;
4577 write_op
.meta
.ptag
= &tag
;
4578 write_op
.meta
.owner
= dest_bucket
->get_info().owner
;
4579 write_op
.meta
.mtime
= mtime
;
4580 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
4581 write_op
.meta
.category
= category
;
4582 write_op
.meta
.olh_epoch
= olh_epoch
;
4583 write_op
.meta
.delete_at
= delete_at
;
4584 write_op
.meta
.modify_tail
= !copy_itself
;
4586 ret
= write_op
.write_meta(dpp
, obj_size
, astate
->accounted_size
, attrs
, y
);
4595 vector
<rgw_raw_obj
>::iterator riter
;
4597 /* rollback reference */
4598 string ref_tag
= tag
+ '\0';
4599 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
4600 ObjectWriteOperation op
;
4601 cls_refcount_put(op
, ref_tag
, true);
4603 ref
.pool
.ioctx().locator_set_key(riter
->loc
);
4605 int r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), riter
->oid
, &op
, null_yield
);
4607 ldpp_dout(dpp
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
4615 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
4616 rgw::sal::Bucket
* bucket
,
4617 const rgw_placement_rule
& dest_placement
,
4618 RGWRados::Object::Read
& read_op
, off_t end
,
4619 rgw::sal::Object
* dest_obj
,
4621 real_time set_mtime
,
4622 rgw::sal::Attrs
& attrs
,
4624 real_time delete_at
,
4626 const DoutPrefixProvider
*dpp
,
4630 append_rand_alpha(cct
, tag
, tag
, 32);
4632 rgw::BlockingAioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
4633 using namespace rgw::putobj
;
4634 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4635 // it causes crashes in the ragweed tests
4636 AtomicObjectProcessor
processor(&aio
, this->store
, &dest_placement
,
4637 bucket
->get_info().owner
, obj_ctx
,
4638 dest_obj
->clone(), olh_epoch
, tag
,
4640 int ret
= processor
.prepare(y
);
4648 ret
= read_op
.read(ofs
, end
, bl
, y
, dpp
);
4650 ldpp_dout(dpp
, 0) << "ERROR: fail to read object data, ret = " << ret
<< dendl
;
4654 uint64_t read_len
= ret
;
4655 ret
= processor
.process(std::move(bl
), ofs
);
4661 } while (ofs
<= end
);
4664 ret
= processor
.process({}, ofs
);
4670 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4671 if (iter
!= attrs
.end()) {
4672 bufferlist
& bl
= iter
->second
;
4679 uint64_t accounted_size
;
4681 bool compressed
{false};
4682 RGWCompressionInfo cs_info
;
4683 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
4685 ldpp_dout(dpp
, 0) << "ERROR: failed to read compression info" << dendl
;
4688 // pass original size if compressed
4689 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
4692 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
,
4693 nullptr, nullptr, nullptr, nullptr, nullptr, y
);
4696 int RGWRados::transition_obj(RGWObjectCtx
& obj_ctx
,
4697 rgw::sal::Bucket
* bucket
,
4698 rgw::sal::Object
& obj
,
4699 const rgw_placement_rule
& placement_rule
,
4700 const real_time
& mtime
,
4702 const DoutPrefixProvider
*dpp
,
4705 rgw::sal::Attrs attrs
;
4706 real_time read_mtime
;
4709 obj
.set_atomic(&obj_ctx
);
4710 RGWRados::Object
op_target(this, bucket
->get_info(), obj_ctx
, obj
.get_obj());
4711 RGWRados::Object::Read
read_op(&op_target
);
4713 read_op
.params
.attrs
= &attrs
;
4714 read_op
.params
.lastmod
= &read_mtime
;
4715 read_op
.params
.obj_size
= &obj_size
;
4717 int ret
= read_op
.prepare(y
, dpp
);
4722 if (read_mtime
!= mtime
) {
4727 attrs
.erase(RGW_ATTR_ID_TAG
);
4728 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4730 ret
= copy_obj_data(obj_ctx
,
4736 nullptr /* pmtime */,
4741 nullptr /* petag */,
4751 int RGWRados::check_bucket_empty(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
, optional_yield y
)
4753 constexpr uint NUM_ENTRIES
= 1000u;
4755 rgw_obj_index_key marker
;
4760 std::vector
<rgw_bucket_dir_entry
> ent_list
;
4761 ent_list
.reserve(NUM_ENTRIES
);
4763 int r
= cls_bucket_list_unordered(dpp
,
4779 for (auto const& dirent
: ent_list
) {
4782 if (rgw_obj_key::oid_to_key_in_ns(dirent
.key
.name
, &obj
, ns
)) {
4786 } while (is_truncated
);
4793 * bucket: the name of the bucket to delete
4794 * Returns 0 on success, -ERR# otherwise.
4796 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, optional_yield y
, const DoutPrefixProvider
*dpp
, bool check_empty
)
4798 const rgw_bucket
& bucket
= bucket_info
.bucket
;
4799 RGWSI_RADOS::Pool index_pool
;
4800 map
<int, string
> bucket_objs
;
4801 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
4806 r
= check_bucket_empty(dpp
, bucket_info
, y
);
4812 bool remove_ep
= true;
4814 if (objv_tracker
.read_version
.empty()) {
4815 RGWBucketEntryPoint ep
;
4816 r
= ctl
.bucket
->read_bucket_entrypoint_info(bucket_info
.bucket
,
4820 RGWBucketCtl::Bucket::GetParams()
4821 .set_objv_tracker(&objv_tracker
));
4823 (!bucket_info
.bucket
.bucket_id
.empty() &&
4824 ep
.bucket
.bucket_id
!= bucket_info
.bucket
.bucket_id
)) {
4826 ldpp_dout(dpp
, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info
.bucket
<< " returned error: r=" << r
<< dendl
;
4827 /* we have no idea what caused the error, will not try to remove it */
4830 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4838 r
= ctl
.bucket
->remove_bucket_entrypoint_info(bucket_info
.bucket
, null_yield
, dpp
,
4839 RGWBucketCtl::Bucket::RemoveParams()
4840 .set_objv_tracker(&objv_tracker
));
4845 /* if the bucket is not synced we can remove the meta file */
4846 if (!svc
.zone
->is_syncing_bucket_meta(bucket
)) {
4847 RGWObjVersionTracker objv_tracker
;
4848 r
= ctl
.bucket
->remove_bucket_instance_info(bucket
, bucket_info
, null_yield
, dpp
);
4853 /* remove bucket index objects asynchronously by best effort */
4854 (void) CLSRGWIssueBucketIndexClean(index_pool
.ioctx(),
4856 cct
->_conf
->rgw_bucket_index_max_aio
)();
4862 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
, const DoutPrefixProvider
*dpp
)
4865 map
<string
, bufferlist
> attrs
;
4867 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
4869 if (bucket
.bucket_id
.empty()) {
4870 r
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, info
, NULL
, null_yield
, dpp
, &attrs
);
4872 r
= get_bucket_instance_info(obj_ctx
, bucket
, info
, nullptr, &attrs
, null_yield
, dpp
);
4875 ldpp_dout(dpp
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
4879 info
.owner
= owner
.get_id();
4881 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
, dpp
);
4883 ldpp_dout(dpp
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
4891 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
, const DoutPrefixProvider
*dpp
)
4895 vector
<rgw_bucket
>::iterator iter
;
4897 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
4898 rgw_bucket
& bucket
= *iter
;
4900 ldpp_dout(dpp
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
4902 ldpp_dout(dpp
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
4906 map
<string
, bufferlist
> attrs
;
4907 int r
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, info
, NULL
, null_yield
, dpp
, &attrs
);
4909 ldpp_dout(dpp
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
4914 info
.flags
&= ~BUCKET_SUSPENDED
;
4916 info
.flags
|= BUCKET_SUSPENDED
;
4919 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
, dpp
);
4921 ldpp_dout(dpp
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
4929 int RGWRados::bucket_suspended(const DoutPrefixProvider
*dpp
, rgw_bucket
& bucket
, bool *suspended
)
4931 RGWBucketInfo bucket_info
;
4932 int ret
= get_bucket_info(&svc
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
, null_yield
, dpp
);
4937 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
4941 int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider
*dpp
)
4943 if ((!state
->manifest
)|| state
->keep_tail
)
4946 cls_rgw_obj_chain chain
;
4947 store
->update_gc_chain(dpp
, obj
, *state
->manifest
, &chain
);
4949 if (chain
.empty()) {
4953 string tag
= (state
->tail_tag
.length() > 0 ? state
->tail_tag
.to_str() : state
->obj_tag
.to_str());
4954 if (store
->gc
== nullptr) {
4955 ldpp_dout(dpp
, 0) << "deleting objects inline since gc isn't initialized" << dendl
;
4956 //Delete objects inline just in case gc hasn't been initialised, prevents crashes
4957 store
->delete_objs_inline(dpp
, chain
, tag
);
4959 auto ret
= store
->gc
->send_chain(chain
, tag
); // do it synchronously
4961 //Delete objects inline if send chain to gc fails
4962 store
->delete_objs_inline(dpp
, chain
, tag
);
4968 void RGWRados::update_gc_chain(const DoutPrefixProvider
*dpp
, rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
4970 RGWObjManifest::obj_iterator iter
;
4971 rgw_raw_obj raw_head
;
4972 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
4973 for (iter
= manifest
.obj_begin(dpp
); iter
!= manifest
.obj_end(dpp
); ++iter
) {
4974 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(store
);
4975 if (mobj
== raw_head
)
4977 cls_rgw_obj_key
key(mobj
.oid
);
4978 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
4982 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
)
4984 if (chain
.empty()) {
4988 return gc
->send_chain(chain
, tag
);
4991 void RGWRados::delete_objs_inline(const DoutPrefixProvider
*dpp
, cls_rgw_obj_chain
& chain
, const string
& tag
)
4994 std::unique_ptr
<IoCtx
> ctx(new IoCtx
);
4996 for (auto liter
= chain
.objs
.begin(); liter
!= chain
.objs
.end(); ++liter
) {
4997 cls_rgw_obj
& obj
= *liter
;
4998 if (obj
.pool
!= last_pool
) {
4999 ctx
.reset(new IoCtx
);
5000 ret
= rgw_init_ioctx(dpp
, get_rados_handle(), obj
.pool
, *ctx
);
5003 ldpp_dout(dpp
, 0) << "ERROR: failed to create ioctx pool=" <<
5007 last_pool
= obj
.pool
;
5009 ctx
->locator_set_key(obj
.loc
);
5010 const string
& oid
= obj
.key
.name
; /* just stored raw oid there */
5011 ldpp_dout(dpp
, 5) << "delete_objs_inline: removing " << obj
.pool
<<
5012 ":" << obj
.key
.name
<< dendl
;
5013 ObjectWriteOperation op
;
5014 cls_refcount_put(op
, tag
, true);
5015 ret
= ctx
->operate(oid
, &op
);
5017 ldpp_dout(dpp
, 5) << "delete_objs_inline: refcount put returned error " << ret
<< dendl
;
5022 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
5023 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
5025 for (const auto& pair
: header
.stats
) {
5026 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
5027 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
5029 RGWStorageStats
& s
= stats
[category
];
5031 s
.category
= category
;
5032 s
.size
+= header_stats
.total_size
;
5033 s
.size_rounded
+= header_stats
.total_size_rounded
;
5034 s
.size_utilized
+= header_stats
.actual_size
;
5035 s
.num_objects
+= header_stats
.num_entries
;
5039 int RGWRados::bucket_check_index(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
,
5040 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
5041 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
5043 RGWSI_RADOS::Pool index_pool
;
5045 // key - bucket index object id
5046 // value - bucket index check OP returned result with the given bucket index object (shard)
5047 map
<int, string
> oids
;
5048 int ret
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, std::nullopt
, &index_pool
, &oids
, nullptr);
5053 // declare and pre-populate
5054 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
5055 for (auto& iter
: oids
) {
5056 bucket_objs_ret
.emplace(iter
.first
, rgw_cls_check_index_ret());
5059 ret
= CLSRGWIssueBucketCheck(index_pool
.ioctx(), oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5064 // aggregate results (from different shards if there are any)
5065 for (const auto& iter
: bucket_objs_ret
) {
5066 accumulate_raw_stats(iter
.second
.existing_header
, *existing_stats
);
5067 accumulate_raw_stats(iter
.second
.calculated_header
, *calculated_stats
);
5073 int RGWRados::bucket_rebuild_index(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
)
5075 RGWSI_RADOS::Pool index_pool
;
5076 map
<int, string
> bucket_objs
;
5078 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
5083 return CLSRGWIssueBucketRebuild(index_pool
.ioctx(), bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5086 int RGWRados::bucket_set_reshard(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const cls_rgw_bucket_instance_entry
& entry
)
5088 RGWSI_RADOS::Pool index_pool
;
5089 map
<int, string
> bucket_objs
;
5091 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
5096 return CLSRGWIssueSetBucketResharding(index_pool
.ioctx(), bucket_objs
, entry
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5099 int RGWRados::defer_gc(const DoutPrefixProvider
*dpp
, void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, optional_yield y
)
5101 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
5102 std::string oid
, key
;
5103 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5107 RGWObjState
*state
= NULL
;
5109 int r
= get_obj_state(dpp
, rctx
, bucket_info
, obj
, &state
, false, y
);
5113 if (!state
->is_atomic
) {
5114 ldpp_dout(dpp
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
5120 if (state
->tail_tag
.length() > 0) {
5121 tag
= state
->tail_tag
.c_str();
5122 } else if (state
->obj_tag
.length() > 0) {
5123 tag
= state
->obj_tag
.c_str();
5125 ldpp_dout(dpp
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
5129 ldpp_dout(dpp
, 0) << "defer chain tag=" << tag
<< dendl
;
5131 cls_rgw_obj_chain chain
;
5132 update_gc_chain(dpp
, state
->obj
, *state
->manifest
, &chain
);
5133 return gc
->async_defer_chain(tag
, chain
);
5136 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
5138 list
<string
> prefixes
;
5139 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
5140 cls_rgw_remove_obj(op
, prefixes
);
5143 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
5145 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
5148 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
5150 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
5153 struct tombstone_entry
{
5154 ceph::real_time mtime
;
5155 uint32_t zone_short_id
;
5158 tombstone_entry() = default;
5159 explicit tombstone_entry(const RGWObjState
& state
)
5160 : mtime(state
.mtime
), zone_short_id(state
.zone_short_id
),
5161 pg_ver(state
.pg_ver
) {}
5166 * bucket: name of the bucket storing the object
5167 * obj: name of the object to delete
5168 * Returns: 0 on success, -ERR# otherwise.
5170 int RGWRados::Object::Delete::delete_obj(optional_yield y
, const DoutPrefixProvider
*dpp
)
5172 RGWRados
*store
= target
->get_store();
5173 rgw_obj
& src_obj
= target
->get_obj();
5174 const string
& instance
= src_obj
.key
.instance
;
5175 rgw_obj obj
= src_obj
;
5177 if (instance
== "null") {
5178 obj
.key
.instance
.clear();
5181 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
5183 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
5184 if (instance
.empty() || explicit_marker_version
) {
5185 rgw_obj marker
= obj
;
5187 if (!params
.marker_version_id
.empty()) {
5188 if (params
.marker_version_id
!= "null") {
5189 marker
.key
.set_instance(params
.marker_version_id
);
5191 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
5192 store
->gen_rand_obj_instance_name(&marker
);
5195 result
.version_id
= marker
.key
.instance
;
5196 if (result
.version_id
.empty())
5197 result
.version_id
= "null";
5198 result
.delete_marker
= true;
5200 struct rgw_bucket_dir_entry_meta meta
;
5202 meta
.owner
= params
.obj_owner
.get_id().to_str();
5203 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
5205 if (real_clock::is_zero(params
.mtime
)) {
5206 meta
.mtime
= real_clock::now();
5208 meta
.mtime
= params
.mtime
;
5211 int r
= store
->set_olh(dpp
, target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
, y
, params
.zones_trace
);
5216 rgw_bucket_dir_entry dirent
;
5218 int r
= store
->bi_get_instance(dpp
, target
->get_bucket_info(), obj
, &dirent
);
5222 result
.delete_marker
= dirent
.is_delete_marker();
5223 r
= store
->unlink_obj_instance(dpp
, target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
, y
, params
.zones_trace
);
5227 result
.version_id
= instance
;
5230 BucketShard
*bs
= nullptr;
5231 int r
= target
->get_bucket_shard(&bs
, dpp
);
5233 ldpp_dout(dpp
, 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
5237 r
= store
->svc
.datalog_rados
->add_entry(dpp
, target
->bucket_info
, bs
->shard_id
);
5239 ldpp_dout(dpp
, -1) << "ERROR: failed writing data log" << dendl
;
5247 int r
= store
->get_obj_head_ref(dpp
, target
->get_bucket_info(), obj
, &ref
);
5253 r
= target
->get_state(dpp
, &state
, false, y
);
5257 ObjectWriteOperation op
;
5259 if (!real_clock::is_zero(params
.unmod_since
)) {
5260 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
5261 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
5262 if (!params
.high_precision_time
) {
5267 ldpp_dout(dpp
, 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
5268 if (ctime
> unmod
) {
5269 return -ERR_PRECONDITION_FAILED
;
5272 /* only delete object if mtime is less than or equal to params.unmod_since */
5273 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
5275 uint64_t obj_accounted_size
= state
->accounted_size
;
5277 if(params
.abortmp
) {
5278 obj_accounted_size
= params
.parts_accounted_size
;
5281 if (!real_clock::is_zero(params
.expiration_time
)) {
5283 real_time delete_at
;
5285 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
5287 auto iter
= bl
.cbegin();
5288 decode(delete_at
, iter
);
5289 } catch (buffer::error
& err
) {
5290 ldpp_dout(dpp
, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
5294 if (params
.expiration_time
!= delete_at
) {
5295 return -ERR_PRECONDITION_FAILED
;
5298 return -ERR_PRECONDITION_FAILED
;
5302 if (!state
->exists
) {
5303 target
->invalidate_state();
5307 r
= target
->prepare_atomic_modification(dpp
, op
, false, NULL
, NULL
, NULL
, true, false, y
);
5311 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
5313 RGWRados::Bucket
bop(store
, bucket_info
);
5314 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5316 index_op
.set_zones_trace(params
.zones_trace
);
5317 index_op
.set_bilog_flags(params
.bilog_flags
);
5319 r
= index_op
.prepare(dpp
, CLS_RGW_OP_DEL
, &state
->write_tag
, y
);
5323 store
->remove_rgw_head_obj(op
);
5325 auto& ioctx
= ref
.pool
.ioctx();
5326 r
= rgw_rados_operate(dpp
, ioctx
, ref
.obj
.oid
, &op
, null_yield
);
5328 /* raced with another operation, object state is indeterminate */
5329 const bool need_invalidate
= (r
== -ECANCELED
);
5331 int64_t poolid
= ioctx
.get_id();
5333 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
5334 if (obj_tombstone_cache
) {
5335 tombstone_entry entry
{*state
};
5336 obj_tombstone_cache
->add(obj
, entry
);
5338 r
= index_op
.complete_del(dpp
, poolid
, ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
5340 int ret
= target
->complete_atomic_modification(dpp
);
5342 ldpp_dout(dpp
, 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
5344 /* other than that, no need to propagate error */
5346 int ret
= index_op
.cancel(dpp
, params
.remove_objs
);
5348 ldpp_dout(dpp
, 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
5352 if (need_invalidate
) {
5353 target
->invalidate_state();
5359 /* update quota cache */
5360 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_accounted_size
);
5365 int RGWRados::delete_obj(const DoutPrefixProvider
*dpp
,
5366 RGWObjectCtx
& obj_ctx
,
5367 const RGWBucketInfo
& bucket_info
,
5369 int versioning_status
, // versioning flags defined in enum RGWBucketFlags
5370 uint16_t bilog_flags
,
5371 const real_time
& expiration_time
,
5372 rgw_zone_set
*zones_trace
)
5374 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
5375 RGWRados::Object::Delete
del_op(&del_target
);
5377 del_op
.params
.bucket_owner
= bucket_info
.owner
;
5378 del_op
.params
.versioning_status
= versioning_status
;
5379 del_op
.params
.bilog_flags
= bilog_flags
;
5380 del_op
.params
.expiration_time
= expiration_time
;
5381 del_op
.params
.zones_trace
= zones_trace
;
5383 return del_op
.delete_obj(null_yield
, dpp
);
5386 int RGWRados::delete_raw_obj(const DoutPrefixProvider
*dpp
, const rgw_raw_obj
& obj
)
5389 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
5394 ObjectWriteOperation op
;
5397 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
5404 int RGWRados::delete_obj_index(const rgw_obj
& obj
, ceph::real_time mtime
, const DoutPrefixProvider
*dpp
)
5406 std::string oid
, key
;
5407 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5409 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5411 RGWBucketInfo bucket_info
;
5412 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
, null_yield
, dpp
);
5414 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
5418 RGWRados::Bucket
bop(this, bucket_info
);
5419 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5421 return index_op
.complete_del(dpp
, -1 /* pool */, 0, mtime
, NULL
);
5424 static void generate_fake_tag(const DoutPrefixProvider
*dpp
, rgw::sal::Store
* store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
5428 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin(dpp
);
5429 if (mi
!= manifest
.obj_end(dpp
)) {
5430 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
5432 rgw::sal::RadosStore
* rstore
= dynamic_cast<rgw::sal::RadosStore
*>(store
);
5433 tag
= mi
.get_location().get_raw_obj(rstore
).oid
;
5437 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
5438 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
5440 // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
5441 hash
.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW
);
5442 hash
.Update((const unsigned char *)manifest_bl
.c_str(), manifest_bl
.length());
5444 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
5445 if (iter
!= attrset
.end()) {
5446 bufferlist
& bl
= iter
->second
;
5447 hash
.Update((const unsigned char *)bl
.c_str(), bl
.length());
5451 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
5452 tag
.append(md5_str
);
5454 ldpp_dout(dpp
, 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
5456 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
5459 static bool is_olh(map
<string
, bufferlist
>& attrs
)
5461 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
5462 return (iter
!= attrs
.end());
5465 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
5467 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
5468 return (iter
!= attrs
.end());
5471 int RGWRados::get_olh_target_state(const DoutPrefixProvider
*dpp
, RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5472 RGWObjState
*olh_state
, RGWObjState
**target_state
, optional_yield y
)
5474 ceph_assert(olh_state
->is_olh
);
5477 int r
= RGWRados::follow_olh(dpp
, bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
5481 r
= get_obj_state(dpp
, &obj_ctx
, bucket_info
, target
, target_state
, false, y
);
5489 int RGWRados::get_obj_state_impl(const DoutPrefixProvider
*dpp
, RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5490 RGWObjState
**state
, bool follow_olh
, optional_yield y
, bool assume_noent
)
5496 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
5498 RGWObjState
*s
= rctx
->get_state(obj
);
5499 ldpp_dout(dpp
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
5502 if (s
->is_olh
&& need_follow_olh
) {
5503 return get_olh_target_state(dpp
, *rctx
, bucket_info
, obj
, s
, state
, y
);
5510 rgw_raw_obj raw_obj
;
5511 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
5515 if (!assume_noent
) {
5516 r
= RGWRados::raw_obj_stat(dpp
, raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
, y
);
5521 s
->has_attrs
= true;
5522 tombstone_entry entry
;
5523 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
5524 s
->mtime
= entry
.mtime
;
5525 s
->zone_short_id
= entry
.zone_short_id
;
5526 s
->pg_ver
= entry
.pg_ver
;
5527 ldpp_dout(dpp
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
5528 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
5530 s
->mtime
= real_time();
5538 s
->has_attrs
= true;
5539 s
->accounted_size
= s
->size
;
5541 auto iter
= s
->attrset
.find(RGW_ATTR_ETAG
);
5542 if (iter
!= s
->attrset
.end()) {
5543 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5544 bufferlist
& bletag
= iter
->second
;
5545 if (bletag
.length() > 0 && bletag
[bletag
.length() - 1] == '\0') {
5547 bletag
.splice(0, bletag
.length() - 1, &newbl
);
5548 bletag
= std::move(newbl
);
5552 iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
5553 const bool compressed
= (iter
!= s
->attrset
.end());
5555 // use uncompressed size for accounted_size
5557 RGWCompressionInfo info
;
5558 auto p
= iter
->second
.cbegin();
5560 s
->accounted_size
= info
.orig_size
;
5561 } catch (buffer::error
&) {
5562 ldpp_dout(dpp
, 0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
5567 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
5568 if (iter
!= s
->attrset
.end()) {
5569 bufferlist bl
= iter
->second
;
5570 bufferlist::iterator it
= bl
.begin();
5571 it
.copy(bl
.length(), s
->shadow_obj
);
5572 s
->shadow_obj
[bl
.length()] = '\0';
5574 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
5575 auto ttiter
= s
->attrset
.find(RGW_ATTR_TAIL_TAG
);
5576 if (ttiter
!= s
->attrset
.end()) {
5577 s
->tail_tag
= s
->attrset
[RGW_ATTR_TAIL_TAG
];
5580 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
5581 if (manifest_bl
.length()) {
5582 auto miter
= manifest_bl
.cbegin();
5584 s
->manifest
.emplace();
5585 decode(*s
->manifest
, miter
);
5586 s
->manifest
->set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
5587 broken due to old bugs */
5588 s
->size
= s
->manifest
->get_obj_size();
5590 s
->accounted_size
= s
->size
;
5591 } catch (buffer::error
& err
) {
5592 ldpp_dout(dpp
, 0) << "ERROR: couldn't decode manifest" << dendl
;
5595 ldpp_dout(dpp
, 10) << "manifest: total_size = " << s
->manifest
->get_obj_size() << dendl
;
5596 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>() && \
5597 s
->manifest
->has_explicit_objs()) {
5598 RGWObjManifest::obj_iterator mi
;
5599 for (mi
= s
->manifest
->obj_begin(dpp
); mi
!= s
->manifest
->obj_end(dpp
); ++mi
) {
5600 ldpp_dout(dpp
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(store
) << dendl
;
5604 if (!s
->obj_tag
.length()) {
5606 * Uh oh, something's wrong, object with manifest should have tag. Let's
5607 * create one out of the manifest, would be unique
5609 generate_fake_tag(dpp
, store
, s
->attrset
, *s
->manifest
, manifest_bl
, s
->obj_tag
);
5613 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
5614 if (aiter
!= s
->attrset
.end()) {
5615 bufferlist
& pg_ver_bl
= aiter
->second
;
5616 if (pg_ver_bl
.length()) {
5617 auto pgbl
= pg_ver_bl
.cbegin();
5619 decode(s
->pg_ver
, pgbl
);
5620 } catch (buffer::error
& err
) {
5621 ldpp_dout(dpp
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5625 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
5626 if (aiter
!= s
->attrset
.end()) {
5627 bufferlist
& zone_short_id_bl
= aiter
->second
;
5628 if (zone_short_id_bl
.length()) {
5629 auto zbl
= zone_short_id_bl
.cbegin();
5631 decode(s
->zone_short_id
, zbl
);
5632 } catch (buffer::error
& err
) {
5633 ldpp_dout(dpp
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5637 if (s
->obj_tag
.length()) {
5638 ldpp_dout(dpp
, 20) << "get_obj_state: setting s->obj_tag to " << s
->obj_tag
.c_str() << dendl
;
5640 ldpp_dout(dpp
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
5643 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5644 * it exist, and not only if is_olh() returns true
5646 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
5647 if (iter
!= s
->attrset
.end()) {
5648 s
->olh_tag
= iter
->second
;
5651 if (is_olh(s
->attrset
)) {
5654 ldpp_dout(dpp
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
5656 if (need_follow_olh
) {
5657 return get_olh_target_state(dpp
, *rctx
, bucket_info
, obj
, s
, state
, y
);
5658 } else if (obj
.key
.have_null_instance() && !s
->manifest
) {
5659 // read null version, and the head object only have olh info
5668 int RGWRados::get_obj_state(const DoutPrefixProvider
*dpp
, RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
5669 bool follow_olh
, optional_yield y
, bool assume_noent
)
5674 ret
= get_obj_state_impl(dpp
, rctx
, bucket_info
, obj
, state
, follow_olh
, y
, assume_noent
);
5675 } while (ret
== -EAGAIN
);
5680 int RGWRados::Object::get_manifest(const DoutPrefixProvider
*dpp
, RGWObjManifest
**pmanifest
, optional_yield y
)
5682 RGWObjState
*astate
;
5683 int r
= get_state(dpp
, &astate
, true, y
);
5688 *pmanifest
= &(*astate
->manifest
);
5693 int RGWRados::Object::Read::get_attr(const DoutPrefixProvider
*dpp
, const char *name
, bufferlist
& dest
, optional_yield y
)
5696 int r
= source
->get_state(dpp
, &state
, true, y
);
5701 if (!state
->get_attr(name
, dest
))
5707 int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider
*dpp
)
5709 RGWObjectCtx
& ctx
= source
->get_ctx();
5710 rgw_obj
& obj
= source
->get_obj();
5711 RGWRados
*store
= source
->get_store();
5713 RGWObjState
*s
= ctx
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
5717 result
.size
= s
->size
;
5718 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
5719 result
.attrs
= s
->attrset
;
5720 result
.manifest
= s
->manifest
;
5726 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
5728 int r
= store
->get_obj_head_ioctx(dpp
, source
->get_bucket_info(), obj
, &state
.io_ctx
);
5733 librados::ObjectReadOperation op
;
5734 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
5735 op
.getxattrs(&result
.attrs
, NULL
);
5736 state
.completion
= librados::Rados::aio_create_completion(nullptr, nullptr);
5737 state
.io_ctx
.locator_set_key(loc
);
5738 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
5740 ldpp_dout(dpp
, 5) << __func__
5741 << ": ERROR: aio_operate() returned ret=" << r
5750 int RGWRados::Object::Stat::wait(const DoutPrefixProvider
*dpp
)
5752 if (!state
.completion
) {
5756 state
.completion
->wait_for_complete();
5757 state
.ret
= state
.completion
->get_return_value();
5758 state
.completion
->release();
5760 if (state
.ret
!= 0) {
5767 int RGWRados::Object::Stat::finish(const DoutPrefixProvider
*dpp
)
5769 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
5770 if (iter
!= result
.attrs
.end()) {
5771 bufferlist
& bl
= iter
->second
;
5772 auto biter
= bl
.cbegin();
5774 result
.manifest
.emplace();
5775 decode(*result
.manifest
, biter
);
5776 } catch (buffer::error
& err
) {
5777 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
5785 int RGWRados::append_atomic_test(const DoutPrefixProvider
*dpp
, RGWObjectCtx
*rctx
,
5786 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5787 ObjectOperation
& op
, RGWObjState
**pstate
, optional_yield y
)
5792 int r
= get_obj_state(dpp
, rctx
, bucket_info
, obj
, pstate
, false, y
);
5796 return append_atomic_test(dpp
, *pstate
, op
);
5799 int RGWRados::append_atomic_test(const DoutPrefixProvider
*dpp
,
5800 const RGWObjState
* state
,
5801 librados::ObjectOperation
& op
)
5803 if (!state
->is_atomic
) {
5804 ldpp_dout(dpp
, 20) << "state for obj=" << state
->obj
<< " is not atomic, not appending atomic test" << dendl
;
5808 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
5809 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
5811 ldpp_dout(dpp
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
5816 int RGWRados::Object::get_state(const DoutPrefixProvider
*dpp
, RGWObjState
**pstate
, bool follow_olh
, optional_yield y
, bool assume_noent
)
5818 return store
->get_obj_state(dpp
, &ctx
, bucket_info
, obj
, pstate
, follow_olh
, y
, assume_noent
);
5821 void RGWRados::Object::invalidate_state()
5823 ctx
.invalidate(obj
);
5826 int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider
*dpp
,
5827 ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
5828 const char *if_match
, const char *if_nomatch
, bool removal_op
,
5829 bool modify_tail
, optional_yield y
)
5831 int r
= get_state(dpp
, &state
, false, y
);
5835 bool need_guard
= ((state
->manifest
) || (state
->obj_tag
.length() != 0) ||
5836 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
5839 if (!state
->is_atomic
) {
5840 ldpp_dout(dpp
, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
5844 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
5851 /* first verify that the object wasn't replaced under */
5852 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
5853 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
5854 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5858 if (strcmp(if_match
, "*") == 0) {
5859 // test the object is existing
5860 if (!state
->exists
) {
5861 return -ERR_PRECONDITION_FAILED
;
5865 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
5866 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
5867 return -ERR_PRECONDITION_FAILED
;
5873 if (strcmp(if_nomatch
, "*") == 0) {
5874 // test the object is NOT existing
5875 if (state
->exists
) {
5876 return -ERR_PRECONDITION_FAILED
;
5880 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
5881 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
5882 return -ERR_PRECONDITION_FAILED
;
5889 if (state
->exists
) {
5891 store
->remove_rgw_head_obj(op
);
5898 /* the object is being removed, no need to update its tag */
5903 state
->write_tag
= *ptag
;
5905 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
5908 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
5910 ldpp_dout(dpp
, 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
5912 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
5914 op
.setxattr(RGW_ATTR_TAIL_TAG
, bl
);
5921 * Set an attr on an object.
5922 * bucket: name of the bucket holding the object
5923 * obj: name of the object to set the attr on
5924 * name: the attr to set
5925 * bl: the contents of the attr
5926 * Returns: 0 on success, -ERR# otherwise.
5928 int RGWRados::set_attr(const DoutPrefixProvider
*dpp
, void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
5930 map
<string
, bufferlist
> attrs
;
5932 return set_attrs(dpp
, ctx
, bucket_info
, obj
, attrs
, NULL
, null_yield
);
5935 int RGWRados::set_attrs(const DoutPrefixProvider
*dpp
, void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& src_obj
,
5936 map
<string
, bufferlist
>& attrs
,
5937 map
<string
, bufferlist
>* rmattrs
,
5940 rgw_obj obj
= src_obj
;
5941 if (obj
.key
.instance
== "null") {
5942 obj
.key
.instance
.clear();
5946 int r
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
5950 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
5952 ObjectWriteOperation op
;
5953 RGWObjState
*state
= NULL
;
5955 r
= append_atomic_test(dpp
, rctx
, bucket_info
, obj
, op
, &state
, y
);
5959 // ensure null version object exist
5960 if (src_obj
.key
.instance
== "null" && !state
->manifest
) {
5964 map
<string
, bufferlist
>::iterator iter
;
5966 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
5967 const string
& name
= iter
->first
;
5968 op
.rmxattr(name
.c_str());
5972 const rgw_bucket
& bucket
= obj
.bucket
;
5974 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
5975 const string
& name
= iter
->first
;
5976 bufferlist
& bl
= iter
->second
;
5981 op
.setxattr(name
.c_str(), bl
);
5983 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
5988 rgw_obj_index_key obj_key
;
5989 obj
.key
.get_index_key(&obj_key
);
5991 obj_expirer
->hint_add(dpp
, ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
5992 } catch (buffer::error
& err
) {
5993 ldpp_dout(dpp
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
6001 RGWObjectCtx
obj_ctx(this->store
);
6004 RGWRados::Bucket
bop(this, bucket_info
);
6005 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
6009 append_rand_alpha(cct
, tag
, tag
, 32);
6010 state
->write_tag
= tag
;
6011 r
= index_op
.prepare(dpp
, CLS_RGW_OP_ADD
, &state
->write_tag
, y
);
6016 bl
.append(tag
.c_str(), tag
.size() + 1);
6017 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6021 real_time mtime
= real_clock::now();
6022 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
6023 op
.mtime2(&mtime_ts
);
6024 auto& ioctx
= ref
.pool
.ioctx();
6025 r
= rgw_rados_operate(dpp
, ioctx
, ref
.obj
.oid
, &op
, null_yield
);
6028 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
6029 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
6030 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
6031 string etag
= rgw_bl_str(etag_bl
);
6032 string content_type
= rgw_bl_str(content_type_bl
);
6033 string storage_class
;
6034 auto iter
= attrs
.find(RGW_ATTR_STORAGE_CLASS
);
6035 if (iter
!= attrs
.end()) {
6036 storage_class
= rgw_bl_str(iter
->second
);
6038 uint64_t epoch
= ioctx
.get_last_version();
6039 int64_t poolid
= ioctx
.get_id();
6040 r
= index_op
.complete(dpp
, poolid
, epoch
, state
->size
, state
->accounted_size
,
6041 mtime
, etag
, content_type
, storage_class
, &acl_bl
,
6042 RGWObjCategory::Main
, NULL
);
6044 int ret
= index_op
.cancel(dpp
, nullptr);
6046 ldpp_dout(dpp
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
6054 state
->obj_tag
.swap(bl
);
6056 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
6057 state
->attrset
.erase(iter
->first
);
6061 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6062 state
->attrset
[iter
->first
] = iter
->second
;
6065 auto iter
= state
->attrset
.find(RGW_ATTR_ID_TAG
);
6066 if (iter
!= state
->attrset
.end()) {
6067 iter
->second
= state
->obj_tag
;
6074 int RGWRados::Object::Read::prepare(optional_yield y
, const DoutPrefixProvider
*dpp
)
6076 RGWRados
*store
= source
->get_store();
6077 CephContext
*cct
= store
->ctx();
6081 map
<string
, bufferlist
>::iterator iter
;
6083 RGWObjState
*astate
;
6084 int r
= source
->get_state(dpp
, &astate
, true, y
);
6088 if (!astate
->exists
) {
6092 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
6094 state
.obj
= astate
->obj
;
6095 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
6097 state
.cur_pool
= state
.head_obj
.pool
;
6098 state
.cur_ioctx
= &state
.io_ctxs
[state
.cur_pool
];
6100 r
= store
->get_obj_head_ioctx(dpp
, bucket_info
, state
.obj
, state
.cur_ioctx
);
6104 if (params
.target_obj
) {
6105 *params
.target_obj
= state
.obj
;
6108 *params
.attrs
= astate
->attrset
;
6109 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>()) {
6110 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
6111 ldpp_dout(dpp
, 20) << "Read xattr rgw_rados: " << iter
->first
<< dendl
;
6116 /* Convert all times go GMT to make them compatible */
6117 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
6118 obj_time_weight src_weight
;
6119 src_weight
.init(astate
);
6120 src_weight
.high_precision
= conds
.high_precision_time
;
6122 obj_time_weight dest_weight
;
6123 dest_weight
.high_precision
= conds
.high_precision_time
;
6125 if (conds
.mod_ptr
&& !conds
.if_nomatch
) {
6126 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
6127 ldpp_dout(dpp
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
6128 if (!(dest_weight
< src_weight
)) {
6129 return -ERR_NOT_MODIFIED
;
6133 if (conds
.unmod_ptr
&& !conds
.if_match
) {
6134 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
6135 ldpp_dout(dpp
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
6136 if (dest_weight
< src_weight
) {
6137 return -ERR_PRECONDITION_FAILED
;
6141 if (conds
.if_match
|| conds
.if_nomatch
) {
6142 r
= get_attr(dpp
, RGW_ATTR_ETAG
, etag
, y
);
6146 if (conds
.if_match
) {
6147 string if_match_str
= rgw_string_unquote(conds
.if_match
);
6148 ldpp_dout(dpp
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-Match: " << if_match_str
<< dendl
;
6149 if (if_match_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) != 0) {
6150 return -ERR_PRECONDITION_FAILED
;
6154 if (conds
.if_nomatch
) {
6155 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
6156 ldpp_dout(dpp
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
6157 if (if_nomatch_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) == 0) {
6158 return -ERR_NOT_MODIFIED
;
6163 if (params
.obj_size
)
6164 *params
.obj_size
= astate
->size
;
6166 *params
.lastmod
= astate
->mtime
;
6171 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
6178 } else if (end
< 0) {
6183 if (ofs
>= (off_t
)obj_size
) {
6186 if (end
>= (off_t
)obj_size
) {
6193 int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider
*dpp
, BucketShard
**pbs
, std::function
<int(BucketShard
*)> call
)
6195 RGWRados
*store
= target
->get_store();
6196 BucketShard
*bs
= nullptr;
6199 #define NUM_RESHARD_RETRIES 10
6200 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
6201 int ret
= get_bucket_shard(&bs
, dpp
);
6203 ldpp_dout(dpp
, 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6207 if (r
!= -ERR_BUSY_RESHARDING
) {
6210 ldpp_dout(dpp
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
6211 string new_bucket_id
;
6212 r
= store
->block_while_resharding(bs
, &new_bucket_id
,
6213 target
->bucket_info
, null_yield
, dpp
);
6214 if (r
== -ERR_BUSY_RESHARDING
) {
6220 ldpp_dout(dpp
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
6221 i
= 0; /* resharding is finished, make sure we can retry */
6222 r
= target
->update_bucket_id(new_bucket_id
, dpp
);
6224 ldpp_dout(dpp
, 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id
<< " returned r=" << r
<< dendl
;
6241 int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider
*dpp
, RGWModifyOp op
, const string
*write_tag
, optional_yield y
)
6246 RGWRados
*store
= target
->get_store();
6248 if (write_tag
&& write_tag
->length()) {
6249 optag
= string(write_tag
->c_str(), write_tag
->length());
6251 if (optag
.empty()) {
6252 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
6256 int r
= guard_reshard(dpp
, nullptr, [&](BucketShard
*bs
) -> int {
6257 return store
->cls_obj_prepare_op(dpp
, *bs
, op
, optag
, obj
, bilog_flags
, y
, zones_trace
);
6268 int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider
*dpp
, int64_t poolid
, uint64_t epoch
,
6269 uint64_t size
, uint64_t accounted_size
,
6270 ceph::real_time
& ut
, const string
& etag
,
6271 const string
& content_type
, const string
& storage_class
,
6273 RGWObjCategory category
,
6274 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
,
6280 RGWRados
*store
= target
->get_store();
6281 BucketShard
*bs
= nullptr;
6283 int ret
= get_bucket_shard(&bs
, dpp
);
6285 ldpp_dout(dpp
, 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6289 rgw_bucket_dir_entry ent
;
6290 obj
.key
.get_index_key(&ent
.key
);
6291 ent
.meta
.size
= size
;
6292 ent
.meta
.accounted_size
= accounted_size
;
6293 ent
.meta
.mtime
= ut
;
6294 ent
.meta
.etag
= etag
;
6295 ent
.meta
.storage_class
= storage_class
;
6297 ent
.meta
.user_data
= *user_data
;
6300 if (acl_bl
&& acl_bl
->length()) {
6301 int ret
= store
->decode_policy(dpp
, *acl_bl
, &owner
);
6303 ldpp_dout(dpp
, 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
6306 ent
.meta
.owner
= owner
.get_id().to_str();
6307 ent
.meta
.owner_display_name
= owner
.get_display_name();
6308 ent
.meta
.content_type
= content_type
;
6309 ent
.meta
.appendable
= appendable
;
6311 ret
= store
->cls_obj_complete_add(*bs
, obj
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
6313 int r
= store
->svc
.datalog_rados
->add_entry(dpp
, target
->bucket_info
, bs
->shard_id
);
6315 ldpp_dout(dpp
, -1) << "ERROR: failed writing data log" << dendl
;
6321 int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider
*dpp
,
6322 int64_t poolid
, uint64_t epoch
,
6323 real_time
& removed_mtime
,
6324 list
<rgw_obj_index_key
> *remove_objs
)
6329 RGWRados
*store
= target
->get_store();
6330 BucketShard
*bs
= nullptr;
6332 int ret
= get_bucket_shard(&bs
, dpp
);
6334 ldpp_dout(dpp
, 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6338 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
, zones_trace
);
6340 int r
= store
->svc
.datalog_rados
->add_entry(dpp
, target
->bucket_info
, bs
->shard_id
);
6342 ldpp_dout(dpp
, -1) << "ERROR: failed writing data log" << dendl
;
6349 int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider
*dpp
,
6350 list
<rgw_obj_index_key
> *remove_objs
)
6355 RGWRados
*store
= target
->get_store();
6358 int ret
= guard_reshard(dpp
, &bs
, [&](BucketShard
*bs
) -> int {
6359 return store
->cls_obj_complete_cancel(*bs
, optag
, obj
, remove_objs
, bilog_flags
, zones_trace
);
6363 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6364 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6365 * have no way to tell that they're all caught up
6367 int r
= store
->svc
.datalog_rados
->add_entry(dpp
, target
->bucket_info
, bs
->shard_id
);
6369 ldpp_dout(dpp
, -1) << "ERROR: failed writing data log" << dendl
;
6375 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
, optional_yield y
, const DoutPrefixProvider
*dpp
)
6377 RGWRados
*store
= source
->get_store();
6379 rgw_raw_obj read_obj
;
6380 uint64_t read_ofs
= ofs
;
6381 uint64_t len
, read_len
;
6382 bool reading_from_head
= true;
6383 ObjectReadOperation op
;
6385 bool merge_bl
= false;
6386 bufferlist
*pbl
= &bl
;
6388 uint64_t max_chunk_size
;
6390 RGWObjState
*astate
;
6391 int r
= source
->get_state(dpp
, &astate
, true, y
);
6395 if (astate
->size
== 0) {
6397 } else if (end
>= (int64_t)astate
->size
) {
6398 end
= astate
->size
- 1;
6404 len
= end
- ofs
+ 1;
6406 if (astate
->manifest
&& astate
->manifest
->has_tail()) {
6407 /* now get the relevant object part */
6408 RGWObjManifest::obj_iterator iter
= astate
->manifest
->obj_find(dpp
, ofs
);
6410 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
6411 read_obj
= iter
.get_location().get_raw_obj(store
->store
);
6412 len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6413 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6414 reading_from_head
= (read_obj
== state
.head_obj
);
6416 read_obj
= state
.head_obj
;
6419 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
, dpp
);
6421 ldpp_dout(dpp
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
6425 if (len
> max_chunk_size
)
6426 len
= max_chunk_size
;
6431 if (reading_from_head
) {
6432 /* only when reading from the head object do we need to do the atomic test */
6433 r
= store
->append_atomic_test(dpp
, &source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
, y
);
6437 if (astate
&& astate
->prefetch_data
) {
6438 if (!ofs
&& astate
->data
.length() >= len
) {
6443 if (ofs
< astate
->data
.length()) {
6444 unsigned copy_len
= std::min((uint64_t)astate
->data
.length() - ofs
, len
);
6445 astate
->data
.begin(ofs
).copy(copy_len
, bl
);
6446 read_len
-= copy_len
;
6447 read_ofs
+= copy_len
;
6457 ldpp_dout(dpp
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
6458 op
.read(read_ofs
, read_len
, pbl
, NULL
);
6460 if (state
.cur_pool
!= read_obj
.pool
) {
6461 auto iter
= state
.io_ctxs
.find(read_obj
.pool
);
6462 if (iter
== state
.io_ctxs
.end()) {
6463 state
.cur_ioctx
= &state
.io_ctxs
[read_obj
.pool
];
6464 r
= store
->open_pool_ctx(dpp
, read_obj
.pool
, *state
.cur_ioctx
, false);
6466 ldpp_dout(dpp
, 20) << "ERROR: failed to open pool context for pool=" << read_obj
.pool
<< " r=" << r
<< dendl
;
6470 state
.cur_ioctx
= &iter
->second
;
6472 state
.cur_pool
= read_obj
.pool
;
6475 state
.cur_ioctx
->locator_set_key(read_obj
.loc
);
6477 r
= state
.cur_ioctx
->operate(read_obj
.oid
, &op
, NULL
);
6478 ldpp_dout(dpp
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
6491 int get_obj_data::flush(rgw::AioResultList
&& results
) {
6492 int r
= rgw::check_for_errors(results
);
6496 std::list
<bufferlist
> bl_list
;
6498 auto cmp
= [](const auto& lhs
, const auto& rhs
) { return lhs
.id
< rhs
.id
; };
6499 results
.sort(cmp
); // merge() requires results to be sorted first
6500 completed
.merge(results
, cmp
); // merge results in sorted order
6502 while (!completed
.empty() && completed
.front().id
== offset
) {
6503 auto bl
= std::move(completed
.front().data
);
6505 bl_list
.push_back(bl
);
6506 offset
+= bl
.length();
6507 int r
= client_cb
->handle_data(bl
, 0, bl
.length());
6512 if (rgwrados
->get_use_datacache()) {
6513 const std::lock_guard
l(d3n_get_data
.d3n_lock
);
6514 auto oid
= completed
.front().obj
.get_ref().obj
.oid
;
6515 if (bl
.length() <= g_conf()->rgw_get_obj_max_req_size
&& !d3n_bypass_cache_write
) {
6516 lsubdout(g_ceph_context
, rgw_datacache
, 10) << "D3nDataCache: " << __func__
<< "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl
.length() << dendl
;
6517 rgwrados
->d3n_data_cache
->put(bl
, bl
.length(), oid
);
6519 lsubdout(g_ceph_context
, rgw_datacache
, 10) << "D3nDataCache: " << __func__
<< "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl
.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write
<< dendl
;
6522 completed
.pop_front_and_dispose(std::default_delete
<rgw::AioResultEntry
>{});
6527 static int _get_obj_iterate_cb(const DoutPrefixProvider
*dpp
,
6528 const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6529 off_t read_ofs
, off_t len
, bool is_head_obj
,
6530 RGWObjState
*astate
, void *arg
)
6532 struct get_obj_data
* d
= static_cast<struct get_obj_data
*>(arg
);
6533 return d
->rgwrados
->get_obj_iterate_cb(dpp
, read_obj
, obj_ofs
, read_ofs
, len
,
6534 is_head_obj
, astate
, arg
);
6537 int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider
*dpp
,
6538 const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6539 off_t read_ofs
, off_t len
, bool is_head_obj
,
6540 RGWObjState
*astate
, void *arg
)
6542 ObjectReadOperation op
;
6543 struct get_obj_data
* d
= static_cast<struct get_obj_data
*>(arg
);
6547 /* only when reading from the head object do we need to do the atomic test */
6548 int r
= append_atomic_test(dpp
, astate
, op
);
6553 obj_ofs
< astate
->data
.length()) {
6554 unsigned chunk_len
= std::min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
6556 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
6561 d
->offset
+= chunk_len
;
6562 read_ofs
+= chunk_len
;
6563 obj_ofs
+= chunk_len
;
6569 auto obj
= d
->rgwrados
->svc
.rados
->obj(read_obj
);
6570 int r
= obj
.open(dpp
);
6572 ldpp_dout(dpp
, 4) << "failed to open rados context for " << read_obj
<< dendl
;
6576 ldpp_dout(dpp
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
6577 op
.read(read_ofs
, len
, nullptr, nullptr);
6579 const uint64_t cost
= len
;
6580 const uint64_t id
= obj_ofs
; // use logical object offset for sorting replies
6582 auto completed
= d
->aio
->get(obj
, rgw::Aio::librados_op(std::move(op
), d
->yield
), cost
, id
);
6584 return d
->flush(std::move(completed
));
6587 int RGWRados::Object::Read::iterate(const DoutPrefixProvider
*dpp
, int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
,
6590 RGWRados
*store
= source
->get_store();
6591 CephContext
*cct
= store
->ctx();
6592 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
6593 const uint64_t chunk_size
= cct
->_conf
->rgw_get_obj_max_req_size
;
6594 const uint64_t window_size
= cct
->_conf
->rgw_get_obj_window_size
;
6596 auto aio
= rgw::make_throttle(window_size
, y
);
6597 get_obj_data
data(store
, cb
, &*aio
, ofs
, y
);
6599 int r
= store
->iterate_obj(dpp
, obj_ctx
, source
->get_bucket_info(), state
.obj
,
6600 ofs
, end
, chunk_size
, _get_obj_iterate_cb
, &data
, y
);
6602 ldpp_dout(dpp
, 0) << "iterate_obj() failed with " << r
<< dendl
;
6603 data
.cancel(); // drain completions without writing back to client
6607 return data
.drain();
6610 int RGWRados::iterate_obj(const DoutPrefixProvider
*dpp
, RGWObjectCtx
& obj_ctx
,
6611 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6612 off_t ofs
, off_t end
, uint64_t max_chunk_size
,
6613 iterate_obj_cb cb
, void *arg
, optional_yield y
)
6615 rgw_raw_obj head_obj
;
6616 rgw_raw_obj read_obj
;
6617 uint64_t read_ofs
= ofs
;
6619 bool reading_from_head
= true;
6620 RGWObjState
*astate
= NULL
;
6622 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
6624 int r
= get_obj_state(dpp
, &obj_ctx
, bucket_info
, obj
, &astate
, false, y
);
6632 len
= end
- ofs
+ 1;
6634 if (astate
->manifest
) {
6635 /* now get the relevant object stripe */
6636 RGWObjManifest::obj_iterator iter
= astate
->manifest
->obj_find(dpp
, ofs
);
6638 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
->obj_end(dpp
);
6640 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
6641 off_t stripe_ofs
= iter
.get_stripe_ofs();
6642 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
6644 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
6645 read_obj
= iter
.get_location().get_raw_obj(store
);
6646 uint64_t read_len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6647 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6649 if (read_len
> max_chunk_size
) {
6650 read_len
= max_chunk_size
;
6653 reading_from_head
= (read_obj
== head_obj
);
6654 r
= cb(dpp
, read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
6664 while (ofs
<= end
) {
6665 read_obj
= head_obj
;
6666 uint64_t read_len
= std::min(len
, max_chunk_size
);
6668 r
= cb(dpp
, read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
6681 int RGWRados::obj_operate(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
6684 int r
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
6689 return rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, op
, null_yield
);
6692 int RGWRados::obj_operate(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
6695 int r
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
6702 return rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, op
, &outbl
, null_yield
);
6705 int RGWRados::olh_init_modification_impl(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
6707 ObjectWriteOperation op
;
6709 ceph_assert(olh_obj
.key
.instance
.empty());
6711 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
6713 if (!state
.exists
) {
6717 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
6718 op
.mtime2(&mtime_ts
);
6722 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6723 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6724 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6725 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6726 * log will reflect that.
6728 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6729 * is used for object data instance, olh_tag for olh instance.
6732 /* guard against racing writes */
6733 bucket_index_guard_olh_op(dpp
, state
, op
);
6738 string obj_tag
= gen_rand_alphanumeric_lower(cct
, 32);
6741 bl
.append(obj_tag
.c_str(), obj_tag
.size());
6742 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6744 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
6748 string olh_tag
= gen_rand_alphanumeric_lower(cct
, 32);
6751 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
6752 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
6754 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
6755 state
.olh_tag
= olh_bl
;
6756 state
.is_olh
= true;
6759 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
6763 RGWOLHPendingInfo pending_info
;
6764 pending_info
.time
= real_clock::now();
6765 encode(pending_info
, bl
);
6767 #define OLH_PENDING_TAG_LEN 32
6768 /* tag will start with current time epoch, this so that entries are sorted by time */
6770 utime_t
ut(pending_info
.time
);
6771 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
6774 string s
= gen_rand_alphanumeric_lower(cct
, OLH_PENDING_TAG_LEN
- op_tag
->size());
6778 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
6779 attr_name
.append(*op_tag
);
6781 op
.setxattr(attr_name
.c_str(), bl
);
6783 int ret
= obj_operate(dpp
, bucket_info
, olh_obj
, &op
);
6788 state
.exists
= true;
6789 state
.attrset
[attr_name
] = bl
;
6794 int RGWRados::olh_init_modification(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
6798 ret
= olh_init_modification_impl(dpp
, bucket_info
, state
, obj
, op_tag
);
6799 if (ret
== -EEXIST
) {
6806 int RGWRados::guard_reshard(const DoutPrefixProvider
*dpp
,
6808 const rgw_obj
& obj_instance
,
6809 const RGWBucketInfo
& bucket_info
,
6810 std::function
<int(BucketShard
*)> call
)
6813 const rgw_obj
*pobj
= &obj_instance
;
6816 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
6817 r
= bs
->init(pobj
->bucket
, *pobj
, nullptr /* no RGWBucketInfo */, dpp
);
6819 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << r
<< dendl
;
6823 if (r
!= -ERR_BUSY_RESHARDING
) {
6826 ldpp_dout(dpp
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
6827 string new_bucket_id
;
6828 r
= block_while_resharding(bs
, &new_bucket_id
, bucket_info
, null_yield
, dpp
);
6829 if (r
== -ERR_BUSY_RESHARDING
) {
6835 ldpp_dout(dpp
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
6836 i
= 0; /* resharding is finished, make sure we can retry */
6839 obj
.bucket
.update_bucket_id(new_bucket_id
);
6850 int RGWRados::block_while_resharding(RGWRados::BucketShard
*bs
,
6851 string
*new_bucket_id
,
6852 const RGWBucketInfo
& bucket_info
,
6854 const DoutPrefixProvider
*dpp
)
6857 cls_rgw_bucket_instance_entry entry
;
6859 // since we want to run this recovery code from two distinct places,
6860 // let's just put it in a lambda so we can easily re-use; if the
6861 // lambda successfully fetches a new bucket id, it sets
6862 // new_bucket_id and returns 0, otherwise it returns a negative
6864 auto fetch_new_bucket_id
=
6865 [this, &bucket_info
, dpp
](const std::string
& log_tag
,
6866 std::string
* new_bucket_id
) -> int {
6867 RGWBucketInfo fresh_bucket_info
= bucket_info
;
6868 int ret
= try_refresh_bucket_info(fresh_bucket_info
, nullptr, dpp
);
6870 ldpp_dout(dpp
, 0) << __func__
<<
6871 " ERROR: failed to refresh bucket info after reshard at " <<
6872 log_tag
<< ": " << cpp_strerror(-ret
) << dendl
;
6875 *new_bucket_id
= fresh_bucket_info
.bucket
.bucket_id
;
6879 constexpr int num_retries
= 10;
6880 for (int i
= 1; i
<= num_retries
; i
++) { // nb: 1-based for loop
6881 auto& ref
= bs
->bucket_obj
.get_ref();
6882 ret
= cls_rgw_get_bucket_resharding(ref
.pool
.ioctx(), ref
.obj
.oid
, &entry
);
6883 if (ret
== -ENOENT
) {
6884 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id
);
6885 } else if (ret
< 0) {
6886 ldpp_dout(dpp
, 0) << __func__
<<
6887 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret
) <<
6892 if (!entry
.resharding_in_progress()) {
6893 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6897 ldpp_dout(dpp
, 20) << "NOTICE: reshard still in progress; " <<
6898 (i
< num_retries
? "retrying" : "too many retries") << dendl
;
6900 if (i
== num_retries
) {
6904 // If bucket is erroneously marked as resharding (e.g., crash or
6905 // other error) then fix it. If we can take the bucket reshard
6906 // lock then it means no other resharding should be taking place,
6907 // and we're free to clear the flags.
6909 // since we expect to do this rarely, we'll do our work in a
6910 // block and erase our work after each try
6912 RGWObjectCtx
obj_ctx(this->store
);
6913 const rgw_bucket
& b
= bs
->bucket
;
6914 std::string bucket_id
= b
.get_key();
6915 RGWBucketReshardLock
reshard_lock(this->store
, bucket_info
, true);
6916 ret
= reshard_lock
.lock(dpp
);
6918 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
6919 ": failed to take reshard lock for bucket " <<
6920 bucket_id
<< "; expected if resharding underway" << dendl
;
6922 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
6923 ": was able to take reshard lock for bucket " <<
6925 ret
= RGWBucketReshard::clear_resharding(dpp
, this->store
, bucket_info
);
6927 reshard_lock
.unlock();
6928 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
6929 " ERROR: failed to clear resharding flags for bucket " <<
6932 reshard_lock
.unlock();
6933 ldpp_dout(dpp
, 5) << __PRETTY_FUNCTION__
<<
6934 ": apparently successfully cleared resharding flags for "
6935 "bucket " << bucket_id
<< dendl
;
6936 continue; // if we apparently succeed immediately test again
6937 } // if clear resharding succeeded
6938 } // if taking of lock succeeded
6939 } // block to encapsulate recovery from incomplete reshard
6941 ret
= reshard_wait
->wait(y
);
6943 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
6944 " ERROR: bucket is still resharding, please retry" << dendl
;
6949 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
6950 " ERROR: bucket is still resharding, please retry" << dendl
;
6951 return -ERR_BUSY_RESHARDING
;
6954 int RGWRados::bucket_index_link_olh(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
6956 const string
& op_tag
,
6957 struct rgw_bucket_dir_entry_meta
*meta
,
6959 real_time unmod_since
, bool high_precision_time
,
6960 rgw_zone_set
*_zones_trace
, bool log_data_change
)
6963 int r
= get_obj_head_ref(dpp
, bucket_info
, obj_instance
, &ref
);
6968 rgw_zone_set zones_trace
;
6970 zones_trace
= *_zones_trace
;
6972 zones_trace
.insert(svc
.zone
->get_zone().id
, bucket_info
.bucket
.get_key());
6974 BucketShard
bs(this);
6976 r
= guard_reshard(dpp
, &bs
, obj_instance
, bucket_info
,
6977 [&](BucketShard
*bs
) -> int {
6978 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
6979 auto& ref
= bs
->bucket_obj
.get_ref();
6980 librados::ObjectWriteOperation op
;
6981 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
6982 cls_rgw_bucket_link_olh(op
, key
, olh_state
.olh_tag
,
6983 delete_marker
, op_tag
, meta
, olh_epoch
,
6984 unmod_since
, high_precision_time
,
6985 svc
.zone
->get_zone().log_data
, zones_trace
);
6986 return rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
6989 ldpp_dout(dpp
, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
6993 r
= svc
.datalog_rados
->add_entry(dpp
, bucket_info
, bs
.shard_id
);
6995 ldpp_dout(dpp
, 0) << "ERROR: failed writing data log" << dendl
;
7001 void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider
*dpp
, RGWObjState
& olh_state
, ObjectOperation
& op
)
7003 ldpp_dout(dpp
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
7004 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
7007 int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
7008 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
, rgw_zone_set
*_zones_trace
)
7011 int r
= get_obj_head_ref(dpp
, bucket_info
, obj_instance
, &ref
);
7016 rgw_zone_set zones_trace
;
7018 zones_trace
= *_zones_trace
;
7020 zones_trace
.insert(svc
.zone
->get_zone().id
, bucket_info
.bucket
.get_key());
7022 BucketShard
bs(this);
7024 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
7025 r
= guard_reshard(dpp
, &bs
, obj_instance
, bucket_info
,
7026 [&](BucketShard
*bs
) -> int {
7027 auto& ref
= bs
->bucket_obj
.get_ref();
7028 librados::ObjectWriteOperation op
;
7029 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7030 cls_rgw_bucket_unlink_instance(op
, key
, op_tag
,
7031 olh_tag
, olh_epoch
, svc
.zone
->get_zone().log_data
, zones_trace
);
7032 return rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7035 ldpp_dout(dpp
, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r
<< dendl
;
7042 int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider
*dpp
,
7043 const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
7044 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
7045 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
7049 int r
= get_obj_head_ref(dpp
, bucket_info
, obj_instance
, &ref
);
7054 BucketShard
bs(this);
7056 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */, dpp
);
7058 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7062 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7064 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7066 ret
= guard_reshard(dpp
, &bs
, obj_instance
, bucket_info
,
7067 [&](BucketShard
*bs
) -> int {
7068 auto& ref
= bs
->bucket_obj
.get_ref();
7069 ObjectReadOperation op
;
7070 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7072 rgw_cls_read_olh_log_ret log_ret
;
7074 cls_rgw_get_olh_log(op
, key
, ver_marker
, olh_tag
, log_ret
, op_ret
);
7076 int r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, &outbl
, null_yield
);
7084 *log
= std::move(log_ret
.log
);
7085 *is_truncated
= log_ret
.is_truncated
;
7089 ldpp_dout(dpp
, 20) << "cls_rgw_get_olh_log() returned r=" << r
<< dendl
;
7096 // a multisite sync bug resulted in the OLH head attributes being overwritten by
7097 // the attributes from another zone, causing link_olh() to fail endlessly due to
7098 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7099 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
7100 int RGWRados::repair_olh(const DoutPrefixProvider
*dpp
, RGWObjState
* state
, const RGWBucketInfo
& bucket_info
,
7103 // fetch the current olh entry from the bucket index
7104 rgw_bucket_olh_entry olh
;
7105 int r
= bi_get_olh(dpp
, bucket_info
, obj
, &olh
);
7107 ldpp_dout(dpp
, 0) << "repair_olh failed to read olh entry for " << obj
<< dendl
;
7110 if (olh
.tag
== rgw_bl_str(state
->olh_tag
)) { // mismatch already resolved?
7114 ldpp_dout(dpp
, 4) << "repair_olh setting olh_tag=" << olh
.tag
7115 << " key=" << olh
.key
<< " delete_marker=" << olh
.delete_marker
<< dendl
;
7117 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7118 ObjectWriteOperation op
;
7119 // assert this is the same olh tag we think we're fixing
7120 bucket_index_guard_olh_op(dpp
, *state
, op
);
7121 // preserve existing mtime
7122 struct timespec mtime_ts
= ceph::real_clock::to_timespec(state
->mtime
);
7123 op
.mtime2(&mtime_ts
);
7126 bl
.append(olh
.tag
.c_str(), olh
.tag
.size());
7127 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, bl
);
7131 info
.target
= rgw_obj(bucket_info
.bucket
, olh
.key
);
7132 info
.removed
= olh
.delete_marker
;
7135 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7138 r
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
7142 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7144 ldpp_dout(dpp
, 0) << "repair_olh failed to write olh attributes with "
7145 << cpp_strerror(r
) << dendl
;
7151 int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
7154 int r
= get_obj_head_ref(dpp
, bucket_info
, obj_instance
, &ref
);
7159 BucketShard
bs(this);
7161 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */, dpp
);
7163 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7167 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7169 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7171 ret
= guard_reshard(dpp
, &bs
, obj_instance
, bucket_info
,
7172 [&](BucketShard
*pbs
) -> int {
7173 ObjectWriteOperation op
;
7174 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7175 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
7176 return pbs
->bucket_obj
.operate(dpp
, &op
, null_yield
);
7179 ldpp_dout(dpp
, 20) << "cls_rgw_trim_olh_log() returned r=" << ret
<< dendl
;
7186 int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
7189 int r
= get_obj_head_ref(dpp
, bucket_info
, obj_instance
, &ref
);
7194 BucketShard
bs(this);
7196 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7198 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7200 int ret
= guard_reshard(dpp
, &bs
, obj_instance
, bucket_info
,
7201 [&](BucketShard
*pbs
) -> int {
7202 ObjectWriteOperation op
;
7203 auto& ref
= pbs
->bucket_obj
.get_ref();
7204 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7205 cls_rgw_clear_olh(op
, key
, olh_tag
);
7206 return rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7209 ldpp_dout(dpp
, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
7216 static int decode_olh_info(const DoutPrefixProvider
*dpp
, CephContext
* cct
, const bufferlist
& bl
, RGWOLHInfo
*olh
)
7219 auto biter
= bl
.cbegin();
7220 decode(*olh
, biter
);
7222 } catch (buffer::error
& err
) {
7223 ldpp_dout(dpp
, 0) << "ERROR: failed to decode olh info" << dendl
;
7228 int RGWRados::apply_olh_log(const DoutPrefixProvider
*dpp
,
7229 RGWObjectCtx
& obj_ctx
,
7231 const RGWBucketInfo
& bucket_info
,
7233 bufferlist
& olh_tag
,
7234 std::map
<uint64_t, std::vector
<rgw_bucket_olh_log_entry
> >& log
,
7235 uint64_t *plast_ver
,
7236 rgw_zone_set
* zones_trace
)
7242 librados::ObjectWriteOperation op
;
7244 uint64_t last_ver
= log
.rbegin()->first
;
7245 *plast_ver
= last_ver
;
7247 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
7249 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7250 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GTE
, last_ver
);
7253 string last_ver_s
= to_string(last_ver
);
7254 ver_bl
.append(last_ver_s
.c_str(), last_ver_s
.size());
7255 op
.setxattr(RGW_ATTR_OLH_VER
, ver_bl
);
7257 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
7258 op
.mtime2(&mtime_ts
);
7260 bool need_to_link
= false;
7261 uint64_t link_epoch
= 0;
7262 cls_rgw_obj_key key
;
7263 bool delete_marker
= false;
7264 list
<cls_rgw_obj_key
> remove_instances
;
7265 bool need_to_remove
= false;
7267 // decode current epoch and instance
7268 auto olh_ver
= state
.attrset
.find(RGW_ATTR_OLH_VER
);
7269 if (olh_ver
!= state
.attrset
.end()) {
7270 std::string str
= olh_ver
->second
.to_str();
7272 link_epoch
= strict_strtoll(str
.c_str(), 10, &err
);
7274 auto olh_info
= state
.attrset
.find(RGW_ATTR_OLH_INFO
);
7275 if (olh_info
!= state
.attrset
.end()) {
7277 int r
= decode_olh_info(dpp
, cct
, olh_info
->second
, &info
);
7281 info
.target
.key
.get_index_key(&key
);
7282 delete_marker
= info
.removed
;
7285 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
7286 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
7287 for (; viter
!= iter
->second
.end(); ++viter
) {
7288 rgw_bucket_olh_log_entry
& entry
= *viter
;
7290 ldpp_dout(dpp
, 20) << "olh_log_entry: epoch=" << iter
->first
<< " op=" << (int)entry
.op
7291 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
7292 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
7294 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
7295 remove_instances
.push_back(entry
.key
);
7297 case CLS_RGW_OLH_OP_LINK_OLH
:
7298 // only overwrite a link of the same epoch if its key sorts before
7299 if (link_epoch
< iter
->first
|| key
.instance
.empty() ||
7300 key
.instance
> entry
.key
.instance
) {
7301 ldpp_dout(dpp
, 20) << "apply_olh_log applying key=" << entry
.key
<< " epoch=" << iter
->first
<< " delete_marker=" << entry
.delete_marker
7302 << " over current=" << key
<< " epoch=" << link_epoch
<< " delete_marker=" << delete_marker
<< dendl
;
7303 need_to_link
= true;
7304 need_to_remove
= false;
7306 delete_marker
= entry
.delete_marker
;
7308 ldpp_dout(dpp
, 20) << "apply_olh skipping key=" << entry
.key
<< " epoch=" << iter
->first
<< " delete_marker=" << entry
.delete_marker
7309 << " before current=" << key
<< " epoch=" << link_epoch
<< " delete_marker=" << delete_marker
<< dendl
;
7312 case CLS_RGW_OLH_OP_UNLINK_OLH
:
7313 need_to_remove
= true;
7314 need_to_link
= false;
7317 ldpp_dout(dpp
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
7320 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
7321 attr_name
.append(entry
.op_tag
);
7322 op
.rmxattr(attr_name
.c_str());
7327 int r
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
7332 const rgw_bucket
& bucket
= obj
.bucket
;
7335 rgw_obj
target(bucket
, key
);
7337 info
.target
= target
;
7338 info
.removed
= delete_marker
;
7341 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7344 /* first remove object instances */
7345 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
7346 liter
!= remove_instances
.end(); ++liter
) {
7347 cls_rgw_obj_key
& key
= *liter
;
7348 rgw_obj
obj_instance(bucket
, key
);
7349 int ret
= delete_obj(dpp
, obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
, ceph::real_time(), zones_trace
);
7350 if (ret
< 0 && ret
!= -ENOENT
) {
7351 ldpp_dout(dpp
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
7356 /* update olh object */
7357 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7358 if (r
== -ECANCELED
) {
7362 ldpp_dout(dpp
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7366 r
= bucket_index_trim_olh_log(dpp
, bucket_info
, state
, obj
, last_ver
);
7368 ldpp_dout(dpp
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
7372 if (need_to_remove
) {
7373 ObjectWriteOperation rm_op
;
7375 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7376 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_EQ
, last_ver
);
7377 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
7380 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &rm_op
, null_yield
);
7381 if (r
== -ECANCELED
) {
7382 return 0; /* someone else won this race */
7385 * only clear if was successful, otherwise we might clobber pending operations on this object
7387 r
= bucket_index_clear_olh(dpp
, bucket_info
, state
, obj
);
7389 ldpp_dout(dpp
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
7399 * read olh log and apply it
7401 int RGWRados::update_olh(const DoutPrefixProvider
*dpp
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_zone_set
*zones_trace
)
7403 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
7405 uint64_t ver_marker
= 0;
7408 int ret
= bucket_index_read_olh_log(dpp
, bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
7412 ret
= apply_olh_log(dpp
, obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
, zones_trace
);
7416 } while (is_truncated
);
7421 int RGWRados::set_olh(const DoutPrefixProvider
*dpp
, RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
7422 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
,
7423 optional_yield y
, rgw_zone_set
*zones_trace
, bool log_data_change
)
7427 rgw_obj olh_obj
= target_obj
;
7428 olh_obj
.key
.instance
.clear();
7430 RGWObjState
*state
= NULL
;
7435 #define MAX_ECANCELED_RETRY 100
7436 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7437 if (ret
== -ECANCELED
) {
7438 obj_ctx
.invalidate(olh_obj
);
7441 ret
= get_obj_state(dpp
, &obj_ctx
, bucket_info
, olh_obj
, &state
, false, y
); /* don't follow olh */
7446 ret
= olh_init_modification(dpp
, bucket_info
, *state
, olh_obj
, &op_tag
);
7448 ldpp_dout(dpp
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7449 if (ret
== -ECANCELED
) {
7454 ret
= bucket_index_link_olh(dpp
, bucket_info
, *state
, target_obj
, delete_marker
,
7455 op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
,
7456 zones_trace
, log_data_change
);
7458 ldpp_dout(dpp
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7459 if (ret
== -ECANCELED
) {
7460 // the bucket index rejected the link_olh() due to olh tag mismatch;
7461 // attempt to reconstruct olh head attributes based on the bucket index
7462 int r2
= repair_olh(dpp
, state
, bucket_info
, olh_obj
);
7463 if (r2
< 0 && r2
!= -ECANCELED
) {
7473 if (i
== MAX_ECANCELED_RETRY
) {
7474 ldpp_dout(dpp
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7478 ret
= update_olh(dpp
, obj_ctx
, state
, bucket_info
, olh_obj
);
7479 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7483 ldpp_dout(dpp
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7490 int RGWRados::unlink_obj_instance(const DoutPrefixProvider
*dpp
, RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
7491 uint64_t olh_epoch
, optional_yield y
, rgw_zone_set
*zones_trace
)
7495 rgw_obj olh_obj
= target_obj
;
7496 olh_obj
.key
.instance
.clear();
7498 RGWObjState
*state
= NULL
;
7503 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7504 if (ret
== -ECANCELED
) {
7505 obj_ctx
.invalidate(olh_obj
);
7508 ret
= get_obj_state(dpp
, &obj_ctx
, bucket_info
, olh_obj
, &state
, false, y
); /* don't follow olh */
7512 ret
= olh_init_modification(dpp
, bucket_info
, *state
, olh_obj
, &op_tag
);
7514 ldpp_dout(dpp
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7515 if (ret
== -ECANCELED
) {
7521 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
7523 ret
= bucket_index_unlink_instance(dpp
, bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
, zones_trace
);
7525 ldpp_dout(dpp
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7526 if (ret
== -ECANCELED
) {
7534 if (i
== MAX_ECANCELED_RETRY
) {
7535 ldpp_dout(dpp
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7539 ret
= update_olh(dpp
, obj_ctx
, state
, bucket_info
, olh_obj
, zones_trace
);
7540 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7544 ldpp_dout(dpp
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7551 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key
*target_key
)
7553 #define OBJ_INSTANCE_LEN 32
7554 char buf
[OBJ_INSTANCE_LEN
+ 1];
7556 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
7557 no underscore for instance name due to the way we encode the raw keys */
7559 target_key
->set_instance(buf
);
7562 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
7564 gen_rand_obj_instance_name(&target_obj
->key
);
7567 int RGWRados::get_olh(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
7569 map
<string
, bufferlist
> attrset
;
7571 ObjectReadOperation op
;
7572 op
.getxattrs(&attrset
, NULL
);
7574 int r
= obj_operate(dpp
, bucket_info
, obj
, &op
);
7579 auto iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
7580 if (iter
== attrset
.end()) { /* not an olh */
7584 return decode_olh_info(dpp
, cct
, iter
->second
, olh
);
7587 void RGWRados::check_pending_olh_entries(const DoutPrefixProvider
*dpp
, map
<string
, bufferlist
>& pending_entries
,
7588 map
<string
, bufferlist
> *rm_pending_entries
)
7590 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
7592 real_time now
= real_clock::now();
7594 while (iter
!= pending_entries
.end()) {
7595 auto biter
= iter
->second
.cbegin();
7596 RGWOLHPendingInfo pending_info
;
7598 decode(pending_info
, biter
);
7599 } catch (buffer::error
& err
) {
7600 /* skipping bad entry, we could remove it but it might hide a bug */
7601 ldpp_dout(dpp
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
7606 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
7608 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
7609 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
7610 pending_entries
.erase(cur_iter
);
7612 /* entries names are sorted by time (rounded to a second) */
7618 int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
7621 int r
= get_obj_head_ref(dpp
, bucket_info
, olh_obj
, &ref
);
7626 // trim no more than 1000 entries per osd op
7627 constexpr int max_entries
= 1000;
7629 auto i
= pending_attrs
.begin();
7630 while (i
!= pending_attrs
.end()) {
7631 ObjectWriteOperation op
;
7632 bucket_index_guard_olh_op(dpp
, state
, op
);
7634 for (int n
= 0; n
< max_entries
&& i
!= pending_attrs
.end(); ++n
, ++i
) {
7635 op
.rmxattr(i
->first
.c_str());
7638 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
7639 if (r
== -ENOENT
|| r
== -ECANCELED
) {
7640 /* raced with some other change, shouldn't sweat about it */
7644 ldpp_dout(dpp
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7651 int RGWRados::follow_olh(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
7653 map
<string
, bufferlist
> pending_entries
;
7654 rgw_filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
7656 map
<string
, bufferlist
> rm_pending_entries
;
7657 check_pending_olh_entries(dpp
,pending_entries
, &rm_pending_entries
);
7659 if (!rm_pending_entries
.empty()) {
7660 int ret
= remove_olh_pending_entries(dpp
, bucket_info
, *state
, olh_obj
, rm_pending_entries
);
7662 ldpp_dout(dpp
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
7666 if (!pending_entries
.empty()) {
7667 ldpp_dout(dpp
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
7669 int ret
= update_olh(dpp
, obj_ctx
, state
, bucket_info
, olh_obj
);
7675 auto iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
7676 if (iter
== state
->attrset
.end()) {
7681 int ret
= decode_olh_info(dpp
, cct
, iter
->second
, &olh
);
7690 *target
= olh
.target
;
7695 int RGWRados::raw_obj_stat(const DoutPrefixProvider
*dpp
,
7696 rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
7697 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
7698 RGWObjVersionTracker
*objv_tracker
, optional_yield y
)
7701 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
7706 map
<string
, bufferlist
> unfiltered_attrset
;
7708 struct timespec mtime_ts
;
7710 ObjectReadOperation op
;
7712 objv_tracker
->prepare_op_for_read(&op
);
7715 op
.getxattrs(&unfiltered_attrset
, NULL
);
7717 if (psize
|| pmtime
) {
7718 op
.stat2(&size
, &mtime_ts
, NULL
);
7721 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
7724 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, &outbl
, null_yield
);
7727 *epoch
= ref
.pool
.ioctx().get_last_version();
7736 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
7738 rgw_filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
7744 int RGWRados::get_bucket_stats(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
7745 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
, bool *syncstopped
)
7747 vector
<rgw_bucket_dir_header
> headers
;
7748 map
<int, string
> bucket_instance_ids
;
7749 int r
= cls_bucket_head(dpp
, bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
7754 ceph_assert(headers
.size() == bucket_instance_ids
.size());
7756 auto iter
= headers
.begin();
7757 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
7758 BucketIndexShardsManager ver_mgr
;
7759 BucketIndexShardsManager master_ver_mgr
;
7760 BucketIndexShardsManager marker_mgr
;
7762 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
7763 accumulate_raw_stats(*iter
, stats
);
7764 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->ver
);
7765 ver_mgr
.add(viter
->first
, string(buf
));
7766 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->master_ver
);
7767 master_ver_mgr
.add(viter
->first
, string(buf
));
7768 if (shard_id
>= 0) {
7769 *max_marker
= iter
->max_marker
;
7771 marker_mgr
.add(viter
->first
, iter
->max_marker
);
7773 if (syncstopped
!= NULL
)
7774 *syncstopped
= iter
->syncstopped
;
7776 ver_mgr
.to_string(bucket_ver
);
7777 master_ver_mgr
.to_string(master_ver
);
7779 marker_mgr
.to_string(max_marker
);
7784 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
7785 RGWGetBucketStats_CB
*cb
;
7787 map
<RGWObjCategory
, RGWStorageStats
> stats
;
7790 ceph::mutex lock
= ceph::make_mutex("RGWGetBucketStatsContext");
7793 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
7794 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true)
7797 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
7798 std::lock_guard l
{lock
};
7801 accumulate_raw_stats(header
, stats
);
7807 if (--pendings
== 0) {
7809 cb
->set_response(&stats
);
7811 cb
->handle_response(ret_code
);
7818 std::lock_guard l
{lock
};
7823 int RGWRados::get_bucket_stats_async(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
7826 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.layout
.current_index
.layout
.normal
.num_shards
? : 1);
7827 ceph_assert(get_ctx
);
7828 int r
= cls_bucket_head_async(dpp
, bucket_info
, shard_id
, get_ctx
, &num_aio
);
7832 get_ctx
->unset_cb();
7839 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
,
7840 const string
& meta_key
,
7841 RGWBucketInfo
& info
,
7843 map
<string
, bufferlist
> *pattrs
,
7845 const DoutPrefixProvider
*dpp
)
7848 rgw_bucket_parse_bucket_key(cct
, meta_key
, &bucket
, nullptr);
7850 return get_bucket_instance_info(obj_ctx
, bucket
, info
, pmtime
, pattrs
, y
, dpp
);
7853 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
7854 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
, optional_yield y
,
7855 const DoutPrefixProvider
*dpp
)
7857 RGWSI_MetaBackend_CtxParams bectx_params
= RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx
);
7858 return ctl
.bucket
->read_bucket_instance_info(bucket
, &info
,
7861 RGWBucketCtl::BucketInstance::GetParams()
7864 .set_bectx_params(bectx_params
));
7867 int RGWRados::get_bucket_info(RGWServices
*svc
,
7868 const string
& tenant
, const string
& bucket_name
,
7869 RGWBucketInfo
& info
,
7872 const DoutPrefixProvider
*dpp
, map
<string
, bufferlist
> *pattrs
)
7874 auto obj_ctx
= svc
->sysobj
->init_obj_ctx();
7875 RGWSI_MetaBackend_CtxParams bectx_params
= RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx
);
7877 bucket
.tenant
= tenant
;
7878 bucket
.name
= bucket_name
;
7879 return ctl
.bucket
->read_bucket_info(bucket
, &info
, y
, dpp
,
7880 RGWBucketCtl::BucketInstance::GetParams()
7883 .set_bectx_params(bectx_params
));
7886 int RGWRados::try_refresh_bucket_info(RGWBucketInfo
& info
,
7887 ceph::real_time
*pmtime
,
7888 const DoutPrefixProvider
*dpp
,
7889 map
<string
, bufferlist
> *pattrs
)
7891 rgw_bucket bucket
= info
.bucket
;
7892 bucket
.bucket_id
.clear();
7894 auto rv
= info
.objv_tracker
.read_version
;
7896 return ctl
.bucket
->read_bucket_info(bucket
, &info
, null_yield
, dpp
,
7897 RGWBucketCtl::BucketInstance::GetParams()
7900 .set_refresh_version(rv
));
7903 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
7904 real_time mtime
, map
<string
, bufferlist
> *pattrs
,
7905 const DoutPrefixProvider
*dpp
)
7907 return ctl
.bucket
->store_bucket_instance_info(info
.bucket
, info
, null_yield
, dpp
,
7908 RGWBucketCtl::BucketInstance::PutParams()
7909 .set_exclusive(exclusive
)
7911 .set_attrs(pattrs
));
7914 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
7915 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
,
7916 const DoutPrefixProvider
*dpp
)
7918 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
7920 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
, dpp
);
7926 return 0; /* done! */
7928 RGWBucketEntryPoint entry_point
;
7929 entry_point
.bucket
= info
.bucket
;
7930 entry_point
.owner
= info
.owner
;
7931 entry_point
.creation_time
= info
.creation_time
;
7932 entry_point
.linked
= true;
7933 RGWObjVersionTracker ot
;
7934 if (pep_objv
&& !pep_objv
->tag
.empty()) {
7935 ot
.write_version
= *pep_objv
;
7937 ot
.generate_new_write_ver(cct
);
7939 *pep_objv
= ot
.write_version
;
7942 ret
= ctl
.bucket
->store_bucket_entrypoint_info(info
.bucket
, entry_point
, null_yield
, dpp
, RGWBucketCtl::Bucket::PutParams()
7943 .set_exclusive(exclusive
)
7944 .set_objv_tracker(&ot
)
7952 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
, const DoutPrefixProvider
*dpp
)
7954 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
7956 map
<string
, RGWBucketEnt
>::iterator iter
;
7957 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
7958 RGWBucketEnt
& ent
= iter
->second
;
7959 rgw_bucket
& bucket
= ent
.bucket
;
7962 ent
.size_rounded
= 0;
7964 vector
<rgw_bucket_dir_header
> headers
;
7966 RGWBucketInfo bucket_info
;
7967 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
, null_yield
, dpp
);
7972 int r
= cls_bucket_head(dpp
, bucket_info
, RGW_NO_SHARD
, headers
);
7976 auto hiter
= headers
.begin();
7977 for (; hiter
!= headers
.end(); ++hiter
) {
7978 RGWObjCategory category
= main_category
;
7979 auto iter
= (hiter
->stats
).find(category
);
7980 if (iter
!= hiter
->stats
.end()) {
7981 struct rgw_bucket_category_stats
& stats
= iter
->second
;
7982 ent
.count
+= stats
.num_entries
;
7983 ent
.size
+= stats
.total_size
;
7984 ent
.size_rounded
+= stats
.total_size_rounded
;
7988 // fill in placement_rule from the bucket instance for use in swift's
7989 // per-storage policy statistics
7990 ent
.placement_rule
= std::move(bucket_info
.placement_rule
);
7996 int RGWRados::append_async(const DoutPrefixProvider
*dpp
, rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
7999 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
8003 librados::Rados
*rad
= get_rados_handle();
8004 librados::AioCompletion
*completion
= rad
->aio_create_completion(nullptr, nullptr);
8006 r
= ref
.pool
.ioctx().aio_append(ref
.obj
.oid
, completion
, bl
, size
);
8007 completion
->release();
8011 int RGWRados::pool_iterate_begin(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
8013 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8014 librados::NObjectIterator
& iter
= ctx
.iter
;
8016 int r
= open_pool_ctx(dpp
, pool
, io_ctx
, false);
8020 iter
= io_ctx
.nobjects_begin();
8025 int RGWRados::pool_iterate_begin(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
, const string
& cursor
, RGWPoolIterCtx
& ctx
)
8027 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8028 librados::NObjectIterator
& iter
= ctx
.iter
;
8030 int r
= open_pool_ctx(dpp
, pool
, io_ctx
, false);
8034 librados::ObjectCursor oc
;
8035 if (!oc
.from_str(cursor
)) {
8036 ldpp_dout(dpp
, 10) << "failed to parse cursor: " << cursor
<< dendl
;
8041 iter
= io_ctx
.nobjects_begin(oc
);
8043 } catch (const std::system_error
& e
) {
8044 r
= -e
.code().value();
8045 ldpp_dout(dpp
, 10) << "nobjects_begin threw " << e
.what()
8046 << ", returning " << r
<< dendl
;
8048 } catch (const std::exception
& e
) {
8049 ldpp_dout(dpp
, 10) << "nobjects_begin threw " << e
.what()
8050 << ", returning -5" << dendl
;
8055 string
RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx
& ctx
)
8057 return ctx
.iter
.get_cursor().to_str();
8060 static int do_pool_iterate(const DoutPrefixProvider
*dpp
, CephContext
* cct
, RGWPoolIterCtx
& ctx
, uint32_t num
,
8061 vector
<rgw_bucket_dir_entry
>& objs
,
8062 bool *is_truncated
, RGWAccessListFilter
*filter
)
8064 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8065 librados::NObjectIterator
& iter
= ctx
.iter
;
8067 if (iter
== io_ctx
.nobjects_end())
8072 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
8073 rgw_bucket_dir_entry e
;
8075 string oid
= iter
->get_oid();
8076 ldpp_dout(dpp
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
8078 // fill it in with initial values; we may correct later
8079 if (filter
&& !filter
->filter(oid
, oid
))
8087 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
8092 int RGWRados::pool_iterate(const DoutPrefixProvider
*dpp
, RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
8093 bool *is_truncated
, RGWAccessListFilter
*filter
)
8095 // catch exceptions from NObjectIterator::operator++()
8097 return do_pool_iterate(dpp
, cct
, ctx
, num
, objs
, is_truncated
, filter
);
8098 } catch (const std::system_error
& e
) {
8099 int r
= -e
.code().value();
8100 ldpp_dout(dpp
, 10) << "NObjectIterator threw exception " << e
.what()
8101 << ", returning " << r
<< dendl
;
8103 } catch (const std::exception
& e
) {
8104 ldpp_dout(dpp
, 10) << "NObjectIterator threw exception " << e
.what()
8105 << ", returning -5" << dendl
;
8110 int RGWRados::list_raw_objects_init(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
, const string
& marker
, RGWListRawObjsCtx
*ctx
)
8112 if (!ctx
->initialized
) {
8113 int r
= pool_iterate_begin(dpp
, pool
, marker
, ctx
->iter_ctx
);
8115 ldpp_dout(dpp
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
8118 ctx
->initialized
= true;
8123 int RGWRados::list_raw_objects_next(const DoutPrefixProvider
*dpp
, const string
& prefix_filter
, int max
,
8124 RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
8127 if (!ctx
.initialized
) {
8130 RGWAccessListFilterPrefix
filter(prefix_filter
);
8131 vector
<rgw_bucket_dir_entry
> objs
;
8132 int r
= pool_iterate(dpp
, ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
8135 ldpp_dout(dpp
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
8139 vector
<rgw_bucket_dir_entry
>::iterator iter
;
8140 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
8141 oids
.push_back(iter
->key
.name
);
8147 int RGWRados::list_raw_objects(const DoutPrefixProvider
*dpp
, const rgw_pool
& pool
, const string
& prefix_filter
,
8148 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
8151 if (!ctx
.initialized
) {
8152 int r
= list_raw_objects_init(dpp
, pool
, string(), &ctx
);
8158 return list_raw_objects_next(dpp
, prefix_filter
, max
, ctx
, oids
, is_truncated
);
8161 string
RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx
& ctx
)
8163 return pool_iterate_get_cursor(ctx
.iter_ctx
);
8166 int RGWRados::bi_get_instance(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8167 rgw_bucket_dir_entry
*dirent
)
8169 rgw_cls_bi_entry bi_entry
;
8170 int r
= bi_get(dpp
, bucket_info
, obj
, BIIndexType::Instance
, &bi_entry
);
8171 if (r
< 0 && r
!= -ENOENT
) {
8172 ldpp_dout(dpp
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
8177 auto iter
= bi_entry
.data
.cbegin();
8179 decode(*dirent
, iter
);
8180 } catch (buffer::error
& err
) {
8181 ldpp_dout(dpp
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
8188 int RGWRados::bi_get_olh(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8189 rgw_bucket_olh_entry
*olh
)
8191 rgw_cls_bi_entry bi_entry
;
8192 int r
= bi_get(dpp
, bucket_info
, obj
, BIIndexType::OLH
, &bi_entry
);
8193 if (r
< 0 && r
!= -ENOENT
) {
8194 ldpp_dout(dpp
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
8199 auto iter
= bi_entry
.data
.cbegin();
8202 } catch (buffer::error
& err
) {
8203 ldpp_dout(dpp
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
8210 int RGWRados::bi_get(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8211 BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
8213 BucketShard
bs(this);
8214 int ret
= bs
.init(dpp
, bucket_info
, obj
);
8216 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8220 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
8222 auto& ref
= bs
.bucket_obj
.get_ref();
8224 return cls_rgw_bi_get(ref
.pool
.ioctx(), ref
.obj
.oid
, index_type
, key
, entry
);
8227 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
8229 auto& ref
= bs
.bucket_obj
.get_ref();
8230 cls_rgw_bi_put(op
, ref
.obj
.oid
, entry
);
8233 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
8235 auto& ref
= bs
.bucket_obj
.get_ref();
8236 int ret
= cls_rgw_bi_put(ref
.pool
.ioctx(), ref
.obj
.oid
, entry
);
8243 int RGWRados::bi_put(const DoutPrefixProvider
*dpp
, rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
8245 // make sure incomplete multipart uploads are hashed correctly
8246 if (obj
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
8248 mp
.from_meta(obj
.key
.name
);
8249 obj
.index_hash_source
= mp
.get_key();
8251 BucketShard
bs(this);
8253 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */, dpp
);
8255 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8259 return bi_put(bs
, entry
);
8262 int RGWRados::bi_list(const DoutPrefixProvider
*dpp
, rgw_bucket
& bucket
,
8263 const string
& obj_name_filter
, const string
& marker
, uint32_t max
,
8264 list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8266 rgw_obj
obj(bucket
, obj_name_filter
);
8267 BucketShard
bs(this);
8268 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */, dpp
);
8270 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8274 auto& ref
= bs
.bucket_obj
.get_ref();
8275 ret
= cls_rgw_bi_list(ref
.pool
.ioctx(), ref
.obj
.oid
, obj_name_filter
, marker
, max
, entries
, is_truncated
);
8276 if (ret
== -ENOENT
) {
8277 *is_truncated
= false;
8285 int RGWRados::bi_list(BucketShard
& bs
, const string
& obj_name_filter
, const string
& marker
, uint32_t max
,
8286 list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8288 auto& ref
= bs
.bucket_obj
.get_ref();
8289 int ret
= cls_rgw_bi_list(ref
.pool
.ioctx(), ref
.obj
.oid
, obj_name_filter
, marker
, max
, entries
, is_truncated
);
8296 int RGWRados::bi_list(const DoutPrefixProvider
*dpp
,
8297 const RGWBucketInfo
& bucket_info
, int shard_id
, const string
& obj_name_filter
, const string
& marker
, uint32_t max
,
8298 list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8300 BucketShard
bs(this);
8301 int ret
= bs
.init(bucket_info
.bucket
, shard_id
, bucket_info
.layout
.current_index
, nullptr /* no RGWBucketInfo */, dpp
);
8303 ldpp_dout(dpp
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8307 return bi_list(bs
, obj_name_filter
, marker
, max
, entries
, is_truncated
);
8310 int RGWRados::bi_remove(const DoutPrefixProvider
*dpp
, BucketShard
& bs
)
8312 auto& ref
= bs
.bucket_obj
.get_ref();
8313 int ret
= ref
.pool
.ioctx().remove(ref
.obj
.oid
);
8314 if (ret
== -ENOENT
) {
8318 ldpp_dout(dpp
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
8325 int RGWRados::gc_operate(const DoutPrefixProvider
*dpp
, string
& oid
, librados::ObjectWriteOperation
*op
)
8327 return rgw_rados_operate(dpp
, gc_pool_ctx
, oid
, op
, null_yield
);
8330 int RGWRados::gc_aio_operate(const string
& oid
, librados::AioCompletion
*c
,
8331 librados::ObjectWriteOperation
*op
)
8333 return gc_pool_ctx
.aio_operate(oid
, c
, op
);
8336 int RGWRados::gc_operate(const DoutPrefixProvider
*dpp
, string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
8338 return rgw_rados_operate(dpp
, gc_pool_ctx
, oid
, op
, pbl
, null_yield
);
8341 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
, bool& processing_queue
)
8343 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
, processing_queue
);
8346 int RGWRados::process_gc(bool expired_only
)
8348 return gc
->process(expired_only
);
8351 int RGWRados::list_lc_progress(string
& marker
, uint32_t max_entries
,
8352 vector
<rgw::sal::Lifecycle::LCEntry
>& progress_map
,
8355 return lc
->list_lc_progress(marker
, max_entries
, progress_map
, index
);
8358 int RGWRados::process_lc(const std::unique_ptr
<rgw::sal::Bucket
>& optional_bucket
)
8361 lc
.initialize(cct
, this->store
);
8362 RGWLC::LCWorker
worker(&lc
, cct
, &lc
, 0);
8363 auto ret
= lc
.process(&worker
, optional_bucket
, true /* once */);
8364 lc
.stop_processor(); // sets down_flag, but returns immediately
8368 bool RGWRados::process_expire_objects(const DoutPrefixProvider
*dpp
)
8370 return obj_expirer
->inspect_all_shards(dpp
, utime_t(), ceph_clock_now());
8373 int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider
*dpp
, BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
8374 rgw_obj
& obj
, uint16_t bilog_flags
, optional_yield y
, rgw_zone_set
*_zones_trace
)
8376 rgw_zone_set zones_trace
;
8378 zones_trace
= *_zones_trace
;
8380 zones_trace
.insert(svc
.zone
->get_zone().id
, bs
.bucket
.get_key());
8382 ObjectWriteOperation o
;
8383 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
8384 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
8385 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), svc
.zone
->get_zone().log_data
, bilog_flags
, zones_trace
);
8386 return bs
.bucket_obj
.operate(dpp
, &o
, y
);
8389 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, const rgw_obj
& obj
, RGWModifyOp op
, string
& tag
,
8390 int64_t pool
, uint64_t epoch
,
8391 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
8392 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
8394 ObjectWriteOperation o
;
8395 rgw_bucket_dir_entry_meta dir_meta
;
8396 dir_meta
= ent
.meta
;
8397 dir_meta
.category
= category
;
8399 rgw_zone_set zones_trace
;
8401 zones_trace
= *_zones_trace
;
8403 zones_trace
.insert(svc
.zone
->get_zone().id
, bs
.bucket
.get_key());
8405 rgw_bucket_entry_ver ver
;
8408 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
8409 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
8410 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
8411 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
);
8412 complete_op_data
*arg
;
8413 index_completion_manager
->create_completion(obj
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
8414 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
, &arg
);
8415 librados::AioCompletion
*completion
= arg
->rados_completion
;
8416 int ret
= bs
.bucket_obj
.aio_operate(arg
->rados_completion
, &o
);
8417 completion
->release(); /* can't reference arg here, as it might have already been released */
8421 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, const rgw_obj
& obj
, string
& tag
,
8422 int64_t pool
, uint64_t epoch
,
8423 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
8424 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
8426 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
8429 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
8430 int64_t pool
, uint64_t epoch
,
8432 real_time
& removed_mtime
,
8433 list
<rgw_obj_index_key
> *remove_objs
,
8434 uint16_t bilog_flags
,
8435 rgw_zone_set
*zones_trace
)
8437 rgw_bucket_dir_entry ent
;
8438 ent
.meta
.mtime
= removed_mtime
;
8439 obj
.key
.get_index_key(&ent
.key
);
8440 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
,
8441 ent
, RGWObjCategory::None
, remove_objs
,
8442 bilog_flags
, zones_trace
);
8445 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
,
8446 list
<rgw_obj_index_key
> *remove_objs
,
8447 uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
8449 rgw_bucket_dir_entry ent
;
8450 obj
.key
.get_index_key(&ent
.key
);
8451 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_CANCEL
, tag
,
8452 -1 /* pool id */, 0, ent
,
8453 RGWObjCategory::None
, remove_objs
, bilog_flags
,
8457 int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
, uint64_t timeout
)
8459 RGWSI_RADOS::Pool index_pool
;
8460 map
<int, string
> bucket_objs
;
8461 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, std::nullopt
, &index_pool
, &bucket_objs
, nullptr);
8465 return CLSRGWIssueSetTagTimeout(index_pool
.ioctx(), bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
8469 uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries
,
8470 uint32_t num_shards
)
8472 // We want to minimize the chances that when num_shards >>
8473 // num_entries that we return much fewer than num_entries to the
8474 // client. Given all the overhead of making a cls call to the osd,
8475 // returning a few entries is not much more work than returning one
8476 // entry. This minimum might be better tuned based on future
8477 // experiments where num_shards >> num_entries. (Note: ">>" should
8478 // be interpreted as "much greater than".)
8479 constexpr uint32_t min_read
= 8;
8481 // The following is based on _"Balls into Bins" -- A Simple and
8482 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8483 // cases when num_shards >> num_entries (it almost serves as a
8484 // ceiling calculation). We also assume alpha is 1.0 and extract it
8485 // from the calculation. Future work could involve memoizing some of
8486 // the transcendental functions to minimize repeatedly re-calling
8487 // them with the same parameters, which we expect to be the case the
8488 // majority of the time.
8489 uint32_t calc_read
=
8491 static_cast<uint32_t>((num_entries
/ num_shards
) +
8492 sqrt((2 * num_entries
) *
8493 log(num_shards
) / num_shards
));
8495 return std::max(min_read
, calc_read
);
8499 int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider
*dpp
,
8500 RGWBucketInfo
& bucket_info
,
8502 const rgw_obj_index_key
& start_after
,
8503 const std::string
& prefix
,
8504 const std::string
& delimiter
,
8505 const uint32_t num_entries
,
8506 const bool list_versions
,
8507 const uint16_t expansion_factor
,
8511 rgw_obj_index_key
* last_entry
,
8513 RGWBucketListNameFilter force_check_filter
)
8515 /* expansion_factor allows the number of entries to read to grow
8516 * exponentially; this is used when earlier reads are producing too
8517 * few results, perhaps due to filtering or to a series of
8518 * namespaced entries */
8520 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<< ": " <<
8521 bucket_info
.bucket
<<
8522 " start_after=\"" << start_after
<<
8523 "\", prefix=\"" << prefix
<<
8524 ", delimiter=\"" << delimiter
<<
8525 "\", shard_id=" << shard_id
<<
8526 "\", num_entries=" << num_entries
<<
8527 ", list_versions=" << list_versions
<<
8528 ", expansion_factor=" << expansion_factor
<<
8529 ", force_check_filter is " <<
8530 (force_check_filter
? "set" : "unset") << dendl
;
8534 RGWSI_RADOS::Pool index_pool
;
8535 // key - oid (for different shards if there is any)
8536 // value - list result for the corresponding oid (shard), it is filled by
8538 std::map
<int, std::string
> shard_oids
;
8539 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, shard_id
,
8540 &index_pool
, &shard_oids
,
8543 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
8544 ": open_bucket_index for " << bucket_info
.bucket
<< " failed" << dendl
;
8548 const uint32_t shard_count
= shard_oids
.size();
8549 uint32_t num_entries_per_shard
;
8550 if (expansion_factor
== 0) {
8551 num_entries_per_shard
=
8552 calc_ordered_bucket_list_per_shard(num_entries
, shard_count
);
8553 } else if (expansion_factor
<= 11) {
8554 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8555 num_entries_per_shard
=
8556 std::min(num_entries
,
8557 (uint32_t(1 << (expansion_factor
- 1)) *
8558 calc_ordered_bucket_list_per_shard(num_entries
, shard_count
)));
8560 num_entries_per_shard
= num_entries
;
8563 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
8564 ": request from each of " << shard_count
<<
8565 " shard(s) for " << num_entries_per_shard
<< " entries to get " <<
8566 num_entries
<< " total entries" << dendl
;
8568 auto& ioctx
= index_pool
.ioctx();
8569 std::map
<int, rgw_cls_list_ret
> shard_list_results
;
8570 cls_rgw_obj_key
start_after_key(start_after
.name
, start_after
.instance
);
8571 r
= CLSRGWIssueBucketList(ioctx
, start_after_key
, prefix
, delimiter
,
8572 num_entries_per_shard
,
8573 list_versions
, shard_oids
, shard_list_results
,
8574 cct
->_conf
->rgw_bucket_index_max_aio
)();
8576 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
8577 ": CLSRGWIssueBucketList for " << bucket_info
.bucket
<<
8582 // to manage the iterators through each shard's list results
8583 struct ShardTracker
{
8584 const size_t shard_idx
;
8585 rgw_cls_list_ret
& result
;
8586 const std::string
& oid_name
;
8587 RGWRados::ent_map_t::iterator cursor
;
8588 RGWRados::ent_map_t::iterator end
;
8590 // manages an iterator through a shard and provides other
8592 ShardTracker(size_t _shard_idx
,
8593 rgw_cls_list_ret
& _result
,
8594 const std::string
& _oid_name
):
8595 shard_idx(_shard_idx
),
8597 oid_name(_oid_name
),
8598 cursor(_result
.dir
.m
.begin()),
8599 end(_result
.dir
.m
.end())
8602 inline const std::string
& entry_name() const {
8603 return cursor
->first
;
8605 rgw_bucket_dir_entry
& dir_entry() const {
8606 return cursor
->second
;
8608 inline bool is_truncated() const {
8609 return result
.is_truncated
;
8611 inline ShardTracker
& advance() {
8613 // return a self-reference to allow for chaining of calls, such
8614 // as x.advance().at_end()
8617 inline bool at_end() const {
8618 return cursor
== end
;
8622 // add the next unique candidate, or return false if we reach the end
8623 auto next_candidate
= [] (CephContext
*cct
, ShardTracker
& t
,
8624 std::map
<std::string
, size_t>& candidates
,
8625 size_t tracker_idx
) {
8626 while (!t
.at_end()) {
8627 if (candidates
.emplace(t
.entry_name(), tracker_idx
).second
) {
8630 t
.advance(); // skip duplicate common prefixes
8634 // one tracker per shard requested (may not be all shards)
8635 std::vector
<ShardTracker
> results_trackers
;
8636 results_trackers
.reserve(shard_list_results
.size());
8637 for (auto& r
: shard_list_results
) {
8638 results_trackers
.emplace_back(r
.first
, r
.second
, shard_oids
[r
.first
]);
8640 // if any *one* shard's result is trucated, the entire result is
8642 *is_truncated
= *is_truncated
|| r
.second
.is_truncated
;
8644 // unless *all* are shards are cls_filtered, the entire result is
8646 *cls_filtered
= *cls_filtered
&& r
.second
.cls_filtered
;
8649 // create a map to track the next candidate entry from ShardTracker
8650 // (key=candidate, value=index into results_trackers); as we consume
8651 // entries from shards, we replace them with the next entries in the
8652 // shards until we run out
8653 std::map
<std::string
, size_t> candidates
;
8654 size_t tracker_idx
= 0;
8655 for (auto& t
: results_trackers
) {
8656 // it's important that the values in the map refer to the index
8657 // into the results_trackers vector, which may not be the same
8658 // as the shard number (i.e., when not all shards are requested)
8659 next_candidate(cct
, t
, candidates
, tracker_idx
);
8663 rgw_bucket_dir_entry
*
8664 last_entry_visited
= nullptr; // to set last_entry (marker)
8665 std::map
<std::string
, bufferlist
> updates
;
8667 while (count
< num_entries
&& !candidates
.empty()) {
8669 // select the next entry in lexical order (first key in map);
8670 // again tracker_idx is not necessarily shard number, but is index
8671 // into results_trackers vector
8672 tracker_idx
= candidates
.begin()->second
;
8673 auto& tracker
= results_trackers
.at(tracker_idx
);
8675 const std::string
& name
= tracker
.entry_name();
8676 rgw_bucket_dir_entry
& dirent
= tracker
.dir_entry();
8678 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<< ": currently processing " <<
8679 dirent
.key
<< " from shard " << tracker
.shard_idx
<< dendl
;
8681 const bool force_check
=
8682 force_check_filter
&& force_check_filter(dirent
.key
.name
);
8684 if ((!dirent
.exists
&&
8685 !dirent
.is_delete_marker() &&
8686 !dirent
.is_common_prefix()) ||
8687 !dirent
.pending_map
.empty() ||
8689 /* there are uncommitted ops. We need to check the current
8690 * state, and if the tags are old we need to do clean-up as
8692 librados::IoCtx sub_ctx
;
8694 r
= check_disk_state(dpp
, sub_ctx
, bucket_info
, dirent
, dirent
,
8695 updates
[tracker
.oid_name
], y
);
8696 if (r
< 0 && r
!= -ENOENT
) {
8697 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
8698 ": check_disk_state for \"" << dirent
.key
<<
8699 "\" failed with r=" << r
<< dendl
;
8706 // at this point either r >= 0 or r == -ENOENT
8707 if (r
>= 0) { // i.e., if r != -ENOENT
8708 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<< ": got " <<
8709 dirent
.key
<< dendl
;
8711 auto [it
, inserted
] = m
.insert_or_assign(name
, std::move(dirent
));
8712 last_entry_visited
= &it
->second
;
8716 ldpp_dout(dpp
, 0) << "WARNING: " << __PRETTY_FUNCTION__
<<
8717 " reassigned map value at \"" << name
<<
8718 "\", which should not happen" << dendl
;
8721 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<< ": skipping " <<
8722 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
8723 last_entry_visited
= &tracker
.dir_entry();
8726 // refresh the candidates map
8727 candidates
.erase(candidates
.begin());
8730 next_candidate(cct
, tracker
, candidates
, tracker_idx
);
8732 if (tracker
.at_end() && tracker
.is_truncated()) {
8733 // once we exhaust one shard that is truncated, we need to stop,
8734 // as we cannot be certain that one of the next entries needs to
8735 // come from that shard; S3 and swift protocols allow returning
8736 // fewer than what was requested
8737 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
8738 ": stopped accumulating results at count=" << count
<<
8739 ", dirent=\"" << dirent
.key
<<
8740 "\", because its shard is untruncated and exhaused" << dendl
;
8743 } // while we haven't provided requested # of result entries
8745 // suggest updates if there are any
8746 for (auto& miter
: updates
) {
8747 if (miter
.second
.length()) {
8748 ObjectWriteOperation o
;
8749 cls_rgw_suggest_changes(o
, miter
.second
);
8750 // we don't care if we lose suggested updates, send them off blindly
8752 librados::Rados::aio_create_completion(nullptr, nullptr);
8753 ioctx
.aio_operate(miter
.first
, c
, &o
);
8758 // determine truncation by checking if all the returned entries are
8760 *is_truncated
= false;
8761 for (const auto& t
: results_trackers
) {
8762 if (!t
.at_end() || t
.is_truncated()) {
8763 *is_truncated
= true;
8768 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
8769 ": returning, count=" << count
<< ", is_truncated=" << *is_truncated
<<
8772 if (*is_truncated
&& count
< num_entries
) {
8773 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<<
8774 ": requested " << num_entries
<< " entries but returning " <<
8775 count
<< ", which is truncated" << dendl
;
8778 if (last_entry_visited
!= nullptr && last_entry
) {
8779 *last_entry
= last_entry_visited
->key
;
8780 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
8781 ": returning, last_entry=" << *last_entry
<< dendl
;
8783 ldpp_dout(dpp
, 20) << __PRETTY_FUNCTION__
<<
8784 ": returning, last_entry NOT SET" << dendl
;
8791 // A helper function to retrieve the hash source from an incomplete
8792 // multipart entry by removing everything from the second to last
8794 static int parse_index_hash_source(const std::string
& oid_wo_ns
, std::string
*index_hash_source
) {
8795 std::size_t found
= oid_wo_ns
.rfind('.');
8796 if (found
== std::string::npos
|| found
< 1) {
8799 found
= oid_wo_ns
.rfind('.', found
- 1);
8800 if (found
== std::string::npos
|| found
< 1) {
8803 *index_hash_source
= oid_wo_ns
.substr(0, found
);
8808 int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider
*dpp
,
8809 RGWBucketInfo
& bucket_info
,
8811 const rgw_obj_index_key
& start_after
,
8812 const std::string
& prefix
,
8813 uint32_t num_entries
,
8815 std::vector
<rgw_bucket_dir_entry
>& ent_list
,
8817 rgw_obj_index_key
*last_entry
,
8819 RGWBucketListNameFilter force_check_filter
) {
8820 ldpp_dout(dpp
, 10) << __PRETTY_FUNCTION__
<< " " <<
8821 bucket_info
.bucket
<<
8822 " start_after=\"" << start_after
<<
8823 "\", prefix=\"" << prefix
<<
8824 "\", shard_id=" << shard_id
<<
8825 "\", num_entries=" << num_entries
<<
8826 ", list_versions=" << list_versions
<<
8827 ", force_check_filter is " <<
8828 (force_check_filter
? "set" : "unset") << dendl
;
8831 static MultipartMetaFilter multipart_meta_filter
;
8833 *is_truncated
= false;
8834 RGWSI_RADOS::Pool index_pool
;
8836 std::map
<int, std::string
> oids
;
8837 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, shard_id
, &index_pool
, &oids
, nullptr);
8842 auto& ioctx
= index_pool
.ioctx();
8844 const uint32_t num_shards
= oids
.size();
8846 rgw_obj_index_key marker
= start_after
;
8847 uint32_t current_shard
;
8848 if (shard_id
>= 0) {
8849 current_shard
= shard_id
;
8850 } else if (start_after
.empty()) {
8853 // at this point we have a marker (start_after) that has something
8854 // in it, so we need to get to the bucket shard index, so we can
8855 // start reading from there
8858 // now convert the key (oid) to an rgw_obj_key since that will
8859 // separate out the namespace, name, and instance
8860 rgw_obj_key obj_key
;
8861 bool parsed
= rgw_obj_key::parse_raw_oid(start_after
.name
, &obj_key
);
8863 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<<
8864 " received an invalid start marker: \"" << start_after
<< "\"" <<
8867 } else if (obj_key
.name
.empty()) {
8868 // if the name is empty that means the object name came in with
8869 // a namespace only, and therefore we need to start our scan at
8870 // the first bucket index shard
8873 // so now we have the key used to compute the bucket index shard
8874 // and can extract the specific shard from it
8875 if (obj_key
.ns
== RGW_OBJ_NS_MULTIPART
) {
8876 // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
8877 // the implementation relying on MultipartMetaFilter
8878 // because MultipartMetaFilter only checks .meta suffix, which may
8879 // exclude data multiparts but include some regular objects with .meta suffix
8881 string index_hash_source
;
8882 r
= parse_index_hash_source(obj_key
.name
, &index_hash_source
);
8884 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<<
8885 " parse_index_hash_source unable to parse \"" << obj_key
.name
<<
8886 "\", r=" << r
<< dendl
;
8889 current_shard
= svc
.bi_rados
->bucket_shard_index(index_hash_source
, num_shards
);
8891 current_shard
= svc
.bi_rados
->bucket_shard_index(obj_key
.name
, num_shards
);
8896 uint32_t count
= 0u;
8897 std::map
<std::string
, bufferlist
> updates
;
8898 rgw_obj_index_key last_added_entry
;
8899 while (count
<= num_entries
&&
8900 ((shard_id
>= 0 && current_shard
== uint32_t(shard_id
)) ||
8901 current_shard
< num_shards
)) {
8902 const std::string
& oid
= oids
[current_shard
];
8903 rgw_cls_list_ret result
;
8905 librados::ObjectReadOperation op
;
8906 const std::string empty_delimiter
;
8907 cls_rgw_bucket_list_op(op
, marker
, prefix
, empty_delimiter
,
8909 list_versions
, &result
);
8910 r
= rgw_rados_operate(dpp
, ioctx
, oid
, &op
, nullptr, null_yield
);
8912 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<<
8913 " error in rgw_rados_operate (bucket list op), r=" << r
<< dendl
;
8917 for (auto& entry
: result
.dir
.m
) {
8918 rgw_bucket_dir_entry
& dirent
= entry
.second
;
8920 bool force_check
= force_check_filter
&&
8921 force_check_filter(dirent
.key
.name
);
8922 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
8923 !dirent
.pending_map
.empty() ||
8925 /* there are uncommitted ops. We need to check the current state,
8926 * and if the tags are old we need to do cleanup as well. */
8927 librados::IoCtx sub_ctx
;
8929 r
= check_disk_state(dpp
, sub_ctx
, bucket_info
, dirent
, dirent
, updates
[oid
], y
);
8930 if (r
< 0 && r
!= -ENOENT
) {
8931 ldpp_dout(dpp
, 0) << "ERROR: " << __func__
<<
8932 " error in check_disk_state, r=" << r
<< dendl
;
8939 // at this point either r >= 0 or r == -ENOENT
8940 if (r
>= 0) { // i.e., if r != -ENOENT
8941 ldpp_dout(dpp
, 10) << __func__
<< ": got " <<
8942 dirent
.key
<< dendl
;
8944 if (count
< num_entries
) {
8945 marker
= last_added_entry
= dirent
.key
; // double assign
8946 ent_list
.emplace_back(std::move(dirent
));
8949 last_added_entry
= dirent
.key
;
8950 *is_truncated
= true;
8951 ldpp_dout(dpp
, 10) << "INFO: " << __func__
<<
8952 ": reached max entries (" << num_entries
<< ") to return at \"" <<
8953 dirent
.key
<< "\"" << dendl
;
8956 } else { // r == -ENOENT
8957 // in the case of -ENOENT, make sure we're advancing marker
8958 // for possible next call to CLSRGWIssueBucketList
8959 marker
= dirent
.key
;
8963 if (!result
.is_truncated
) {
8964 // if we reached the end of the shard read next shard
8966 marker
= rgw_obj_index_key();
8972 // suggest updates if there is any
8973 std::map
<std::string
, bufferlist
>::iterator miter
= updates
.begin();
8974 for (; miter
!= updates
.end(); ++miter
) {
8975 if (miter
->second
.length()) {
8976 ObjectWriteOperation o
;
8977 cls_rgw_suggest_changes(o
, miter
->second
);
8978 // we don't care if we lose suggested updates, send them off blindly
8979 AioCompletion
*c
= librados::Rados::aio_create_completion(nullptr, nullptr);
8980 ioctx
.aio_operate(miter
->first
, c
, &o
);
8985 if (last_entry
&& !ent_list
.empty()) {
8986 *last_entry
= last_added_entry
;
8990 } // RGWRados::cls_bucket_list_unordered
8993 int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider
*dpp
, const string
& oid
,
8994 rgw_usage_log_info
& info
)
8996 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
8999 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
9004 ObjectWriteOperation op
;
9005 cls_rgw_usage_log_add(op
, info
);
9007 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
9011 int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider
*dpp
, const string
& oid
, const string
& user
, const string
& bucket
,
9012 uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
9013 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
,
9016 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9019 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
9024 *is_truncated
= false;
9026 r
= cls_rgw_usage_log_read(ref
.pool
.ioctx(), ref
.obj
.oid
, user
, bucket
, start_epoch
, end_epoch
,
9027 max_entries
, read_iter
, usage
, is_truncated
);
9032 static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider
*dpp
, rgw_rados_ref ref
, const string
& user
, const string
& bucket
, uint64_t start_epoch
, uint64_t end_epoch
)
9036 librados::ObjectWriteOperation op
;
9037 cls_rgw_usage_log_trim(op
, user
, bucket
, start_epoch
, end_epoch
);
9038 int r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
9048 int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider
*dpp
, const string
& oid
, const string
& user
, const string
& bucket
,
9049 uint64_t start_epoch
, uint64_t end_epoch
)
9051 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9054 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
9059 r
= cls_rgw_usage_log_trim_repeat(dpp
, ref
, user
, bucket
, start_epoch
, end_epoch
);
9063 int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider
*dpp
, string
& oid
)
9065 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9068 int r
= get_raw_obj_ref(dpp
, obj
, &ref
);
9072 librados::ObjectWriteOperation op
;
9073 cls_rgw_usage_log_clear(op
);
9074 r
= rgw_rados_operate(dpp
, ref
.pool
.ioctx(), ref
.obj
.oid
, &op
, null_yield
);
9079 int RGWRados::remove_objs_from_index(const DoutPrefixProvider
*dpp
, RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
9081 RGWSI_RADOS::Pool index_pool
;
9084 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
9086 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, &index_pool
, &dir_oid
);
9092 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
9093 rgw_bucket_dir_entry entry
;
9095 ldpp_dout(dpp
, 2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
9096 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9097 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
9098 encode(entry
, updates
);
9103 r
= index_pool
.ioctx().exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
9108 int RGWRados::check_disk_state(const DoutPrefixProvider
*dpp
,
9109 librados::IoCtx io_ctx
,
9110 const RGWBucketInfo
& bucket_info
,
9111 rgw_bucket_dir_entry
& list_state
,
9112 rgw_bucket_dir_entry
& object
,
9113 bufferlist
& suggested_updates
,
9116 const rgw_bucket
& bucket
= bucket_info
.bucket
;
9117 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
9121 rgw_obj
obj(bucket
, list_state
.key
);
9124 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
9126 if (loc
!= list_state
.locator
) {
9127 ldpp_dout(dpp
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
9130 io_ctx
.locator_set_key(list_state
.locator
);
9132 RGWObjState
*astate
= NULL
;
9133 RGWObjectCtx
rctx(this->store
);
9134 int r
= get_obj_state(dpp
, &rctx
, bucket_info
, obj
, &astate
, false, y
);
9138 list_state
.pending_map
.clear(); // we don't need this and it inflates size
9139 if (!list_state
.is_delete_marker() && !astate
->exists
) {
9140 /* object doesn't exist right now -- hopefully because it's
9141 * marked as !exists and got deleted */
9142 if (list_state
.exists
) {
9143 /* FIXME: what should happen now? Work out if there are any
9144 * non-bad ways this could happen (there probably are, but annoying
9148 // encode a suggested removal of that key
9149 list_state
.ver
.epoch
= io_ctx
.get_last_version();
9150 list_state
.ver
.pool
= io_ctx
.get_id();
9151 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
9156 string content_type
;
9159 object
.meta
.size
= astate
->size
;
9160 object
.meta
.accounted_size
= astate
->accounted_size
;
9161 object
.meta
.mtime
= astate
->mtime
;
9163 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
9164 if (iter
!= astate
->attrset
.end()) {
9165 etag
= rgw_bl_str(iter
->second
);
9167 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
9168 if (iter
!= astate
->attrset
.end()) {
9169 content_type
= rgw_bl_str(iter
->second
);
9171 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
9172 if (iter
!= astate
->attrset
.end()) {
9173 r
= decode_policy(dpp
, iter
->second
, &owner
);
9175 ldpp_dout(dpp
, 0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
9179 if (astate
->manifest
) {
9180 RGWObjManifest::obj_iterator miter
;
9181 RGWObjManifest
& manifest
= *astate
->manifest
;
9182 for (miter
= manifest
.obj_begin(dpp
); miter
!= manifest
.obj_end(dpp
); ++miter
) {
9183 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(store
);
9185 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
9187 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
9188 ldpp_dout(dpp
, 0) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
9189 r
= delete_obj_index(loc
, astate
->mtime
, dpp
);
9191 ldpp_dout(dpp
, 0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
9197 object
.meta
.etag
= etag
;
9198 object
.meta
.content_type
= content_type
;
9199 object
.meta
.owner
= owner
.get_id().to_str();
9200 object
.meta
.owner_display_name
= owner
.get_display_name();
9202 // encode suggested updates
9204 list_state
.meta
.size
= object
.meta
.size
;
9205 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
9206 list_state
.meta
.mtime
= object
.meta
.mtime
;
9207 list_state
.meta
.category
= main_category
;
9208 list_state
.meta
.etag
= etag
;
9209 list_state
.meta
.content_type
= content_type
;
9211 librados::IoCtx head_obj_ctx
; // initialize to data pool so we can get pool id
9212 int ret
= get_obj_head_ioctx(dpp
, bucket_info
, obj
, &head_obj_ctx
);
9214 ldpp_dout(dpp
, 0) << __PRETTY_FUNCTION__
<<
9215 " WARNING: unable to find head object data pool for \"" <<
9216 obj
<< "\", not updating version pool/epoch" << dendl
;
9218 list_state
.ver
.pool
= head_obj_ctx
.get_id();
9219 list_state
.ver
.epoch
= astate
->epoch
;
9222 if (astate
->obj_tag
.length() > 0) {
9223 list_state
.tag
= astate
->obj_tag
.c_str();
9226 list_state
.meta
.owner
= owner
.get_id().to_str();
9227 list_state
.meta
.owner_display_name
= owner
.get_display_name();
9229 list_state
.exists
= true;
9231 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
9235 int RGWRados::cls_bucket_head(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, int shard_id
, vector
<rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
9237 RGWSI_RADOS::Pool index_pool
;
9238 map
<int, string
> oids
;
9239 map
<int, struct rgw_cls_list_ret
> list_results
;
9240 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, shard_id
, &index_pool
, &oids
, bucket_instance_ids
);
9242 ldpp_dout(dpp
, 20) << "cls_bucket_head: open_bucket_index() returned "
9247 r
= CLSRGWIssueGetDirHeader(index_pool
.ioctx(), oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
9249 ldpp_dout(dpp
, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
9254 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
9255 for(; iter
!= list_results
.end(); ++iter
) {
9256 headers
.push_back(std::move(iter
->second
.dir
.header
));
9261 int RGWRados::cls_bucket_head_async(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
9263 RGWSI_RADOS::Pool index_pool
;
9264 map
<int, string
> bucket_objs
;
9265 int r
= svc
.bi_rados
->open_bucket_index(dpp
, bucket_info
, shard_id
, &index_pool
, &bucket_objs
, nullptr);
9269 map
<int, string
>::iterator iter
= bucket_objs
.begin();
9270 for (; iter
!= bucket_objs
.end(); ++iter
) {
9271 r
= cls_rgw_get_dir_header_async(index_pool
.ioctx(), iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
9282 int RGWRados::check_bucket_shards(const RGWBucketInfo
& bucket_info
,
9283 const rgw_bucket
& bucket
,
9285 const DoutPrefixProvider
*dpp
)
9287 if (! cct
->_conf
.get_val
<bool>("rgw_dynamic_resharding")) {
9291 bool need_resharding
= false;
9292 uint32_t num_source_shards
=
9293 (bucket_info
.layout
.current_index
.layout
.normal
.num_shards
> 0 ? bucket_info
.layout
.current_index
.layout
.normal
.num_shards
: 1);
9294 const uint32_t max_dynamic_shards
=
9295 uint32_t(cct
->_conf
.get_val
<uint64_t>("rgw_max_dynamic_shards"));
9297 if (num_source_shards
>= max_dynamic_shards
) {
9301 uint32_t suggested_num_shards
= 0;
9302 const uint64_t max_objs_per_shard
=
9303 cct
->_conf
.get_val
<uint64_t>("rgw_max_objs_per_shard");
9305 quota_handler
->check_bucket_shards(dpp
, max_objs_per_shard
, num_source_shards
,
9306 num_objs
, need_resharding
, &suggested_num_shards
);
9307 if (! need_resharding
) {
9311 const uint32_t final_num_shards
=
9312 RGWBucketReshard::get_preferred_shards(suggested_num_shards
,
9313 max_dynamic_shards
);
9314 // final verification, so we don't reduce number of shards
9315 if (final_num_shards
<= num_source_shards
) {
9319 ldpp_dout(dpp
, 1) << "RGWRados::" << __func__
<< " bucket " << bucket
.name
<<
9320 " needs resharding; current num shards " << bucket_info
.layout
.current_index
.layout
.normal
.num_shards
<<
9321 "; new num shards " << final_num_shards
<< " (suggested " <<
9322 suggested_num_shards
<< ")" << dendl
;
9324 return add_bucket_to_reshard(dpp
, bucket_info
, final_num_shards
);
9327 int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider
*dpp
, const RGWBucketInfo
& bucket_info
, uint32_t new_num_shards
)
9329 RGWReshard
reshard(this->store
, dpp
);
9331 uint32_t num_source_shards
= (bucket_info
.layout
.current_index
.layout
.normal
.num_shards
> 0 ? bucket_info
.layout
.current_index
.layout
.normal
.num_shards
: 1);
9333 new_num_shards
= std::min(new_num_shards
, get_max_bucket_shards());
9334 if (new_num_shards
<= num_source_shards
) {
9335 ldpp_dout(dpp
, 20) << "not resharding bucket name=" << bucket_info
.bucket
.name
<< ", orig_num=" << num_source_shards
<< ", new_num_shards=" << new_num_shards
<< dendl
;
9339 cls_rgw_reshard_entry entry
;
9340 entry
.time
= real_clock::now();
9341 entry
.tenant
= bucket_info
.owner
.tenant
;
9342 entry
.bucket_name
= bucket_info
.bucket
.name
;
9343 entry
.bucket_id
= bucket_info
.bucket
.bucket_id
;
9344 entry
.old_num_shards
= num_source_shards
;
9345 entry
.new_num_shards
= new_num_shards
;
9347 return reshard
.add(dpp
, entry
);
9350 int RGWRados::check_quota(const DoutPrefixProvider
*dpp
, const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
9351 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
,
9352 uint64_t obj_size
, optional_yield y
,
9353 bool check_size_only
)
9355 // if we only check size, then num_objs will set to 0
9357 return quota_handler
->check_quota(dpp
, bucket_owner
, bucket
, user_quota
, bucket_quota
, 0, obj_size
, y
);
9359 return quota_handler
->check_quota(dpp
, bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
, y
);
9362 int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout
& layout
, const string
& obj_key
,
9366 switch (layout
.hash_type
) {
9367 case rgw::BucketHashType::Mod
:
9368 if (!layout
.num_shards
) {
9373 uint32_t sid
= svc
.bi_rados
->bucket_shard_index(obj_key
, layout
.num_shards
);
9375 *shard_id
= (int)sid
;
9385 uint64_t RGWRados::instance_id()
9387 return get_rados_handle()->get_instance_id();
9390 uint64_t RGWRados::next_bucket_id()
9392 std::lock_guard l
{bucket_id_lock
};
9393 return ++max_bucket_id
;
9396 librados::Rados
* RGWRados::get_rados_handle()
9401 int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider
*dpp
, const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
9404 int ret
= get_raw_obj_ref(dpp
, obj
, &ref
);
9406 ldpp_dout(dpp
, -1) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
9410 ObjectWriteOperation op
;
9411 list
<string
> prefixes
;
9412 cls_rgw_remove_obj(op
, prefixes
);
9414 AioCompletion
*c
= librados::Rados::aio_create_completion(nullptr, nullptr);
9415 ret
= ref
.pool
.ioctx().aio_operate(ref
.obj
.oid
, c
, &op
);
9417 ldpp_dout(dpp
, -1) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
9422 handles
.push_back(c
);
9427 int RGWRados::delete_obj_aio(const DoutPrefixProvider
*dpp
, const rgw_obj
& obj
,
9428 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
9429 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
,
9433 int ret
= get_obj_head_ref(dpp
, bucket_info
, obj
, &ref
);
9435 ldpp_dout(dpp
, -1) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
9439 if (keep_index_consistent
) {
9440 RGWRados::Bucket
bop(this, bucket_info
);
9441 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
9443 ret
= index_op
.prepare(dpp
, CLS_RGW_OP_DEL
, &astate
->write_tag
, y
);
9445 ldpp_dout(dpp
, -1) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
9450 ObjectWriteOperation op
;
9451 list
<string
> prefixes
;
9452 cls_rgw_remove_obj(op
, prefixes
);
9454 AioCompletion
*c
= librados::Rados::aio_create_completion(nullptr, nullptr);
9455 ret
= ref
.pool
.ioctx().aio_operate(ref
.obj
.oid
, c
, &op
);
9457 ldpp_dout(dpp
, -1) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
9462 handles
.push_back(c
);
9464 if (keep_index_consistent
) {
9465 ret
= delete_obj_index(obj
, astate
->mtime
, dpp
);
9467 ldpp_dout(dpp
, -1) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;
9474 void objexp_hint_entry::generate_test_instances(list
<objexp_hint_entry
*>& o
)
9476 auto it
= new objexp_hint_entry
;
9477 it
->tenant
= "tenant1";
9478 it
->bucket_name
= "bucket1";
9479 it
->bucket_id
= "1234";
9480 it
->obj_key
= rgw_obj_key("obj");
9482 o
.push_back(new objexp_hint_entry
);
9485 void objexp_hint_entry::dump(Formatter
*f
) const
9487 f
->open_object_section("objexp_hint_entry");
9488 encode_json("tenant", tenant
, f
);
9489 encode_json("bucket_name", bucket_name
, f
);
9490 encode_json("bucket_id", bucket_id
, f
);
9491 encode_json("rgw_obj_key", obj_key
, f
);
9492 utime_t
ut(exp_time
);
9493 encode_json("exp_time", ut
, f
);
9497 void RGWOLHInfo::generate_test_instances(list
<RGWOLHInfo
*> &o
)
9499 RGWOLHInfo
*olh
= new RGWOLHInfo
;
9500 olh
->removed
= false;
9502 o
.push_back(new RGWOLHInfo
);
9505 void RGWOLHInfo::dump(Formatter
*f
) const
9507 encode_json("target", target
, f
);
9510 void RGWOLHPendingInfo::dump(Formatter
*f
) const
9513 encode_json("time", ut
, f
);