1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "include/compat.h"
8 #include <boost/algorithm/string.hpp>
11 #include <boost/container/flat_set.hpp>
12 #include <boost/format.hpp>
13 #include <boost/optional.hpp>
14 #include <boost/utility/in_place_factory.hpp>
16 #include "common/ceph_json.h"
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
22 #include "rgw_rados.h"
24 #include "rgw_cache.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_aio_throttle.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32 #include "rgw_putobj_processor.h"
34 #include "cls/rgw/cls_rgw_ops.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "cls/otp/cls_otp_client.h"
44 #include "osd/osd_types.h"
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
50 #undef fork // fails to compile RGWPeriod::fork() below
52 #include "common/Clock.h"
54 using namespace librados
;
62 #include "include/random.h"
67 #include "rgw_object_expirer_core.h"
69 #include "rgw_sync_counters.h"
70 #include "rgw_sync_trace.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
75 #include "services/svc_zone.h"
76 #include "services/svc_zone_utils.h"
77 #include "services/svc_quota.h"
78 #include "services/svc_sync_modules.h"
79 #include "services/svc_sys_obj.h"
80 #include "services/svc_sys_obj_cache.h"
82 #include "compressor/Compressor.h"
85 #define TRACEPOINT_DEFINE
86 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
87 #include "tracing/rgw_rados.h"
88 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
89 #undef TRACEPOINT_DEFINE
91 #define tracepoint(...)
94 #define dout_context g_ceph_context
95 #define dout_subsys ceph_subsys_rgw
98 static string shadow_ns
= "shadow";
99 static string dir_oid_prefix
= ".dir.";
100 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
101 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
103 static string log_lock_name
= "rgw_log_lock";
104 static RGWObjCategory main_category
= RGWObjCategory::Main
;
105 #define RGW_USAGE_OBJ_PREFIX "usage."
107 #define dout_subsys ceph_subsys_rgw
109 const std::string MP_META_SUFFIX
= ".meta";
112 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
113 const rgw_placement_rule
& head_placement_rule
,
114 const rgw_obj
& obj
, rgw_pool
*pool
)
116 if (!zone_params
.get_head_data_pool(head_placement_rule
, obj
, pool
)) {
117 RGWZonePlacementInfo placement
;
118 if (!zone_params
.get_placement(zonegroup
.default_placement
.name
, &placement
)) {
122 if (!obj
.in_extra_data
) {
123 *pool
= placement
.get_data_pool(zonegroup
.default_placement
.storage_class
);
125 *pool
= placement
.get_data_extra_pool();
132 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
133 const rgw_placement_rule
& head_placement_rule
,
134 const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
136 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
138 return rgw_get_obj_data_pool(zonegroup
, zone_params
, head_placement_rule
, obj
, &raw_obj
->pool
);
141 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
145 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
151 rgw_raw_obj
rgw_obj_select::get_raw_obj(RGWRados
*store
) const
155 store
->obj_to_raw(placement_rule
, obj
, &r
);
161 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
163 obj_version
*check_objv
= version_for_check();
166 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
169 cls_version_read(*op
, &read_version
);
172 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
174 obj_version
*check_objv
= version_for_check();
175 obj_version
*modify_version
= version_for_write();
178 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
181 if (modify_version
) {
182 cls_version_set(*op
, *modify_version
);
184 cls_version_inc(*op
);
188 void RGWObjManifest::obj_iterator::operator++()
190 if (manifest
->explicit_objs
) {
193 update_explicit_pos();
199 uint64_t obj_size
= manifest
->get_obj_size();
200 uint64_t head_size
= manifest
->get_head_size();
202 if (ofs
== obj_size
) {
206 if (manifest
->rules
.empty()) {
210 /* are we still pointing at the head? */
211 if (ofs
< head_size
) {
212 rule_iter
= manifest
->rules
.begin();
213 RGWObjManifestRule
*rule
= &rule_iter
->second
;
214 ofs
= std::min(head_size
, obj_size
);
217 stripe_size
= std::min(obj_size
- ofs
, rule
->stripe_max_size
);
218 if (rule
->part_size
> 0) {
219 stripe_size
= std::min(stripe_size
, rule
->part_size
);
225 RGWObjManifestRule
*rule
= &rule_iter
->second
;
227 stripe_ofs
+= rule
->stripe_max_size
;
229 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule
->part_size
<< " rules.size()=" << manifest
->rules
.size() << dendl
;
231 if (rule
->part_size
> 0) {
232 /* multi part, multi stripes object */
234 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
236 if (stripe_ofs
>= part_ofs
+ rule
->part_size
) {
237 /* moved to the next part */
239 part_ofs
+= rule
->part_size
;
240 stripe_ofs
= part_ofs
;
242 bool last_rule
= (next_rule_iter
== manifest
->rules
.end());
243 /* move to the next rule? */
244 if (!last_rule
&& stripe_ofs
>= next_rule_iter
->second
.start_ofs
) {
245 rule_iter
= next_rule_iter
;
246 last_rule
= (next_rule_iter
== manifest
->rules
.end());
250 cur_part_id
= rule_iter
->second
.start_part_num
;
255 rule
= &rule_iter
->second
;
258 stripe_size
= std::min(rule
->part_size
- (stripe_ofs
- part_ofs
), rule
->stripe_max_size
);
261 cur_override_prefix
= rule
->override_prefix
;
264 if (ofs
> obj_size
) {
270 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs
<< " stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
274 int RGWObjManifest::generator::create_begin(CephContext
*cct
, RGWObjManifest
*_m
,
275 const rgw_placement_rule
& head_placement_rule
,
276 const rgw_placement_rule
*tail_placement_rule
,
277 const rgw_bucket
& _b
, const rgw_obj
& _obj
)
281 if (!tail_placement_rule
) {
282 manifest
->set_tail_placement(head_placement_rule
, _b
);
284 rgw_placement_rule new_tail_rule
= *tail_placement_rule
;
285 new_tail_rule
.inherit_from(head_placement_rule
);
286 manifest
->set_tail_placement(new_tail_rule
, _b
);
289 manifest
->set_head(head_placement_rule
, _obj
, 0);
292 if (manifest
->get_prefix().empty()) {
294 gen_rand_alphanumeric(cct
, buf
, sizeof(buf
) - 1);
296 string oid_prefix
= ".";
297 oid_prefix
.append(buf
);
298 oid_prefix
.append("_");
300 manifest
->set_prefix(oid_prefix
);
303 bool found
= manifest
->get_rule(0, &rule
);
305 derr
<< "ERROR: manifest->get_rule() could not find rule" << dendl
;
309 uint64_t head_size
= manifest
->get_head_size();
312 cur_stripe_size
= head_size
;
314 cur_stripe_size
= rule
.stripe_max_size
;
317 cur_part_id
= rule
.start_part_num
;
319 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, 0, NULL
, &cur_obj
);
321 // Normal object which not generated through copy operation
322 manifest
->set_tail_instance(_obj
.key
.instance
);
324 manifest
->update_iterators();
329 int RGWObjManifest::generator::create_next(uint64_t ofs
)
331 if (ofs
< last_ofs
) /* only going forward */
334 uint64_t max_head_size
= manifest
->get_max_head_size();
336 if (ofs
< max_head_size
) {
337 manifest
->set_head_size(ofs
);
340 if (ofs
>= max_head_size
) {
341 manifest
->set_head_size(max_head_size
);
342 cur_stripe
= (ofs
- max_head_size
) / rule
.stripe_max_size
;
343 cur_stripe_size
= rule
.stripe_max_size
;
345 if (cur_part_id
== 0 && max_head_size
> 0) {
351 manifest
->set_obj_size(ofs
);
353 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, ofs
, NULL
, &cur_obj
);
355 manifest
->update_iterators();
360 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_begin()
365 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_end()
370 RGWObjManifest::obj_iterator
RGWObjManifest::obj_find(uint64_t ofs
)
372 if (ofs
> obj_size
) {
375 RGWObjManifest::obj_iterator
iter(this);
380 int RGWObjManifest::append(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
,
381 const RGWZoneParams
& zone_params
)
383 if (explicit_objs
|| m
.explicit_objs
) {
384 return append_explicit(m
, zonegroup
, zone_params
);
392 string override_prefix
;
394 if (prefix
.empty()) {
398 if (prefix
!= m
.prefix
) {
399 override_prefix
= m
.prefix
;
402 map
<uint64_t, RGWObjManifestRule
>::iterator miter
= m
.rules
.begin();
403 if (miter
== m
.rules
.end()) {
404 return append_explicit(m
, zonegroup
, zone_params
);
407 for (; miter
!= m
.rules
.end(); ++miter
) {
408 map
<uint64_t, RGWObjManifestRule
>::reverse_iterator last_rule
= rules
.rbegin();
410 RGWObjManifestRule
& rule
= last_rule
->second
;
412 if (rule
.part_size
== 0) {
413 rule
.part_size
= obj_size
- rule
.start_ofs
;
416 RGWObjManifestRule
& next_rule
= miter
->second
;
417 if (!next_rule
.part_size
) {
418 next_rule
.part_size
= m
.obj_size
- next_rule
.start_ofs
;
421 string rule_prefix
= prefix
;
422 if (!rule
.override_prefix
.empty()) {
423 rule_prefix
= rule
.override_prefix
;
426 string next_rule_prefix
= m
.prefix
;
427 if (!next_rule
.override_prefix
.empty()) {
428 next_rule_prefix
= next_rule
.override_prefix
;
431 if (rule
.part_size
!= next_rule
.part_size
||
432 rule
.stripe_max_size
!= next_rule
.stripe_max_size
||
433 rule_prefix
!= next_rule_prefix
) {
434 if (next_rule_prefix
!= prefix
) {
435 append_rules(m
, miter
, &next_rule_prefix
);
437 append_rules(m
, miter
, NULL
);
442 uint64_t expected_part_num
= rule
.start_part_num
+ 1;
443 if (rule
.part_size
> 0) {
444 expected_part_num
= rule
.start_part_num
+ (obj_size
+ next_rule
.start_ofs
- rule
.start_ofs
) / rule
.part_size
;
447 if (expected_part_num
!= next_rule
.start_part_num
) {
448 append_rules(m
, miter
, NULL
);
453 set_obj_size(obj_size
+ m
.obj_size
);
458 int RGWObjManifest::append(RGWObjManifest
& m
, RGWSI_Zone
*zone_svc
)
460 return append(m
, zone_svc
->get_zonegroup(), zone_svc
->get_zone_params());
463 void RGWObjManifest::append_rules(RGWObjManifest
& m
, map
<uint64_t, RGWObjManifestRule
>::iterator
& miter
,
464 string
*override_prefix
)
466 for (; miter
!= m
.rules
.end(); ++miter
) {
467 RGWObjManifestRule rule
= miter
->second
;
468 rule
.start_ofs
+= obj_size
;
470 rule
.override_prefix
= *override_prefix
;
471 rules
[rule
.start_ofs
] = rule
;
475 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
480 obj_iterator iter
= obj_begin();
482 while (iter
!= obj_end()) {
483 RGWObjManifestPart
& part
= objs
[iter
.get_stripe_ofs()];
484 const rgw_obj_select
& os
= iter
.get_location();
485 const rgw_raw_obj
& raw_loc
= os
.get_raw_obj(zonegroup
, zone_params
);
488 uint64_t ofs
= iter
.get_stripe_ofs();
493 rgw_raw_obj_to_obj(tail_placement
.bucket
, raw_loc
, &part
.loc
);
496 uint64_t next_ofs
= iter
.get_stripe_ofs();
498 part
.size
= next_ofs
- ofs
;
501 explicit_objs
= true;
506 int RGWObjManifest::append_explicit(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
508 if (!explicit_objs
) {
509 convert_to_explicit(zonegroup
, zone_params
);
511 if (!m
.explicit_objs
) {
512 m
.convert_to_explicit(zonegroup
, zone_params
);
514 map
<uint64_t, RGWObjManifestPart
>::iterator iter
;
515 uint64_t base
= obj_size
;
516 for (iter
= m
.objs
.begin(); iter
!= m
.objs
.end(); ++iter
) {
517 RGWObjManifestPart
& part
= iter
->second
;
518 objs
[base
+ iter
->first
] = part
;
520 obj_size
+= m
.obj_size
;
525 bool RGWObjManifest::get_rule(uint64_t ofs
, RGWObjManifestRule
*rule
)
531 map
<uint64_t, RGWObjManifestRule
>::iterator iter
= rules
.upper_bound(ofs
);
532 if (iter
!= rules
.begin()) {
536 *rule
= iter
->second
;
541 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
543 write_version
.ver
= 1;
546 write_version
.tag
.clear();
547 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
550 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
552 RGWHTTPManager http_manager
;
555 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
556 http_manager(store
->ctx(), completion_mgr
) {
557 http_manager
.start();
560 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
561 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
565 list
<RGWCoroutinesStack
*> stacks
;
566 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
567 RGWRESTConn
*conn
= iter
->second
;
568 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
569 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
571 stacks
.push_back(stack
);
577 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
579 RGWHTTPManager http_manager
;
582 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
583 http_manager(store
->ctx(), completion_mgr
) {
584 http_manager
.start();
587 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, map
<int, set
<string
> >& shards
) {
588 rgw_http_param_pair pairs
[] = { { "type", "data" },
590 { "source-zone", store
->svc
.zone
->get_zone_params().get_id().c_str() },
593 list
<RGWCoroutinesStack
*> stacks
;
594 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
595 RGWRESTConn
*conn
= iter
->second
;
596 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
597 stack
->call(new RGWPostRESTResourceCR
<map
<int, set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
599 stacks
.push_back(stack
);
605 /* class RGWRadosThread */
607 void RGWRadosThread::start()
609 worker
= new Worker(cct
, this);
610 worker
->create(thread_name
.c_str());
613 void RGWRadosThread::stop()
625 void *RGWRadosThread::Worker::entry() {
626 uint64_t msec
= processor
->interval_msec();
627 utime_t interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
630 utime_t start
= ceph_clock_now();
631 int r
= processor
->process();
633 dout(0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
636 if (processor
->going_down())
639 utime_t end
= ceph_clock_now();
642 uint64_t cur_msec
= processor
->interval_msec();
643 if (cur_msec
!= msec
) { /* was it reconfigured? */
645 interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
650 continue; // next round
652 utime_t wait_time
= interval
;
655 wait_interval(wait_time
);
659 } while (!processor
->going_down());
664 class RGWMetaNotifier
: public RGWRadosThread
{
665 RGWMetaNotifierManager notify_mgr
;
666 RGWMetadataLog
*const log
;
668 uint64_t interval_msec() override
{
669 return cct
->_conf
->rgw_md_notify_interval_msec
;
671 void stop_process() override
{
675 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
676 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
678 int process() override
;
681 int RGWMetaNotifier::process()
685 log
->read_clear_modified(shards
);
687 if (shards
.empty()) {
691 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
692 ldout(cct
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
695 notify_mgr
.notify_all(store
->svc
.zone
->get_zone_conn_map(), shards
);
700 class RGWDataNotifier
: public RGWRadosThread
{
701 RGWDataNotifierManager notify_mgr
;
703 uint64_t interval_msec() override
{
704 return cct
->_conf
.get_val
<int64_t>("rgw_data_notify_interval_msec");
706 void stop_process() override
{
710 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
712 int process() override
;
715 int RGWDataNotifier::process()
717 if (!store
->data_log
) {
721 map
<int, set
<string
> > shards
;
723 store
->data_log
->read_clear_modified(shards
);
725 if (shards
.empty()) {
729 for (map
<int, set
<string
> >::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
730 ldout(cct
, 20) << __func__
<< "(): notifying datalog change, shard_id=" << iter
->first
<< ": " << iter
->second
<< dendl
;
733 notify_mgr
.notify_all(store
->svc
.zone
->get_zone_data_notify_to_map(), shards
);
738 class RGWSyncProcessorThread
: public RGWRadosThread
{
740 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
741 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
742 ~RGWSyncProcessorThread() override
{}
743 int init() override
= 0 ;
744 int process() override
= 0;
747 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
749 RGWMetaSyncStatusManager sync
;
751 uint64_t interval_msec() override
{
752 return 0; /* no interval associated, it'll run once until stopped */
754 void stop_process() override
{
758 RGWMetaSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
)
759 : RGWSyncProcessorThread(_store
, "meta-sync"), sync(_store
, async_rados
) {}
761 void wakeup_sync_shards(set
<int>& shard_ids
) {
762 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
766 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
768 int init() override
{
769 int ret
= sync
.init();
771 ldout(store
->ctx(), 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
777 int process() override
{
783 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
785 PerfCountersRef counters
;
786 RGWDataSyncStatusManager sync
;
789 uint64_t interval_msec() override
{
791 return 0; /* no interval associated, it'll run once until stopped */
793 #define DATA_SYNC_INIT_WAIT_SEC 20
794 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
797 void stop_process() override
{
801 RGWDataSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
,
802 const RGWZone
* source_zone
)
803 : RGWSyncProcessorThread(_store
, "data-sync"),
804 counters(sync_counters::build(store
->ctx(), std::string("data-sync-from-") + source_zone
->name
)),
805 sync(_store
, async_rados
, source_zone
->id
, counters
.get()),
806 initialized(false) {}
808 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
809 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
810 sync
.wakeup(iter
->first
, iter
->second
);
813 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
815 int init() override
{
819 int process() override
{
820 while (!initialized
) {
824 int ret
= sync
.init();
837 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
, DoutPrefixProvider
839 RGWCoroutinesManager crs
;
841 rgw::BucketTrimManager
*bucket_trim
;
843 const utime_t trim_interval
;
845 uint64_t interval_msec() override
{ return 0; }
846 void stop_process() override
{ crs
.stop(); }
848 RGWSyncLogTrimThread(RGWRados
*store
, rgw::BucketTrimManager
*bucket_trim
,
850 : RGWSyncProcessorThread(store
, "sync-log-trim"),
851 crs(store
->ctx(), store
->get_cr_registry()), store(store
),
852 bucket_trim(bucket_trim
),
853 http(store
->ctx(), crs
.get_completion_mgr()),
854 trim_interval(interval
, 0)
857 int init() override
{
860 int process() override
{
861 list
<RGWCoroutinesStack
*> stacks
;
862 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
863 meta
->call(create_meta_log_trim_cr(this, store
, &http
,
864 cct
->_conf
->rgw_md_log_max_shards
,
866 stacks
.push_back(meta
);
868 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
869 data
->call(create_data_log_trim_cr(store
, &http
,
870 cct
->_conf
->rgw_data_log_num_shards
,
872 stacks
.push_back(data
);
874 auto bucket
= new RGWCoroutinesStack(store
->ctx(), &crs
);
875 bucket
->call(bucket_trim
->create_bucket_trim_cr(&http
));
876 stacks
.push_back(bucket
);
882 // implements DoutPrefixProvider
883 CephContext
*get_cct() const override
{ return store
->ctx(); }
884 unsigned get_subsys() const
889 std::ostream
& gen_prefix(std::ostream
& out
) const
891 return out
<< "sync log trim: ";
896 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
898 Mutex::Locker
l(meta_sync_thread_lock
);
899 if (meta_sync_processor_thread
) {
900 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
904 void RGWRados::wakeup_data_sync_shards(const string
& source_zone
, map
<int, set
<string
> >& shard_ids
)
906 ldout(ctx(), 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
907 Mutex::Locker
l(data_sync_thread_lock
);
908 map
<string
, RGWDataSyncProcessorThread
*>::iterator iter
= data_sync_processor_threads
.find(source_zone
);
909 if (iter
== data_sync_processor_threads
.end()) {
910 ldout(ctx(), 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
914 RGWDataSyncProcessorThread
*thread
= iter
->second
;
916 thread
->wakeup_sync_shards(shard_ids
);
919 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
921 Mutex::Locker
l(meta_sync_thread_lock
);
922 if (meta_sync_processor_thread
) {
923 return meta_sync_processor_thread
->get_manager();
928 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const std::string
& source_zone
)
930 Mutex::Locker
l(data_sync_thread_lock
);
931 auto thread
= data_sync_processor_threads
.find(source_zone
);
932 if (thread
== data_sync_processor_threads
.end()) {
935 return thread
->second
->get_manager();
938 int RGWRados::get_required_alignment(const rgw_pool
& pool
, uint64_t *alignment
)
941 int r
= open_pool_ctx(pool
, ioctx
, false);
943 ldout(cct
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
948 r
= ioctx
.pool_requires_alignment2(&requires
);
950 ldout(cct
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
961 r
= ioctx
.pool_required_alignment2(&align
);
963 ldout(cct
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
968 ldout(cct
, 20) << "required alignment=" << align
<< dendl
;
974 void RGWRados::get_max_aligned_size(uint64_t size
, uint64_t alignment
, uint64_t *max_size
)
976 if (alignment
== 0) {
981 if (size
<= alignment
) {
982 *max_size
= alignment
;
986 *max_size
= size
- (size
% alignment
);
989 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
, uint64_t *palignment
)
992 int r
= get_required_alignment(pool
, &alignment
);
998 *palignment
= alignment
;
1001 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
1003 get_max_aligned_size(config_chunk_size
, alignment
, max_chunk_size
);
1005 ldout(cct
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
1010 int RGWRados::get_max_chunk_size(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
,
1011 uint64_t *max_chunk_size
, uint64_t *palignment
)
1014 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
1015 ldout(cct
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
1018 return get_max_chunk_size(pool
, max_chunk_size
, palignment
);
1021 class RGWIndexCompletionManager
;
1023 struct complete_op_data
{
1024 Mutex lock
{"complete_op_data"};
1025 AioCompletion
*rados_completion
{nullptr};
1026 int manager_shard_id
{-1};
1027 RGWIndexCompletionManager
*manager
{nullptr};
1031 rgw_bucket_entry_ver ver
;
1032 cls_rgw_obj_key key
;
1033 rgw_bucket_dir_entry_meta dir_meta
;
1034 list
<cls_rgw_obj_key
> remove_objs
;
1037 rgw_zone_set zones_trace
;
1039 bool stopped
{false};
1042 Mutex::Locker
l(lock
);
1047 class RGWIndexCompletionThread
: public RGWRadosThread
{
1050 uint64_t interval_msec() override
{
1054 list
<complete_op_data
*> completions
;
1056 Mutex completions_lock
;
1058 RGWIndexCompletionThread(RGWRados
*_store
)
1059 : RGWRadosThread(_store
, "index-complete"), store(_store
), completions_lock("RGWIndexCompletionThread::completions_lock") {}
1061 int process() override
;
1063 void add_completion(complete_op_data
*completion
) {
1065 Mutex::Locker
l(completions_lock
);
1066 completions
.push_back(completion
);
1073 int RGWIndexCompletionThread::process()
1075 list
<complete_op_data
*> comps
;
1078 Mutex::Locker
l(completions_lock
);
1079 completions
.swap(comps
);
1082 for (auto c
: comps
) {
1083 std::unique_ptr
<complete_op_data
> up
{c
};
1088 ldout(store
->ctx(), 20) << __func__
<< "(): handling completion for key=" << c
->key
<< dendl
;
1090 RGWRados::BucketShard
bs(store
);
1091 RGWBucketInfo bucket_info
;
1093 int r
= bs
.init(c
->obj
.bucket
, c
->obj
, &bucket_info
);
1095 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): failed to initialize BucketShard, obj=" << c
->obj
<< " r=" << r
<< dendl
;
1096 /* not much to do */
1100 r
= store
->guard_reshard(&bs
, c
->obj
, bucket_info
,
1101 [&](RGWRados::BucketShard
*bs
) -> int {
1102 librados::ObjectWriteOperation o
;
1103 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
1104 cls_rgw_bucket_complete_op(o
, c
->op
, c
->tag
, c
->ver
, c
->key
, c
->dir_meta
, &c
->remove_objs
,
1105 c
->log_op
, c
->bilog_op
, &c
->zones_trace
);
1106 return bs
->index_ctx
.operate(bs
->bucket_obj
, &o
);
1109 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): bucket index completion failed, obj=" << c
->obj
<< " r=" << r
<< dendl
;
1110 /* ignoring error, can't do anything about it */
1113 r
= store
->data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
1115 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
1122 class RGWIndexCompletionManager
{
1123 RGWRados
*store
{nullptr};
1124 vector
<Mutex
*> locks
;
1125 vector
<set
<complete_op_data
*> > completions
;
1127 RGWIndexCompletionThread
*completion_thread
{nullptr};
1131 std::atomic
<int> cur_shard
{0};
1135 RGWIndexCompletionManager(RGWRados
*_store
) : store(_store
) {
1136 num_shards
= store
->ctx()->_conf
->rgw_thread_pool_size
;
1138 for (int i
= 0; i
< num_shards
; i
++) {
1140 snprintf(buf
, sizeof(buf
), "RGWIndexCompletionManager::lock::%d", i
);
1141 locks
.push_back(new Mutex(buf
));
1144 completions
.resize(num_shards
);
1146 ~RGWIndexCompletionManager() {
1149 for (auto l
: locks
) {
1155 int result
= cur_shard
% num_shards
;
1160 void create_completion(const rgw_obj
& obj
,
1161 RGWModifyOp op
, string
& tag
,
1162 rgw_bucket_entry_ver
& ver
,
1163 const cls_rgw_obj_key
& key
,
1164 rgw_bucket_dir_entry_meta
& dir_meta
,
1165 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
1167 rgw_zone_set
*zones_trace
,
1168 complete_op_data
**result
);
1169 bool handle_completion(completion_t cb
, complete_op_data
*arg
);
1172 completion_thread
= new RGWIndexCompletionThread(store
);
1173 int ret
= completion_thread
->init();
1177 completion_thread
->start();
1181 if (completion_thread
) {
1182 completion_thread
->stop();
1183 delete completion_thread
;
1186 for (int i
= 0; i
< num_shards
; ++i
) {
1187 Mutex::Locker
l(*locks
[i
]);
1188 for (auto c
: completions
[i
]) {
1192 completions
.clear();
1196 static void obj_complete_cb(completion_t cb
, void *arg
)
1198 complete_op_data
*completion
= (complete_op_data
*)arg
;
1199 completion
->lock
.Lock();
1200 if (completion
->stopped
) {
1201 completion
->lock
.Unlock(); /* can drop lock, no one else is referencing us */
1205 bool need_delete
= completion
->manager
->handle_completion(cb
, completion
);
1206 completion
->lock
.Unlock();
1213 void RGWIndexCompletionManager::create_completion(const rgw_obj
& obj
,
1214 RGWModifyOp op
, string
& tag
,
1215 rgw_bucket_entry_ver
& ver
,
1216 const cls_rgw_obj_key
& key
,
1217 rgw_bucket_dir_entry_meta
& dir_meta
,
1218 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
1220 rgw_zone_set
*zones_trace
,
1221 complete_op_data
**result
)
1223 complete_op_data
*entry
= new complete_op_data
;
1225 int shard_id
= next_shard();
1227 entry
->manager_shard_id
= shard_id
;
1228 entry
->manager
= this;
1234 entry
->dir_meta
= dir_meta
;
1235 entry
->log_op
= log_op
;
1236 entry
->bilog_op
= bilog_op
;
1239 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
1240 entry
->remove_objs
.push_back(*iter
);
1245 entry
->zones_trace
= *zones_trace
;
1247 entry
->zones_trace
.insert(store
->svc
.zone
->get_zone().id
);
1252 entry
->rados_completion
= librados::Rados::aio_create_completion(entry
, NULL
, obj_complete_cb
);
1254 Mutex::Locker
l(*locks
[shard_id
]);
1255 completions
[shard_id
].insert(entry
);
1258 bool RGWIndexCompletionManager::handle_completion(completion_t cb
, complete_op_data
*arg
)
1260 int shard_id
= arg
->manager_shard_id
;
1262 Mutex::Locker
l(*locks
[shard_id
]);
1264 auto& comps
= completions
[shard_id
];
1266 auto iter
= comps
.find(arg
);
1267 if (iter
== comps
.end()) {
1274 int r
= rados_aio_get_return_value(cb
);
1275 if (r
!= -ERR_BUSY_RESHARDING
) {
1278 completion_thread
->add_completion(arg
);
1282 void RGWRados::finalize()
1284 cct
->get_admin_socket()->unregister_commands(this);
1285 if (run_sync_thread
) {
1286 Mutex::Locker
l(meta_sync_thread_lock
);
1287 meta_sync_processor_thread
->stop();
1289 Mutex::Locker
dl(data_sync_thread_lock
);
1290 for (auto iter
: data_sync_processor_threads
) {
1291 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1294 if (sync_log_trimmer
) {
1295 sync_log_trimmer
->stop();
1299 async_rados
->stop();
1301 if (run_sync_thread
) {
1302 delete meta_sync_processor_thread
;
1303 meta_sync_processor_thread
= NULL
;
1304 Mutex::Locker
dl(data_sync_thread_lock
);
1305 for (auto iter
: data_sync_processor_threads
) {
1306 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1309 data_sync_processor_threads
.clear();
1310 delete sync_log_trimmer
;
1311 sync_log_trimmer
= nullptr;
1312 bucket_trim
= boost::none
;
1314 if (meta_notifier
) {
1315 meta_notifier
->stop();
1316 delete meta_notifier
;
1318 if (data_notifier
) {
1319 data_notifier
->stop();
1320 delete data_notifier
;
1337 RGWQuotaHandler::free_handler(quota_handler
);
1346 delete obj_tombstone_cache
;
1348 if (reshard_wait
.get()) {
1349 reshard_wait
->stop();
1350 reshard_wait
.reset();
1353 if (run_reshard_thread
) {
1354 reshard
->stop_processor();
1357 delete index_completion_manager
;
1361 * Initialize the RADOS instance and prepare to do other ops
1362 * Returns 0 on success, -ERR# on failure.
1364 int RGWRados::init_rados()
1367 auto admin_socket
= cct
->get_admin_socket();
1368 for (auto cmd
: admin_commands
) {
1369 int r
= admin_socket
->register_command(cmd
[0], cmd
[1], this,
1372 lderr(cct
) << "ERROR: fail to register admin socket command (r=" << r
1378 ret
= rados
.init_with_context(cct
);
1382 ret
= rados
.connect();
1387 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
1388 new RGWCoroutinesManagerRegistry(cct
)};
1389 ret
= crs
->hook_to_admin_command("cr dump");
1394 meta_mgr
= new RGWMetadataManager(cct
, this);
1395 data_log
= new RGWDataChangesLog(cct
, this);
1396 cr_registry
= crs
.release();
1400 int RGWRados::register_to_service_map(const string
& daemon_type
, const map
<string
, string
>& meta
)
1402 map
<string
,string
> metadata
= meta
;
1403 metadata
["num_handles"] = "1"s
;
1404 metadata
["zonegroup_id"] = svc
.zone
->get_zonegroup().get_id();
1405 metadata
["zonegroup_name"] = svc
.zone
->get_zonegroup().get_name();
1406 metadata
["zone_name"] = svc
.zone
->zone_name();
1407 metadata
["zone_id"] = svc
.zone
->zone_id();
1408 string name
= cct
->_conf
->name
.get_id();
1409 if (name
.compare(0, 4, "rgw.") == 0) {
1410 name
= name
.substr(4);
1412 int ret
= rados
.service_daemon_register(daemon_type
, name
, metadata
);
1414 ldout(cct
, 0) << "ERROR: service_daemon_register() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1421 int RGWRados::update_service_map(std::map
<std::string
, std::string
>&& status
)
1423 int ret
= rados
.service_daemon_update_status(move(status
));
1425 ldout(cct
, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1433 * Initialize the RADOS instance and prepare to do other ops
1434 * Returns 0 on success, -ERR# on failure.
1436 int RGWRados::init_complete()
1441 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1443 auto& zone_public_config
= svc
.zone
->get_zone();
1444 ret
= svc
.sync_modules
->get_manager()->create_instance(cct
, zone_public_config
.tier_type
, svc
.zone
->get_zone_params().tier_config
, &sync_module
);
1446 lderr(cct
) << "ERROR: failed to init sync module instance, ret=" << ret
<< dendl
;
1447 if (ret
== -ENOENT
) {
1448 lderr(cct
) << "ERROR: " << zone_public_config
.tier_type
1449 << " sync module does not exist. valid sync modules: "
1450 << svc
.sync_modules
->get_manager()->get_registered_module_names()
1456 period_puller
.reset(new RGWPeriodPuller(this));
1457 period_history
.reset(new RGWPeriodHistory(cct
, period_puller
.get(),
1458 svc
.zone
->get_current_period()));
1460 ret
= open_root_pool_ctx();
1464 ret
= open_gc_pool_ctx();
1468 ret
= open_lc_pool_ctx();
1472 ret
= open_objexp_pool_ctx();
1476 ret
= open_reshard_pool_ctx();
1480 pools_initialized
= true;
1483 gc
->initialize(cct
, this);
1485 obj_expirer
= new RGWObjectExpirer(this);
1487 if (use_gc_thread
) {
1488 gc
->start_processor();
1489 obj_expirer
->start_processor();
1492 auto& current_period
= svc
.zone
->get_current_period();
1493 auto& zonegroup
= svc
.zone
->get_zonegroup();
1494 auto& zone_params
= svc
.zone
->get_zone_params();
1495 auto& zone
= svc
.zone
->get_zone();
1497 /* no point of running sync thread if we don't have a master zone configured
1498 or there is no rest_master_conn */
1499 if (zonegroup
.master_zone
.empty() || !svc
.zone
->get_master_conn()
1500 || current_period
.get_id().empty()) {
1501 run_sync_thread
= false;
1504 if (run_sync_thread
) {
1505 // initialize the log period history
1506 meta_mgr
->init_oldest_log_period();
1509 async_rados
= new RGWAsyncRadosProcessor(this, cct
->_conf
->rgw_num_async_rados_threads
);
1510 async_rados
->start();
1512 ret
= meta_mgr
->init(current_period
.get_id());
1514 lderr(cct
) << "ERROR: failed to initialize metadata log: "
1515 << cpp_strerror(-ret
) << dendl
;
1519 if (svc
.zone
->is_meta_master()) {
1520 auto md_log
= meta_mgr
->get_log(current_period
.get_id());
1521 meta_notifier
= new RGWMetaNotifier(this, md_log
);
1522 meta_notifier
->start();
1525 /* init it anyway, might run sync through radosgw-admin explicitly */
1526 sync_tracer
= new RGWSyncTraceManager(cct
, cct
->_conf
->rgw_sync_trace_history_size
);
1527 sync_tracer
->init(this);
1528 ret
= sync_tracer
->hook_to_admin_command();
1533 if (run_sync_thread
) {
1534 for (const auto &pt
: zonegroup
.placement_targets
) {
1535 if (zone_params
.placement_pools
.find(pt
.second
.name
)
1536 == zone_params
.placement_pools
.end()){
1537 ldout(cct
, 0) << "WARNING: This zone does not contain the placement target "
1538 << pt
.second
.name
<< " present in zonegroup" << dendl
;
1541 Mutex::Locker
l(meta_sync_thread_lock
);
1542 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this, async_rados
);
1543 ret
= meta_sync_processor_thread
->init();
1545 ldout(cct
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
1548 meta_sync_processor_thread
->start();
1550 // configure the bucket trim manager
1551 rgw::BucketTrimConfig config
;
1552 rgw::configure_bucket_trim(cct
, config
);
1554 bucket_trim
.emplace(this, config
);
1555 ret
= bucket_trim
->init();
1557 ldout(cct
, 0) << "ERROR: failed to start bucket trim manager" << dendl
;
1560 data_log
->set_observer(&*bucket_trim
);
1562 Mutex::Locker
dl(data_sync_thread_lock
);
1563 for (auto source_zone
: svc
.zone
->get_data_sync_source_zones()) {
1564 ldout(cct
, 5) << "starting data sync thread for zone " << source_zone
->name
<< dendl
;
1565 auto *thread
= new RGWDataSyncProcessorThread(this, async_rados
, source_zone
);
1566 ret
= thread
->init();
1568 ldout(cct
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
1572 data_sync_processor_threads
[source_zone
->id
] = thread
;
1574 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
1576 sync_log_trimmer
= new RGWSyncLogTrimThread(this, &*bucket_trim
, interval
);
1577 ret
= sync_log_trimmer
->init();
1579 ldout(cct
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
1582 sync_log_trimmer
->start();
1585 data_notifier
= new RGWDataNotifier(this);
1586 data_notifier
->start();
1588 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
1589 binfo_cache
->init(svc
.cache
);
1592 lc
->initialize(cct
, this);
1595 lc
->start_processor();
1597 quota_handler
= RGWQuotaHandler::generate_handler(this, quota_threads
);
1599 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
1600 zone
.bucket_index_max_shards
);
1601 if (bucket_index_max_shards
> get_max_bucket_shards()) {
1602 bucket_index_max_shards
= get_max_bucket_shards();
1603 ldout(cct
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
1604 << get_max_bucket_shards() << dendl
;
1606 ldout(cct
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
1608 bool need_tombstone_cache
= !svc
.zone
->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1610 if (need_tombstone_cache
) {
1611 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
1614 reshard_wait
= std::make_shared
<RGWReshardWait
>();
1616 reshard
= new RGWReshard(this);
1618 /* only the master zone in the zonegroup reshards buckets */
1619 run_reshard_thread
= run_reshard_thread
&& (zonegroup
.master_zone
== zone
.id
);
1620 if (run_reshard_thread
) {
1621 reshard
->start_processor();
1624 index_completion_manager
= new RGWIndexCompletionManager(this);
1625 ret
= index_completion_manager
->start();
1630 int RGWRados::init_svc(bool raw
)
1633 return svc
.init_raw(cct
, use_cache
);
1636 return svc
.init(cct
, use_cache
);
1640 * Initialize the RADOS instance and prepare to do other ops
1641 * Returns 0 on success, -ERR# on failure.
1643 int RGWRados::initialize()
1647 inject_notify_timeout_probability
=
1648 cct
->_conf
.get_val
<double>("rgw_inject_notify_timeout_probability");
1649 max_notify_retries
= cct
->_conf
.get_val
<uint64_t>("rgw_max_notify_retries");
1651 ret
= init_svc(false);
1653 ldout(cct
, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
1657 host_id
= svc
.zone_utils
->gen_host_id();
1663 return init_complete();
1667 * Open the pool used as root for this gateway
1668 * Returns: 0 on success, -ERR# otherwise.
1670 int RGWRados::open_root_pool_ctx()
1672 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().domain_root
, root_pool_ctx
, true, true);
1675 int RGWRados::open_gc_pool_ctx()
1677 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().gc_pool
, gc_pool_ctx
, true, true);
1680 int RGWRados::open_lc_pool_ctx()
1682 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().lc_pool
, lc_pool_ctx
, true, true);
1685 int RGWRados::open_objexp_pool_ctx()
1687 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, objexp_pool_ctx
, true, true);
1690 int RGWRados::open_reshard_pool_ctx()
1692 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().reshard_pool
, reshard_pool_ctx
, true, true);
1695 int RGWRados::open_pool_ctx(const rgw_pool
& pool
, librados::IoCtx
& io_ctx
,
1698 constexpr bool create
= true; // create the pool if it doesn't exist
1699 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
, mostly_omap
);
1702 void RGWRados::build_bucket_index_marker(const string
& shard_id_str
, const string
& shard_marker
,
1705 *marker
= shard_id_str
;
1706 marker
->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR
);
1707 marker
->append(shard_marker
);
1711 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
)
1713 const rgw_pool
& explicit_pool
= bucket_info
.bucket
.explicit_placement
.index_pool
;
1715 if (!explicit_pool
.empty()) {
1716 return open_pool_ctx(explicit_pool
, index_ctx
, false);
1719 auto& zonegroup
= svc
.zone
->get_zonegroup();
1720 auto& zone_params
= svc
.zone
->get_zone_params();
1722 const rgw_placement_rule
*rule
= &bucket_info
.placement_rule
;
1723 if (rule
->empty()) {
1724 rule
= &zonegroup
.default_placement
;
1726 auto iter
= zone_params
.placement_pools
.find(rule
->name
);
1727 if (iter
== zone_params
.placement_pools
.end()) {
1728 ldout(cct
, 0) << "could not find placement rule " << *rule
<< " within zonegroup " << dendl
;
1732 int r
= open_pool_ctx(iter
->second
.index_pool
, index_ctx
, true);
1741 struct log_list_state
{
1743 librados::IoCtx io_ctx
;
1744 librados::NObjectIterator obit
;
1747 int RGWRados::log_list_init(const string
& prefix
, RGWAccessHandle
*handle
)
1749 log_list_state
*state
= new log_list_state
;
1750 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1755 state
->prefix
= prefix
;
1756 state
->obit
= state
->io_ctx
.nobjects_begin();
1757 *handle
= (RGWAccessHandle
)state
;
1761 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
1763 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
1765 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
1769 if (state
->prefix
.length() &&
1770 state
->obit
->get_oid().find(state
->prefix
) != 0) {
1774 *name
= state
->obit
->get_oid();
1781 int RGWRados::log_remove(const string
& name
)
1783 librados::IoCtx io_ctx
;
1784 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
1787 return io_ctx
.remove(name
);
1790 struct log_show_state
{
1791 librados::IoCtx io_ctx
;
1793 bufferlist::const_iterator p
;
1797 log_show_state() : pos(0), eof(false) {}
1800 int RGWRados::log_show_init(const string
& name
, RGWAccessHandle
*handle
)
1802 log_show_state
*state
= new log_show_state
;
1803 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1809 *handle
= (RGWAccessHandle
)state
;
1813 int RGWRados::log_show_next(RGWAccessHandle handle
, rgw_log_entry
*entry
)
1815 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
1816 off_t off
= state
->p
.get_off();
1818 ldout(cct
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
1820 << " eof " << (int)state
->eof
1823 unsigned chunk
= 1024*1024;
1824 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
1826 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
1832 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
1833 } catch (buffer::error
& err
) {
1837 state
->bl
.claim(old
);
1838 state
->bl
.claim_append(more
);
1839 state
->p
= state
->bl
.cbegin();
1840 if ((unsigned)r
< chunk
)
1842 ldout(cct
, 10) << " read " << r
<< dendl
;
1846 return 0; // end of file
1848 decode(*entry
, state
->p
);
1850 catch (const buffer::error
&e
) {
1857 * usage_log_hash: get usage log key hash, based on name and index
1859 * Get the usage object name. Since a user may have more than 1
1860 * object holding that info (multiple shards), we use index to
1861 * specify that shard number. Once index exceeds max shards it
1863 * If name is not being set, results for all users will be returned
1864 * and index will wrap only after total shards number.
1866 * @param cct [in] ceph context
1867 * @param name [in] user name
1868 * @param hash [out] hash value
1869 * @param index [in] shard index number
1871 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
1873 uint32_t val
= index
;
1875 if (!name
.empty()) {
1876 int max_user_shards
= cct
->_conf
->rgw_usage_max_user_shards
;
1877 val
%= max_user_shards
;
1878 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
1881 int max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1882 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
1886 int RGWRados::log_usage(map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
1890 map
<string
, rgw_usage_log_info
> log_objs
;
1895 /* restructure usage map, zone by object hash */
1896 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
1897 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
1898 const rgw_user_bucket
& ub
= iter
->first
;
1899 RGWUsageBatch
& info
= iter
->second
;
1901 if (ub
.user
.empty()) {
1902 ldout(cct
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
1906 if (ub
.user
!= last_user
) {
1907 /* index *should* be random, but why waste extra cycles
1908 in most cases max user shards is not going to exceed 1,
1909 so just incrementing it */
1910 usage_log_hash(cct
, ub
.user
, hash
, index
++);
1912 last_user
= ub
.user
;
1913 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
1915 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
1916 v
.push_back(miter
->second
);
1920 map
<string
, rgw_usage_log_info
>::iterator liter
;
1922 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
1923 int r
= cls_obj_usage_log_add(liter
->first
, liter
->second
);
1930 int RGWRados::read_usage(const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
,
1931 uint32_t max_entries
, bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
,
1932 rgw_usage_log_entry
>& usage
)
1934 uint32_t num
= max_entries
;
1935 string hash
, first_hash
;
1936 string user_str
= user
.to_str();
1937 usage_log_hash(cct
, user_str
, first_hash
, 0);
1939 if (usage_iter
.index
) {
1940 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
1948 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
1949 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
1951 int ret
= cls_obj_usage_log_read(hash
, user_str
, bucket_name
, start_epoch
, end_epoch
, num
,
1952 usage_iter
.read_iter
, ret_usage
, is_truncated
);
1959 num
-= ret_usage
.size();
1961 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
1962 usage
[iter
->first
].aggregate(iter
->second
);
1966 if (!*is_truncated
) {
1967 usage_iter
.read_iter
.clear();
1968 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
1970 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
1974 int RGWRados::trim_usage(const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
)
1977 string hash
, first_hash
;
1978 string user_str
= user
.to_str();
1979 usage_log_hash(cct
, user_str
, first_hash
, index
);
1983 int ret
= cls_obj_usage_log_trim(hash
, user_str
, bucket_name
, start_epoch
, end_epoch
);
1985 if (ret
< 0 && ret
!= -ENOENT
)
1988 usage_log_hash(cct
, user_str
, hash
, ++index
);
1989 } while (hash
!= first_hash
);
1995 int RGWRados::clear_usage()
1997 auto max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1999 for (unsigned i
=0; i
< max_shards
; i
++){
2000 string oid
= RGW_USAGE_OBJ_PREFIX
+ to_string(i
);
2001 ret
= cls_obj_usage_log_clear(oid
);
2003 ldout(cct
,0) << "usage clear on oid="<< oid
<< "failed with ret=" << ret
<< dendl
;
2010 int RGWRados::key_to_shard_id(const string
& key
, int max_shards
)
2012 return rgw_shard_id(key
, max_shards
);
2015 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& key
, string
& name
, int *shard_id
)
2017 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
2020 *shard_id
= val
% max_shards
;
2022 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
2023 name
= prefix
+ buf
;
2026 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& section
, const string
& key
, string
& name
)
2028 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
2029 val
^= ceph_str_hash_linux(section
.c_str(), section
.size());
2031 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
2032 name
= prefix
+ buf
;
2035 void RGWRados::shard_name(const string
& prefix
, unsigned shard_id
, string
& name
)
2038 snprintf(buf
, sizeof(buf
), "%u", shard_id
);
2039 name
= prefix
+ buf
;
2043 void RGWRados::time_log_prepare_entry(cls_log_entry
& entry
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
2045 cls_log_add_prepare_entry(entry
, utime_t(ut
), section
, key
, bl
);
2048 int RGWRados::time_log_add_init(librados::IoCtx
& io_ctx
)
2050 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
, true);
2054 int RGWRados::time_log_add(const string
& oid
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
2056 librados::IoCtx io_ctx
;
2058 int r
= time_log_add_init(io_ctx
);
2063 ObjectWriteOperation op
;
2065 cls_log_add(op
, t
, section
, key
, bl
);
2067 return io_ctx
.operate(oid
, &op
);
2070 int RGWRados::time_log_add(const string
& oid
, list
<cls_log_entry
>& entries
,
2071 librados::AioCompletion
*completion
, bool monotonic_inc
)
2073 librados::IoCtx io_ctx
;
2075 int r
= time_log_add_init(io_ctx
);
2080 ObjectWriteOperation op
;
2081 cls_log_add(op
, entries
, monotonic_inc
);
2084 r
= io_ctx
.operate(oid
, &op
);
2086 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
2091 int RGWRados::time_log_list(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
2092 int max_entries
, list
<cls_log_entry
>& entries
,
2093 const string
& marker
,
2097 librados::IoCtx io_ctx
;
2099 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2102 librados::ObjectReadOperation op
;
2104 utime_t
st(start_time
);
2105 utime_t
et(end_time
);
2107 cls_log_list(op
, st
, et
, marker
, max_entries
, entries
,
2108 out_marker
, truncated
);
2112 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
2119 int RGWRados::time_log_info(const string
& oid
, cls_log_header
*header
)
2121 librados::IoCtx io_ctx
;
2123 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2126 librados::ObjectReadOperation op
;
2128 cls_log_info(op
, header
);
2132 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
2139 int RGWRados::time_log_info_async(librados::IoCtx
& io_ctx
, const string
& oid
, cls_log_header
*header
, librados::AioCompletion
*completion
)
2141 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2145 librados::ObjectReadOperation op
;
2147 cls_log_info(op
, header
);
2149 int ret
= io_ctx
.aio_operate(oid
, completion
, &op
, NULL
);
2156 int RGWRados::time_log_trim(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
2157 const string
& from_marker
, const string
& to_marker
,
2158 librados::AioCompletion
*completion
)
2160 librados::IoCtx io_ctx
;
2162 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2166 utime_t
st(start_time
);
2167 utime_t
et(end_time
);
2169 ObjectWriteOperation op
;
2170 cls_log_trim(op
, st
, et
, from_marker
, to_marker
);
2173 r
= io_ctx
.operate(oid
, &op
);
2175 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
2180 string
RGWRados::objexp_hint_get_shardname(int shard_num
)
2183 snprintf(buf
, sizeof(buf
), "%010u", (unsigned)shard_num
);
2185 string
objname("obj_delete_at_hint.");
2186 return objname
+ buf
;
2189 int RGWRados::objexp_key_shard(const rgw_obj_index_key
& key
)
2191 string obj_key
= key
.name
+ key
.instance
;
2192 int num_shards
= cct
->_conf
->rgw_objexp_hints_num_shards
;
2193 return rgw_bucket_shard_index(obj_key
, num_shards
);
2196 static string
objexp_hint_get_keyext(const string
& tenant_name
,
2197 const string
& bucket_name
,
2198 const string
& bucket_id
,
2199 const rgw_obj_key
& obj_key
)
2201 return tenant_name
+ (tenant_name
.empty() ? "" : ":") + bucket_name
+ ":" + bucket_id
+
2202 ":" + obj_key
.name
+ ":" + obj_key
.instance
;
2205 int RGWRados::objexp_hint_add(const ceph::real_time
& delete_at
,
2206 const string
& tenant_name
,
2207 const string
& bucket_name
,
2208 const string
& bucket_id
,
2209 const rgw_obj_index_key
& obj_key
)
2211 const string keyext
= objexp_hint_get_keyext(tenant_name
, bucket_name
,
2212 bucket_id
, obj_key
);
2213 objexp_hint_entry he
= {
2214 .tenant
= tenant_name
,
2215 .bucket_name
= bucket_name
,
2216 .bucket_id
= bucket_id
,
2218 .exp_time
= delete_at
};
2221 ObjectWriteOperation op
;
2222 cls_timeindex_add(op
, utime_t(delete_at
), keyext
, hebl
);
2224 string shard_name
= objexp_hint_get_shardname(objexp_key_shard(obj_key
));
2225 return objexp_pool_ctx
.operate(shard_name
, &op
);
2228 void RGWRados::objexp_get_shard(int shard_num
,
2229 string
& shard
) /* out */
2231 shard
= objexp_hint_get_shardname(shard_num
);
2234 int RGWRados::objexp_hint_list(const string
& oid
,
2235 const ceph::real_time
& start_time
,
2236 const ceph::real_time
& end_time
,
2237 const int max_entries
,
2238 const string
& marker
,
2239 list
<cls_timeindex_entry
>& entries
, /* out */
2240 string
*out_marker
, /* out */
2241 bool *truncated
) /* out */
2243 librados::ObjectReadOperation op
;
2244 cls_timeindex_list(op
, utime_t(start_time
), utime_t(end_time
), marker
, max_entries
, entries
,
2245 out_marker
, truncated
);
2248 int ret
= objexp_pool_ctx
.operate(oid
, &op
, &obl
);
2250 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
2254 if ((ret
== -ENOENT
) && truncated
) {
2261 int RGWRados::objexp_hint_parse(cls_timeindex_entry
&ti_entry
, /* in */
2262 objexp_hint_entry
& hint_entry
) /* out */
2265 auto iter
= ti_entry
.value
.cbegin();
2266 decode(hint_entry
, iter
);
2267 } catch (buffer::error
& err
) {
2268 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
2274 int RGWRados::objexp_hint_trim(const string
& oid
,
2275 const ceph::real_time
& start_time
,
2276 const ceph::real_time
& end_time
,
2277 const string
& from_marker
,
2278 const string
& to_marker
)
2280 int ret
= cls_timeindex_trim(objexp_pool_ctx
, oid
, utime_t(start_time
), utime_t(end_time
),
2281 from_marker
, to_marker
);
2282 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
2289 int RGWRados::lock_exclusive(const rgw_pool
& pool
, const string
& oid
, timespan
& duration
,
2290 string
& zone_id
, string
& owner_id
) {
2291 librados::IoCtx io_ctx
;
2293 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
2297 uint64_t msec
= std::chrono::duration_cast
<std::chrono::milliseconds
>(duration
).count();
2298 utime_t
ut(msec
/ 1000, msec
% 1000);
2300 rados::cls::lock::Lock
l(log_lock_name
);
2302 l
.set_cookie(owner_id
);
2304 l
.set_may_renew(true);
2306 return l
.lock_exclusive(&io_ctx
, oid
);
2309 int RGWRados::unlock(const rgw_pool
& pool
, const string
& oid
, string
& zone_id
, string
& owner_id
) {
2310 librados::IoCtx io_ctx
;
2312 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
2317 rados::cls::lock::Lock
l(log_lock_name
);
2319 l
.set_cookie(owner_id
);
2321 return l
.unlock(&io_ctx
, oid
);
2324 int RGWRados::decode_policy(bufferlist
& bl
, ACLOwner
*owner
)
2326 auto i
= bl
.cbegin();
2327 RGWAccessControlPolicy
policy(cct
);
2329 policy
.decode_owner(i
);
2330 } catch (buffer::error
& err
) {
2331 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
2334 *owner
= policy
.get_owner();
2338 int rgw_policy_from_attrset(CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
2340 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
2341 if (aiter
== attrset
.end())
2344 bufferlist
& bl
= aiter
->second
;
2345 auto iter
= bl
.cbegin();
2347 policy
->decode(iter
);
2348 } catch (buffer::error
& err
) {
2349 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
2352 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 15>()) {
2353 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
2354 ldout(cct
, 15) << __func__
<< " Read AccessControlPolicy";
2355 s3policy
->to_xml(*_dout
);
2362 int RGWRados::Bucket::update_bucket_id(const string
& new_bucket_id
)
2364 rgw_bucket bucket
= bucket_info
.bucket
;
2365 bucket
.update_bucket_id(new_bucket_id
);
2367 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
2369 bucket_info
.objv_tracker
.clear();
2370 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, nullptr, nullptr);
2379 static inline std::string
after_delim(std::string_view delim
)
2381 // assert: ! delim.empty()
2382 std::string result
{delim
.data(), delim
.length()};
2383 result
+= char(255);
2389 * Get ordered listing of the objects in a bucket.
2391 * max: maximum number of results to return
2392 * bucket: bucket to list contents of
2393 * prefix: only return results that match this prefix
2394 * delim: do not include results that match this string.
2395 * Any skipped results will have the matching portion of their name
2396 * inserted in common_prefixes with a "true" mark.
2397 * marker: if filled in, begin the listing with this object.
2398 * end_marker: if filled in, end the listing with this object.
2399 * result: the objects are put in here.
2400 * common_prefixes: if delim is filled in, any matching prefixes are
2402 * is_truncated: if number of objects in the bucket is bigger than
2403 * max, then truncated.
2405 int RGWRados::Bucket::List::list_objects_ordered(
2407 vector
<rgw_bucket_dir_entry
> *result
,
2408 map
<string
, bool> *common_prefixes
,
2411 RGWRados
*store
= target
->get_store();
2412 CephContext
*cct
= store
->ctx();
2413 int shard_id
= target
->get_shard_id();
2416 bool truncated
= true;
2417 const int64_t max
= // protect against memory issues and negative vals
2418 std::min(bucket_list_objects_absolute_max
, std::max(int64_t(0), max_p
));
2419 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
, max
);
2423 rgw_obj_key
marker_obj(params
.marker
.name
, params
.marker
.instance
, params
.ns
);
2424 rgw_obj_index_key cur_marker
;
2425 marker_obj
.get_index_key(&cur_marker
);
2427 rgw_obj_key
end_marker_obj(params
.end_marker
.name
, params
.end_marker
.instance
,
2429 rgw_obj_index_key cur_end_marker
;
2430 end_marker_obj
.get_index_key(&cur_end_marker
);
2431 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
2433 rgw_obj_key
prefix_obj(params
.prefix
);
2434 prefix_obj
.ns
= params
.ns
;
2435 string cur_prefix
= prefix_obj
.get_index_key_name();
2436 string after_delim_s
; /* needed in !params.delim.empty() AND later */
2438 if (!params
.delim
.empty()) {
2439 after_delim_s
= after_delim(params
.delim
);
2440 /* if marker points at a common prefix, fast forward it into its
2441 * upper bound string */
2442 int delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
2443 if (delim_pos
>= 0) {
2444 string s
= cur_marker
.name
.substr(0, delim_pos
);
2445 s
.append(after_delim_s
);
2450 string skip_after_delim
;
2451 while (truncated
&& count
<= max
) {
2452 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
2453 int r
= store
->cls_bucket_list_ordered(target
->get_bucket_info(),
2457 read_ahead
+ 1 - count
,
2458 params
.list_versions
,
2465 for (auto eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
2466 rgw_bucket_dir_entry
& entry
= eiter
->second
;
2467 rgw_obj_index_key index_key
= entry
.key
;
2469 rgw_obj_key
obj(index_key
);
2471 /* note that parse_raw_oid() here will not set the correct
2472 * object's instance, as rgw_obj_index_key encodes that
2473 * separately. We don't need to set the instance because it's
2474 * not needed for the checks here and we end up using the raw
2475 * entry for the return vector
2477 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
2479 ldout(cct
, 0) << "ERROR: could not parse object name: " << obj
.name
<< dendl
;
2483 bool check_ns
= (obj
.ns
== params
.ns
);
2484 if (!params
.list_versions
&& !entry
.is_visible()) {
2488 if (params
.enforce_ns
&& !check_ns
) {
2489 if (!params
.ns
.empty()) {
2490 /* we've iterated past the namespace we're searching -- done now */
2495 /* we're not looking at the namespace this object is in, next! */
2499 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
2505 params
.marker
= index_key
;
2506 next_marker
= index_key
;
2509 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
2512 if (params
.prefix
.size() &&
2513 (obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
) != 0))
2516 if (!params
.delim
.empty()) {
2517 int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
2519 if (delim_pos
>= 0) {
2520 /* extract key -with trailing delimiter- for CommonPrefix */
2522 obj
.name
.substr(0, delim_pos
+ params
.delim
.length());
2524 if (common_prefixes
&&
2525 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
2530 next_marker
= prefix_key
;
2531 (*common_prefixes
)[prefix_key
] = true;
2545 result
->emplace_back(std::move(entry
));
2549 if (!params
.delim
.empty()) {
2550 int marker_delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
2551 if (marker_delim_pos
>= 0) {
2552 skip_after_delim
= cur_marker
.name
.substr(0, marker_delim_pos
);
2553 skip_after_delim
.append(after_delim_s
);
2555 ldout(cct
, 20) << "skip_after_delim=" << skip_after_delim
<< dendl
;
2557 if (skip_after_delim
> cur_marker
.name
) {
2558 cur_marker
= skip_after_delim
;
2559 ldout(cct
, 20) << "setting cur_marker="
2561 << "[" << cur_marker
.instance
<< "]"
2570 *is_truncated
= truncated
;
2573 } // list_objects_ordered
2577 * Get listing of the objects in a bucket and allow the results to be out
2580 * Even though there are key differences with the ordered counterpart,
2581 * the parameters are the same to maintain some compatability.
2583 * max: maximum number of results to return
2584 * bucket: bucket to list contents of
2585 * prefix: only return results that match this prefix
2586 * delim: should not be set; if it is we should have indicated an error
2587 * marker: if filled in, begin the listing with this object.
2588 * end_marker: if filled in, end the listing with this object.
2589 * result: the objects are put in here.
2590 * common_prefixes: this is never filled with an unordered list; the param
2591 * is maintained for compatibility
2592 * is_truncated: if number of objects in the bucket is bigger than max, then
2595 int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p
,
2596 vector
<rgw_bucket_dir_entry
> *result
,
2597 map
<string
, bool> *common_prefixes
,
2600 RGWRados
*store
= target
->get_store();
2601 CephContext
*cct
= store
->ctx();
2602 int shard_id
= target
->get_shard_id();
2605 bool truncated
= true;
2607 const int64_t max
= // protect against memory issues and negative vals
2608 std::min(bucket_list_objects_absolute_max
, std::max(int64_t(0), max_p
));
2610 // read a few extra in each call to cls_bucket_list_unordered in
2611 // case some are filtered out due to namespace matching, versioning,
2613 const int64_t max_read_ahead
= 100;
2614 const uint32_t read_ahead
= uint32_t(max
+ std::min(max
, max_read_ahead
));
2618 rgw_obj_key
marker_obj(params
.marker
.name
,
2619 params
.marker
.instance
,
2621 rgw_obj_index_key cur_marker
;
2622 marker_obj
.get_index_key(&cur_marker
);
2624 rgw_obj_key
end_marker_obj(params
.end_marker
.name
,
2625 params
.end_marker
.instance
,
2627 rgw_obj_index_key cur_end_marker
;
2628 end_marker_obj
.get_index_key(&cur_end_marker
);
2629 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
2631 rgw_obj_key
prefix_obj(params
.prefix
);
2632 prefix_obj
.ns
= params
.ns
;
2633 string cur_prefix
= prefix_obj
.get_index_key_name();
2635 while (truncated
&& count
<= max
) {
2636 std::vector
<rgw_bucket_dir_entry
> ent_list
;
2637 int r
= store
->cls_bucket_list_unordered(target
->get_bucket_info(),
2642 params
.list_versions
,
2649 // NB: while regions of ent_list will be sorted, we have no
2650 // guarantee that all items will be sorted since they can cross
2653 for (auto& entry
: ent_list
) {
2654 rgw_obj_index_key index_key
= entry
.key
;
2655 rgw_obj_key
obj(index_key
);
2657 /* note that parse_raw_oid() here will not set the correct
2658 * object's instance, as rgw_obj_index_key encodes that
2659 * separately. We don't need to set the instance because it's
2660 * not needed for the checks here and we end up using the raw
2661 * entry for the return vector
2663 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
2665 ldout(cct
, 0) << "ERROR: could not parse object name: " <<
2670 if (!params
.list_versions
&& !entry
.is_visible()) {
2674 if (params
.enforce_ns
&& obj
.ns
!= params
.ns
) {
2678 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
2679 // we're not guaranteed items will come in order, so we have
2680 // to loop through all
2685 params
.marker
.set(index_key
);
2686 next_marker
.set(index_key
);
2689 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
2692 if (params
.prefix
.size() &&
2693 (0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
)))
2701 result
->emplace_back(std::move(entry
));
2703 } // for (auto& entry : ent_list)
2704 } // while (truncated && count <= max)
2708 *is_truncated
= truncated
;
2711 } // list_objects_unordered
2715 * create a rados pool, associated meta info
2716 * returns 0 on success, -ERR# otherwise.
2718 int RGWRados::create_pool(const rgw_pool
& pool
)
2720 librados::IoCtx io_ctx
;
2721 constexpr bool create
= true;
2722 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
);
2725 int RGWRados::init_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
2727 librados::IoCtx index_ctx
;
2729 string dir_oid
= dir_oid_prefix
;
2730 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
2735 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
2737 map
<int, string
> bucket_objs
;
2738 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
2740 return CLSRGWIssueBucketIndexInit(index_ctx
,
2742 cct
->_conf
->rgw_bucket_index_max_aio
)();
2745 int RGWRados::clean_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
2747 librados::IoCtx index_ctx
;
2749 std::string dir_oid
= dir_oid_prefix
;
2750 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
2755 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
2757 std::map
<int, std::string
> bucket_objs
;
2758 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
2760 return CLSRGWIssueBucketIndexClean(index_ctx
,
2762 cct
->_conf
->rgw_bucket_index_max_aio
)();
2765 void RGWRados::create_bucket_id(string
*bucket_id
)
2767 uint64_t iid
= instance_id();
2768 uint64_t bid
= next_bucket_id();
2769 char buf
[svc
.zone
->get_zone_params().get_id().size() + 48];
2770 snprintf(buf
, sizeof(buf
), "%s.%" PRIu64
".%" PRIu64
,
2771 svc
.zone
->get_zone_params().get_id().c_str(), iid
, bid
);
2775 int RGWRados::create_bucket(const RGWUserInfo
& owner
, rgw_bucket
& bucket
,
2776 const string
& zonegroup_id
,
2777 const rgw_placement_rule
& placement_rule
,
2778 const string
& swift_ver_location
,
2779 const RGWQuotaInfo
* pquota_info
,
2780 map
<std::string
, bufferlist
>& attrs
,
2781 RGWBucketInfo
& info
,
2783 obj_version
*pep_objv
,
2784 real_time creation_time
,
2785 rgw_bucket
*pmaster_bucket
,
2786 uint32_t *pmaster_num_shards
,
2789 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2790 rgw_placement_rule selected_placement_rule
;
2791 RGWZonePlacementInfo rule_info
;
2793 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
2795 ret
= svc
.zone
->select_bucket_placement(owner
, zonegroup_id
, placement_rule
,
2796 &selected_placement_rule
, &rule_info
);
2800 if (!pmaster_bucket
) {
2801 create_bucket_id(&bucket
.marker
);
2802 bucket
.bucket_id
= bucket
.marker
;
2804 bucket
.marker
= pmaster_bucket
->marker
;
2805 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
2808 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
2811 objv_tracker
.write_version
= *pobjv
;
2813 objv_tracker
.generate_new_write_ver(cct
);
2816 info
.bucket
= bucket
;
2817 info
.owner
= owner
.user_id
;
2818 info
.zonegroup
= zonegroup_id
;
2819 info
.placement_rule
= selected_placement_rule
;
2820 info
.index_type
= rule_info
.index_type
;
2821 info
.swift_ver_location
= swift_ver_location
;
2822 info
.swift_versioning
= (!swift_ver_location
.empty());
2823 if (pmaster_num_shards
) {
2824 info
.num_shards
= *pmaster_num_shards
;
2826 info
.num_shards
= bucket_index_max_shards
;
2828 info
.bucket_index_shard_hash_type
= RGWBucketInfo::MOD
;
2829 info
.requester_pays
= false;
2830 if (real_clock::is_zero(creation_time
)) {
2831 info
.creation_time
= ceph::real_clock::now();
2833 info
.creation_time
= creation_time
;
2836 info
.quota
= *pquota_info
;
2839 int r
= init_bucket_index(info
, info
.num_shards
);
2844 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true);
2845 if (ret
== -EEXIST
) {
2846 librados::IoCtx index_ctx
;
2847 map
<int, string
> bucket_objs
;
2848 int r
= open_bucket_index(info
, index_ctx
, bucket_objs
);
2852 /* we need to reread the info and return it, caller will have a use for it */
2853 RGWObjVersionTracker instance_ver
= info
.objv_tracker
;
2854 info
.objv_tracker
.clear();
2855 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
2856 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, NULL
);
2861 ldout(cct
, 0) << "get_bucket_info returned " << r
<< dendl
;
2865 /* only remove it if it's a different bucket instance */
2866 if (info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
2867 /* remove bucket meta instance */
2868 r
= rgw_bucket_instance_remove_entry(this,
2874 /* remove bucket index objects asynchronously by best effort */
2875 (void) CLSRGWIssueBucketIndexClean(index_ctx
,
2877 cct
->_conf
->rgw_bucket_index_max_aio
)();
2879 /* ret == -ENOENT here */
2884 /* this is highly unlikely */
2885 ldout(cct
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
2889 bool RGWRados::get_obj_data_pool(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
2891 return rgw_get_obj_data_pool(svc
.zone
->get_zonegroup(), svc
.zone
->get_zone_params(), placement_rule
, obj
, pool
);
2894 bool RGWRados::obj_to_raw(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
2896 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
2898 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
2901 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
2904 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
2907 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2908 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2912 int r
= open_pool_ctx(pool
, *ioctx
, false);
2917 ioctx
->locator_set_key(key
);
2922 int RGWRados::get_obj_head_ref(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_rados_ref
*ref
)
2924 get_obj_bucket_and_oid_loc(obj
, ref
->obj
.oid
, ref
->obj
.loc
);
2927 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2928 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2932 int r
= open_pool_ctx(pool
, ref
->ioctx
, false);
2937 ref
->ioctx
.locator_set_key(ref
->obj
.loc
);
2942 int RGWRados::get_raw_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2948 if (ref
->obj
.oid
.empty()) {
2949 ref
->obj
.oid
= obj
.pool
.to_str();
2950 ref
->obj
.pool
= svc
.zone
->get_zone_params().domain_root
;
2952 r
= open_pool_ctx(ref
->obj
.pool
, ref
->ioctx
, false);
2956 ref
->ioctx
.locator_set_key(ref
->obj
.loc
);
2961 int RGWRados::get_system_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2963 return get_raw_obj_ref(obj
, ref
);
2967 * fixes an issue where head objects were supposed to have a locator created, but ended
2970 int RGWRados::fix_head_obj_locator(const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
2972 const rgw_bucket
& bucket
= bucket_info
.bucket
;
2976 rgw_obj
obj(bucket
, key
);
2978 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
2980 if (locator
.empty()) {
2981 ldout(cct
, 20) << "object does not have a locator, nothing to fix" << dendl
;
2985 librados::IoCtx ioctx
;
2987 int ret
= get_obj_head_ioctx(bucket_info
, obj
, &ioctx
);
2989 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
2992 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
2997 struct timespec mtime_ts
;
2998 map
<string
, bufferlist
> attrs
;
2999 librados::ObjectReadOperation op
;
3000 op
.getxattrs(&attrs
, NULL
);
3001 op
.stat2(&size
, &mtime_ts
, NULL
);
3002 #define HEAD_SIZE 512 * 1024
3003 op
.read(0, HEAD_SIZE
, &data
, NULL
);
3005 ret
= ioctx
.operate(oid
, &op
, NULL
);
3007 lderr(cct
) << "ERROR: ioctx.operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
3011 if (size
> HEAD_SIZE
) {
3012 lderr(cct
) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
3016 if (size
!= data
.length()) {
3017 lderr(cct
) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
3022 librados::ObjectWriteOperation wop
;
3024 wop
.mtime2(&mtime_ts
);
3026 map
<string
, bufferlist
>::iterator iter
;
3027 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
3028 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
3033 ioctx
.locator_set_key(locator
);
3034 ioctx
.operate(oid
, &wop
);
3038 ioctx
.locator_set_key(string());
3040 ret
= ioctx
.remove(oid
);
3042 lderr(cct
) << "ERROR: failed to remove original bad object" << dendl
;
3050 int RGWRados::move_rados_obj(librados::IoCtx
& src_ioctx
,
3051 const string
& src_oid
, const string
& src_locator
,
3052 librados::IoCtx
& dst_ioctx
,
3053 const string
& dst_oid
, const string
& dst_locator
)
3056 #define COPY_BUF_SIZE (4 * 1024 * 1024)
3058 uint64_t chunk_size
= COPY_BUF_SIZE
;
3062 struct timespec mtime_ts
;
3065 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
3069 src_ioctx
.locator_set_key(src_locator
);
3070 dst_ioctx
.locator_set_key(dst_locator
);
3074 ObjectReadOperation rop
;
3075 ObjectWriteOperation wop
;
3078 rop
.stat2(&size
, &mtime_ts
, NULL
);
3079 mtime
= real_clock::from_timespec(mtime_ts
);
3081 rop
.read(ofs
, chunk_size
, &data
, NULL
);
3082 ret
= src_ioctx
.operate(src_oid
, &rop
, NULL
);
3087 if (data
.length() == 0) {
3092 wop
.create(true); /* make it exclusive */
3093 wop
.mtime2(&mtime_ts
);
3094 mtime
= real_clock::from_timespec(mtime_ts
);
3096 wop
.write(ofs
, data
);
3097 ret
= dst_ioctx
.operate(dst_oid
, &wop
);
3101 ofs
+= data
.length();
3102 done
= data
.length() != chunk_size
;
3106 lderr(cct
) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
3107 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
3112 src_ioctx
.remove(src_oid
);
3117 // TODO: clean up dst_oid if we created it
3118 lderr(cct
) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
3123 * fixes an issue where head objects were supposed to have a locator created, but ended
3126 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
)
3128 const rgw_bucket
& bucket
= bucket_info
.bucket
;
3129 rgw_obj
obj(bucket
, key
);
3136 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
3141 RGWObjState
*astate
= NULL
;
3142 RGWObjectCtx
rctx(this);
3143 r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
3147 if (astate
->has_manifest
) {
3148 RGWObjManifest::obj_iterator miter
;
3149 RGWObjManifest
& manifest
= astate
->manifest
;
3150 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
3151 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(this);
3156 rgw_raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
3158 if (loc
.key
.ns
.empty()) {
3159 /* continue, we're only interested in tail objects */
3163 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
3164 ref
.ioctx
.locator_set_key(locator
);
3166 ldout(cct
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
3168 r
= ref
.ioctx
.stat(oid
, NULL
, NULL
);
3174 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
3176 /* create a new ioctx with the bad locator */
3177 librados::IoCtx src_ioctx
;
3178 src_ioctx
.dup(ref
.ioctx
);
3179 src_ioctx
.locator_set_key(bad_loc
);
3181 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
3183 /* cannot find a broken part */
3186 ldout(cct
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
3191 r
= move_rados_obj(src_ioctx
, oid
, bad_loc
, ref
.ioctx
, oid
, locator
);
3193 lderr(cct
) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
3202 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
3204 RGWBucketInfo
* bucket_info_out
)
3208 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
3210 RGWBucketInfo bucket_info
;
3211 RGWBucketInfo
* bucket_info_p
=
3212 bucket_info_out
? bucket_info_out
: &bucket_info
;
3214 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
);
3219 ret
= store
->open_bucket_index_shard(*bucket_info_p
, index_ctx
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
3221 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3224 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3229 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
3231 RGWBucketInfo
* bucket_info_out
)
3236 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
3238 RGWBucketInfo bucket_info
;
3239 RGWBucketInfo
* bucket_info_p
=
3240 bucket_info_out
? bucket_info_out
: &bucket_info
;
3241 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
);
3246 ret
= store
->open_bucket_index_shard(*bucket_info_p
, index_ctx
, shard_id
, &bucket_obj
);
3248 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3251 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3256 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
,
3259 bucket
= bucket_info
.bucket
;
3261 int ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
,
3262 obj
.get_hash_object(), &bucket_obj
,
3265 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3268 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3273 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
, int sid
)
3275 bucket
= bucket_info
.bucket
;
3278 int ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, shard_id
, &bucket_obj
);
3280 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3283 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3289 /* Execute @handler on last item in bucket listing for bucket specified
3290 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
3291 * to objects matching these criterias. */
3292 int RGWRados::on_last_entry_in_listing(RGWBucketInfo
& bucket_info
,
3293 const std::string
& obj_prefix
,
3294 const std::string
& obj_delim
,
3295 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
3297 RGWRados::Bucket
target(this, bucket_info
);
3298 RGWRados::Bucket::List
list_op(&target
);
3300 list_op
.params
.prefix
= obj_prefix
;
3301 list_op
.params
.delim
= obj_delim
;
3303 ldout(cct
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
3304 << ", obj_prefix=" << obj_prefix
3305 << ", obj_delim=" << obj_delim
3308 bool is_truncated
= false;
3310 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
3311 /* We need to rewind to the last object in a listing. */
3313 /* List bucket entries in chunks. */
3314 static constexpr int MAX_LIST_OBJS
= 100;
3315 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
3317 int ret
= list_op
.list_objects(MAX_LIST_OBJS
, &entries
, nullptr,
3321 } else if (!entries
.empty()) {
3322 last_entry
= entries
.back();
3324 } while (is_truncated
);
3327 return handler(*last_entry
);
3330 /* Empty listing - no items we can run handler on. */
3335 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
3336 const rgw_user
& user
,
3337 RGWBucketInfo
& bucket_info
,
3340 if (! swift_versioning_enabled(bucket_info
)) {
3344 obj_ctx
.set_atomic(obj
);
3346 RGWObjState
* state
= nullptr;
3347 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &state
, false);
3352 if (!state
->exists
) {
3356 const string
& src_name
= obj
.get_oid();
3357 char buf
[src_name
.size() + 32];
3358 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
3359 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
3360 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
3362 RGWBucketInfo dest_bucket_info
;
3364 auto sysobj_ctx
= svc
.sysobj
->init_obj_ctx();
3366 r
= get_bucket_info(sysobj_ctx
, bucket_info
.bucket
.tenant
, bucket_info
.swift_ver_location
, dest_bucket_info
, NULL
, NULL
);
3368 ldout(cct
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
3370 return -ERR_PRECONDITION_FAILED
;
3375 if (dest_bucket_info
.owner
!= bucket_info
.owner
) {
3376 return -ERR_PRECONDITION_FAILED
;
3379 rgw_obj
dest_obj(dest_bucket_info
.bucket
, buf
);
3381 if (dest_bucket_info
.versioning_enabled()){
3382 gen_rand_obj_instance_name(&dest_obj
);
3385 obj_ctx
.set_atomic(dest_obj
);
3389 r
= copy_obj(obj_ctx
,
3391 NULL
, /* req_info *info */
3397 bucket_info
.placement_rule
,
3398 NULL
, /* time_t *src_mtime */
3399 NULL
, /* time_t *mtime */
3400 NULL
, /* const time_t *mod_ptr */
3401 NULL
, /* const time_t *unmod_ptr */
3402 false, /* bool high_precision_time */
3403 NULL
, /* const char *if_match */
3404 NULL
, /* const char *if_nomatch */
3405 RGWRados::ATTRSMOD_NONE
,
3406 true, /* bool copy_if_newer */
3408 RGWObjCategory::Main
,
3409 0, /* uint64_t olh_epoch */
3410 real_time(), /* time_t delete_at */
3411 NULL
, /* string *version_id */
3412 NULL
, /* string *ptag */
3413 NULL
, /* string *petag */
3414 NULL
, /* void (*progress_cb)(off_t, void *) */
3415 NULL
); /* void *progress_data */
3416 if (r
== -ECANCELED
|| r
== -ENOENT
) {
3417 /* Has already been overwritten, meaning another rgw process already
3425 int RGWRados::swift_versioning_restore(RGWSysObjectCtx
& sysobj_ctx
,
3426 RGWObjectCtx
& obj_ctx
,
3427 const rgw_user
& user
,
3428 RGWBucketInfo
& bucket_info
,
3430 bool& restored
) /* out */
3432 if (! swift_versioning_enabled(bucket_info
)) {
3436 /* Bucket info of the bucket that stores previous versions of our object. */
3437 RGWBucketInfo archive_binfo
;
3439 int ret
= get_bucket_info(sysobj_ctx
, bucket_info
.bucket
.tenant
,
3440 bucket_info
.swift_ver_location
, archive_binfo
,
3446 /* Abort the operation if the bucket storing our archive belongs to someone
3447 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3448 * into consideration. For we can live with that.
3450 * TODO: delegate this check to un upper layer and compare with ACLs. */
3451 if (bucket_info
.owner
!= archive_binfo
.owner
) {
3455 /* This code will be executed on latest version of the object. */
3456 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
3457 std::string no_zone
;
3459 /* We don't support object versioning of Swift API on those buckets that
3460 * are already versioned using the S3 mechanism. This affects also bucket
3461 * storing archived objects. Otherwise the delete operation would create
3462 * a deletion marker. */
3463 if (archive_binfo
.versioned()) {
3465 return -ERR_PRECONDITION_FAILED
;
3468 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3469 * irrelevant and may be safely skipped. */
3470 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
3472 rgw_obj
archive_obj(archive_binfo
.bucket
, entry
.key
);
3474 if (bucket_info
.versioning_enabled()){
3475 gen_rand_obj_instance_name(&obj
);
3478 obj_ctx
.set_atomic(archive_obj
);
3479 obj_ctx
.set_atomic(obj
);
3481 int ret
= copy_obj(obj_ctx
,
3483 nullptr, /* req_info *info */
3486 archive_obj
, /* src obj */
3487 bucket_info
, /* dest bucket info */
3488 archive_binfo
, /* src bucket info */
3489 bucket_info
.placement_rule
, /* placement_rule */
3490 nullptr, /* time_t *src_mtime */
3491 nullptr, /* time_t *mtime */
3492 nullptr, /* const time_t *mod_ptr */
3493 nullptr, /* const time_t *unmod_ptr */
3494 false, /* bool high_precision_time */
3495 nullptr, /* const char *if_match */
3496 nullptr, /* const char *if_nomatch */
3497 RGWRados::ATTRSMOD_NONE
,
3498 true, /* bool copy_if_newer */
3500 RGWObjCategory::Main
,
3501 0, /* uint64_t olh_epoch */
3502 real_time(), /* time_t delete_at */
3503 nullptr, /* string *version_id */
3504 nullptr, /* string *ptag */
3505 nullptr, /* string *petag */
3506 nullptr, /* void (*progress_cb)(off_t, void *) */
3507 nullptr); /* void *progress_data */
3508 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
3509 /* Has already been overwritten, meaning another rgw process already
3512 } else if (ret
< 0) {
3518 /* Need to remove the archived copy. */
3519 ret
= delete_obj(obj_ctx
, archive_binfo
, archive_obj
,
3520 archive_binfo
.versioning_status());
3525 const std::string
& obj_name
= obj
.get_oid();
3526 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
3529 return on_last_entry_in_listing(archive_binfo
, prefix
, std::string(),
3533 int RGWRados::Object::Write::_do_write_meta(uint64_t size
, uint64_t accounted_size
,
3534 map
<string
, bufferlist
>& attrs
,
3535 bool assume_noent
, bool modify_tail
,
3538 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
3539 RGWRados
*store
= target
->get_store();
3541 ObjectWriteOperation op
;
3543 const struct req_state
* s
= get_req_state();
3547 req_id
= store
->svc
.zone_utils
->unique_id(store
->get_new_req_id());
3554 int r
= target
->get_state(&state
, false, assume_noent
);
3558 rgw_obj
& obj
= target
->get_obj();
3560 if (obj
.get_oid().empty()) {
3561 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
3566 r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
3570 bool is_olh
= state
->is_olh
;
3572 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
3574 const string
*ptag
= meta
.ptag
;
3575 if (!ptag
&& !index_op
->get_optag()->empty()) {
3576 ptag
= index_op
->get_optag();
3578 r
= target
->prepare_atomic_modification(op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false, modify_tail
);
3582 if (real_clock::is_zero(meta
.set_mtime
)) {
3583 meta
.set_mtime
= real_clock::now();
3586 if (target
->bucket_info
.obj_lock_enabled() && target
->bucket_info
.obj_lock
.has_rule() && meta
.flags
== PUT_OBJ_CREATE
) {
3587 auto iter
= attrs
.find(RGW_ATTR_OBJECT_RETENTION
);
3588 if (iter
== attrs
.end()) {
3589 real_time lock_until_date
= target
->bucket_info
.obj_lock
.get_lock_until_date(meta
.set_mtime
);
3590 string mode
= target
->bucket_info
.obj_lock
.get_mode();
3591 RGWObjectRetention
obj_retention(mode
, lock_until_date
);
3593 obj_retention
.encode(bl
);
3594 op
.setxattr(RGW_ATTR_OBJECT_RETENTION
, bl
);
3598 if (state
->is_olh
) {
3599 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
3602 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
3603 op
.mtime2(&mtime_ts
);
3606 /* if we want to overwrite the data, we also want to overwrite the
3607 xattrs, so just remove the object */
3608 op
.write_full(*meta
.data
);
3612 string content_type
;
3614 string storage_class
;
3616 map
<string
, bufferlist
>::iterator iter
;
3618 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
3619 const string
& name
= iter
->first
;
3620 op
.rmxattr(name
.c_str());
3624 if (meta
.manifest
) {
3625 storage_class
= meta
.manifest
->get_tail_placement().placement_rule
.storage_class
;
3627 /* remove existing manifest attr */
3628 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
3629 if (iter
!= attrs
.end())
3633 encode(*meta
.manifest
, bl
);
3634 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
3637 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
3638 const string
& name
= iter
->first
;
3639 bufferlist
& bl
= iter
->second
;
3644 op
.setxattr(name
.c_str(), bl
);
3646 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
3647 etag
= rgw_bl_str(bl
);
3648 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
3649 content_type
= rgw_bl_str(bl
);
3650 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
3654 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
3655 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
3658 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
3660 encode(store
->svc
.zone
->get_zone_short_id(), bl
);
3661 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
3664 if (!storage_class
.empty()) {
3666 bl
.append(storage_class
);
3667 op
.setxattr(RGW_ATTR_STORAGE_CLASS
, bl
);
3678 if (!reset_obj
) { //Multipart upload, it has immutable head.
3679 orig_exists
= false;
3682 orig_exists
= state
->exists
;
3683 orig_size
= state
->accounted_size
;
3686 bool versioned_target
= (meta
.olh_epoch
&& *meta
.olh_epoch
> 0) ||
3687 !obj
.key
.instance
.empty();
3689 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
3692 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
3695 if (!index_op
->is_prepared()) {
3696 tracepoint(rgw_rados
, prepare_enter
, req_id
.c_str());
3697 r
= index_op
->prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
3698 tracepoint(rgw_rados
, prepare_exit
, req_id
.c_str());
3703 tracepoint(rgw_rados
, operate_enter
, req_id
.c_str());
3704 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
3705 tracepoint(rgw_rados
, operate_exit
, req_id
.c_str());
3706 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
3707 or -ENOENT if was removed, or -EEXIST if it did not exist
3708 before and now it does */
3709 if (r
== -EEXIST
&& assume_noent
) {
3710 target
->invalidate_state();
3716 epoch
= ref
.ioctx
.get_last_version();
3717 poolid
= ref
.ioctx
.get_id();
3719 r
= target
->complete_atomic_modification();
3721 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
3724 tracepoint(rgw_rados
, complete_enter
, req_id
.c_str());
3725 r
= index_op
->complete(poolid
, epoch
, size
, accounted_size
,
3726 meta
.set_mtime
, etag
, content_type
,
3727 storage_class
, &acl_bl
,
3728 meta
.category
, meta
.remove_objs
, meta
.user_data
, meta
.appendable
);
3729 tracepoint(rgw_rados
, complete_exit
, req_id
.c_str());
3734 *meta
.mtime
= meta
.set_mtime
;
3737 /* note that index_op was using state so we couldn't invalidate it earlier */
3738 target
->invalidate_state();
3741 if (versioned_op
&& meta
.olh_epoch
) {
3742 r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, *meta
.olh_epoch
, real_time(), false, meta
.zones_trace
);
3748 if (!real_clock::is_zero(meta
.delete_at
)) {
3749 rgw_obj_index_key obj_key
;
3750 obj
.key
.get_index_key(&obj_key
);
3752 r
= store
->objexp_hint_add(meta
.delete_at
,
3753 obj
.bucket
.tenant
, obj
.bucket
.name
, obj
.bucket
.bucket_id
, obj_key
);
3755 ldout(store
->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
3756 /* ignoring error, nothing we can do at this point */
3759 meta
.canceled
= false;
3761 /* update quota cache */
3762 if (meta
.completeMultipart
){
3763 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3767 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3768 accounted_size
, orig_size
);
3773 int ret
= index_op
->cancel();
3775 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
3778 meta
.canceled
= true;
3780 /* we lost in a race. There are a few options:
3781 * - existing object was rewritten (ECANCELED)
3782 * - non existing object was created (EEXIST)
3783 * - object was removed (ENOENT)
3784 * should treat it as a success
3786 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
3787 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
3791 if (meta
.if_match
!= NULL
) {
3792 // only overwrite existing object
3793 if (strcmp(meta
.if_match
, "*") == 0) {
3795 r
= -ERR_PRECONDITION_FAILED
;
3796 } else if (r
== -ECANCELED
) {
3802 if (meta
.if_nomatch
!= NULL
) {
3803 // only create a new object
3804 if (strcmp(meta
.if_nomatch
, "*") == 0) {
3806 r
= -ERR_PRECONDITION_FAILED
;
3807 } else if (r
== -ENOENT
) {
3817 int RGWRados::Object::Write::write_meta(uint64_t size
, uint64_t accounted_size
,
3818 map
<string
, bufferlist
>& attrs
)
3820 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
3822 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
3823 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
3824 index_op
.set_zones_trace(meta
.zones_trace
);
3826 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
3829 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
);
3831 assume_noent
= false;
3834 if (!assume_noent
) {
3835 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
);
3840 class RGWRadosPutObj
: public RGWHTTPStreamRWRequest::ReceiveCB
3844 rgw::putobj::DataProcessor
*filter
;
3845 boost::optional
<RGWPutObj_Compress
>& compressor
;
3846 boost::optional
<rgw::putobj::ChunkProcessor
> buffering
;
3847 CompressorRef
& plugin
;
3848 rgw::putobj::ObjectProcessor
*processor
;
3849 void (*progress_cb
)(off_t
, void *);
3850 void *progress_data
;
3851 bufferlist extra_data_bl
;
3852 uint64_t extra_data_left
{0};
3853 bool need_to_process_attrs
{true};
3854 uint64_t data_len
{0};
3855 map
<string
, bufferlist
> src_attrs
;
3857 uint64_t lofs
{0}; /* logical ofs */
3858 std::function
<int(const map
<string
, bufferlist
>&)> attrs_handler
;
3860 RGWRadosPutObj(CephContext
* cct
,
3861 CompressorRef
& plugin
,
3862 boost::optional
<RGWPutObj_Compress
>& compressor
,
3863 rgw::putobj::ObjectProcessor
*p
,
3864 void (*_progress_cb
)(off_t
, void *),
3865 void *_progress_data
,
3866 std::function
<int(const map
<string
, bufferlist
>&)> _attrs_handler
) :
3869 compressor(compressor
),
3872 progress_cb(_progress_cb
),
3873 progress_data(_progress_data
),
3874 attrs_handler(_attrs_handler
) {}
3876 int process_attrs(void) {
3877 if (extra_data_bl
.length()) {
3879 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
3880 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
3884 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
3886 src_attrs
.erase(RGW_ATTR_COMPRESSION
);
3887 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
3889 // filter out olh attributes
3890 auto iter
= src_attrs
.lower_bound(RGW_ATTR_OLH_PREFIX
);
3891 while (iter
!= src_attrs
.end()) {
3892 if (!boost::algorithm::starts_with(iter
->first
, RGW_ATTR_OLH_PREFIX
)) {
3895 iter
= src_attrs
.erase(iter
);
3899 int ret
= attrs_handler(src_attrs
);
3904 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
3905 //do not compress if object is encrypted
3906 compressor
= boost::in_place(cct
, plugin
, filter
);
3907 // add a filter that buffers data so we don't try to compress tiny blocks.
3908 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3909 // compression ratio
3910 constexpr unsigned buffer_size
= 512 * 1024;
3911 buffering
= boost::in_place(&*compressor
, buffer_size
);
3912 filter
= &*buffering
;
3915 need_to_process_attrs
= false;
3920 int handle_data(bufferlist
& bl
, bool *pause
) override
{
3922 progress_cb(data_len
, progress_data
);
3924 if (extra_data_left
) {
3925 uint64_t extra_len
= bl
.length();
3926 if (extra_len
> extra_data_left
)
3927 extra_len
= extra_data_left
;
3930 bl
.splice(0, extra_len
, &extra
);
3931 extra_data_bl
.append(extra
);
3933 extra_data_left
-= extra_len
;
3934 if (extra_data_left
== 0) {
3935 int res
= process_attrs();
3940 if (bl
.length() == 0) {
3944 if (need_to_process_attrs
) {
3945 /* need to call process_attrs() even if we don't get any attrs,
3946 * need it to call attrs_handler().
3948 int res
= process_attrs();
3954 ceph_assert(uint64_t(ofs
) >= extra_data_len
);
3956 uint64_t size
= bl
.length();
3959 const uint64_t lofs
= data_len
;
3962 return filter
->process(std::move(bl
), lofs
);
3966 return filter
->process({}, data_len
);
3969 bufferlist
& get_extra_data() { return extra_data_bl
; }
3971 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
3973 void set_extra_data_len(uint64_t len
) override
{
3974 extra_data_left
= len
;
3975 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len
);
3978 uint64_t get_data_len() {
3984 * prepare attrset depending on attrs_mod.
3986 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
3987 map
<string
, bufferlist
>& attrs
,
3988 RGWRados::AttrsMod attrs_mod
)
3990 switch (attrs_mod
) {
3991 case RGWRados::ATTRSMOD_NONE
:
3994 case RGWRados::ATTRSMOD_REPLACE
:
3995 if (!attrs
[RGW_ATTR_ETAG
].length()) {
3996 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
3998 if (!attrs
[RGW_ATTR_TAIL_TAG
].length()) {
3999 auto ttiter
= src_attrs
.find(RGW_ATTR_TAIL_TAG
);
4000 if (ttiter
!= src_attrs
.end()) {
4001 attrs
[RGW_ATTR_TAIL_TAG
] = src_attrs
[RGW_ATTR_TAIL_TAG
];
4005 case RGWRados::ATTRSMOD_MERGE
:
4006 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
4007 if (attrs
.find(it
->first
) == attrs
.end()) {
4008 attrs
[it
->first
] = it
->second
;
4015 int RGWRados::rewrite_obj(RGWBucketInfo
& dest_bucket_info
, const rgw_obj
& obj
)
4017 map
<string
, bufferlist
> attrset
;
4021 RGWObjectCtx
rctx(this);
4023 RGWRados::Object
op_target(this, dest_bucket_info
, rctx
, obj
);
4024 RGWRados::Object::Read
read_op(&op_target
);
4026 read_op
.params
.attrs
= &attrset
;
4027 read_op
.params
.lastmod
= &mtime
;
4028 read_op
.params
.obj_size
= &obj_size
;
4030 int ret
= read_op
.prepare();
4034 attrset
.erase(RGW_ATTR_ID_TAG
);
4035 attrset
.erase(RGW_ATTR_TAIL_TAG
);
4037 return copy_obj_data(rctx
, dest_bucket_info
, dest_bucket_info
.placement_rule
,
4038 read_op
, obj_size
- 1, obj
, NULL
, mtime
, attrset
,
4039 0, real_time(), NULL
);
4042 struct obj_time_weight
{
4044 uint32_t zone_short_id
;
4046 bool high_precision
;
4048 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
4050 bool compare_low_precision(const obj_time_weight
& rhs
) {
4051 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
4052 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
4061 if (!zone_short_id
|| !rhs
.zone_short_id
) {
4062 /* don't compare zone ids, if one wasn't provided */
4065 if (zone_short_id
!= rhs
.zone_short_id
) {
4066 return (zone_short_id
< rhs
.zone_short_id
);
4068 return (pg_ver
< rhs
.pg_ver
);
4072 bool operator<(const obj_time_weight
& rhs
) {
4073 if (!high_precision
|| !rhs
.high_precision
) {
4074 return compare_low_precision(rhs
);
4076 if (mtime
> rhs
.mtime
) {
4079 if (mtime
< rhs
.mtime
) {
4082 if (!zone_short_id
|| !rhs
.zone_short_id
) {
4083 /* don't compare zone ids, if one wasn't provided */
4086 if (zone_short_id
!= rhs
.zone_short_id
) {
4087 return (zone_short_id
< rhs
.zone_short_id
);
4089 return (pg_ver
< rhs
.pg_ver
);
4092 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
4094 zone_short_id
= _short_id
;
4098 void init(RGWObjState
*state
) {
4099 mtime
= state
->mtime
;
4100 zone_short_id
= state
->zone_short_id
;
4101 pg_ver
= state
->pg_ver
;
4105 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
4108 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
4109 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
4115 class RGWGetExtraDataCB
: public RGWHTTPStreamRWRequest::ReceiveCB
{
4116 bufferlist extra_data
;
4118 RGWGetExtraDataCB() {}
4119 int handle_data(bufferlist
& bl
, bool *pause
) override
{
4120 int bl_len
= (int)bl
.length();
4121 if (extra_data
.length() < extra_data_len
) {
4122 off_t max
= extra_data_len
- extra_data
.length();
4126 bl
.splice(0, max
, &extra_data
);
4131 bufferlist
& get_extra_data() {
4136 int RGWRados::stat_remote_obj(RGWObjectCtx
& obj_ctx
,
4137 const rgw_user
& user_id
,
4139 const string
& source_zone
,
4141 RGWBucketInfo
& src_bucket_info
,
4142 real_time
*src_mtime
,
4144 const real_time
*mod_ptr
,
4145 const real_time
*unmod_ptr
,
4146 bool high_precision_time
,
4147 const char *if_match
,
4148 const char *if_nomatch
,
4149 map
<string
, bufferlist
> *pattrs
,
4150 map
<string
, string
> *pheaders
,
4155 /* source is in a different zonegroup, copy from there */
4157 RGWRESTStreamRWRequest
*in_stream_req
;
4159 map
<string
, bufferlist
> src_attrs
;
4160 append_rand_alpha(cct
, tag
, tag
, 32);
4161 obj_time_weight set_mtime_weight
;
4162 set_mtime_weight
.high_precision
= high_precision_time
;
4165 if (source_zone
.empty()) {
4166 if (src_bucket_info
.zonegroup
.empty()) {
4167 /* source is in the master zonegroup */
4168 conn
= svc
.zone
->get_master_conn();
4170 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
4171 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
4172 if (iter
== zonegroup_conn_map
.end()) {
4173 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
4176 conn
= iter
->second
;
4179 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
4180 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
4181 if (iter
== zone_conn_map
.end()) {
4182 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
4185 conn
= iter
->second
;
4188 RGWGetExtraDataCB cb
;
4189 map
<string
, string
> req_headers
;
4190 real_time set_mtime
;
4192 const real_time
*pmod
= mod_ptr
;
4194 obj_time_weight dest_mtime_weight
;
4196 constexpr bool prepend_meta
= true;
4197 constexpr bool get_op
= true;
4198 constexpr bool rgwx_stat
= true;
4199 constexpr bool sync_manifest
= true;
4200 constexpr bool skip_decrypt
= true;
4201 int ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
4202 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
4203 prepend_meta
, get_op
, rgwx_stat
,
4204 sync_manifest
, skip_decrypt
,
4205 true, &cb
, &in_stream_req
);
4210 ret
= conn
->complete_request(in_stream_req
, nullptr, &set_mtime
, psize
, nullptr, pheaders
);
4215 bufferlist
& extra_data_bl
= cb
.get_extra_data();
4216 if (extra_data_bl
.length()) {
4218 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
4219 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
4223 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
4225 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
4229 *src_mtime
= set_mtime
;
4233 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
4234 if (iter
!= src_attrs
.end()) {
4235 bufferlist
& etagbl
= iter
->second
;
4236 *petag
= etagbl
.to_str();
4237 while (petag
->size() > 0 && (*petag
)[petag
->size() - 1] == '\0') {
4238 *petag
= petag
->substr(0, petag
->size() - 1);
4244 *pattrs
= std::move(src_attrs
);
4250 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
4251 const rgw_user
& user_id
,
4253 const string
& source_zone
,
4254 const rgw_obj
& dest_obj
,
4255 const rgw_obj
& src_obj
,
4256 RGWBucketInfo
& dest_bucket_info
,
4257 RGWBucketInfo
& src_bucket_info
,
4258 std::optional
<rgw_placement_rule
> dest_placement_rule
,
4259 real_time
*src_mtime
,
4261 const real_time
*mod_ptr
,
4262 const real_time
*unmod_ptr
,
4263 bool high_precision_time
,
4264 const char *if_match
,
4265 const char *if_nomatch
,
4268 map
<string
, bufferlist
>& attrs
,
4269 RGWObjCategory category
,
4270 std::optional
<uint64_t> olh_epoch
,
4271 real_time delete_at
,
4274 void (*progress_cb
)(off_t
, void *),
4275 void *progress_data
,
4276 rgw_zone_set
*zones_trace
,
4277 std::optional
<uint64_t>* bytes_transferred
)
4279 /* source is in a different zonegroup, copy from there */
4281 RGWRESTStreamRWRequest
*in_stream_req
;
4284 append_rand_alpha(cct
, tag
, tag
, 32);
4285 obj_time_weight set_mtime_weight
;
4286 set_mtime_weight
.high_precision
= high_precision_time
;
4289 rgw::AioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
4290 using namespace rgw::putobj
;
4291 const rgw_placement_rule
*ptail_rule
= (dest_placement_rule
? &(*dest_placement_rule
) : nullptr);
4292 AtomicObjectProcessor
processor(&aio
, this, dest_bucket_info
, ptail_rule
, user_id
,
4293 obj_ctx
, dest_obj
, olh_epoch
, tag
);
4295 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
4296 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
4297 if (source_zone
.empty()) {
4298 if (dest_bucket_info
.zonegroup
.empty()) {
4299 /* source is in the master zonegroup */
4300 conn
= svc
.zone
->get_master_conn();
4302 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
4303 if (iter
== zonegroup_conn_map
.end()) {
4304 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
4307 conn
= iter
->second
;
4310 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
4311 if (iter
== zone_conn_map
.end()) {
4312 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
4315 conn
= iter
->second
;
4318 string obj_name
= dest_obj
.bucket
.name
+ "/" + dest_obj
.get_oid();
4320 boost::optional
<RGWPutObj_Compress
> compressor
;
4321 CompressorRef plugin
;
4323 rgw_placement_rule dest_rule
;
4324 RGWRadosPutObj
cb(cct
, plugin
, compressor
, &processor
, progress_cb
, progress_data
,
4325 [&](const map
<string
, bufferlist
>& obj_attrs
) {
4327 auto iter
= obj_attrs
.find(RGW_ATTR_STORAGE_CLASS
);
4328 if (iter
!= obj_attrs
.end()) {
4329 dest_rule
.storage_class
= iter
->second
.to_str();
4330 dest_rule
.inherit_from(dest_bucket_info
.placement_rule
);
4331 processor
.set_tail_placement(std::move(dest_rule
));
4332 ptail_rule
= &dest_rule
;
4334 ptail_rule
= &dest_bucket_info
.placement_rule
;
4337 const auto& compression_type
= svc
.zone
->get_zone_params().get_compression_type(*ptail_rule
);
4338 if (compression_type
!= "none") {
4339 plugin
= Compressor::create(cct
, compression_type
);
4341 ldout(cct
, 1) << "Cannot load plugin for compression type "
4342 << compression_type
<< dendl
;
4346 int ret
= processor
.prepare();
4354 real_time set_mtime
;
4355 uint64_t expected_size
= 0;
4357 RGWObjState
*dest_state
= NULL
;
4359 const real_time
*pmod
= mod_ptr
;
4361 obj_time_weight dest_mtime_weight
;
4363 if (copy_if_newer
) {
4364 /* need to get mtime for destination */
4365 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
4369 if (!real_clock::is_zero(dest_state
->mtime
)) {
4370 dest_mtime_weight
.init(dest_state
);
4371 pmod
= &dest_mtime_weight
.mtime
;
4375 static constexpr bool prepend_meta
= true;
4376 static constexpr bool get_op
= true;
4377 static constexpr bool rgwx_stat
= false;
4378 static constexpr bool sync_manifest
= true;
4379 static constexpr bool skip_decrypt
= true;
4380 ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
4381 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
4382 prepend_meta
, get_op
, rgwx_stat
,
4383 sync_manifest
, skip_decrypt
,
4385 &cb
, &in_stream_req
);
4390 ret
= conn
->complete_request(in_stream_req
, &etag
, &set_mtime
,
4391 &expected_size
, nullptr, nullptr);
4399 if (cb
.get_data_len() != expected_size
) {
4401 ldout(cct
, 0) << "ERROR: object truncated during fetching, expected "
4402 << expected_size
<< " bytes but received " << cb
.get_data_len() << dendl
;
4405 if (compressor
&& compressor
->is_compressed()) {
4407 RGWCompressionInfo cs_info
;
4408 cs_info
.compression_type
= plugin
->get_type_name();
4409 cs_info
.orig_size
= cb
.get_data_len();
4410 cs_info
.blocks
= move(compressor
->get_compression_blocks());
4411 encode(cs_info
, tmp
);
4412 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
4415 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4416 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
4418 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
4419 if (iter
!= cb
.get_attrs().end()) {
4421 decode(delete_at
, iter
->second
);
4422 } catch (buffer::error
& err
) {
4423 ldout(cct
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
4429 *src_mtime
= set_mtime
;
4433 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
4434 if (iter
!= cb
.get_attrs().end()) {
4435 *petag
= iter
->second
.to_str();
4439 //erase the append attr
4440 cb
.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM
);
4442 if (source_zone
.empty()) {
4443 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
4445 attrs
= cb
.get_attrs();
4448 if (copy_if_newer
) {
4449 uint64_t pg_ver
= 0;
4450 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
4451 if (i
!= attrs
.end() && i
->second
.length() > 0) {
4452 auto iter
= i
->second
.cbegin();
4454 decode(pg_ver
, iter
);
4455 } catch (buffer::error
& err
) {
4456 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
4457 /* non critical error */
4460 set_mtime_weight
.init(set_mtime
, svc
.zone
->get_zone_short_id(), pg_ver
);
4463 #define MAX_COMPLETE_RETRY 100
4464 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
4465 bool canceled
= false;
4466 ret
= processor
.complete(cb
.get_data_len(), etag
, mtime
, set_mtime
,
4467 attrs
, delete_at
, nullptr, nullptr, nullptr,
4468 zones_trace
, &canceled
);
4472 if (copy_if_newer
&& canceled
) {
4473 ldout(cct
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
4474 obj_ctx
.invalidate(dest_obj
); /* object was overwritten */
4475 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
4477 ldout(cct
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
4480 dest_mtime_weight
.init(dest_state
);
4481 dest_mtime_weight
.high_precision
= high_precision_time
;
4482 if (!dest_state
->exists
||
4483 dest_mtime_weight
< set_mtime_weight
) {
4484 ldout(cct
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
4487 ldout(cct
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
4493 if (i
== MAX_COMPLETE_RETRY
) {
4494 ldout(cct
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
4499 if (bytes_transferred
) {
4500 *bytes_transferred
= cb
.get_data_len();
4504 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
4505 // we may have already fetched during sync of OP_ADD, but were waiting
4506 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4507 if (olh_epoch
&& *olh_epoch
> 0) {
4508 constexpr bool log_data_change
= true;
4509 ret
= set_olh(obj_ctx
, dest_bucket_info
, dest_obj
, false, nullptr,
4510 *olh_epoch
, real_time(), false, zones_trace
, log_data_change
);
4512 // we already have the latest copy
4520 int RGWRados::copy_obj_to_remote_dest(RGWObjState
*astate
,
4521 map
<string
, bufferlist
>& src_attrs
,
4522 RGWRados::Object::Read
& read_op
,
4523 const rgw_user
& user_id
,
4529 RGWRESTStreamS3PutObj
*out_stream_req
;
4531 auto rest_master_conn
= svc
.zone
->get_master_conn();
4533 int ret
= rest_master_conn
->put_obj_async(user_id
, dest_obj
, astate
->size
, src_attrs
, true, &out_stream_req
);
4538 ret
= read_op
.iterate(0, astate
->size
- 1, out_stream_req
->get_out_cb());
4540 delete out_stream_req
;
4544 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
);
4553 * dest_obj: the object to copy into
4554 * src_obj: the object to copy from
4555 * attrs: usage depends on attrs_mod parameter
4556 * attrs_mod: the modification mode of the attrs, may have the following values:
4557 * ATTRSMOD_NONE - the attributes of the source object will be
4558 * copied without modifications, attrs parameter is ignored;
4559 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4560 * parameter, source object attributes are not copied;
4561 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4562 * are overwritten by values contained in attrs parameter.
4563 * err: stores any errors resulting from the get of the original object
4564 * Returns: 0 on success, -ERR# otherwise.
4566 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
4567 const rgw_user
& user_id
,
4569 const string
& source_zone
,
4572 RGWBucketInfo
& dest_bucket_info
,
4573 RGWBucketInfo
& src_bucket_info
,
4574 const rgw_placement_rule
& dest_placement
,
4575 real_time
*src_mtime
,
4577 const real_time
*mod_ptr
,
4578 const real_time
*unmod_ptr
,
4579 bool high_precision_time
,
4580 const char *if_match
,
4581 const char *if_nomatch
,
4584 map
<string
, bufferlist
>& attrs
,
4585 RGWObjCategory category
,
4587 real_time delete_at
,
4591 void (*progress_cb
)(off_t
, void *),
4592 void *progress_data
)
4596 rgw_obj shadow_obj
= dest_obj
;
4602 append_rand_alpha(cct
, dest_obj
.get_oid(), shadow_oid
, 32);
4603 shadow_obj
.init_ns(dest_obj
.bucket
, shadow_oid
, shadow_ns
);
4605 auto& zonegroup
= svc
.zone
->get_zonegroup();
4607 remote_dest
= !zonegroup
.equals(dest_bucket_info
.zonegroup
);
4608 remote_src
= !zonegroup
.equals(src_bucket_info
.zonegroup
);
4610 if (remote_src
&& remote_dest
) {
4611 ldout(cct
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
4615 ldout(cct
, 5) << "Copy object " << src_obj
.bucket
<< ":" << src_obj
.get_oid() << " => " << dest_obj
.bucket
<< ":" << dest_obj
.get_oid() << dendl
;
4617 if (remote_src
|| !source_zone
.empty()) {
4618 return fetch_remote_obj(obj_ctx
, user_id
, info
, source_zone
,
4619 dest_obj
, src_obj
, dest_bucket_info
, src_bucket_info
,
4620 dest_placement
, src_mtime
, mtime
, mod_ptr
,
4621 unmod_ptr
, high_precision_time
,
4622 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
4623 olh_epoch
, delete_at
, ptag
, petag
, progress_cb
, progress_data
);
4626 map
<string
, bufferlist
> src_attrs
;
4627 RGWRados::Object
src_op_target(this, src_bucket_info
, obj_ctx
, src_obj
);
4628 RGWRados::Object::Read
read_op(&src_op_target
);
4630 read_op
.conds
.mod_ptr
= mod_ptr
;
4631 read_op
.conds
.unmod_ptr
= unmod_ptr
;
4632 read_op
.conds
.high_precision_time
= high_precision_time
;
4633 read_op
.conds
.if_match
= if_match
;
4634 read_op
.conds
.if_nomatch
= if_nomatch
;
4635 read_op
.params
.attrs
= &src_attrs
;
4636 read_op
.params
.lastmod
= src_mtime
;
4637 read_op
.params
.obj_size
= &obj_size
;
4639 ret
= read_op
.prepare();
4643 if (src_attrs
.count(RGW_ATTR_CRYPT_MODE
)) {
4644 // Current implementation does not follow S3 spec and even
4645 // may result in data corruption silently when copying
4646 // multipart objects acorss pools. So reject COPY operations
4647 //on encrypted objects before it is fully functional.
4648 ldout(cct
, 0) << "ERROR: copy op for encrypted object " << src_obj
4649 << " has not been implemented." << dendl
;
4650 return -ERR_NOT_IMPLEMENTED
;
4653 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
4654 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
4656 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
4657 attrs
.erase(RGW_ATTR_ID_TAG
);
4658 attrs
.erase(RGW_ATTR_PG_VER
);
4659 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
4660 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
4661 if (cmp
!= src_attrs
.end())
4662 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
4664 RGWObjManifest manifest
;
4665 RGWObjState
*astate
= NULL
;
4667 ret
= get_obj_state(&obj_ctx
, src_bucket_info
, src_obj
, &astate
);
4672 vector
<rgw_raw_obj
> ref_objs
;
4675 /* dest is in a different zonegroup, copy it there */
4676 return copy_obj_to_remote_dest(astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
4678 uint64_t max_chunk_size
;
4680 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, dest_obj
, &max_chunk_size
);
4682 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
.bucket
<< dendl
;
4689 const rgw_placement_rule
*src_rule
{nullptr};
4691 if (astate
->has_manifest
) {
4692 src_rule
= &astate
->manifest
.get_tail_placement().placement_rule
;
4693 ldout(cct
, 20) << __func__
<< "(): manifest src_rule=" << src_rule
->to_str() << dendl
;
4696 if (!src_rule
|| src_rule
->empty()) {
4697 src_rule
= &src_bucket_info
.placement_rule
;
4700 if (!get_obj_data_pool(*src_rule
, src_obj
, &src_pool
)) {
4701 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
4705 if (!get_obj_data_pool(dest_placement
, dest_obj
, &dest_pool
)) {
4706 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
4710 ldout(cct
, 20) << __func__
<< "(): src_rule=" << src_rule
->to_str() << " src_pool=" << src_pool
4711 << " dest_rule=" << dest_placement
.to_str() << " dest_pool=" << dest_pool
<< dendl
;
4713 bool copy_data
= !astate
->has_manifest
||
4714 (*src_rule
!= dest_placement
) ||
4715 (src_pool
!= dest_pool
);
4717 bool copy_first
= false;
4718 if (astate
->has_manifest
) {
4719 if (!astate
->manifest
.has_tail()) {
4722 uint64_t head_size
= astate
->manifest
.get_head_size();
4724 if (head_size
> 0) {
4725 if (head_size
> max_chunk_size
) {
4735 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4736 if (iter
!= attrs
.end()) {
4737 *petag
= iter
->second
.to_str();
4741 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
4742 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4743 return copy_obj_data(obj_ctx
, dest_bucket_info
, dest_placement
, read_op
, obj_size
- 1, dest_obj
,
4744 mtime
, real_time(), attrs
, olh_epoch
, delete_at
, petag
);
4747 RGWObjManifest::obj_iterator miter
= astate
->manifest
.obj_begin();
4749 if (copy_first
) { // we need to copy first chunk, not increase refcount
4754 ret
= get_raw_obj_ref(miter
.get_location().get_raw_obj(this), &ref
);
4759 bufferlist first_chunk
;
4761 bool copy_itself
= (dest_obj
== src_obj
);
4762 RGWObjManifest
*pmanifest
;
4763 ldout(cct
, 20) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
4765 RGWRados::Object
dest_op_target(this, dest_bucket_info
, obj_ctx
, dest_obj
);
4766 RGWRados::Object::Write
write_op(&dest_op_target
);
4775 append_rand_alpha(cct
, tag
, tag
, 32);
4779 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4780 manifest
= astate
->manifest
;
4781 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
4782 if (tail_placement
.bucket
.name
.empty()) {
4783 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
.bucket
);
4786 for (; miter
!= astate
->manifest
.obj_end(); ++miter
) {
4787 ObjectWriteOperation op
;
4788 ref_tag
= tag
+ '\0';
4789 cls_refcount_get(op
, ref_tag
, true);
4790 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(this);
4791 ref
.ioctx
.locator_set_key(loc
.loc
);
4793 ret
= ref
.ioctx
.operate(loc
.oid
, &op
);
4798 ref_objs
.push_back(loc
);
4801 pmanifest
= &manifest
;
4803 pmanifest
= &astate
->manifest
;
4804 /* don't send the object's tail for garbage collection */
4805 astate
->keep_tail
= true;
4809 ret
= read_op
.read(0, max_chunk_size
, first_chunk
);
4814 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, first_chunk
.length());
4816 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, 0);
4819 write_op
.meta
.data
= &first_chunk
;
4820 write_op
.meta
.manifest
= pmanifest
;
4821 write_op
.meta
.ptag
= &tag
;
4822 write_op
.meta
.owner
= dest_bucket_info
.owner
;
4823 write_op
.meta
.mtime
= mtime
;
4824 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
4825 write_op
.meta
.category
= category
;
4826 write_op
.meta
.olh_epoch
= olh_epoch
;
4827 write_op
.meta
.delete_at
= delete_at
;
4828 write_op
.meta
.modify_tail
= !copy_itself
;
4830 ret
= write_op
.write_meta(obj_size
, astate
->accounted_size
, attrs
);
4839 vector
<rgw_raw_obj
>::iterator riter
;
4841 /* rollback reference */
4842 string ref_tag
= tag
+ '\0';
4843 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
4844 ObjectWriteOperation op
;
4845 cls_refcount_put(op
, ref_tag
, true);
4847 ref
.ioctx
.locator_set_key(riter
->loc
);
4849 int r
= ref
.ioctx
.operate(riter
->oid
, &op
);
4851 ldout(cct
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
4859 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
4860 RGWBucketInfo
& dest_bucket_info
,
4861 const rgw_placement_rule
& dest_placement
,
4862 RGWRados::Object::Read
& read_op
, off_t end
,
4863 const rgw_obj
& dest_obj
,
4865 real_time set_mtime
,
4866 map
<string
, bufferlist
>& attrs
,
4868 real_time delete_at
,
4872 append_rand_alpha(cct
, tag
, tag
, 32);
4874 rgw::AioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
4875 using namespace rgw::putobj
;
4876 AtomicObjectProcessor
processor(&aio
, this, dest_bucket_info
, &dest_placement
,
4877 dest_bucket_info
.owner
, obj_ctx
,
4878 dest_obj
, olh_epoch
, tag
);
4879 int ret
= processor
.prepare();
4887 ret
= read_op
.read(ofs
, end
, bl
);
4889 ldout(cct
, 0) << "ERROR: fail to read object data, ret = " << ret
<< dendl
;
4893 uint64_t read_len
= ret
;
4894 ret
= processor
.process(std::move(bl
), ofs
);
4900 } while (ofs
<= end
);
4903 ret
= processor
.process({}, ofs
);
4909 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4910 if (iter
!= attrs
.end()) {
4911 bufferlist
& bl
= iter
->second
;
4918 uint64_t accounted_size
;
4920 bool compressed
{false};
4921 RGWCompressionInfo cs_info
;
4922 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
4924 ldout(cct
, 0) << "ERROR: failed to read compression info" << dendl
;
4927 // pass original size if compressed
4928 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
4931 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
,
4932 nullptr, nullptr, nullptr, nullptr, nullptr);
4935 int RGWRados::transition_obj(RGWObjectCtx
& obj_ctx
,
4936 RGWBucketInfo
& bucket_info
,
4938 const rgw_placement_rule
& placement_rule
,
4939 const real_time
& mtime
,
4942 map
<string
, bufferlist
> attrs
;
4943 real_time read_mtime
;
4946 RGWRados::Object
op_target(this, bucket_info
, obj_ctx
, obj
);
4947 RGWRados::Object::Read
read_op(&op_target
);
4949 read_op
.params
.attrs
= &attrs
;
4950 read_op
.params
.lastmod
= &read_mtime
;
4951 read_op
.params
.obj_size
= &obj_size
;
4953 int ret
= read_op
.prepare();
4958 if (read_mtime
!= mtime
) {
4963 ret
= copy_obj_data(obj_ctx
,
4969 nullptr /* pmtime */,
4974 nullptr /* petag */);
4982 int RGWRados::check_bucket_empty(RGWBucketInfo
& bucket_info
)
4984 std::vector
<rgw_bucket_dir_entry
> ent_list
;
4985 rgw_obj_index_key marker
;
4990 constexpr uint NUM_ENTRIES
= 1000u;
4991 int r
= cls_bucket_list_unordered(bucket_info
,
5004 for (auto const& dirent
: ent_list
) {
5007 if (rgw_obj_key::oid_to_key_in_ns(dirent
.key
.name
, &obj
, ns
))
5010 } while (is_truncated
);
5017 * bucket: the name of the bucket to delete
5018 * Returns 0 on success, -ERR# otherwise.
5020 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, bool check_empty
)
5022 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5023 librados::IoCtx index_ctx
;
5024 map
<int, string
> bucket_objs
;
5025 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
5030 r
= check_bucket_empty(bucket_info
);
5036 r
= rgw_bucket_delete_bucket_obj(this, bucket
.tenant
, bucket
.name
, objv_tracker
);
5040 /* if the bucket is not synced we can remove the meta file */
5041 if (!svc
.zone
->is_syncing_bucket_meta(bucket
)) {
5042 RGWObjVersionTracker objv_tracker
;
5043 r
= rgw_bucket_instance_remove_entry(this, bucket
.get_key(), &objv_tracker
);
5048 /* remove bucket index objects asynchronously by best effort */
5049 (void) CLSRGWIssueBucketIndexClean(index_ctx
,
5051 cct
->_conf
->rgw_bucket_index_max_aio
)();
5057 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
)
5060 map
<string
, bufferlist
> attrs
;
5061 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5063 if (bucket
.bucket_id
.empty()) {
5064 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
5066 r
= get_bucket_instance_info(obj_ctx
, bucket
, info
, nullptr, &attrs
);
5069 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
5073 info
.owner
= owner
.get_id();
5075 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
5077 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
5085 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
)
5089 vector
<rgw_bucket
>::iterator iter
;
5091 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
5092 rgw_bucket
& bucket
= *iter
;
5094 ldout(cct
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
5096 ldout(cct
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
5099 map
<string
, bufferlist
> attrs
;
5100 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5101 int r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
5103 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
5108 info
.flags
&= ~BUCKET_SUSPENDED
;
5110 info
.flags
|= BUCKET_SUSPENDED
;
5113 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
5115 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
5123 int RGWRados::bucket_suspended(rgw_bucket
& bucket
, bool *suspended
)
5125 RGWBucketInfo bucket_info
;
5126 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5127 int ret
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
);
5132 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
5136 int RGWRados::Object::complete_atomic_modification()
5138 if (!state
->has_manifest
|| state
->keep_tail
)
5141 cls_rgw_obj_chain chain
;
5142 store
->update_gc_chain(obj
, state
->manifest
, &chain
);
5144 if (chain
.empty()) {
5148 string tag
= (state
->tail_tag
.length() > 0 ? state
->tail_tag
.to_str() : state
->obj_tag
.to_str());
5149 return store
->gc
->send_chain(chain
, tag
, false); // do it async
5152 void RGWRados::update_gc_chain(rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
5154 RGWObjManifest::obj_iterator iter
;
5155 rgw_raw_obj raw_head
;
5156 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
5157 for (iter
= manifest
.obj_begin(); iter
!= manifest
.obj_end(); ++iter
) {
5158 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(this);
5159 if (mobj
== raw_head
)
5161 cls_rgw_obj_key
key(mobj
.oid
);
5162 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
5166 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
, bool sync
)
5168 return gc
->send_chain(chain
, tag
, sync
);
5171 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
,
5172 librados::IoCtx
& index_ctx
,
5175 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5176 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5180 if (bucket
.bucket_id
.empty()) {
5181 ldout(cct
, 0) << "ERROR: empty bucket id for bucket operation" << dendl
;
5185 bucket_oid
= dir_oid_prefix
;
5186 bucket_oid
.append(bucket
.bucket_id
);
5191 int RGWRados::open_bucket_index_base(const RGWBucketInfo
& bucket_info
,
5192 librados::IoCtx
& index_ctx
,
5193 string
& bucket_oid_base
) {
5194 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5195 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5199 if (bucket
.bucket_id
.empty()) {
5200 ldout(cct
, 0) << "ERROR: empty bucket_id for bucket operation" << dendl
;
5204 bucket_oid_base
= dir_oid_prefix
;
5205 bucket_oid_base
.append(bucket
.bucket_id
);
5211 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
,
5212 librados::IoCtx
& index_ctx
,
5213 map
<int, string
>& bucket_objs
,
5215 map
<int, string
> *bucket_instance_ids
) {
5216 string bucket_oid_base
;
5217 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
5222 get_bucket_index_objects(bucket_oid_base
, bucket_info
.num_shards
, bucket_objs
, shard_id
);
5223 if (bucket_instance_ids
) {
5224 get_bucket_instance_ids(bucket_info
, shard_id
, bucket_instance_ids
);
5229 template<typename T
>
5230 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
5231 map
<int, string
>& oids
, map
<int, T
>& bucket_objs
,
5232 int shard_id
, map
<int, string
> *bucket_instance_ids
)
5234 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, bucket_instance_ids
);
5238 map
<int, string
>::const_iterator iter
= oids
.begin();
5239 for (; iter
!= oids
.end(); ++iter
) {
5240 bucket_objs
[iter
->first
] = T();
5245 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
5246 const string
& obj_key
, string
*bucket_obj
, int *shard_id
)
5248 string bucket_oid_base
;
5249 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
5253 RGWObjectCtx
obj_ctx(this);
5255 ret
= get_bucket_index_object(bucket_oid_base
, obj_key
, bucket_info
.num_shards
,
5256 (RGWBucketInfo::BIShardsHashType
)bucket_info
.bucket_index_shard_hash_type
, bucket_obj
, shard_id
);
5258 ldout(cct
, 10) << "get_bucket_index_object() returned ret=" << ret
<< dendl
;
5264 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
5265 int shard_id
, string
*bucket_obj
)
5267 string bucket_oid_base
;
5268 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
5272 RGWObjectCtx
obj_ctx(this);
5274 get_bucket_index_object(bucket_oid_base
, bucket_info
.num_shards
,
5275 shard_id
, bucket_obj
);
5279 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
5280 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
5282 for (const auto& pair
: header
.stats
) {
5283 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
5284 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
5286 RGWStorageStats
& s
= stats
[category
];
5288 s
.category
= category
;
5289 s
.size
+= header_stats
.total_size
;
5290 s
.size_rounded
+= header_stats
.total_size_rounded
;
5291 s
.size_utilized
+= header_stats
.actual_size
;
5292 s
.num_objects
+= header_stats
.num_entries
;
5296 int RGWRados::bucket_check_index(RGWBucketInfo
& bucket_info
,
5297 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
5298 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
5300 librados::IoCtx index_ctx
;
5301 // key - bucket index object id
5302 // value - bucket index check OP returned result with the given bucket index object (shard)
5303 map
<int, string
> oids
;
5304 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
5306 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, bucket_objs_ret
);
5311 ret
= CLSRGWIssueBucketCheck(index_ctx
, oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5316 // Aggregate results (from different shards if there is any)
5317 map
<int, struct rgw_cls_check_index_ret
>::iterator iter
;
5318 for (iter
= bucket_objs_ret
.begin(); iter
!= bucket_objs_ret
.end(); ++iter
) {
5319 accumulate_raw_stats(iter
->second
.existing_header
, *existing_stats
);
5320 accumulate_raw_stats(iter
->second
.calculated_header
, *calculated_stats
);
5326 int RGWRados::bucket_rebuild_index(RGWBucketInfo
& bucket_info
)
5328 librados::IoCtx index_ctx
;
5329 map
<int, string
> bucket_objs
;
5331 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
5336 return CLSRGWIssueBucketRebuild(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5339 int RGWRados::bucket_set_reshard(const RGWBucketInfo
& bucket_info
, const cls_rgw_bucket_instance_entry
& entry
)
5341 librados::IoCtx index_ctx
;
5342 map
<int, string
> bucket_objs
;
5344 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
5349 return CLSRGWIssueSetBucketResharding(index_ctx
, bucket_objs
, entry
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5352 int RGWRados::defer_gc(void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
)
5354 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
5355 std::string oid
, key
;
5356 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5360 RGWObjState
*state
= NULL
;
5362 int r
= get_obj_state(rctx
, bucket_info
, obj
, &state
, false);
5366 if (!state
->is_atomic
) {
5367 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
5373 if (state
->tail_tag
.length() > 0) {
5374 tag
= state
->tail_tag
.c_str();
5375 } else if (state
->obj_tag
.length() > 0) {
5376 tag
= state
->obj_tag
.c_str();
5378 ldout(cct
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
5382 ldout(cct
, 0) << "defer chain tag=" << tag
<< dendl
;
5384 return gc
->defer_chain(tag
, false);
5387 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
5389 list
<string
> prefixes
;
5390 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
5391 cls_rgw_remove_obj(op
, prefixes
);
5394 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
5396 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
5399 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
5401 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
5407 * bucket: name of the bucket storing the object
5408 * obj: name of the object to delete
5409 * Returns: 0 on success, -ERR# otherwise.
5411 int RGWRados::Object::Delete::delete_obj()
5413 RGWRados
*store
= target
->get_store();
5414 rgw_obj
& src_obj
= target
->get_obj();
5415 const string
& instance
= src_obj
.key
.instance
;
5416 rgw_obj obj
= src_obj
;
5418 if (instance
== "null") {
5419 obj
.key
.instance
.clear();
5422 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
5424 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
5425 if (instance
.empty() || explicit_marker_version
) {
5426 rgw_obj marker
= obj
;
5428 if (!params
.marker_version_id
.empty()) {
5429 if (params
.marker_version_id
!= "null") {
5430 marker
.key
.set_instance(params
.marker_version_id
);
5432 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
5433 store
->gen_rand_obj_instance_name(&marker
);
5436 result
.version_id
= marker
.key
.instance
;
5437 if (result
.version_id
.empty())
5438 result
.version_id
= "null";
5439 result
.delete_marker
= true;
5441 struct rgw_bucket_dir_entry_meta meta
;
5443 meta
.owner
= params
.obj_owner
.get_id().to_str();
5444 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
5446 if (real_clock::is_zero(params
.mtime
)) {
5447 meta
.mtime
= real_clock::now();
5449 meta
.mtime
= params
.mtime
;
5452 int r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
, params
.zones_trace
);
5457 rgw_bucket_dir_entry dirent
;
5459 int r
= store
->bi_get_instance(target
->get_bucket_info(), obj
, &dirent
);
5463 result
.delete_marker
= dirent
.is_delete_marker();
5464 r
= store
->unlink_obj_instance(target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
, params
.zones_trace
);
5468 result
.version_id
= instance
;
5472 int r
= target
->get_bucket_shard(&bs
);
5474 ldout(store
->ctx(), 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
5478 if (target
->bucket_info
.datasync_flag_enabled()) {
5479 r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
5481 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
5490 int r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
5496 r
= target
->get_state(&state
, false);
5500 ObjectWriteOperation op
;
5502 if (!real_clock::is_zero(params
.unmod_since
)) {
5503 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
5504 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
5505 if (!params
.high_precision_time
) {
5510 ldout(store
->ctx(), 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
5511 if (ctime
> unmod
) {
5512 return -ERR_PRECONDITION_FAILED
;
5515 /* only delete object if mtime is less than or equal to params.unmod_since */
5516 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
5518 uint64_t obj_accounted_size
= state
->accounted_size
;
5520 if (!real_clock::is_zero(params
.expiration_time
)) {
5522 real_time delete_at
;
5524 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
5526 auto iter
= bl
.cbegin();
5527 decode(delete_at
, iter
);
5528 } catch (buffer::error
& err
) {
5529 ldout(store
->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
5533 if (params
.expiration_time
!= delete_at
) {
5534 return -ERR_PRECONDITION_FAILED
;
5537 return -ERR_PRECONDITION_FAILED
;
5541 if (!state
->exists
) {
5542 target
->invalidate_state();
5546 r
= target
->prepare_atomic_modification(op
, false, NULL
, NULL
, NULL
, true, false);
5550 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
5552 RGWRados::Bucket
bop(store
, bucket_info
);
5553 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5555 index_op
.set_zones_trace(params
.zones_trace
);
5556 index_op
.set_bilog_flags(params
.bilog_flags
);
5558 r
= index_op
.prepare(CLS_RGW_OP_DEL
, &state
->write_tag
);
5562 store
->remove_rgw_head_obj(op
);
5563 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
5565 /* raced with another operation, object state is indeterminate */
5566 const bool need_invalidate
= (r
== -ECANCELED
);
5568 int64_t poolid
= ref
.ioctx
.get_id();
5570 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
5571 if (obj_tombstone_cache
) {
5572 tombstone_entry entry
{*state
};
5573 obj_tombstone_cache
->add(obj
, entry
);
5575 r
= index_op
.complete_del(poolid
, ref
.ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
5577 int ret
= target
->complete_atomic_modification();
5579 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
5581 /* other than that, no need to propagate error */
5583 int ret
= index_op
.cancel();
5585 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
5589 if (need_invalidate
) {
5590 target
->invalidate_state();
5596 /* update quota cache */
5597 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_accounted_size
);
5602 int RGWRados::delete_obj(RGWObjectCtx
& obj_ctx
,
5603 const RGWBucketInfo
& bucket_info
,
5605 int versioning_status
,
5606 uint16_t bilog_flags
,
5607 const real_time
& expiration_time
,
5608 rgw_zone_set
*zones_trace
)
5610 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
5611 RGWRados::Object::Delete
del_op(&del_target
);
5613 del_op
.params
.bucket_owner
= bucket_info
.owner
;
5614 del_op
.params
.versioning_status
= versioning_status
;
5615 del_op
.params
.bilog_flags
= bilog_flags
;
5616 del_op
.params
.expiration_time
= expiration_time
;
5617 del_op
.params
.zones_trace
= zones_trace
;
5619 return del_op
.delete_obj();
5622 int RGWRados::delete_raw_obj(const rgw_raw_obj
& obj
)
5625 int r
= get_raw_obj_ref(obj
, &ref
);
5630 ObjectWriteOperation op
;
5633 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
5640 int RGWRados::delete_obj_index(const rgw_obj
& obj
, ceph::real_time mtime
)
5642 std::string oid
, key
;
5643 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5645 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5647 RGWBucketInfo bucket_info
;
5648 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
);
5650 ldout(cct
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
5654 RGWRados::Bucket
bop(this, bucket_info
);
5655 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5657 return index_op
.complete_del(-1 /* pool */, 0, mtime
, NULL
);
5660 static void generate_fake_tag(RGWRados
*store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
5664 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin();
5665 if (mi
!= manifest
.obj_end()) {
5666 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
5668 tag
= mi
.get_location().get_raw_obj(store
).oid
;
5672 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
5673 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
5675 hash
.Update((const unsigned char *)manifest_bl
.c_str(), manifest_bl
.length());
5677 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
5678 if (iter
!= attrset
.end()) {
5679 bufferlist
& bl
= iter
->second
;
5680 hash
.Update((const unsigned char *)bl
.c_str(), bl
.length());
5684 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
5685 tag
.append(md5_str
);
5687 ldout(store
->ctx(), 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
5689 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
5692 static bool is_olh(map
<string
, bufferlist
>& attrs
)
5694 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
5695 return (iter
!= attrs
.end());
5698 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
5700 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
5701 return (iter
!= attrs
.end());
5704 int RGWRados::get_olh_target_state(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5705 RGWObjState
*olh_state
, RGWObjState
**target_state
)
5707 ceph_assert(olh_state
->is_olh
);
5710 int r
= RGWRados::follow_olh(bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
5714 r
= get_obj_state(&obj_ctx
, bucket_info
, target
, target_state
, false);
5722 int RGWRados::get_obj_state_impl(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5723 RGWObjState
**state
, bool follow_olh
, bool assume_noent
)
5729 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
5731 RGWObjState
*s
= rctx
->get_state(obj
);
5732 ldout(cct
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
5735 if (s
->is_olh
&& need_follow_olh
) {
5736 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
5743 rgw_raw_obj raw_obj
;
5744 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
5748 if (!assume_noent
) {
5749 r
= RGWRados::raw_obj_stat(raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
);
5754 s
->has_attrs
= true;
5755 tombstone_entry entry
;
5756 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
5757 s
->mtime
= entry
.mtime
;
5758 s
->zone_short_id
= entry
.zone_short_id
;
5759 s
->pg_ver
= entry
.pg_ver
;
5760 ldout(cct
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
5761 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
5763 s
->mtime
= real_time();
5771 s
->has_attrs
= true;
5772 s
->accounted_size
= s
->size
;
5774 auto iter
= s
->attrset
.find(RGW_ATTR_ETAG
);
5775 if (iter
!= s
->attrset
.end()) {
5776 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5777 bufferlist
& bletag
= iter
->second
;
5778 if (bletag
.length() > 0 && bletag
[bletag
.length() - 1] == '\0') {
5780 bletag
.splice(0, bletag
.length() - 1, &newbl
);
5781 bletag
.claim(newbl
);
5785 iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
5786 const bool compressed
= (iter
!= s
->attrset
.end());
5788 // use uncompressed size for accounted_size
5790 RGWCompressionInfo info
;
5791 auto p
= iter
->second
.cbegin();
5793 s
->accounted_size
= info
.orig_size
;
5794 } catch (buffer::error
&) {
5795 dout(0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
5800 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
5801 if (iter
!= s
->attrset
.end()) {
5802 bufferlist bl
= iter
->second
;
5803 bufferlist::iterator it
= bl
.begin();
5804 it
.copy(bl
.length(), s
->shadow_obj
);
5805 s
->shadow_obj
[bl
.length()] = '\0';
5807 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
5808 auto ttiter
= s
->attrset
.find(RGW_ATTR_TAIL_TAG
);
5809 if (ttiter
!= s
->attrset
.end()) {
5810 s
->tail_tag
= s
->attrset
[RGW_ATTR_TAIL_TAG
];
5813 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
5814 if (manifest_bl
.length()) {
5815 auto miter
= manifest_bl
.cbegin();
5817 decode(s
->manifest
, miter
);
5818 s
->has_manifest
= true;
5819 s
->manifest
.set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
5820 broken due to old bugs */
5821 s
->size
= s
->manifest
.get_obj_size();
5823 s
->accounted_size
= s
->size
;
5824 } catch (buffer::error
& err
) {
5825 ldout(cct
, 0) << "ERROR: couldn't decode manifest" << dendl
;
5828 ldout(cct
, 10) << "manifest: total_size = " << s
->manifest
.get_obj_size() << dendl
;
5829 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>() && \
5830 s
->manifest
.has_explicit_objs()) {
5831 RGWObjManifest::obj_iterator mi
;
5832 for (mi
= s
->manifest
.obj_begin(); mi
!= s
->manifest
.obj_end(); ++mi
) {
5833 ldout(cct
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(this) << dendl
;
5837 if (!s
->obj_tag
.length()) {
5839 * Uh oh, something's wrong, object with manifest should have tag. Let's
5840 * create one out of the manifest, would be unique
5842 generate_fake_tag(this, s
->attrset
, s
->manifest
, manifest_bl
, s
->obj_tag
);
5846 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
5847 if (aiter
!= s
->attrset
.end()) {
5848 bufferlist
& pg_ver_bl
= aiter
->second
;
5849 if (pg_ver_bl
.length()) {
5850 auto pgbl
= pg_ver_bl
.cbegin();
5852 decode(s
->pg_ver
, pgbl
);
5853 } catch (buffer::error
& err
) {
5854 ldout(cct
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5858 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
5859 if (aiter
!= s
->attrset
.end()) {
5860 bufferlist
& zone_short_id_bl
= aiter
->second
;
5861 if (zone_short_id_bl
.length()) {
5862 auto zbl
= zone_short_id_bl
.cbegin();
5864 decode(s
->zone_short_id
, zbl
);
5865 } catch (buffer::error
& err
) {
5866 ldout(cct
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5870 if (s
->obj_tag
.length())
5871 ldout(cct
, 20) << "get_obj_state: setting s->obj_tag to " << s
->obj_tag
.c_str() << dendl
;
5873 ldout(cct
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
5875 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5876 * it exist, and not only if is_olh() returns true
5878 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
5879 if (iter
!= s
->attrset
.end()) {
5880 s
->olh_tag
= iter
->second
;
5883 if (is_olh(s
->attrset
)) {
5886 ldout(cct
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
5888 if (need_follow_olh
) {
5889 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
5890 } else if (obj
.key
.have_null_instance() && !s
->has_manifest
) {
5891 // read null version, and the head object only have olh info
5900 int RGWRados::get_obj_state(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
5901 bool follow_olh
, bool assume_noent
)
5906 ret
= get_obj_state_impl(rctx
, bucket_info
, obj
, state
, follow_olh
, assume_noent
);
5907 } while (ret
== -EAGAIN
);
5912 int RGWRados::Object::get_manifest(RGWObjManifest
**pmanifest
)
5914 RGWObjState
*astate
;
5915 int r
= get_state(&astate
, true);
5920 *pmanifest
= &astate
->manifest
;
5925 int RGWRados::Object::Read::get_attr(const char *name
, bufferlist
& dest
)
5928 int r
= source
->get_state(&state
, true);
5933 if (!state
->get_attr(name
, dest
))
5940 int RGWRados::Object::Stat::stat_async()
5942 RGWObjectCtx
& ctx
= source
->get_ctx();
5943 rgw_obj
& obj
= source
->get_obj();
5944 RGWRados
*store
= source
->get_store();
5946 RGWObjState
*s
= ctx
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
5950 result
.size
= s
->size
;
5951 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
5952 result
.attrs
= s
->attrset
;
5953 result
.has_manifest
= s
->has_manifest
;
5954 result
.manifest
= s
->manifest
;
5960 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
5962 int r
= store
->get_obj_head_ioctx(source
->get_bucket_info(), obj
, &state
.io_ctx
);
5967 librados::ObjectReadOperation op
;
5968 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
5969 op
.getxattrs(&result
.attrs
, NULL
);
5970 state
.completion
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
5971 state
.io_ctx
.locator_set_key(loc
);
5972 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
5974 ldout(store
->ctx(), 5) << __func__
5975 << ": ERROR: aio_operate() returned ret=" << r
5984 int RGWRados::Object::Stat::wait()
5986 if (!state
.completion
) {
5990 state
.completion
->wait_for_safe();
5991 state
.ret
= state
.completion
->get_return_value();
5992 state
.completion
->release();
5994 if (state
.ret
!= 0) {
6001 int RGWRados::Object::Stat::finish()
6003 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
6004 if (iter
!= result
.attrs
.end()) {
6005 bufferlist
& bl
= iter
->second
;
6006 auto biter
= bl
.cbegin();
6008 decode(result
.manifest
, biter
);
6009 } catch (buffer::error
& err
) {
6010 RGWRados
*store
= source
->get_store();
6011 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
6014 result
.has_manifest
= true;
6020 int RGWRados::append_atomic_test(RGWObjectCtx
*rctx
,
6021 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6022 ObjectOperation
& op
, RGWObjState
**pstate
)
6027 int r
= get_obj_state(rctx
, bucket_info
, obj
, pstate
, false);
6031 return append_atomic_test(*pstate
, op
);
6034 int RGWRados::append_atomic_test(const RGWObjState
* state
,
6035 librados::ObjectOperation
& op
)
6037 if (!state
->is_atomic
) {
6038 ldout(cct
, 20) << "state for obj=" << state
->obj
<< " is not atomic, not appending atomic test" << dendl
;
6042 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
6043 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
6045 ldout(cct
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
6050 int RGWRados::Object::get_state(RGWObjState
**pstate
, bool follow_olh
, bool assume_noent
)
6052 return store
->get_obj_state(&ctx
, bucket_info
, obj
, pstate
, follow_olh
, assume_noent
);
6055 void RGWRados::Object::invalidate_state()
6057 ctx
.invalidate(obj
);
6060 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
6061 const char *if_match
, const char *if_nomatch
, bool removal_op
,
6064 int r
= get_state(&state
, false);
6068 bool need_guard
= (state
->has_manifest
|| (state
->obj_tag
.length() != 0) ||
6069 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
6072 if (!state
->is_atomic
) {
6073 ldout(store
->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
6077 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
6084 /* first verify that the object wasn't replaced under */
6085 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
6086 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
6087 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
6091 if (strcmp(if_match
, "*") == 0) {
6092 // test the object is existing
6093 if (!state
->exists
) {
6094 return -ERR_PRECONDITION_FAILED
;
6098 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
6099 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
6100 return -ERR_PRECONDITION_FAILED
;
6106 if (strcmp(if_nomatch
, "*") == 0) {
6107 // test the object is NOT existing
6108 if (state
->exists
) {
6109 return -ERR_PRECONDITION_FAILED
;
6113 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
6114 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
6115 return -ERR_PRECONDITION_FAILED
;
6122 if (state
->exists
) {
6124 store
->remove_rgw_head_obj(op
);
6131 /* the object is being removed, no need to update its tag */
6136 state
->write_tag
= *ptag
;
6138 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
6141 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
6143 ldout(store
->ctx(), 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
6145 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6147 op
.setxattr(RGW_ATTR_TAIL_TAG
, bl
);
6154 * Set an attr on an object.
6155 * bucket: name of the bucket holding the object
6156 * obj: name of the object to set the attr on
6157 * name: the attr to set
6158 * bl: the contents of the attr
6159 * Returns: 0 on success, -ERR# otherwise.
6161 int RGWRados::set_attr(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
6163 map
<string
, bufferlist
> attrs
;
6165 return set_attrs(ctx
, bucket_info
, obj
, attrs
, NULL
);
6168 int RGWRados::set_attrs(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& src_obj
,
6169 map
<string
, bufferlist
>& attrs
,
6170 map
<string
, bufferlist
>* rmattrs
)
6172 rgw_obj obj
= src_obj
;
6173 if (obj
.key
.instance
== "null") {
6174 obj
.key
.instance
.clear();
6178 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6182 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
6184 ObjectWriteOperation op
;
6185 RGWObjState
*state
= NULL
;
6187 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &state
);
6191 // ensure null version object exist
6192 if (src_obj
.key
.instance
== "null" && !state
->has_manifest
) {
6196 map
<string
, bufferlist
>::iterator iter
;
6198 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
6199 const string
& name
= iter
->first
;
6200 op
.rmxattr(name
.c_str());
6204 const rgw_bucket
& bucket
= obj
.bucket
;
6206 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6207 const string
& name
= iter
->first
;
6208 bufferlist
& bl
= iter
->second
;
6213 op
.setxattr(name
.c_str(), bl
);
6215 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
6220 rgw_obj_index_key obj_key
;
6221 obj
.key
.get_index_key(&obj_key
);
6223 objexp_hint_add(ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
6224 } catch (buffer::error
& err
) {
6225 ldout(cct
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
6233 RGWObjectCtx
obj_ctx(this);
6236 RGWRados::Bucket
bop(this, bucket_info
);
6237 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
6241 append_rand_alpha(cct
, tag
, tag
, 32);
6242 state
->write_tag
= tag
;
6243 r
= index_op
.prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
6248 bl
.append(tag
.c_str(), tag
.size() + 1);
6249 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6253 real_time mtime
= real_clock::now();
6254 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
6255 op
.mtime2(&mtime_ts
);
6256 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
6259 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
6260 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
6261 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
6262 string etag
= rgw_bl_str(etag_bl
);
6263 string content_type
= rgw_bl_str(content_type_bl
);
6264 string storage_class
;
6265 auto iter
= attrs
.find(RGW_ATTR_STORAGE_CLASS
);
6266 if (iter
!= attrs
.end()) {
6267 storage_class
= rgw_bl_str(iter
->second
);
6269 uint64_t epoch
= ref
.ioctx
.get_last_version();
6270 int64_t poolid
= ref
.ioctx
.get_id();
6271 r
= index_op
.complete(poolid
, epoch
, state
->size
, state
->accounted_size
,
6272 mtime
, etag
, content_type
, storage_class
, &acl_bl
,
6273 RGWObjCategory::Main
, NULL
);
6275 int ret
= index_op
.cancel();
6277 ldout(cct
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
6285 state
->obj_tag
.swap(bl
);
6287 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
6288 state
->attrset
.erase(iter
->first
);
6292 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6293 state
->attrset
[iter
->first
] = iter
->second
;
6296 auto iter
= state
->attrset
.find(RGW_ATTR_ID_TAG
);
6297 if (iter
!= state
->attrset
.end()) {
6298 iter
->second
= state
->obj_tag
;
6305 int RGWRados::Object::Read::prepare()
6307 RGWRados
*store
= source
->get_store();
6308 CephContext
*cct
= store
->ctx();
6312 map
<string
, bufferlist
>::iterator iter
;
6314 RGWObjState
*astate
;
6315 int r
= source
->get_state(&astate
, true);
6319 if (!astate
->exists
) {
6323 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
6325 state
.obj
= astate
->obj
;
6326 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
6328 state
.cur_pool
= state
.head_obj
.pool
;
6329 state
.cur_ioctx
= &state
.io_ctxs
[state
.cur_pool
];
6331 r
= store
->get_obj_head_ioctx(bucket_info
, state
.obj
, state
.cur_ioctx
);
6335 if (params
.target_obj
) {
6336 *params
.target_obj
= state
.obj
;
6339 *params
.attrs
= astate
->attrset
;
6340 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>()) {
6341 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
6342 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
6347 /* Convert all times go GMT to make them compatible */
6348 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
6349 obj_time_weight src_weight
;
6350 src_weight
.init(astate
);
6351 src_weight
.high_precision
= conds
.high_precision_time
;
6353 obj_time_weight dest_weight
;
6354 dest_weight
.high_precision
= conds
.high_precision_time
;
6356 if (conds
.mod_ptr
) {
6357 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
6358 ldout(cct
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
6359 if (!(dest_weight
< src_weight
)) {
6360 return -ERR_NOT_MODIFIED
;
6364 if (conds
.unmod_ptr
) {
6365 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
6366 ldout(cct
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
6367 if (dest_weight
< src_weight
) {
6368 return -ERR_PRECONDITION_FAILED
;
6372 if (conds
.if_match
|| conds
.if_nomatch
) {
6373 r
= get_attr(RGW_ATTR_ETAG
, etag
);
6379 if (conds
.if_match
) {
6380 string if_match_str
= rgw_string_unquote(conds
.if_match
);
6381 ldout(cct
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-Match: " << if_match_str
<< dendl
;
6382 if (if_match_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) != 0) {
6383 return -ERR_PRECONDITION_FAILED
;
6387 if (conds
.if_nomatch
) {
6388 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
6389 ldout(cct
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
6390 if (if_nomatch_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) == 0) {
6391 return -ERR_NOT_MODIFIED
;
6396 if (params
.obj_size
)
6397 *params
.obj_size
= astate
->size
;
6399 *params
.lastmod
= astate
->mtime
;
6404 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
6411 } else if (end
< 0) {
6416 if (ofs
>= (off_t
)obj_size
) {
6419 if (end
>= (off_t
)obj_size
) {
6426 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard
**pbs
, std::function
<int(BucketShard
*)> call
)
6428 RGWRados
*store
= target
->get_store();
6432 #define NUM_RESHARD_RETRIES 10
6433 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
6434 int ret
= get_bucket_shard(&bs
);
6436 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6440 if (r
!= -ERR_BUSY_RESHARDING
) {
6443 ldout(store
->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
6444 string new_bucket_id
;
6445 r
= store
->block_while_resharding(bs
, &new_bucket_id
,
6446 target
->bucket_info
, null_yield
);
6447 if (r
== -ERR_BUSY_RESHARDING
) {
6453 ldout(store
->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
6454 i
= 0; /* resharding is finished, make sure we can retry */
6455 r
= target
->update_bucket_id(new_bucket_id
);
6457 ldout(store
->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id
<< " returned r=" << r
<< dendl
;
6474 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op
, const string
*write_tag
)
6479 RGWRados
*store
= target
->get_store();
6481 if (write_tag
&& write_tag
->length()) {
6482 optag
= string(write_tag
->c_str(), write_tag
->length());
6484 if (optag
.empty()) {
6485 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
6489 int r
= guard_reshard(nullptr, [&](BucketShard
*bs
) -> int {
6490 return store
->cls_obj_prepare_op(*bs
, op
, optag
, obj
, bilog_flags
, zones_trace
);
6501 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid
, uint64_t epoch
,
6502 uint64_t size
, uint64_t accounted_size
,
6503 ceph::real_time
& ut
, const string
& etag
,
6504 const string
& content_type
, const string
& storage_class
,
6506 RGWObjCategory category
,
6507 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
,
6513 RGWRados
*store
= target
->get_store();
6516 int ret
= get_bucket_shard(&bs
);
6518 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6522 rgw_bucket_dir_entry ent
;
6523 obj
.key
.get_index_key(&ent
.key
);
6524 ent
.meta
.size
= size
;
6525 ent
.meta
.accounted_size
= accounted_size
;
6526 ent
.meta
.mtime
= ut
;
6527 ent
.meta
.etag
= etag
;
6528 ent
.meta
.storage_class
= storage_class
;
6530 ent
.meta
.user_data
= *user_data
;
6533 if (acl_bl
&& acl_bl
->length()) {
6534 int ret
= store
->decode_policy(*acl_bl
, &owner
);
6536 ldout(store
->ctx(), 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
6539 ent
.meta
.owner
= owner
.get_id().to_str();
6540 ent
.meta
.owner_display_name
= owner
.get_display_name();
6541 ent
.meta
.content_type
= content_type
;
6542 ent
.meta
.appendable
= appendable
;
6544 ret
= store
->cls_obj_complete_add(*bs
, obj
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
6546 if (target
->bucket_info
.datasync_flag_enabled()) {
6547 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
6549 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6556 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid
, uint64_t epoch
,
6557 real_time
& removed_mtime
,
6558 list
<rgw_obj_index_key
> *remove_objs
)
6563 RGWRados
*store
= target
->get_store();
6566 int ret
= get_bucket_shard(&bs
);
6568 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6572 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
, zones_trace
);
6574 if (target
->bucket_info
.datasync_flag_enabled()) {
6575 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
6577 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6585 int RGWRados::Bucket::UpdateIndex::cancel()
6590 RGWRados
*store
= target
->get_store();
6593 int ret
= guard_reshard(&bs
, [&](BucketShard
*bs
) -> int {
6594 return store
->cls_obj_complete_cancel(*bs
, optag
, obj
, bilog_flags
, zones_trace
);
6598 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6599 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6600 * have no way to tell that they're all caught up
6602 if (target
->bucket_info
.datasync_flag_enabled()) {
6603 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
6605 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6612 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
)
6614 RGWRados
*store
= source
->get_store();
6615 CephContext
*cct
= store
->ctx();
6617 rgw_raw_obj read_obj
;
6618 uint64_t read_ofs
= ofs
;
6619 uint64_t len
, read_len
;
6620 bool reading_from_head
= true;
6621 ObjectReadOperation op
;
6623 bool merge_bl
= false;
6624 bufferlist
*pbl
= &bl
;
6626 uint64_t max_chunk_size
;
6628 RGWObjState
*astate
;
6629 int r
= source
->get_state(&astate
, true);
6633 if (astate
->size
== 0) {
6635 } else if (end
>= (int64_t)astate
->size
) {
6636 end
= astate
->size
- 1;
6642 len
= end
- ofs
+ 1;
6644 if (astate
->has_manifest
&& astate
->manifest
.has_tail()) {
6645 /* now get the relevant object part */
6646 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
6648 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
6649 read_obj
= iter
.get_location().get_raw_obj(store
);
6650 len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6651 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6652 reading_from_head
= (read_obj
== state
.head_obj
);
6654 read_obj
= state
.head_obj
;
6657 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
);
6659 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
6663 if (len
> max_chunk_size
)
6664 len
= max_chunk_size
;
6669 if (reading_from_head
) {
6670 /* only when reading from the head object do we need to do the atomic test */
6671 r
= store
->append_atomic_test(&source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
);
6675 if (astate
&& astate
->prefetch_data
) {
6676 if (!ofs
&& astate
->data
.length() >= len
) {
6681 if (ofs
< astate
->data
.length()) {
6682 unsigned copy_len
= std::min((uint64_t)astate
->data
.length() - ofs
, len
);
6683 astate
->data
.copy(ofs
, copy_len
, bl
);
6684 read_len
-= copy_len
;
6685 read_ofs
+= copy_len
;
6695 ldout(cct
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
6696 op
.read(read_ofs
, read_len
, pbl
, NULL
);
6698 if (state
.cur_pool
!= read_obj
.pool
) {
6699 auto iter
= state
.io_ctxs
.find(read_obj
.pool
);
6700 if (iter
== state
.io_ctxs
.end()) {
6701 state
.cur_ioctx
= &state
.io_ctxs
[read_obj
.pool
];
6702 r
= store
->open_pool_ctx(read_obj
.pool
, *state
.cur_ioctx
, false);
6704 ldout(cct
, 20) << "ERROR: failed to open pool context for pool=" << read_obj
.pool
<< " r=" << r
<< dendl
;
6708 state
.cur_ioctx
= &iter
->second
;
6710 state
.cur_pool
= read_obj
.pool
;
6713 state
.cur_ioctx
->locator_set_key(read_obj
.loc
);
6715 r
= state
.cur_ioctx
->operate(read_obj
.oid
, &op
, NULL
);
6716 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
6729 struct get_obj_data
{
6731 RGWGetDataCB
* client_cb
;
6733 uint64_t offset
; // next offset to write to client
6734 rgw::AioResultList completed
; // completed read results, sorted by offset
6736 get_obj_data(RGWRados
* store
, RGWGetDataCB
* cb
, rgw::Aio
* aio
, uint64_t offset
)
6737 : store(store
), client_cb(cb
), aio(aio
), offset(offset
) {}
6739 int flush(rgw::AioResultList
&& results
) {
6740 int r
= rgw::check_for_errors(results
);
6745 auto cmp
= [](const auto& lhs
, const auto& rhs
) { return lhs
.id
< rhs
.id
; };
6746 results
.sort(cmp
); // merge() requires results to be sorted first
6747 completed
.merge(results
, cmp
); // merge results in sorted order
6749 while (!completed
.empty() && completed
.front().id
== offset
) {
6750 auto bl
= std::move(completed
.front().data
);
6751 completed
.pop_front_and_dispose(std::default_delete
<rgw::AioResultEntry
>{});
6753 offset
+= bl
.length();
6754 int r
= client_cb
->handle_data(bl
, 0, bl
.length());
6763 // wait for all completions to drain and ignore the results
6768 auto c
= aio
->wait();
6769 while (!c
.empty()) {
6770 int r
= flush(std::move(c
));
6777 return flush(std::move(c
));
6781 static int _get_obj_iterate_cb(const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6782 off_t read_ofs
, off_t len
, bool is_head_obj
,
6783 RGWObjState
*astate
, void *arg
)
6785 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
6787 return d
->store
->get_obj_iterate_cb(read_obj
, obj_ofs
, read_ofs
, len
,
6788 is_head_obj
, astate
, arg
);
6791 int RGWRados::get_obj_iterate_cb(const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6792 off_t read_ofs
, off_t len
, bool is_head_obj
,
6793 RGWObjState
*astate
, void *arg
)
6795 ObjectReadOperation op
;
6796 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
6800 /* only when reading from the head object do we need to do the atomic test */
6801 int r
= append_atomic_test(astate
, op
);
6806 obj_ofs
< astate
->data
.length()) {
6807 unsigned chunk_len
= std::min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
6809 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
6814 d
->offset
+= chunk_len
;
6815 read_ofs
+= chunk_len
;
6816 obj_ofs
+= chunk_len
;
6822 auto obj
= d
->store
->svc
.rados
->obj(read_obj
);
6825 ldout(cct
, 4) << "failed to open rados context for " << read_obj
<< dendl
;
6829 ldout(cct
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
6830 op
.read(read_ofs
, len
, nullptr, nullptr);
6832 const uint64_t cost
= len
;
6833 const uint64_t id
= obj_ofs
; // use logical object offset for sorting replies
6835 auto completed
= d
->aio
->submit(obj
, &op
, cost
, id
);
6837 return d
->flush(std::move(completed
));
6840 int RGWRados::Object::Read::iterate(int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
)
6842 RGWRados
*store
= source
->get_store();
6843 CephContext
*cct
= store
->ctx();
6844 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
6845 const uint64_t chunk_size
= cct
->_conf
->rgw_get_obj_max_req_size
;
6846 const uint64_t window_size
= cct
->_conf
->rgw_get_obj_window_size
;
6848 rgw::AioThrottle
aio(window_size
);
6849 get_obj_data
data(store
, cb
, &aio
, ofs
);
6851 int r
= store
->iterate_obj(obj_ctx
, source
->get_bucket_info(), state
.obj
,
6852 ofs
, end
, chunk_size
, _get_obj_iterate_cb
, &data
);
6854 ldout(cct
, 0) << "iterate_obj() failed with " << r
<< dendl
;
6855 data
.cancel(); // drain completions without writing back to client
6859 return data
.drain();
6862 int RGWRados::iterate_obj(RGWObjectCtx
& obj_ctx
,
6863 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6864 off_t ofs
, off_t end
, uint64_t max_chunk_size
,
6865 iterate_obj_cb cb
, void *arg
)
6867 rgw_raw_obj head_obj
;
6868 rgw_raw_obj read_obj
;
6869 uint64_t read_ofs
= ofs
;
6871 bool reading_from_head
= true;
6872 RGWObjState
*astate
= NULL
;
6874 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
6876 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &astate
, false);
6884 len
= end
- ofs
+ 1;
6886 if (astate
->has_manifest
) {
6887 /* now get the relevant object stripe */
6888 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
6890 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
.obj_end();
6892 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
6893 off_t stripe_ofs
= iter
.get_stripe_ofs();
6894 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
6896 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
6897 read_obj
= iter
.get_location().get_raw_obj(this);
6898 uint64_t read_len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6899 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6901 if (read_len
> max_chunk_size
) {
6902 read_len
= max_chunk_size
;
6905 reading_from_head
= (read_obj
== head_obj
);
6906 r
= cb(read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
6916 while (ofs
<= end
) {
6917 read_obj
= head_obj
;
6918 uint64_t read_len
= std::min(len
, max_chunk_size
);
6920 r
= cb(read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
6933 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
6936 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6941 return ref
.ioctx
.operate(ref
.obj
.oid
, op
);
6944 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
6947 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6954 return ref
.ioctx
.operate(ref
.obj
.oid
, op
, &outbl
);
6957 int RGWRados::olh_init_modification_impl(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
6959 ObjectWriteOperation op
;
6961 ceph_assert(olh_obj
.key
.instance
.empty());
6963 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
6965 if (!state
.exists
) {
6969 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
6970 op
.mtime2(&mtime_ts
);
6974 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6975 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6976 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6977 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6978 * log will reflect that.
6980 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6981 * is used for object data instance, olh_tag for olh instance.
6984 /* guard against racing writes */
6985 bucket_index_guard_olh_op(state
, op
);
6991 gen_rand_alphanumeric_lower(cct
, &obj_tag
, 32);
6994 bl
.append(obj_tag
.c_str(), obj_tag
.size());
6995 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6997 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
7002 gen_rand_alphanumeric_lower(cct
, &olh_tag
, 32);
7005 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
7006 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
7008 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
7009 state
.olh_tag
= olh_bl
;
7010 state
.is_olh
= true;
7013 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
7017 RGWOLHPendingInfo pending_info
;
7018 pending_info
.time
= real_clock::now();
7019 encode(pending_info
, bl
);
7021 #define OLH_PENDING_TAG_LEN 32
7022 /* tag will start with current time epoch, this so that entries are sorted by time */
7024 utime_t
ut(pending_info
.time
);
7025 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
7029 gen_rand_alphanumeric_lower(cct
, &s
, OLH_PENDING_TAG_LEN
- op_tag
->size());
7033 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
7034 attr_name
.append(*op_tag
);
7036 op
.setxattr(attr_name
.c_str(), bl
);
7038 int ret
= obj_operate(bucket_info
, olh_obj
, &op
);
7043 state
.exists
= true;
7044 state
.attrset
[attr_name
] = bl
;
7049 int RGWRados::olh_init_modification(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
7053 ret
= olh_init_modification_impl(bucket_info
, state
, obj
, op_tag
);
7054 if (ret
== -EEXIST
) {
7061 int RGWRados::guard_reshard(BucketShard
*bs
,
7062 const rgw_obj
& obj_instance
,
7063 const RGWBucketInfo
& bucket_info
,
7064 std::function
<int(BucketShard
*)> call
)
7067 const rgw_obj
*pobj
= &obj_instance
;
7070 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
7071 r
= bs
->init(pobj
->bucket
, *pobj
, nullptr /* no RGWBucketInfo */);
7073 ldout(cct
, 5) << "bs.init() returned ret=" << r
<< dendl
;
7077 if (r
!= -ERR_BUSY_RESHARDING
) {
7080 ldout(cct
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
7081 string new_bucket_id
;
7082 r
= block_while_resharding(bs
, &new_bucket_id
, bucket_info
, null_yield
);
7083 if (r
== -ERR_BUSY_RESHARDING
) {
7089 ldout(cct
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
7090 i
= 0; /* resharding is finished, make sure we can retry */
7093 obj
.bucket
.update_bucket_id(new_bucket_id
);
7104 int RGWRados::block_while_resharding(RGWRados::BucketShard
*bs
,
7105 string
*new_bucket_id
,
7106 const RGWBucketInfo
& bucket_info
,
7110 cls_rgw_bucket_instance_entry entry
;
7112 // since we want to run this recovery code from two distinct places,
7113 // let's just put it in a lambda so we can easily re-use; if the
7114 // lambda successfully fetches a new bucket id, it sets
7115 // new_bucket_id and returns 0, otherwise it returns a negative
7117 auto fetch_new_bucket_id
=
7118 [this, bucket_info
](const std::string
& log_tag
,
7119 std::string
* new_bucket_id
) -> int {
7120 RGWBucketInfo fresh_bucket_info
= bucket_info
;
7121 int ret
= try_refresh_bucket_info(fresh_bucket_info
, nullptr);
7123 ldout(cct
, 0) << __func__
<<
7124 " ERROR: failed to refresh bucket info after reshard at " <<
7125 log_tag
<< ": " << cpp_strerror(-ret
) << dendl
;
7128 *new_bucket_id
= fresh_bucket_info
.bucket
.bucket_id
;
7132 constexpr int num_retries
= 10;
7133 for (int i
= 1; i
<= num_retries
; i
++) { // nb: 1-based for loop
7134 ret
= cls_rgw_get_bucket_resharding(bs
->index_ctx
, bs
->bucket_obj
, &entry
);
7135 if (ret
== -ENOENT
) {
7136 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id
);
7137 } else if (ret
< 0) {
7138 ldout(cct
, 0) << __func__
<<
7139 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret
) <<
7144 if (!entry
.resharding_in_progress()) {
7145 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
7149 ldout(cct
, 20) << "NOTICE: reshard still in progress; " <<
7150 (i
< num_retries
? "retrying" : "too many retries") << dendl
;
7152 if (i
== num_retries
) {
7156 // If bucket is erroneously marked as resharding (e.g., crash or
7157 // other error) then fix it. If we can take the bucket reshard
7158 // lock then it means no other resharding should be taking place,
7159 // and we're free to clear the flags.
7161 // since we expect to do this rarely, we'll do our work in a
7162 // block and erase our work after each try
7164 RGWObjectCtx
obj_ctx(this);
7165 const rgw_bucket
& b
= bs
->bucket
;
7166 std::string bucket_id
= b
.get_key();
7167 RGWBucketReshardLock
reshard_lock(this, bucket_info
, true);
7168 ret
= reshard_lock
.lock();
7170 ldout(cct
, 20) << __func__
<<
7171 " INFO: failed to take reshard lock for bucket " <<
7172 bucket_id
<< "; expected if resharding underway" << dendl
;
7174 ldout(cct
, 10) << __func__
<<
7175 " INFO: was able to take reshard lock for bucket " <<
7177 ret
= RGWBucketReshard::clear_resharding(this, bucket_info
);
7179 reshard_lock
.unlock();
7180 ldout(cct
, 0) << __func__
<<
7181 " ERROR: failed to clear resharding flags for bucket " <<
7184 reshard_lock
.unlock();
7185 ldout(cct
, 5) << __func__
<<
7186 " INFO: apparently successfully cleared resharding flags for "
7187 "bucket " << bucket_id
<< dendl
;
7188 continue; // if we apparently succeed immediately test again
7189 } // if clear resharding succeeded
7190 } // if taking of lock succeeded
7191 } // block to encapsulate recovery from incomplete reshard
7193 ret
= reshard_wait
->wait(y
);
7195 ldout(cct
, 0) << __func__
<<
7196 " ERROR: bucket is still resharding, please retry" << dendl
;
7201 ldout(cct
, 0) << __func__
<<
7202 " ERROR: bucket is still resharding, please retry" << dendl
;
7203 return -ERR_BUSY_RESHARDING
;
7206 int RGWRados::bucket_index_link_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
7208 const string
& op_tag
,
7209 struct rgw_bucket_dir_entry_meta
*meta
,
7211 real_time unmod_since
, bool high_precision_time
,
7212 rgw_zone_set
*_zones_trace
, bool log_data_change
)
7215 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7220 rgw_zone_set zones_trace
;
7222 zones_trace
= *_zones_trace
;
7224 zones_trace
.insert(svc
.zone
->get_zone().id
);
7226 BucketShard
bs(this);
7228 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
7229 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7230 [&](BucketShard
*bs
) -> int {
7231 librados::ObjectWriteOperation op
;
7232 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7233 return cls_rgw_bucket_link_olh(bs
->index_ctx
, op
,
7234 bs
->bucket_obj
, key
, olh_state
.olh_tag
, delete_marker
, op_tag
, meta
, olh_epoch
,
7235 unmod_since
, high_precision_time
,
7236 svc
.zone
->get_zone().log_data
, zones_trace
);
7239 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
7243 if (log_data_change
&& bucket_info
.datasync_flag_enabled()) {
7244 data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
7250 void RGWRados::bucket_index_guard_olh_op(RGWObjState
& olh_state
, ObjectOperation
& op
)
7252 ldout(cct
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
7253 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
7256 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
7257 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
, rgw_zone_set
*_zones_trace
)
7260 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7265 rgw_zone_set zones_trace
;
7267 zones_trace
= *_zones_trace
;
7269 zones_trace
.insert(svc
.zone
->get_zone().id
);
7271 BucketShard
bs(this);
7273 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
7274 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7275 [&](BucketShard
*bs
) -> int {
7276 librados::ObjectWriteOperation op
;
7277 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7278 return cls_rgw_bucket_unlink_instance(bs
->index_ctx
, op
, bs
->bucket_obj
, key
, op_tag
,
7279 olh_tag
, olh_epoch
, svc
.zone
->get_zone().log_data
, zones_trace
);
7282 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
7289 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
7290 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
7291 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
7295 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7300 BucketShard
bs(this);
7302 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
7304 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7308 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7310 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7312 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7313 [&](BucketShard
*bs
) -> int {
7314 ObjectReadOperation op
;
7315 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7316 return cls_rgw_get_olh_log(bs
->index_ctx
, bs
->bucket_obj
, op
,
7317 key
, ver_marker
, olh_tag
, log
, is_truncated
);
7320 ldout(cct
, 20) << "cls_rgw_get_olh_log() returned r=" << r
<< dendl
;
7327 // a multisite sync bug resulted in the OLH head attributes being overwritten by
7328 // the attributes from another zone, causing link_olh() to fail endlessly due to
7329 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7330 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
7331 int RGWRados::repair_olh(RGWObjState
* state
, const RGWBucketInfo
& bucket_info
,
7334 // fetch the current olh entry from the bucket index
7335 rgw_bucket_olh_entry olh
;
7336 int r
= bi_get_olh(bucket_info
, obj
, &olh
);
7338 ldout(cct
, 0) << "repair_olh failed to read olh entry for " << obj
<< dendl
;
7341 if (olh
.tag
== rgw_bl_str(state
->olh_tag
)) { // mismatch already resolved?
7345 ldout(cct
, 4) << "repair_olh setting olh_tag=" << olh
.tag
7346 << " key=" << olh
.key
<< " delete_marker=" << olh
.delete_marker
<< dendl
;
7348 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7349 ObjectWriteOperation op
;
7350 // assert this is the same olh tag we think we're fixing
7351 bucket_index_guard_olh_op(*state
, op
);
7352 // preserve existing mtime
7353 struct timespec mtime_ts
= ceph::real_clock::to_timespec(state
->mtime
);
7354 op
.mtime2(&mtime_ts
);
7357 bl
.append(olh
.tag
.c_str(), olh
.tag
.size());
7358 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, bl
);
7362 info
.target
= rgw_obj(bucket_info
.bucket
, olh
.key
);
7363 info
.removed
= olh
.delete_marker
;
7366 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7369 r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
7373 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
7375 ldout(cct
, 0) << "repair_olh failed to write olh attributes with "
7376 << cpp_strerror(r
) << dendl
;
7382 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
7385 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7390 BucketShard
bs(this);
7392 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
7394 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7398 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7400 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7402 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7403 [&](BucketShard
*pbs
) -> int {
7404 ObjectWriteOperation op
;
7405 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7406 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
7407 return pbs
->index_ctx
.operate(pbs
->bucket_obj
, &op
);
7410 ldout(cct
, 20) << "cls_rgw_trim_olh_log() returned r=" << ret
<< dendl
;
7417 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
7420 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7425 BucketShard
bs(this);
7427 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7429 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7431 int ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7432 [&](BucketShard
*pbs
) -> int {
7433 ObjectWriteOperation op
;
7434 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7435 return cls_rgw_clear_olh(pbs
->index_ctx
, op
, pbs
->bucket_obj
, key
, olh_tag
);
7438 ldout(cct
, 5) << "cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
7445 static int decode_olh_info(CephContext
* cct
, const bufferlist
& bl
, RGWOLHInfo
*olh
)
7448 auto biter
= bl
.cbegin();
7449 decode(*olh
, biter
);
7451 } catch (buffer::error
& err
) {
7452 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
7457 int RGWRados::apply_olh_log(RGWObjectCtx
& obj_ctx
, RGWObjState
& state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
7458 bufferlist
& olh_tag
, map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >& log
,
7459 uint64_t *plast_ver
, rgw_zone_set
* zones_trace
)
7465 librados::ObjectWriteOperation op
;
7467 uint64_t last_ver
= log
.rbegin()->first
;
7468 *plast_ver
= last_ver
;
7470 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
7472 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7473 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GTE
, last_ver
);
7476 string last_ver_s
= to_string(last_ver
);
7477 ver_bl
.append(last_ver_s
.c_str(), last_ver_s
.size());
7478 op
.setxattr(RGW_ATTR_OLH_VER
, ver_bl
);
7480 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
7481 op
.mtime2(&mtime_ts
);
7483 bool need_to_link
= false;
7484 uint64_t link_epoch
= 0;
7485 cls_rgw_obj_key key
;
7486 bool delete_marker
= false;
7487 list
<cls_rgw_obj_key
> remove_instances
;
7488 bool need_to_remove
= false;
7490 // decode current epoch and instance
7491 auto olh_ver
= state
.attrset
.find(RGW_ATTR_OLH_VER
);
7492 if (olh_ver
!= state
.attrset
.end()) {
7493 std::string str
= olh_ver
->second
.to_str();
7495 link_epoch
= strict_strtoll(str
.c_str(), 10, &err
);
7497 auto olh_info
= state
.attrset
.find(RGW_ATTR_OLH_INFO
);
7498 if (olh_info
!= state
.attrset
.end()) {
7500 int r
= decode_olh_info(cct
, olh_info
->second
, &info
);
7504 info
.target
.key
.get_index_key(&key
);
7505 delete_marker
= info
.removed
;
7508 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
7509 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
7510 for (; viter
!= iter
->second
.end(); ++viter
) {
7511 rgw_bucket_olh_log_entry
& entry
= *viter
;
7513 ldout(cct
, 20) << "olh_log_entry: epoch=" << iter
->first
<< " op=" << (int)entry
.op
7514 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
7515 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
7517 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
7518 remove_instances
.push_back(entry
.key
);
7520 case CLS_RGW_OLH_OP_LINK_OLH
:
7521 // only overwrite a link of the same epoch if its key sorts before
7522 if (link_epoch
< iter
->first
|| key
.instance
.empty() ||
7523 key
.instance
> entry
.key
.instance
) {
7524 ldout(cct
, 20) << "apply_olh_log applying key=" << entry
.key
<< " epoch=" << iter
->first
<< " delete_marker=" << entry
.delete_marker
7525 << " over current=" << key
<< " epoch=" << link_epoch
<< " delete_marker=" << delete_marker
<< dendl
;
7526 need_to_link
= true;
7527 need_to_remove
= false;
7529 delete_marker
= entry
.delete_marker
;
7531 ldout(cct
, 20) << "apply_olh skipping key=" << entry
.key
<< " epoch=" << iter
->first
<< " delete_marker=" << entry
.delete_marker
7532 << " before current=" << key
<< " epoch=" << link_epoch
<< " delete_marker=" << delete_marker
<< dendl
;
7535 case CLS_RGW_OLH_OP_UNLINK_OLH
:
7536 need_to_remove
= true;
7537 need_to_link
= false;
7540 ldout(cct
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
7543 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
7544 attr_name
.append(entry
.op_tag
);
7545 op
.rmxattr(attr_name
.c_str());
7550 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
7555 const rgw_bucket
& bucket
= obj
.bucket
;
7558 rgw_obj
target(bucket
, key
);
7560 info
.target
= target
;
7561 info
.removed
= delete_marker
;
7564 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7567 /* first remove object instances */
7568 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
7569 liter
!= remove_instances
.end(); ++liter
) {
7570 cls_rgw_obj_key
& key
= *liter
;
7571 rgw_obj
obj_instance(bucket
, key
);
7572 int ret
= delete_obj(obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
, ceph::real_time(), zones_trace
);
7573 if (ret
< 0 && ret
!= -ENOENT
) {
7574 ldout(cct
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
7579 /* update olh object */
7580 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
7581 if (r
== -ECANCELED
) {
7585 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7589 r
= bucket_index_trim_olh_log(bucket_info
, state
, obj
, last_ver
);
7591 ldout(cct
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
7595 if (need_to_remove
) {
7596 ObjectWriteOperation rm_op
;
7598 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7599 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_EQ
, last_ver
);
7600 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
7603 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &rm_op
);
7604 if (r
== -ECANCELED
) {
7605 return 0; /* someone else won this race */
7608 * only clear if was successful, otherwise we might clobber pending operations on this object
7610 r
= bucket_index_clear_olh(bucket_info
, state
, obj
);
7612 ldout(cct
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
7622 * read olh log and apply it
7624 int RGWRados::update_olh(RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_zone_set
*zones_trace
)
7626 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
7628 uint64_t ver_marker
= 0;
7631 int ret
= bucket_index_read_olh_log(bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
7635 ret
= apply_olh_log(obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
, zones_trace
);
7639 } while (is_truncated
);
7644 int RGWRados::set_olh(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
7645 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
,
7646 rgw_zone_set
*zones_trace
, bool log_data_change
)
7650 rgw_obj olh_obj
= target_obj
;
7651 olh_obj
.key
.instance
.clear();
7653 RGWObjState
*state
= NULL
;
7658 #define MAX_ECANCELED_RETRY 100
7659 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7660 if (ret
== -ECANCELED
) {
7661 obj_ctx
.invalidate(olh_obj
);
7664 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
7669 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
7671 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7672 if (ret
== -ECANCELED
) {
7677 ret
= bucket_index_link_olh(bucket_info
, *state
, target_obj
, delete_marker
,
7678 op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
,
7679 zones_trace
, log_data_change
);
7681 ldout(cct
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7682 if (ret
== -ECANCELED
) {
7683 // the bucket index rejected the link_olh() due to olh tag mismatch;
7684 // attempt to reconstruct olh head attributes based on the bucket index
7685 int r2
= repair_olh(state
, bucket_info
, olh_obj
);
7686 if (r2
< 0 && r2
!= -ECANCELED
) {
7696 if (i
== MAX_ECANCELED_RETRY
) {
7697 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7701 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
7702 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7706 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7713 int RGWRados::unlink_obj_instance(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
7714 uint64_t olh_epoch
, rgw_zone_set
*zones_trace
)
7718 rgw_obj olh_obj
= target_obj
;
7719 olh_obj
.key
.instance
.clear();
7721 RGWObjState
*state
= NULL
;
7726 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7727 if (ret
== -ECANCELED
) {
7728 obj_ctx
.invalidate(olh_obj
);
7731 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
7735 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
7737 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7738 if (ret
== -ECANCELED
) {
7744 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
7746 ret
= bucket_index_unlink_instance(bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
, zones_trace
);
7748 ldout(cct
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7749 if (ret
== -ECANCELED
) {
7757 if (i
== MAX_ECANCELED_RETRY
) {
7758 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7762 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
, zones_trace
);
7763 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7767 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7774 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key
*target_key
)
7776 #define OBJ_INSTANCE_LEN 32
7777 char buf
[OBJ_INSTANCE_LEN
+ 1];
7779 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
7780 no underscore for instance name due to the way we encode the raw keys */
7782 target_key
->set_instance(buf
);
7785 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
7787 gen_rand_obj_instance_name(&target_obj
->key
);
7790 int RGWRados::get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
7792 map
<string
, bufferlist
> attrset
;
7794 ObjectReadOperation op
;
7795 op
.getxattrs(&attrset
, NULL
);
7797 int r
= obj_operate(bucket_info
, obj
, &op
);
7802 auto iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
7803 if (iter
== attrset
.end()) { /* not an olh */
7807 return decode_olh_info(cct
, iter
->second
, olh
);
7810 void RGWRados::check_pending_olh_entries(map
<string
, bufferlist
>& pending_entries
,
7811 map
<string
, bufferlist
> *rm_pending_entries
)
7813 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
7815 real_time now
= real_clock::now();
7817 while (iter
!= pending_entries
.end()) {
7818 auto biter
= iter
->second
.cbegin();
7819 RGWOLHPendingInfo pending_info
;
7821 decode(pending_info
, biter
);
7822 } catch (buffer::error
& err
) {
7823 /* skipping bad entry, we could remove it but it might hide a bug */
7824 ldout(cct
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
7829 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
7831 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
7832 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
7833 pending_entries
.erase(cur_iter
);
7835 /* entries names are sorted by time (rounded to a second) */
7841 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
7844 int r
= get_obj_head_ref(bucket_info
, olh_obj
, &ref
);
7849 // trim no more than 1000 entries per osd op
7850 constexpr int max_entries
= 1000;
7852 auto i
= pending_attrs
.begin();
7853 while (i
!= pending_attrs
.end()) {
7854 ObjectWriteOperation op
;
7855 bucket_index_guard_olh_op(state
, op
);
7857 for (int n
= 0; n
< max_entries
&& i
!= pending_attrs
.end(); ++n
, ++i
) {
7858 op
.rmxattr(i
->first
.c_str());
7861 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
7862 if (r
== -ENOENT
|| r
== -ECANCELED
) {
7863 /* raced with some other change, shouldn't sweat about it */
7867 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7874 int RGWRados::follow_olh(const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
7876 map
<string
, bufferlist
> pending_entries
;
7877 rgw_filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
7879 map
<string
, bufferlist
> rm_pending_entries
;
7880 check_pending_olh_entries(pending_entries
, &rm_pending_entries
);
7882 if (!rm_pending_entries
.empty()) {
7883 int ret
= remove_olh_pending_entries(bucket_info
, *state
, olh_obj
, rm_pending_entries
);
7885 ldout(cct
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
7889 if (!pending_entries
.empty()) {
7890 ldout(cct
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
7892 int ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
7898 auto iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
7899 if (iter
== state
->attrset
.end()) {
7904 int ret
= decode_olh_info(cct
, iter
->second
, &olh
);
7913 *target
= olh
.target
;
7918 int RGWRados::raw_obj_stat(rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
7919 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
7920 RGWObjVersionTracker
*objv_tracker
)
7923 int r
= get_raw_obj_ref(obj
, &ref
);
7928 map
<string
, bufferlist
> unfiltered_attrset
;
7930 struct timespec mtime_ts
;
7932 ObjectReadOperation op
;
7934 objv_tracker
->prepare_op_for_read(&op
);
7937 op
.getxattrs(&unfiltered_attrset
, NULL
);
7939 if (psize
|| pmtime
) {
7940 op
.stat2(&size
, &mtime_ts
, NULL
);
7943 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
7946 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
, &outbl
);
7949 *epoch
= ref
.ioctx
.get_last_version();
7958 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
7960 rgw_filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
7966 int RGWRados::get_bucket_stats(RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
7967 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
, bool *syncstopped
)
7969 vector
<rgw_bucket_dir_header
> headers
;
7970 map
<int, string
> bucket_instance_ids
;
7971 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
7976 ceph_assert(headers
.size() == bucket_instance_ids
.size());
7978 auto iter
= headers
.begin();
7979 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
7980 BucketIndexShardsManager ver_mgr
;
7981 BucketIndexShardsManager master_ver_mgr
;
7982 BucketIndexShardsManager marker_mgr
;
7984 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
7985 accumulate_raw_stats(*iter
, stats
);
7986 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->ver
);
7987 ver_mgr
.add(viter
->first
, string(buf
));
7988 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->master_ver
);
7989 master_ver_mgr
.add(viter
->first
, string(buf
));
7990 if (shard_id
>= 0) {
7991 *max_marker
= iter
->max_marker
;
7993 marker_mgr
.add(viter
->first
, iter
->max_marker
);
7995 if (syncstopped
!= NULL
)
7996 *syncstopped
= iter
->syncstopped
;
7998 ver_mgr
.to_string(bucket_ver
);
7999 master_ver_mgr
.to_string(master_ver
);
8001 marker_mgr
.to_string(max_marker
);
8006 int RGWRados::get_bi_log_status(RGWBucketInfo
& bucket_info
, int shard_id
,
8007 map
<int, string
>& markers
)
8009 vector
<rgw_bucket_dir_header
> headers
;
8010 map
<int, string
> bucket_instance_ids
;
8011 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
8015 ceph_assert(headers
.size() == bucket_instance_ids
.size());
8017 auto iter
= headers
.begin();
8018 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
8020 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
8021 if (shard_id
>= 0) {
8022 markers
[shard_id
] = iter
->max_marker
;
8024 markers
[viter
->first
] = iter
->max_marker
;
8030 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
8031 RGWGetBucketStats_CB
*cb
;
8033 map
<RGWObjCategory
, RGWStorageStats
> stats
;
8039 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
8040 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true),
8041 lock("RGWGetBucketStatsContext") {}
8043 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
8044 Mutex::Locker
l(lock
);
8047 accumulate_raw_stats(header
, stats
);
8053 if (--pendings
== 0) {
8055 cb
->set_response(&stats
);
8057 cb
->handle_response(ret_code
);
8064 Mutex::Locker
l(lock
);
8069 int RGWRados::get_bucket_stats_async(RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
8072 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.num_shards
? : 1);
8073 ceph_assert(get_ctx
);
8074 int r
= cls_bucket_head_async(bucket_info
, shard_id
, get_ctx
, &num_aio
);
8078 get_ctx
->unset_cb();
8085 class RGWGetUserStatsContext
: public RGWGetUserHeader_CB
{
8086 RGWGetUserStats_CB
*cb
;
8089 explicit RGWGetUserStatsContext(RGWGetUserStats_CB
* const cb
)
8092 void handle_response(int r
, cls_user_header
& header
) override
{
8093 const cls_user_stats
& hs
= header
.stats
;
8095 RGWStorageStats stats
;
8097 stats
.size
= hs
.total_bytes
;
8098 stats
.size_rounded
= hs
.total_bytes_rounded
;
8099 stats
.num_objects
= hs
.total_entries
;
8101 cb
->set_response(stats
);
8104 cb
->handle_response(r
);
8110 int RGWRados::get_user_stats(const rgw_user
& user
, RGWStorageStats
& stats
)
8112 string user_str
= user
.to_str();
8114 cls_user_header header
;
8115 int r
= cls_user_get_header(user_str
, &header
);
8119 const cls_user_stats
& hs
= header
.stats
;
8121 stats
.size
= hs
.total_bytes
;
8122 stats
.size_rounded
= hs
.total_bytes_rounded
;
8123 stats
.num_objects
= hs
.total_entries
;
8128 int RGWRados::get_user_stats_async(const rgw_user
& user
, RGWGetUserStats_CB
*ctx
)
8130 string user_str
= user
.to_str();
8132 RGWGetUserStatsContext
*get_ctx
= new RGWGetUserStatsContext(ctx
);
8133 int r
= cls_user_get_header_async(user_str
, get_ctx
);
8143 void RGWRados::get_bucket_meta_oid(const rgw_bucket
& bucket
, string
& oid
)
8145 oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ bucket
.get_key(':');
8148 void RGWRados::get_bucket_instance_obj(const rgw_bucket
& bucket
, rgw_raw_obj
& obj
)
8150 if (!bucket
.oid
.empty()) {
8151 obj
.init(svc
.zone
->get_zone_params().domain_root
, bucket
.oid
);
8154 get_bucket_meta_oid(bucket
, oid
);
8155 obj
.init(svc
.zone
->get_zone_params().domain_root
, oid
);
8159 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const string
& meta_key
, RGWBucketInfo
& info
,
8160 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
8162 size_t pos
= meta_key
.find(':');
8163 if (pos
== string::npos
) {
8166 string oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ meta_key
;
8167 rgw_bucket_instance_key_to_oid(oid
);
8169 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
8172 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
8173 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
8176 if (bucket
.oid
.empty()) {
8177 get_bucket_meta_oid(bucket
, oid
);
8182 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
8185 int RGWRados::get_bucket_instance_from_oid(RGWSysObjectCtx
& obj_ctx
, const string
& oid
, RGWBucketInfo
& info
,
8186 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
,
8187 rgw_cache_entry_info
*cache_info
,
8188 boost::optional
<obj_version
> refresh_version
)
8190 auto& domain_root
= svc
.zone
->get_zone_params().domain_root
;
8192 ldout(cct
, 20) << "reading from " << domain_root
<< ":" << oid
<< dendl
;
8196 int ret
= rgw_get_system_obj(this, obj_ctx
, domain_root
,
8197 oid
, epbl
, &info
.objv_tracker
, pmtime
, pattrs
,
8198 cache_info
, refresh_version
);
8203 auto iter
= epbl
.cbegin();
8206 } catch (buffer::error
& err
) {
8207 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
8210 info
.bucket
.oid
= oid
;
8214 int RGWRados::get_bucket_entrypoint_info(RGWSysObjectCtx
& obj_ctx
,
8215 const string
& tenant_name
,
8216 const string
& bucket_name
,
8217 RGWBucketEntryPoint
& entry_point
,
8218 RGWObjVersionTracker
*objv_tracker
,
8220 map
<string
, bufferlist
> *pattrs
,
8221 rgw_cache_entry_info
*cache_info
,
8222 boost::optional
<obj_version
> refresh_version
)
8225 string bucket_entry
;
8227 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
8228 int ret
= rgw_get_system_obj(this, obj_ctx
, svc
.zone
->get_zone_params().domain_root
,
8229 bucket_entry
, bl
, objv_tracker
, pmtime
, pattrs
,
8230 cache_info
, refresh_version
);
8235 auto iter
= bl
.cbegin();
8237 decode(entry_point
, iter
);
8238 } catch (buffer::error
& err
) {
8239 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
8245 int RGWRados::convert_old_bucket_info(RGWSysObjectCtx
& obj_ctx
,
8246 const string
& tenant_name
,
8247 const string
& bucket_name
)
8249 RGWBucketEntryPoint entry_point
;
8251 RGWObjVersionTracker ot
;
8252 map
<string
, bufferlist
> attrs
;
8255 ldout(cct
, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name
<< dendl
;
8257 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant_name
, bucket_name
, entry_point
, &ot
, &ep_mtime
, &attrs
);
8259 ldout(cct
, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret
<< " bucket=" << bucket_name
<< dendl
;
8263 if (!entry_point
.has_bucket_info
) {
8264 /* already converted! */
8268 info
= entry_point
.old_bucket_info
;
8269 info
.bucket
.oid
= bucket_name
;
8270 info
.ep_objv
= ot
.read_version
;
8272 ot
.generate_new_write_ver(cct
);
8274 ret
= put_linked_bucket_info(info
, false, ep_mtime
, &ot
.write_version
, &attrs
, true);
8276 ldout(cct
, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret
<< dendl
;
8283 int RGWRados::_get_bucket_info(RGWSysObjectCtx
& obj_ctx
,
8284 const string
& tenant
,
8285 const string
& bucket_name
,
8286 RGWBucketInfo
& info
,
8288 map
<string
, bufferlist
> *pattrs
,
8289 boost::optional
<obj_version
> refresh_version
)
8291 string bucket_entry
;
8292 rgw_make_bucket_entry_name(tenant
, bucket_name
, bucket_entry
);
8295 if (auto e
= binfo_cache
->find(bucket_entry
)) {
8296 if (refresh_version
&&
8297 e
->info
.objv_tracker
.read_version
.compare(&(*refresh_version
))) {
8298 lderr(cct
) << "WARNING: The bucket info cache is inconsistent. This is "
8299 << "a failure that should be debugged. I am a nice machine, "
8300 << "so I will try to recover." << dendl
;
8301 binfo_cache
->invalidate(bucket_entry
);
8312 bucket_info_entry e
;
8313 RGWBucketEntryPoint entry_point
;
8315 RGWObjVersionTracker ot
;
8316 rgw_cache_entry_info entry_cache_info
;
8317 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant
, bucket_name
,
8318 entry_point
, &ot
, &ep_mtime
, pattrs
,
8319 &entry_cache_info
, refresh_version
);
8321 /* only init these fields */
8322 info
.bucket
.tenant
= tenant
;
8323 info
.bucket
.name
= bucket_name
;
8327 if (entry_point
.has_bucket_info
) {
8328 info
= entry_point
.old_bucket_info
;
8329 info
.bucket
.oid
= bucket_name
;
8330 info
.bucket
.tenant
= tenant
;
8331 info
.ep_objv
= ot
.read_version
;
8332 ldout(cct
, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info
.bucket
<< " owner " << info
.owner
<< dendl
;
8336 /* data is in the bucket instance object, we need to get attributes from there, clear everything
8343 ldout(cct
, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point
.bucket
<< dendl
;
8346 /* read bucket instance info */
8349 get_bucket_meta_oid(entry_point
.bucket
, oid
);
8351 rgw_cache_entry_info cache_info
;
8353 ret
= get_bucket_instance_from_oid(obj_ctx
, oid
, e
.info
, &e
.mtime
, &e
.attrs
,
8354 &cache_info
, refresh_version
);
8355 e
.info
.ep_objv
= ot
.read_version
;
8358 lderr(cct
) << "ERROR: get_bucket_instance_from_oid failed: " << ret
<< dendl
;
8359 info
.bucket
.tenant
= tenant
;
8360 info
.bucket
.name
= bucket_name
;
8361 // XXX and why return anything in case of an error anyway?
8370 /* chain to both bucket entry point and bucket instance */
8371 if (!binfo_cache
->put(svc
.cache
, bucket_entry
, &e
, {&entry_cache_info
, &cache_info
})) {
8372 ldout(cct
, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl
;
8375 if (refresh_version
&&
8376 refresh_version
->compare(&info
.objv_tracker
.read_version
)) {
8377 lderr(cct
) << "WARNING: The OSD has the same version I have. Something may "
8378 << "have gone squirrelly. An administrator may have forced a "
8379 << "change; otherwise there is a problem somewhere." << dendl
;
8385 int RGWRados::get_bucket_info(RGWSysObjectCtx
& obj_ctx
,
8386 const string
& tenant
, const string
& bucket_name
,
8387 RGWBucketInfo
& info
,
8388 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
8390 return _get_bucket_info(obj_ctx
, tenant
, bucket_name
, info
, pmtime
,
8391 pattrs
, boost::none
);
8394 int RGWRados::try_refresh_bucket_info(RGWBucketInfo
& info
,
8395 ceph::real_time
*pmtime
,
8396 map
<string
, bufferlist
> *pattrs
)
8398 RGWSysObjectCtx obj_ctx
= svc
.sysobj
->init_obj_ctx();
8400 return _get_bucket_info(obj_ctx
, info
.bucket
.tenant
, info
.bucket
.name
,
8401 info
, pmtime
, pattrs
, info
.objv_tracker
.read_version
);
8404 int RGWRados::put_bucket_entrypoint_info(const string
& tenant_name
, const string
& bucket_name
, RGWBucketEntryPoint
& entry_point
,
8405 bool exclusive
, RGWObjVersionTracker
& objv_tracker
, real_time mtime
,
8406 map
<string
, bufferlist
> *pattrs
)
8409 encode(entry_point
, epbl
);
8410 string bucket_entry
;
8411 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
8412 return rgw_bucket_store_info(this, bucket_entry
, epbl
, exclusive
, pattrs
, &objv_tracker
, mtime
);
8415 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
8416 real_time mtime
, map
<string
, bufferlist
> *pattrs
)
8418 info
.has_instance_obj
= true;
8423 string key
= info
.bucket
.get_key(); /* when we go through meta api, we don't use oid directly */
8424 int ret
= rgw_bucket_instance_store_info(this, key
, bl
, exclusive
, pattrs
, &info
.objv_tracker
, mtime
);
8425 if (ret
== -EEXIST
) {
8426 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
8427 * bucket operation on this specific bucket (e.g., being synced from the master), but
8428 * since bucket instace meta object is unique for this specific bucket instace, we don't
8429 * need to return an error.
8430 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
8431 * master, creating a bucket, sending bucket creation to the master, we create the bucket
8432 * locally, while in the sync thread we sync the new bucket.
8439 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
8440 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
)
8442 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
8444 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
);
8450 return 0; /* done! */
8452 RGWBucketEntryPoint entry_point
;
8453 entry_point
.bucket
= info
.bucket
;
8454 entry_point
.owner
= info
.owner
;
8455 entry_point
.creation_time
= info
.creation_time
;
8456 entry_point
.linked
= true;
8457 RGWObjVersionTracker ot
;
8458 if (pep_objv
&& !pep_objv
->tag
.empty()) {
8459 ot
.write_version
= *pep_objv
;
8461 ot
.generate_new_write_ver(cct
);
8463 *pep_objv
= ot
.write_version
;
8466 ret
= put_bucket_entrypoint_info(info
.bucket
.tenant
, info
.bucket
.name
, entry_point
, exclusive
, ot
, mtime
, NULL
);
8473 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
)
8475 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
8477 map
<string
, RGWBucketEnt
>::iterator iter
;
8478 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
8479 RGWBucketEnt
& ent
= iter
->second
;
8480 rgw_bucket
& bucket
= ent
.bucket
;
8483 ent
.size_rounded
= 0;
8485 vector
<rgw_bucket_dir_header
> headers
;
8487 RGWBucketInfo bucket_info
;
8488 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
8493 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
8497 auto hiter
= headers
.begin();
8498 for (; hiter
!= headers
.end(); ++hiter
) {
8499 RGWObjCategory category
= main_category
;
8500 auto iter
= (hiter
->stats
).find(category
);
8501 if (iter
!= hiter
->stats
.end()) {
8502 struct rgw_bucket_category_stats
& stats
= iter
->second
;
8503 ent
.count
+= stats
.num_entries
;
8504 ent
.size
+= stats
.total_size
;
8505 ent
.size_rounded
+= stats
.total_size_rounded
;
8509 // fill in placement_rule from the bucket instance for use in swift's
8510 // per-storage policy statistics
8511 ent
.placement_rule
= std::move(bucket_info
.placement_rule
);
8517 int RGWRados::append_async(rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
8520 int r
= get_raw_obj_ref(obj
, &ref
);
8524 librados::Rados
*rad
= get_rados_handle();
8525 librados::AioCompletion
*completion
= rad
->aio_create_completion(NULL
, NULL
, NULL
);
8527 r
= ref
.ioctx
.aio_append(ref
.obj
.oid
, completion
, bl
, size
);
8528 completion
->release();
8532 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
8534 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8535 librados::NObjectIterator
& iter
= ctx
.iter
;
8537 int r
= open_pool_ctx(pool
, io_ctx
, false);
8541 iter
= io_ctx
.nobjects_begin();
8546 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, const string
& cursor
, RGWPoolIterCtx
& ctx
)
8548 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8549 librados::NObjectIterator
& iter
= ctx
.iter
;
8551 int r
= open_pool_ctx(pool
, io_ctx
, false);
8555 librados::ObjectCursor oc
;
8556 if (!oc
.from_str(cursor
)) {
8557 ldout(cct
, 10) << "failed to parse cursor: " << cursor
<< dendl
;
8562 iter
= io_ctx
.nobjects_begin(oc
);
8564 } catch (const std::system_error
& e
) {
8565 r
= -e
.code().value();
8566 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
8567 << ", returning " << r
<< dendl
;
8569 } catch (const std::exception
& e
) {
8570 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
8571 << ", returning -5" << dendl
;
8576 string
RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx
& ctx
)
8578 return ctx
.iter
.get_cursor().to_str();
8581 static int do_pool_iterate(CephContext
* cct
, RGWPoolIterCtx
& ctx
, uint32_t num
,
8582 vector
<rgw_bucket_dir_entry
>& objs
,
8583 bool *is_truncated
, RGWAccessListFilter
*filter
)
8585 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8586 librados::NObjectIterator
& iter
= ctx
.iter
;
8588 if (iter
== io_ctx
.nobjects_end())
8593 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
8594 rgw_bucket_dir_entry e
;
8596 string oid
= iter
->get_oid();
8597 ldout(cct
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
8599 // fill it in with initial values; we may correct later
8600 if (filter
&& !filter
->filter(oid
, oid
))
8608 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
8613 int RGWRados::pool_iterate(RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
8614 bool *is_truncated
, RGWAccessListFilter
*filter
)
8616 // catch exceptions from NObjectIterator::operator++()
8618 return do_pool_iterate(cct
, ctx
, num
, objs
, is_truncated
, filter
);
8619 } catch (const std::system_error
& e
) {
8620 int r
= -e
.code().value();
8621 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
8622 << ", returning " << r
<< dendl
;
8624 } catch (const std::exception
& e
) {
8625 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
8626 << ", returning -5" << dendl
;
8631 int RGWRados::list_raw_objects_init(const rgw_pool
& pool
, const string
& marker
, RGWListRawObjsCtx
*ctx
)
8633 if (!ctx
->initialized
) {
8634 int r
= pool_iterate_begin(pool
, marker
, ctx
->iter_ctx
);
8636 ldout(cct
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
8639 ctx
->initialized
= true;
8644 int RGWRados::list_raw_objects_next(const string
& prefix_filter
, int max
,
8645 RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
8648 if (!ctx
.initialized
) {
8651 RGWAccessListFilterPrefix
filter(prefix_filter
);
8652 vector
<rgw_bucket_dir_entry
> objs
;
8653 int r
= pool_iterate(ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
8656 ldout(cct
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
8660 vector
<rgw_bucket_dir_entry
>::iterator iter
;
8661 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
8662 oids
.push_back(iter
->key
.name
);
8668 int RGWRados::list_raw_objects(const rgw_pool
& pool
, const string
& prefix_filter
,
8669 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
8672 if (!ctx
.initialized
) {
8673 int r
= list_raw_objects_init(pool
, string(), &ctx
);
8679 return list_raw_objects_next(prefix_filter
, max
, ctx
, oids
, is_truncated
);
8682 string
RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx
& ctx
)
8684 return pool_iterate_get_cursor(ctx
.iter_ctx
);
8687 int RGWRados::list_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& marker
, uint32_t max
,
8688 std::list
<rgw_bi_log_entry
>& result
, bool *truncated
)
8690 ldout(cct
, 20) << __func__
<< ": " << bucket_info
.bucket
<< " marker " << marker
<< " shard_id=" << shard_id
<< " max " << max
<< dendl
;
8693 librados::IoCtx index_ctx
;
8694 map
<int, string
> oids
;
8695 map
<int, cls_rgw_bi_log_list_ret
> bi_log_lists
;
8696 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
8700 BucketIndexShardsManager marker_mgr
;
8701 bool has_shards
= (oids
.size() > 1 || shard_id
>= 0);
8702 // If there are multiple shards for the bucket index object, the marker
8703 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
8704 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
8705 // only contain one record, and the key is the bucket instance id.
8706 r
= marker_mgr
.from_string(marker
, shard_id
);
8710 r
= CLSRGWIssueBILogList(index_ctx
, marker_mgr
, max
, oids
, bi_log_lists
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8714 map
<int, list
<rgw_bi_log_entry
>::iterator
> vcurrents
;
8715 map
<int, list
<rgw_bi_log_entry
>::iterator
> vends
;
8719 map
<int, cls_rgw_bi_log_list_ret
>::iterator miter
= bi_log_lists
.begin();
8720 for (; miter
!= bi_log_lists
.end(); ++miter
) {
8721 int shard_id
= miter
->first
;
8722 vcurrents
[shard_id
] = miter
->second
.entries
.begin();
8723 vends
[shard_id
] = miter
->second
.entries
.end();
8725 *truncated
= (*truncated
|| miter
->second
.truncated
);
8730 bool has_more
= true;
8731 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator viter
;
8732 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator eiter
;
8733 while (total
< max
&& has_more
) {
8736 viter
= vcurrents
.begin();
8737 eiter
= vends
.begin();
8739 for (; total
< max
&& viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
8740 assert (eiter
!= vends
.end());
8742 int shard_id
= viter
->first
;
8743 list
<rgw_bi_log_entry
>::iterator
& liter
= viter
->second
;
8745 if (liter
== eiter
->second
){
8748 rgw_bi_log_entry
& entry
= *(liter
);
8751 snprintf(buf
, sizeof(buf
), "%d", shard_id
);
8753 build_bucket_index_marker(buf
, entry
.id
, &tmp_id
);
8754 entry
.id
.swap(tmp_id
);
8756 marker_mgr
.add(shard_id
, entry
.id
);
8757 result
.push_back(entry
);
8765 for (viter
= vcurrents
.begin(), eiter
= vends
.begin(); viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
8766 assert (eiter
!= vends
.end());
8767 *truncated
= (*truncated
|| (viter
->second
!= eiter
->second
));
8771 // Refresh marker, if there are multiple shards, the output will look like
8772 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
8773 // if there is no sharding, the simply marker (without oid) is returned
8775 marker_mgr
.to_string(&marker
);
8777 if (!result
.empty()) {
8778 marker
= result
.rbegin()->id
;
8785 int RGWRados::trim_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& start_marker
, string
& end_marker
)
8787 librados::IoCtx index_ctx
;
8788 map
<int, string
> bucket_objs
;
8790 BucketIndexShardsManager start_marker_mgr
;
8791 BucketIndexShardsManager end_marker_mgr
;
8793 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
8798 r
= start_marker_mgr
.from_string(start_marker
, shard_id
);
8803 r
= end_marker_mgr
.from_string(end_marker
, shard_id
);
8808 return CLSRGWIssueBILogTrim(index_ctx
, start_marker_mgr
, end_marker_mgr
, bucket_objs
,
8809 cct
->_conf
->rgw_bucket_index_max_aio
)();
8812 int RGWRados::resync_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
)
8814 librados::IoCtx index_ctx
;
8815 map
<int, string
> bucket_objs
;
8816 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
8820 return CLSRGWIssueResyncBucketBILog(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8823 int RGWRados::stop_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
)
8825 librados::IoCtx index_ctx
;
8826 map
<int, string
> bucket_objs
;
8827 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
8831 return CLSRGWIssueBucketBILogStop(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8834 int RGWRados::bi_get_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8835 rgw_bucket_dir_entry
*dirent
)
8837 rgw_cls_bi_entry bi_entry
;
8838 int r
= bi_get(bucket_info
, obj
, BIIndexType::Instance
, &bi_entry
);
8839 if (r
< 0 && r
!= -ENOENT
) {
8840 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
8845 auto iter
= bi_entry
.data
.cbegin();
8847 decode(*dirent
, iter
);
8848 } catch (buffer::error
& err
) {
8849 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
8856 int RGWRados::bi_get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8857 rgw_bucket_olh_entry
*olh
)
8859 rgw_cls_bi_entry bi_entry
;
8860 int r
= bi_get(bucket_info
, obj
, BIIndexType::OLH
, &bi_entry
);
8861 if (r
< 0 && r
!= -ENOENT
) {
8862 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
8867 auto iter
= bi_entry
.data
.cbegin();
8870 } catch (buffer::error
& err
) {
8871 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
8878 int RGWRados::bi_get(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8879 BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
8881 BucketShard
bs(this);
8882 int ret
= bs
.init(bucket_info
, obj
);
8884 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8888 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
8890 return cls_rgw_bi_get(bs
.index_ctx
, bs
.bucket_obj
, index_type
, key
, entry
);
8893 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
8895 cls_rgw_bi_put(op
, bs
.bucket_obj
, entry
);
8898 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
8900 int ret
= cls_rgw_bi_put(bs
.index_ctx
, bs
.bucket_obj
, entry
);
8907 int RGWRados::bi_put(rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
8909 BucketShard
bs(this);
8910 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
8912 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8916 return bi_put(bs
, entry
);
8919 int RGWRados::bi_list(rgw_bucket
& bucket
, const string
& obj_name
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8921 rgw_obj
obj(bucket
, obj_name
);
8922 BucketShard
bs(this);
8923 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
8925 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8929 ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, obj_name
, marker
, max
, entries
, is_truncated
);
8930 if (ret
== -ENOENT
) {
8931 *is_truncated
= false;
8939 int RGWRados::bi_list(BucketShard
& bs
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8941 int ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, filter_obj
, marker
, max
, entries
, is_truncated
);
8948 int RGWRados::bi_remove(BucketShard
& bs
)
8950 int ret
= bs
.index_ctx
.remove(bs
.bucket_obj
);
8951 if (ret
== -ENOENT
) {
8955 ldout(cct
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
8962 int RGWRados::bi_list(rgw_bucket
& bucket
, int shard_id
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8964 BucketShard
bs(this);
8965 int ret
= bs
.init(bucket
, shard_id
, nullptr /* no RGWBucketInfo */);
8967 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8971 return bi_list(bs
, filter_obj
, marker
, max
, entries
, is_truncated
);
8974 int RGWRados::gc_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
8976 return gc_pool_ctx
.operate(oid
, op
);
8979 int RGWRados::gc_aio_operate(string
& oid
, librados::ObjectWriteOperation
*op
, AioCompletion
**pc
)
8981 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
8982 int r
= gc_pool_ctx
.aio_operate(oid
, c
, op
);
8991 int RGWRados::gc_operate(string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
8993 return gc_pool_ctx
.operate(oid
, op
, pbl
);
8996 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
)
8998 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
);
9001 int RGWRados::process_gc(bool expired_only
)
9003 return gc
->process(expired_only
);
9006 int RGWRados::list_lc_progress(const string
& marker
, uint32_t max_entries
, map
<string
, int> *progress_map
)
9008 return lc
->list_lc_progress(marker
, max_entries
, progress_map
);
9011 int RGWRados::process_lc()
9013 return lc
->process();
9016 bool RGWRados::process_expire_objects()
9018 return obj_expirer
->inspect_all_shards(utime_t(), ceph_clock_now());
9021 int RGWRados::cls_obj_prepare_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
9022 rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
9024 rgw_zone_set zones_trace
;
9026 zones_trace
= *_zones_trace
;
9028 zones_trace
.insert(svc
.zone
->get_zone().id
);
9030 ObjectWriteOperation o
;
9031 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
9032 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
9033 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), svc
.zone
->get_zone().log_data
, bilog_flags
, zones_trace
);
9034 return bs
.index_ctx
.operate(bs
.bucket_obj
, &o
);
9037 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, const rgw_obj
& obj
, RGWModifyOp op
, string
& tag
,
9038 int64_t pool
, uint64_t epoch
,
9039 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
9040 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
9042 ObjectWriteOperation o
;
9043 rgw_bucket_dir_entry_meta dir_meta
;
9044 dir_meta
= ent
.meta
;
9045 dir_meta
.category
= category
;
9047 rgw_zone_set zones_trace
;
9049 zones_trace
= *_zones_trace
;
9051 zones_trace
.insert(svc
.zone
->get_zone().id
);
9053 rgw_bucket_entry_ver ver
;
9056 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
9057 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
9058 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
9059 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
);
9060 complete_op_data
*arg
;
9061 index_completion_manager
->create_completion(obj
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
9062 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
, &arg
);
9063 librados::AioCompletion
*completion
= arg
->rados_completion
;
9064 int ret
= bs
.index_ctx
.aio_operate(bs
.bucket_obj
, arg
->rados_completion
, &o
);
9065 completion
->release(); /* can't reference arg here, as it might have already been released */
9069 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, const rgw_obj
& obj
, string
& tag
,
9070 int64_t pool
, uint64_t epoch
,
9071 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
9072 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
9074 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
9077 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
9078 int64_t pool
, uint64_t epoch
,
9080 real_time
& removed_mtime
,
9081 list
<rgw_obj_index_key
> *remove_objs
,
9082 uint16_t bilog_flags
,
9083 rgw_zone_set
*zones_trace
)
9085 rgw_bucket_dir_entry ent
;
9086 ent
.meta
.mtime
= removed_mtime
;
9087 obj
.key
.get_index_key(&ent
.key
);
9088 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
,
9089 ent
, RGWObjCategory::None
, remove_objs
,
9090 bilog_flags
, zones_trace
);
9093 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
9095 rgw_bucket_dir_entry ent
;
9096 obj
.key
.get_index_key(&ent
.key
);
9097 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_CANCEL
, tag
,
9098 -1 /* pool id */, 0, ent
,
9099 RGWObjCategory::None
, NULL
, bilog_flags
,
9103 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo
& bucket_info
, uint64_t timeout
)
9105 librados::IoCtx index_ctx
;
9106 map
<int, string
> bucket_objs
;
9107 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
9111 return CLSRGWIssueSetTagTimeout(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
9115 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo
& bucket_info
,
9117 const rgw_obj_index_key
& start
,
9118 const string
& prefix
,
9119 uint32_t num_entries
,
9121 map
<string
, rgw_bucket_dir_entry
>& m
,
9123 rgw_obj_index_key
*last_entry
,
9124 bool (*force_check_filter
)(const string
& name
))
9126 ldout(cct
, 10) << "cls_bucket_list_ordered " << bucket_info
.bucket
<<
9127 " start " << start
.name
<< "[" << start
.instance
<< "] num_entries " <<
9128 num_entries
<< dendl
;
9130 librados::IoCtx index_ctx
;
9131 // key - oid (for different shards if there is any)
9132 // value - list result for the corresponding oid (shard), it is filled by
9134 map
<int, string
> oids
;
9135 map
<int, struct rgw_cls_list_ret
> list_results
;
9136 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
9140 cls_rgw_obj_key
start_key(start
.name
, start
.instance
);
9141 r
= CLSRGWIssueBucketList(index_ctx
, start_key
, prefix
, num_entries
,
9142 list_versions
, oids
, list_results
,
9143 cct
->_conf
->rgw_bucket_index_max_aio
)();
9147 // Create a list of iterators that are used to iterate each shard
9148 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vcurrents
;
9149 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vends
;
9150 vector
<string
> vnames
;
9151 vcurrents
.reserve(list_results
.size());
9152 vends
.reserve(list_results
.size());
9153 vnames
.reserve(list_results
.size());
9154 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
9155 *is_truncated
= false;
9156 for (; iter
!= list_results
.end(); ++iter
) {
9157 vcurrents
.push_back(iter
->second
.dir
.m
.begin());
9158 vends
.push_back(iter
->second
.dir
.m
.end());
9159 vnames
.push_back(oids
[iter
->first
]);
9160 *is_truncated
= (*is_truncated
|| iter
->second
.is_truncated
);
9163 // Create a map to track the next candidate entry from each shard, if the entry
9164 // from a specified shard is selected/erased, the next entry from that shard will
9165 // be inserted for next round selection
9166 map
<string
, size_t> candidates
;
9167 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
9168 if (vcurrents
[i
] != vends
[i
]) {
9169 candidates
[vcurrents
[i
]->first
] = i
;
9173 map
<string
, bufferlist
> updates
;
9176 while (count
< num_entries
&& !candidates
.empty()) {
9178 // Select the next one
9179 pos
= candidates
.begin()->second
;
9180 const string
& name
= vcurrents
[pos
]->first
;
9181 struct rgw_bucket_dir_entry
& dirent
= vcurrents
[pos
]->second
;
9183 bool force_check
= force_check_filter
&&
9184 force_check_filter(dirent
.key
.name
);
9185 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
9186 !dirent
.pending_map
.empty() ||
9188 /* there are uncommitted ops. We need to check the current state,
9189 * and if the tags are old we need to do cleanup as well. */
9190 librados::IoCtx sub_ctx
;
9191 sub_ctx
.dup(index_ctx
);
9192 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
,
9193 updates
[vnames
[pos
]]);
9194 if (r
< 0 && r
!= -ENOENT
) {
9201 ldout(cct
, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
9202 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
9203 m
[name
] = std::move(dirent
);
9207 // Refresh the candidates map
9208 candidates
.erase(candidates
.begin());
9210 if (vcurrents
[pos
] != vends
[pos
]) {
9211 candidates
[vcurrents
[pos
]->first
] = pos
;
9215 // Suggest updates if there is any
9216 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
9217 for (; miter
!= updates
.end(); ++miter
) {
9218 if (miter
->second
.length()) {
9219 ObjectWriteOperation o
;
9220 cls_rgw_suggest_changes(o
, miter
->second
);
9221 // we don't care if we lose suggested updates, send them off blindly
9222 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
9223 index_ctx
.aio_operate(miter
->first
, c
, &o
);
9228 // Check if all the returned entries are consumed or not
9229 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
9230 if (vcurrents
[i
] != vends
[i
]) {
9231 *is_truncated
= true;
9237 *last_entry
= std::move((--vcurrents
[pos
])->first
);
9243 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo
& bucket_info
,
9245 const rgw_obj_index_key
& start
,
9246 const string
& prefix
,
9247 uint32_t num_entries
,
9249 std::vector
<rgw_bucket_dir_entry
>& ent_list
,
9251 rgw_obj_index_key
*last_entry
,
9252 bool (*force_check_filter
)(const string
& name
)) {
9253 ldout(cct
, 10) << "cls_bucket_list_unordered " << bucket_info
.bucket
<<
9254 " start " << start
.name
<< "[" << start
.instance
<<
9255 "] num_entries " << num_entries
<< dendl
;
9257 static MultipartMetaFilter multipart_meta_filter
;
9259 *is_truncated
= false;
9260 librados::IoCtx index_ctx
;
9262 map
<int, string
> oids
;
9263 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
9266 const uint32_t num_shards
= oids
.size();
9268 rgw_obj_index_key marker
= start
;
9269 uint32_t current_shard
;
9270 if (shard_id
>= 0) {
9271 current_shard
= shard_id
;
9272 } else if (start
.empty()) {
9275 // at this point we have a marker (start) that has something in
9276 // it, so we need to get to the bucket shard index, so we can
9277 // start reading from there
9280 // test whether object name is a multipart meta name
9281 if(! multipart_meta_filter
.filter(start
.name
, key
)) {
9282 // if multipart_meta_filter fails, must be "regular" (i.e.,
9283 // unadorned) and the name is the key
9287 // now convert the key (oid) to an rgw_obj_key since that will
9288 // separate out the namespace, name, and instance
9289 rgw_obj_key obj_key
;
9290 bool parsed
= rgw_obj_key::parse_raw_oid(key
, &obj_key
);
9293 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9294 "start marker: '" << start
<< "'" << dendl
;
9296 } else if (obj_key
.name
.empty()) {
9297 // if the name is empty that means the object name came in with
9298 // a namespace only, and therefore we need to start our scan at
9299 // the first bucket index shard
9302 // so now we have the key used to compute the bucket index shard
9303 // and can extract the specific shard from it
9304 current_shard
= rgw_bucket_shard_index(obj_key
.name
, num_shards
);
9308 uint32_t count
= 0u;
9309 map
<string
, bufferlist
> updates
;
9310 rgw_obj_index_key last_added_entry
;
9311 while (count
<= num_entries
&&
9312 ((shard_id
>= 0 && current_shard
== uint32_t(shard_id
)) ||
9313 current_shard
< num_shards
)) {
9314 const std::string
& oid
= oids
[current_shard
];
9315 rgw_cls_list_ret result
;
9317 librados::ObjectReadOperation op
;
9318 cls_rgw_bucket_list_op(op
, marker
, prefix
, num_entries
,
9319 list_versions
, &result
);
9320 r
= index_ctx
.operate(oid
, &op
, nullptr);
9324 for (auto& entry
: result
.dir
.m
) {
9325 rgw_bucket_dir_entry
& dirent
= entry
.second
;
9327 bool force_check
= force_check_filter
&&
9328 force_check_filter(dirent
.key
.name
);
9329 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
9330 !dirent
.pending_map
.empty() ||
9332 /* there are uncommitted ops. We need to check the current state,
9333 * and if the tags are old we need to do cleanup as well. */
9334 librados::IoCtx sub_ctx
;
9335 sub_ctx
.dup(index_ctx
);
9336 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
, updates
[oid
]);
9337 if (r
< 0 && r
!= -ENOENT
) {
9344 // at this point either r >=0 or r == -ENOENT
9345 if (r
>= 0) { // i.e., if r != -ENOENT
9346 ldout(cct
, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
9347 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
9349 if (count
< num_entries
) {
9350 marker
= last_added_entry
= dirent
.key
; // double assign
9351 ent_list
.emplace_back(std::move(dirent
));
9354 *is_truncated
= true;
9357 } else { // r == -ENOENT
9358 // in the case of -ENOENT, make sure we're advancing marker
9359 // for possible next call to CLSRGWIssueBucketList
9360 marker
= dirent
.key
;
9364 if (!result
.is_truncated
) {
9365 // if we reached the end of the shard read next shard
9367 marker
= rgw_obj_index_key();
9373 // suggest updates if there is any
9374 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
9375 for (; miter
!= updates
.end(); ++miter
) {
9376 if (miter
->second
.length()) {
9377 ObjectWriteOperation o
;
9378 cls_rgw_suggest_changes(o
, miter
->second
);
9379 // we don't care if we lose suggested updates, send them off blindly
9380 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
9381 index_ctx
.aio_operate(miter
->first
, c
, &o
);
9386 if (last_entry
&& !ent_list
.empty()) {
9387 *last_entry
= last_added_entry
;
9391 } // RGWRados::cls_bucket_list_unordered
9394 int RGWRados::cls_obj_usage_log_add(const string
& oid
,
9395 rgw_usage_log_info
& info
)
9397 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9400 int r
= get_raw_obj_ref(obj
, &ref
);
9405 ObjectWriteOperation op
;
9406 cls_rgw_usage_log_add(op
, info
);
9408 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9412 int RGWRados::cls_obj_usage_log_read(const string
& oid
, const string
& user
, const string
& bucket
,
9413 uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
9414 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
,
9417 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9420 int r
= get_raw_obj_ref(obj
, &ref
);
9425 *is_truncated
= false;
9427 r
= cls_rgw_usage_log_read(ref
.ioctx
, ref
.obj
.oid
, user
, bucket
, start_epoch
, end_epoch
,
9428 max_entries
, read_iter
, usage
, is_truncated
);
9433 int RGWRados::cls_obj_usage_log_trim(const string
& oid
, const string
& user
, const string
& bucket
,
9434 uint64_t start_epoch
, uint64_t end_epoch
)
9436 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9439 int r
= get_raw_obj_ref(obj
, &ref
);
9444 r
= cls_rgw_usage_log_trim(ref
.ioctx
, ref
.obj
.oid
, user
, bucket
, start_epoch
, end_epoch
);
9448 int RGWRados::cls_obj_usage_log_clear(string
& oid
)
9450 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9453 int r
= get_raw_obj_ref(obj
, &ref
);
9457 librados::ObjectWriteOperation op
;
9458 cls_rgw_usage_log_clear(op
);
9459 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9464 int RGWRados::remove_objs_from_index(RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
9466 librados::IoCtx index_ctx
;
9469 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
9471 int r
= open_bucket_index(bucket_info
, index_ctx
, dir_oid
);
9477 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
9478 rgw_bucket_dir_entry entry
;
9480 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
9481 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9482 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
9483 encode(entry
, updates
);
9488 r
= index_ctx
.exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
9493 int RGWRados::check_disk_state(librados::IoCtx io_ctx
,
9494 const RGWBucketInfo
& bucket_info
,
9495 rgw_bucket_dir_entry
& list_state
,
9496 rgw_bucket_dir_entry
& object
,
9497 bufferlist
& suggested_updates
)
9499 const rgw_bucket
& bucket
= bucket_info
.bucket
;
9500 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
9504 rgw_obj
obj(bucket
, list_state
.key
);
9507 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
9509 if (loc
!= list_state
.locator
) {
9510 ldout(cct
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
9513 io_ctx
.locator_set_key(list_state
.locator
);
9515 RGWObjState
*astate
= NULL
;
9516 RGWObjectCtx
rctx(this);
9517 int r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
9521 list_state
.pending_map
.clear(); // we don't need this and it inflates size
9522 if (!astate
->exists
) {
9523 /* object doesn't exist right now -- hopefully because it's
9524 * marked as !exists and got deleted */
9525 if (list_state
.exists
) {
9526 /* FIXME: what should happen now? Work out if there are any
9527 * non-bad ways this could happen (there probably are, but annoying
9530 // encode a suggested removal of that key
9531 list_state
.ver
.epoch
= io_ctx
.get_last_version();
9532 list_state
.ver
.pool
= io_ctx
.get_id();
9533 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
9538 string content_type
;
9541 object
.meta
.size
= astate
->size
;
9542 object
.meta
.accounted_size
= astate
->accounted_size
;
9543 object
.meta
.mtime
= astate
->mtime
;
9545 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
9546 if (iter
!= astate
->attrset
.end()) {
9547 etag
= rgw_bl_str(iter
->second
);
9549 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
9550 if (iter
!= astate
->attrset
.end()) {
9551 content_type
= rgw_bl_str(iter
->second
);
9553 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
9554 if (iter
!= astate
->attrset
.end()) {
9555 r
= decode_policy(iter
->second
, &owner
);
9557 dout(0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
9561 if (astate
->has_manifest
) {
9562 RGWObjManifest::obj_iterator miter
;
9563 RGWObjManifest
& manifest
= astate
->manifest
;
9564 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
9565 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(this);
9567 rgw_raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
9569 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
9570 dout(10) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
9571 r
= delete_obj_index(loc
, astate
->mtime
);
9573 dout(0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
9579 object
.meta
.etag
= etag
;
9580 object
.meta
.content_type
= content_type
;
9581 object
.meta
.owner
= owner
.get_id().to_str();
9582 object
.meta
.owner_display_name
= owner
.get_display_name();
9584 // encode suggested updates
9585 list_state
.ver
.pool
= io_ctx
.get_id();
9586 list_state
.ver
.epoch
= astate
->epoch
;
9587 list_state
.meta
.size
= object
.meta
.size
;
9588 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
9589 list_state
.meta
.mtime
= object
.meta
.mtime
;
9590 list_state
.meta
.category
= main_category
;
9591 list_state
.meta
.etag
= etag
;
9592 list_state
.meta
.content_type
= content_type
;
9593 if (astate
->obj_tag
.length() > 0)
9594 list_state
.tag
= astate
->obj_tag
.c_str();
9595 list_state
.meta
.owner
= owner
.get_id().to_str();
9596 list_state
.meta
.owner_display_name
= owner
.get_display_name();
9598 list_state
.exists
= true;
9599 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
9603 int RGWRados::cls_bucket_head(const RGWBucketInfo
& bucket_info
, int shard_id
, vector
<rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
9605 librados::IoCtx index_ctx
;
9606 map
<int, string
> oids
;
9607 map
<int, struct rgw_cls_list_ret
> list_results
;
9608 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, list_results
, shard_id
, bucket_instance_ids
);
9612 r
= CLSRGWIssueGetDirHeader(index_ctx
, oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
9616 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
9617 for(; iter
!= list_results
.end(); ++iter
) {
9618 headers
.push_back(std::move(iter
->second
.dir
.header
));
9623 int RGWRados::cls_bucket_head_async(const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
9625 librados::IoCtx index_ctx
;
9626 map
<int, string
> bucket_objs
;
9627 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
9631 map
<int, string
>::iterator iter
= bucket_objs
.begin();
9632 for (; iter
!= bucket_objs
.end(); ++iter
) {
9633 r
= cls_rgw_get_dir_header_async(index_ctx
, iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
9644 int RGWRados::cls_user_get_header(const string
& user_id
, cls_user_header
*header
)
9646 string buckets_obj_id
;
9647 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9648 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9651 int r
= get_raw_obj_ref(obj
, &ref
);
9656 librados::ObjectReadOperation op
;
9658 ::cls_user_get_header(op
, header
, &rc
);
9660 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
, &ibl
);
9669 int RGWRados::cls_user_reset_stats(const string
& user_id
)
9671 string buckets_obj_id
;
9672 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9673 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9676 int r
= get_raw_obj_ref(obj
, &ref
);
9681 librados::ObjectWriteOperation op
;
9682 ::cls_user_reset_stats(op
);
9683 return ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9686 int RGWRados::cls_user_get_header_async(const string
& user_id
, RGWGetUserHeader_CB
*ctx
)
9688 string buckets_obj_id
;
9689 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9690 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9693 int r
= get_raw_obj_ref(obj
, &ref
);
9698 r
= ::cls_user_get_header_async(ref
.ioctx
, ref
.obj
.oid
, ctx
);
9705 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj
& user_obj
,
9706 const RGWBucketInfo
& bucket_info
)
9708 vector
<rgw_bucket_dir_header
> headers
;
9709 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
9711 ldout(cct
, 20) << "cls_bucket_header() returned " << r
<< dendl
;
9715 cls_user_bucket_entry entry
;
9717 bucket_info
.bucket
.convert(&entry
.bucket
);
9719 for (const auto& hiter
: headers
) {
9720 for (const auto& iter
: hiter
.stats
) {
9721 if (RGWObjCategory::Main
== iter
.first
||
9722 RGWObjCategory::MultiMeta
== iter
.first
) {
9723 const struct rgw_bucket_category_stats
& header_stats
= iter
.second
;
9724 entry
.size
+= header_stats
.total_size
;
9725 entry
.size_rounded
+= header_stats
.total_size_rounded
;
9726 entry
.count
+= header_stats
.num_entries
;
9731 list
<cls_user_bucket_entry
> entries
;
9732 entries
.push_back(entry
);
9734 r
= cls_user_update_buckets(user_obj
, entries
, false);
9736 ldout(cct
, 20) << "cls_user_update_buckets() returned " << r
<< dendl
;
9743 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket
& bucket
, cls_user_bucket_entry
& entry
)
9745 vector
<rgw_bucket_dir_header
> headers
;
9746 RGWBucketInfo bucket_info
;
9747 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
9748 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
9753 ret
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
9755 ldout(cct
, 20) << "cls_bucket_header() returned " << ret
<< dendl
;
9759 bucket
.convert(&entry
.bucket
);
9761 for (const auto& hiter
: headers
) {
9762 for (const auto& iter
: hiter
.stats
) {
9763 const struct rgw_bucket_category_stats
& header_stats
= iter
.second
;
9764 entry
.size
+= header_stats
.total_size
;
9765 entry
.size_rounded
+= header_stats
.total_size_rounded
;
9766 entry
.count
+= header_stats
.num_entries
;
9773 int RGWRados::cls_user_list_buckets(rgw_raw_obj
& obj
,
9774 const string
& in_marker
,
9775 const string
& end_marker
,
9776 const int max_entries
,
9777 list
<cls_user_bucket_entry
>& entries
,
9778 string
* const out_marker
,
9779 bool * const truncated
)
9782 int r
= get_raw_obj_ref(obj
, &ref
);
9787 librados::ObjectReadOperation op
;
9790 cls_user_bucket_list(op
, in_marker
, end_marker
, max_entries
, entries
, out_marker
, truncated
, &rc
);
9792 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
, &ibl
);
9801 int RGWRados::cls_user_update_buckets(rgw_raw_obj
& obj
, list
<cls_user_bucket_entry
>& entries
, bool add
)
9804 int r
= get_raw_obj_ref(obj
, &ref
);
9809 librados::ObjectWriteOperation op
;
9810 cls_user_set_buckets(op
, entries
, add
);
9811 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9818 int RGWRados::complete_sync_user_stats(const rgw_user
& user_id
)
9820 string buckets_obj_id
;
9821 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9822 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9823 return cls_user_complete_stats_sync(obj
);
9826 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj
& obj
)
9829 int r
= get_raw_obj_ref(obj
, &ref
);
9834 librados::ObjectWriteOperation op
;
9835 ::cls_user_complete_stats_sync(op
);
9836 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9843 int RGWRados::cls_user_add_bucket(rgw_raw_obj
& obj
, const cls_user_bucket_entry
& entry
)
9845 list
<cls_user_bucket_entry
> l
;
9848 return cls_user_update_buckets(obj
, l
, true);
9851 int RGWRados::cls_user_remove_bucket(rgw_raw_obj
& obj
, const cls_user_bucket
& bucket
)
9854 int r
= get_system_obj_ref(obj
, &ref
);
9859 librados::ObjectWriteOperation op
;
9860 ::cls_user_remove_bucket(op
, bucket
);
9861 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9868 int RGWRados::check_bucket_shards(const RGWBucketInfo
& bucket_info
, const rgw_bucket
& bucket
,
9869 RGWQuotaInfo
& bucket_quota
)
9871 if (! cct
->_conf
.get_val
<bool>("rgw_dynamic_resharding")) {
9875 bool need_resharding
= false;
9876 int num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
9877 uint32_t suggested_num_shards
;
9879 const uint64_t max_objs_per_shard
=
9880 cct
->_conf
.get_val
<uint64_t>("rgw_max_objs_per_shard");
9882 quota_handler
->check_bucket_shards(max_objs_per_shard
, num_source_shards
,
9883 bucket_info
.owner
, bucket
, bucket_quota
,
9884 1, need_resharding
, &suggested_num_shards
);
9889 if (need_resharding
) {
9890 ldout(cct
, 20) << __func__
<< " bucket " << bucket
.name
<< " need resharding " <<
9891 " old num shards " << bucket_info
.num_shards
<< " new num shards " << suggested_num_shards
<<
9893 return add_bucket_to_reshard(bucket_info
, suggested_num_shards
);
9899 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo
& bucket_info
, uint32_t new_num_shards
)
9901 RGWReshard
reshard(this);
9903 uint32_t num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
9905 new_num_shards
= std::min(new_num_shards
, get_max_bucket_shards());
9906 if (new_num_shards
<= num_source_shards
) {
9907 ldout(cct
, 20) << "not resharding bucket name=" << bucket_info
.bucket
.name
<< ", orig_num=" << num_source_shards
<< ", new_num_shards=" << new_num_shards
<< dendl
;
9911 cls_rgw_reshard_entry entry
;
9912 entry
.time
= real_clock::now();
9913 entry
.tenant
= bucket_info
.owner
.tenant
;
9914 entry
.bucket_name
= bucket_info
.bucket
.name
;
9915 entry
.bucket_id
= bucket_info
.bucket
.bucket_id
;
9916 entry
.old_num_shards
= num_source_shards
;
9917 entry
.new_num_shards
= new_num_shards
;
9919 return reshard
.add(entry
);
9922 int RGWRados::check_quota(const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
9923 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
, uint64_t obj_size
, bool check_size_only
)
9925 // if we only check size, then num_objs will set to 0
9927 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 0, obj_size
);
9929 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
);
9932 void RGWRados::get_bucket_index_objects(const string
& bucket_oid_base
,
9933 uint32_t num_shards
,
9934 map
<int, string
>& bucket_objects
,
9937 bucket_objects
[0] = bucket_oid_base
;
9939 char buf
[bucket_oid_base
.size() + 32];
9941 for (uint32_t i
= 0; i
< num_shards
; ++i
) {
9942 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), i
);
9943 bucket_objects
[i
] = buf
;
9946 if ((uint32_t)shard_id
> num_shards
) {
9949 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
9950 bucket_objects
[shard_id
] = buf
;
9955 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<int, string
> *result
)
9957 const rgw_bucket
& bucket
= bucket_info
.bucket
;
9958 string plain_id
= bucket
.name
+ ":" + bucket
.bucket_id
;
9959 if (!bucket_info
.num_shards
) {
9960 (*result
)[0] = plain_id
;
9964 for (uint32_t i
= 0; i
< bucket_info
.num_shards
; ++i
) {
9965 snprintf(buf
, sizeof(buf
), ":%d", i
);
9966 (*result
)[i
] = plain_id
+ buf
;
9969 if ((uint32_t)shard_id
> bucket_info
.num_shards
) {
9972 snprintf(buf
, sizeof(buf
), ":%d", shard_id
);
9973 (*result
)[shard_id
] = plain_id
+ buf
;
9978 int RGWRados::get_target_shard_id(const RGWBucketInfo
& bucket_info
, const string
& obj_key
,
9982 switch (bucket_info
.bucket_index_shard_hash_type
) {
9983 case RGWBucketInfo::MOD
:
9984 if (!bucket_info
.num_shards
) {
9989 uint32_t sid
= rgw_bucket_shard_index(obj_key
, bucket_info
.num_shards
);
9991 *shard_id
= (int)sid
;
10001 void RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, uint32_t num_shards
,
10002 int shard_id
, string
*bucket_obj
)
10005 // By default with no sharding, we use the bucket oid as itself
10006 (*bucket_obj
) = bucket_oid_base
;
10008 char buf
[bucket_oid_base
.size() + 32];
10009 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
10010 (*bucket_obj
) = buf
;
10014 int RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, const string
& obj_key
,
10015 uint32_t num_shards
, RGWBucketInfo::BIShardsHashType hash_type
, string
*bucket_obj
, int *shard_id
)
10018 switch (hash_type
) {
10019 case RGWBucketInfo::MOD
:
10021 // By default with no sharding, we use the bucket oid as itself
10022 (*bucket_obj
) = bucket_oid_base
;
10027 uint32_t sid
= rgw_bucket_shard_index(obj_key
, num_shards
);
10028 char buf
[bucket_oid_base
.size() + 32];
10029 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), sid
);
10030 (*bucket_obj
) = buf
;
10032 *shard_id
= (int)sid
;
10042 uint64_t RGWRados::instance_id()
10044 return get_rados_handle()->get_instance_id();
10047 uint64_t RGWRados::next_bucket_id()
10049 Mutex::Locker
l(bucket_id_lock
);
10050 return ++max_bucket_id
;
10053 RGWRados
*RGWStoreManager::init_storage_provider(CephContext
*cct
, bool use_gc_thread
, bool use_lc_thread
,
10054 bool quota_threads
, bool run_sync_thread
, bool run_reshard_thread
, bool use_cache
)
10056 RGWRados
*store
= new RGWRados
;
10058 if ((*store
).set_use_cache(use_cache
)
10059 .set_run_gc_thread(use_gc_thread
)
10060 .set_run_lc_thread(use_lc_thread
)
10061 .set_run_quota_threads(quota_threads
)
10062 .set_run_sync_thread(run_sync_thread
)
10063 .set_run_reshard_thread(run_reshard_thread
)
10064 .initialize(cct
) < 0) {
10072 RGWRados
*RGWStoreManager::init_raw_storage_provider(CephContext
*cct
)
10074 RGWRados
*store
= NULL
;
10075 store
= new RGWRados
;
10077 store
->set_context(cct
);
10079 int ret
= store
->init_svc(true);
10081 ldout(cct
, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
10085 if (store
->init_rados() < 0) {
10093 void RGWStoreManager::close_storage(RGWRados
*store
)
10103 librados::Rados
* RGWRados::get_rados_handle()
10108 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
10111 int ret
= get_raw_obj_ref(obj
, &ref
);
10113 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
10117 ObjectWriteOperation op
;
10118 list
<string
> prefixes
;
10119 cls_rgw_remove_obj(op
, prefixes
);
10121 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
10122 ret
= ref
.ioctx
.aio_operate(ref
.obj
.oid
, c
, &op
);
10124 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
10129 handles
.push_back(c
);
10134 int RGWRados::delete_obj_aio(const rgw_obj
& obj
,
10135 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
10136 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
)
10139 int ret
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10141 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
10145 if (keep_index_consistent
) {
10146 RGWRados::Bucket
bop(this, bucket_info
);
10147 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
10149 ret
= index_op
.prepare(CLS_RGW_OP_DEL
, &astate
->write_tag
);
10151 lderr(cct
) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
10156 ObjectWriteOperation op
;
10157 list
<string
> prefixes
;
10158 cls_rgw_remove_obj(op
, prefixes
);
10160 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
10161 ret
= ref
.ioctx
.aio_operate(ref
.obj
.oid
, c
, &op
);
10163 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
10168 handles
.push_back(c
);
10170 if (keep_index_consistent
) {
10171 ret
= delete_obj_index(obj
, astate
->mtime
);
10173 lderr(cct
) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;
10180 int rgw_compression_info_from_attrset(map
<string
, bufferlist
>& attrs
, bool& need_decompress
, RGWCompressionInfo
& cs_info
) {
10181 map
<string
, bufferlist
>::iterator value
= attrs
.find(RGW_ATTR_COMPRESSION
);
10182 if (value
!= attrs
.end()) {
10183 auto bliter
= value
->second
.cbegin();
10185 decode(cs_info
, bliter
);
10186 } catch (buffer::error
& err
) {
10189 if (cs_info
.blocks
.size() == 0) {
10192 if (cs_info
.compression_type
!= "none")
10193 need_decompress
= true;
10195 need_decompress
= false;
10198 need_decompress
= false;
10203 bool RGWRados::call(std::string_view command
, const cmdmap_t
& cmdmap
,
10204 std::string_view format
, bufferlist
& out
)
10206 if (command
== "cache list"sv
) {
10207 std::optional
<std::string
> filter
;
10208 if (auto i
= cmdmap
.find("filter"); i
!= cmdmap
.cend()) {
10209 filter
= boost::get
<std::string
>(i
->second
);
10211 std::unique_ptr
<Formatter
> f(ceph::Formatter::create(format
, "table"));
10213 f
->open_array_section("cache_entries");
10214 call_list(filter
, f
.get());
10215 f
->close_section();
10219 out
.append("Unable to create Formatter.\n");
10222 } else if (command
== "cache inspect"sv
) {
10223 std::unique_ptr
<Formatter
> f(ceph::Formatter::create(format
, "json-pretty"));
10225 const auto& target
= boost::get
<std::string
>(cmdmap
.at("target"));
10226 if (call_inspect(target
, f
.get())) {
10230 out
.append("Unable to find entry "s
+ target
+ ".\n");
10234 out
.append("Unable to create Formatter.\n");
10237 } else if (command
== "cache erase"sv
) {
10238 const auto& target
= boost::get
<std::string
>(cmdmap
.at("target"));
10239 if (call_erase(target
)) {
10242 out
.append("Unable to find entry "s
+ target
+ ".\n");
10245 } else if (command
== "cache zap"sv
) {
10252 void RGWRados::call_list(const std::optional
<std::string
>& s
,
10253 ceph::Formatter
*f
)
10258 svc
.cache
->call_list(s
, f
);
10261 bool RGWRados::call_inspect(const std::string
& s
, Formatter
*f
)
10266 return svc
.cache
->call_inspect(s
, f
);
10269 bool RGWRados::call_erase(const std::string
& s
) {
10273 return svc
.cache
->call_erase(s
);
10276 void RGWRados::call_zap() {
10280 svc
.cache
->call_zap();
10283 string
RGWRados::get_mfa_oid(const rgw_user
& user
)
10285 return string("user:") + user
.to_str();
10288 int RGWRados::get_mfa_ref(const rgw_user
& user
, rgw_rados_ref
*ref
)
10290 string oid
= get_mfa_oid(user
);
10291 rgw_raw_obj
obj(svc
.zone
->get_zone_params().otp_pool
, oid
);
10292 return get_system_obj_ref(obj
, ref
);
10295 int RGWRados::check_mfa(const rgw_user
& user
, const string
& otp_id
, const string
& pin
)
10299 int r
= get_mfa_ref(user
, &ref
);
10304 rados::cls::otp::otp_check_t result
;
10306 r
= rados::cls::otp::OTP::check(cct
, ref
.ioctx
, ref
.obj
.oid
, otp_id
, pin
, &result
);
10310 ldout(cct
, 20) << "OTP check, otp_id=" << otp_id
<< " result=" << (int)result
.result
<< dendl
;
10312 return (result
.result
== rados::cls::otp::OTP_CHECK_SUCCESS
? 0 : -EACCES
);
10315 void RGWRados::prepare_mfa_write(librados::ObjectWriteOperation
*op
,
10316 RGWObjVersionTracker
*objv_tracker
,
10317 const ceph::real_time
& mtime
)
10319 RGWObjVersionTracker ot
;
10321 if (objv_tracker
) {
10322 ot
= *objv_tracker
;
10325 if (ot
.write_version
.tag
.empty()) {
10326 if (ot
.read_version
.tag
.empty()) {
10327 ot
.generate_new_write_ver(cct
);
10329 ot
.write_version
= ot
.read_version
;
10330 ot
.write_version
.ver
++;
10334 ot
.prepare_op_for_write(op
);
10335 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
10336 op
->mtime2(&mtime_ts
);
10339 int RGWRados::create_mfa(const rgw_user
& user
, const rados::cls::otp::otp_info_t
& config
,
10340 RGWObjVersionTracker
*objv_tracker
, const ceph::real_time
& mtime
)
10344 int r
= get_mfa_ref(user
, &ref
);
10349 librados::ObjectWriteOperation op
;
10350 prepare_mfa_write(&op
, objv_tracker
, mtime
);
10351 rados::cls::otp::OTP::create(&op
, config
);
10352 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
10354 ldout(cct
, 20) << "OTP create, otp_id=" << config
.id
<< " result=" << (int)r
<< dendl
;
10361 int RGWRados::remove_mfa(const rgw_user
& user
, const string
& id
,
10362 RGWObjVersionTracker
*objv_tracker
,
10363 const ceph::real_time
& mtime
)
10367 int r
= get_mfa_ref(user
, &ref
);
10372 librados::ObjectWriteOperation op
;
10373 prepare_mfa_write(&op
, objv_tracker
, mtime
);
10374 rados::cls::otp::OTP::remove(&op
, id
);
10375 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
10377 ldout(cct
, 20) << "OTP remove, otp_id=" << id
<< " result=" << (int)r
<< dendl
;
10384 int RGWRados::get_mfa(const rgw_user
& user
, const string
& id
, rados::cls::otp::otp_info_t
*result
)
10388 int r
= get_mfa_ref(user
, &ref
);
10393 r
= rados::cls::otp::OTP::get(nullptr, ref
.ioctx
, ref
.obj
.oid
, id
, result
);
10401 int RGWRados::list_mfa(const rgw_user
& user
, list
<rados::cls::otp::otp_info_t
> *result
)
10405 int r
= get_mfa_ref(user
, &ref
);
10410 r
= rados::cls::otp::OTP::get_all(nullptr, ref
.ioctx
, ref
.obj
.oid
, result
);
10418 int RGWRados::otp_get_current_time(const rgw_user
& user
, ceph::real_time
*result
)
10422 int r
= get_mfa_ref(user
, &ref
);
10427 r
= rados::cls::otp::OTP::get_current_time(ref
.ioctx
, ref
.obj
.oid
, result
);
10435 int RGWRados::set_mfa(const string
& oid
, const list
<rados::cls::otp::otp_info_t
>& entries
,
10436 bool reset_obj
, RGWObjVersionTracker
*objv_tracker
,
10437 const real_time
& mtime
)
10439 rgw_raw_obj
obj(svc
.zone
->get_zone_params().otp_pool
, oid
);
10441 int r
= get_system_obj_ref(obj
, &ref
);
10446 librados::ObjectWriteOperation op
;
10449 op
.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK
);
10452 prepare_mfa_write(&op
, objv_tracker
, mtime
);
10453 rados::cls::otp::OTP::set(&op
, entries
);
10454 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
10456 ldout(cct
, 20) << "OTP set entries.size()=" << entries
.size() << " result=" << (int)r
<< dendl
;
10463 int RGWRados::list_mfa(const string
& oid
, list
<rados::cls::otp::otp_info_t
> *result
,
10464 RGWObjVersionTracker
*objv_tracker
, ceph::real_time
*pmtime
)
10466 rgw_raw_obj
obj(svc
.zone
->get_zone_params().otp_pool
, oid
);
10468 int r
= get_system_obj_ref(obj
, &ref
);
10472 librados::ObjectReadOperation op
;
10473 struct timespec mtime_ts
;
10475 op
.stat2(nullptr, &mtime_ts
, nullptr);
10477 objv_tracker
->prepare_op_for_read(&op
);
10478 r
= rados::cls::otp::OTP::get_all(&op
, ref
.ioctx
, ref
.obj
.oid
, result
);
10483 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);