1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "include/compat.h"
8 #include <boost/algorithm/string.hpp>
11 #include <boost/container/flat_set.hpp>
12 #include <boost/format.hpp>
13 #include <boost/optional.hpp>
14 #include <boost/utility/in_place_factory.hpp>
16 #include "common/ceph_json.h"
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
22 #include "rgw_rados.h"
24 #include "rgw_cache.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_aio_throttle.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32 #include "rgw_putobj_processor.h"
34 #include "cls/rgw/cls_rgw_ops.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "cls/otp/cls_otp_client.h"
44 #include "osd/osd_types.h"
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
50 #undef fork // fails to compile RGWPeriod::fork() below
52 #include "common/Clock.h"
54 using namespace librados
;
62 #include "include/random.h"
67 #include "rgw_object_expirer_core.h"
69 #include "rgw_sync_counters.h"
70 #include "rgw_sync_trace.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
75 #include "services/svc_zone.h"
76 #include "services/svc_zone_utils.h"
77 #include "services/svc_quota.h"
78 #include "services/svc_sync_modules.h"
79 #include "services/svc_sys_obj.h"
80 #include "services/svc_sys_obj_cache.h"
82 #include "compressor/Compressor.h"
85 #define TRACEPOINT_DEFINE
86 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
87 #include "tracing/rgw_rados.h"
88 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
89 #undef TRACEPOINT_DEFINE
91 #define tracepoint(...)
94 #define dout_context g_ceph_context
95 #define dout_subsys ceph_subsys_rgw
98 static string shadow_ns
= "shadow";
99 static string dir_oid_prefix
= ".dir.";
100 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
101 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
103 static string log_lock_name
= "rgw_log_lock";
104 static RGWObjCategory main_category
= RGWObjCategory::Main
;
105 #define RGW_USAGE_OBJ_PREFIX "usage."
107 #define dout_subsys ceph_subsys_rgw
109 const std::string MP_META_SUFFIX
= ".meta";
112 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
113 const rgw_placement_rule
& head_placement_rule
,
114 const rgw_obj
& obj
, rgw_pool
*pool
)
116 if (!zone_params
.get_head_data_pool(head_placement_rule
, obj
, pool
)) {
117 RGWZonePlacementInfo placement
;
118 if (!zone_params
.get_placement(zonegroup
.default_placement
.name
, &placement
)) {
122 if (!obj
.in_extra_data
) {
123 *pool
= placement
.get_data_pool(zonegroup
.default_placement
.storage_class
);
125 *pool
= placement
.get_data_extra_pool();
132 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
133 const rgw_placement_rule
& head_placement_rule
,
134 const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
136 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
138 return rgw_get_obj_data_pool(zonegroup
, zone_params
, head_placement_rule
, obj
, &raw_obj
->pool
);
141 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
145 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
151 rgw_raw_obj
rgw_obj_select::get_raw_obj(RGWRados
*store
) const
155 store
->obj_to_raw(placement_rule
, obj
, &r
);
161 int rgw_init_ioctx(librados::Rados
*rados
, const rgw_pool
& pool
, IoCtx
& ioctx
, bool create
)
163 int r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
164 if (r
== -ENOENT
&& create
) {
165 r
= rados
->pool_create(pool
.name
.c_str());
169 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r
)
170 << " (this can be due to a pool or placement group misconfiguration, e.g."
171 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
174 if (r
< 0 && r
!= -EEXIST
) {
178 r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
183 r
= ioctx
.application_enable(pg_pool_t::APPLICATION_NAME_RGW
, false);
184 if (r
< 0 && r
!= -EOPNOTSUPP
) {
190 if (!pool
.ns
.empty()) {
191 ioctx
.set_namespace(pool
.ns
);
196 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
198 obj_version
*check_objv
= version_for_check();
201 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
204 cls_version_read(*op
, &read_version
);
207 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
209 obj_version
*check_objv
= version_for_check();
210 obj_version
*modify_version
= version_for_write();
213 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
216 if (modify_version
) {
217 cls_version_set(*op
, *modify_version
);
219 cls_version_inc(*op
);
223 void RGWObjManifest::obj_iterator::operator++()
225 if (manifest
->explicit_objs
) {
228 update_explicit_pos();
234 uint64_t obj_size
= manifest
->get_obj_size();
235 uint64_t head_size
= manifest
->get_head_size();
237 if (ofs
== obj_size
) {
241 if (manifest
->rules
.empty()) {
245 /* are we still pointing at the head? */
246 if (ofs
< head_size
) {
247 rule_iter
= manifest
->rules
.begin();
248 RGWObjManifestRule
*rule
= &rule_iter
->second
;
249 ofs
= std::min(head_size
, obj_size
);
252 stripe_size
= std::min(obj_size
- ofs
, rule
->stripe_max_size
);
253 if (rule
->part_size
> 0) {
254 stripe_size
= std::min(stripe_size
, rule
->part_size
);
260 RGWObjManifestRule
*rule
= &rule_iter
->second
;
262 stripe_ofs
+= rule
->stripe_max_size
;
264 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule
->part_size
<< " rules.size()=" << manifest
->rules
.size() << dendl
;
266 if (rule
->part_size
> 0) {
267 /* multi part, multi stripes object */
269 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
271 if (stripe_ofs
>= part_ofs
+ rule
->part_size
) {
272 /* moved to the next part */
274 part_ofs
+= rule
->part_size
;
275 stripe_ofs
= part_ofs
;
277 bool last_rule
= (next_rule_iter
== manifest
->rules
.end());
278 /* move to the next rule? */
279 if (!last_rule
&& stripe_ofs
>= next_rule_iter
->second
.start_ofs
) {
280 rule_iter
= next_rule_iter
;
281 last_rule
= (next_rule_iter
== manifest
->rules
.end());
285 cur_part_id
= rule_iter
->second
.start_part_num
;
290 rule
= &rule_iter
->second
;
293 stripe_size
= std::min(rule
->part_size
- (stripe_ofs
- part_ofs
), rule
->stripe_max_size
);
296 cur_override_prefix
= rule
->override_prefix
;
299 if (ofs
> obj_size
) {
305 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs
<< " stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
309 int RGWObjManifest::generator::create_begin(CephContext
*cct
, RGWObjManifest
*_m
,
310 const rgw_placement_rule
& head_placement_rule
,
311 const rgw_placement_rule
*tail_placement_rule
,
312 const rgw_bucket
& _b
, const rgw_obj
& _obj
)
316 if (!tail_placement_rule
) {
317 manifest
->set_tail_placement(head_placement_rule
, _b
);
319 rgw_placement_rule new_tail_rule
= *tail_placement_rule
;
320 new_tail_rule
.inherit_from(head_placement_rule
);
321 manifest
->set_tail_placement(new_tail_rule
, _b
);
324 manifest
->set_head(head_placement_rule
, _obj
, 0);
327 if (manifest
->get_prefix().empty()) {
329 gen_rand_alphanumeric(cct
, buf
, sizeof(buf
) - 1);
331 string oid_prefix
= ".";
332 oid_prefix
.append(buf
);
333 oid_prefix
.append("_");
335 manifest
->set_prefix(oid_prefix
);
338 bool found
= manifest
->get_rule(0, &rule
);
340 derr
<< "ERROR: manifest->get_rule() could not find rule" << dendl
;
344 uint64_t head_size
= manifest
->get_head_size();
347 cur_stripe_size
= head_size
;
349 cur_stripe_size
= rule
.stripe_max_size
;
352 cur_part_id
= rule
.start_part_num
;
354 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, 0, NULL
, &cur_obj
);
356 // Normal object which not generated through copy operation
357 manifest
->set_tail_instance(_obj
.key
.instance
);
359 manifest
->update_iterators();
364 int RGWObjManifest::generator::create_next(uint64_t ofs
)
366 if (ofs
< last_ofs
) /* only going forward */
369 uint64_t max_head_size
= manifest
->get_max_head_size();
371 if (ofs
< max_head_size
) {
372 manifest
->set_head_size(ofs
);
375 if (ofs
>= max_head_size
) {
376 manifest
->set_head_size(max_head_size
);
377 cur_stripe
= (ofs
- max_head_size
) / rule
.stripe_max_size
;
378 cur_stripe_size
= rule
.stripe_max_size
;
380 if (cur_part_id
== 0 && max_head_size
> 0) {
386 manifest
->set_obj_size(ofs
);
388 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, ofs
, NULL
, &cur_obj
);
390 manifest
->update_iterators();
395 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_begin()
400 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_end()
405 RGWObjManifest::obj_iterator
RGWObjManifest::obj_find(uint64_t ofs
)
407 if (ofs
> obj_size
) {
410 RGWObjManifest::obj_iterator
iter(this);
415 int RGWObjManifest::append(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
,
416 const RGWZoneParams
& zone_params
)
418 if (explicit_objs
|| m
.explicit_objs
) {
419 return append_explicit(m
, zonegroup
, zone_params
);
427 string override_prefix
;
429 if (prefix
.empty()) {
433 if (prefix
!= m
.prefix
) {
434 override_prefix
= m
.prefix
;
437 map
<uint64_t, RGWObjManifestRule
>::iterator miter
= m
.rules
.begin();
438 if (miter
== m
.rules
.end()) {
439 return append_explicit(m
, zonegroup
, zone_params
);
442 for (; miter
!= m
.rules
.end(); ++miter
) {
443 map
<uint64_t, RGWObjManifestRule
>::reverse_iterator last_rule
= rules
.rbegin();
445 RGWObjManifestRule
& rule
= last_rule
->second
;
447 if (rule
.part_size
== 0) {
448 rule
.part_size
= obj_size
- rule
.start_ofs
;
451 RGWObjManifestRule
& next_rule
= miter
->second
;
452 if (!next_rule
.part_size
) {
453 next_rule
.part_size
= m
.obj_size
- next_rule
.start_ofs
;
456 string rule_prefix
= prefix
;
457 if (!rule
.override_prefix
.empty()) {
458 rule_prefix
= rule
.override_prefix
;
461 string next_rule_prefix
= m
.prefix
;
462 if (!next_rule
.override_prefix
.empty()) {
463 next_rule_prefix
= next_rule
.override_prefix
;
466 if (rule
.part_size
!= next_rule
.part_size
||
467 rule
.stripe_max_size
!= next_rule
.stripe_max_size
||
468 rule_prefix
!= next_rule_prefix
) {
469 if (next_rule_prefix
!= prefix
) {
470 append_rules(m
, miter
, &next_rule_prefix
);
472 append_rules(m
, miter
, NULL
);
477 uint64_t expected_part_num
= rule
.start_part_num
+ 1;
478 if (rule
.part_size
> 0) {
479 expected_part_num
= rule
.start_part_num
+ (obj_size
+ next_rule
.start_ofs
- rule
.start_ofs
) / rule
.part_size
;
482 if (expected_part_num
!= next_rule
.start_part_num
) {
483 append_rules(m
, miter
, NULL
);
488 set_obj_size(obj_size
+ m
.obj_size
);
493 int RGWObjManifest::append(RGWObjManifest
& m
, RGWSI_Zone
*zone_svc
)
495 return append(m
, zone_svc
->get_zonegroup(), zone_svc
->get_zone_params());
498 void RGWObjManifest::append_rules(RGWObjManifest
& m
, map
<uint64_t, RGWObjManifestRule
>::iterator
& miter
,
499 string
*override_prefix
)
501 for (; miter
!= m
.rules
.end(); ++miter
) {
502 RGWObjManifestRule rule
= miter
->second
;
503 rule
.start_ofs
+= obj_size
;
505 rule
.override_prefix
= *override_prefix
;
506 rules
[rule
.start_ofs
] = rule
;
510 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
515 obj_iterator iter
= obj_begin();
517 while (iter
!= obj_end()) {
518 RGWObjManifestPart
& part
= objs
[iter
.get_stripe_ofs()];
519 const rgw_obj_select
& os
= iter
.get_location();
520 const rgw_raw_obj
& raw_loc
= os
.get_raw_obj(zonegroup
, zone_params
);
523 uint64_t ofs
= iter
.get_stripe_ofs();
528 rgw_raw_obj_to_obj(tail_placement
.bucket
, raw_loc
, &part
.loc
);
531 uint64_t next_ofs
= iter
.get_stripe_ofs();
533 part
.size
= next_ofs
- ofs
;
536 explicit_objs
= true;
541 int RGWObjManifest::append_explicit(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
543 if (!explicit_objs
) {
544 convert_to_explicit(zonegroup
, zone_params
);
546 if (!m
.explicit_objs
) {
547 m
.convert_to_explicit(zonegroup
, zone_params
);
549 map
<uint64_t, RGWObjManifestPart
>::iterator iter
;
550 uint64_t base
= obj_size
;
551 for (iter
= m
.objs
.begin(); iter
!= m
.objs
.end(); ++iter
) {
552 RGWObjManifestPart
& part
= iter
->second
;
553 objs
[base
+ iter
->first
] = part
;
555 obj_size
+= m
.obj_size
;
560 bool RGWObjManifest::get_rule(uint64_t ofs
, RGWObjManifestRule
*rule
)
566 map
<uint64_t, RGWObjManifestRule
>::iterator iter
= rules
.upper_bound(ofs
);
567 if (iter
!= rules
.begin()) {
571 *rule
= iter
->second
;
576 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
578 write_version
.ver
= 1;
581 write_version
.tag
.clear();
582 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
585 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
587 RGWHTTPManager http_manager
;
590 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
591 http_manager(store
->ctx(), completion_mgr
) {
592 http_manager
.start();
595 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
596 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
600 list
<RGWCoroutinesStack
*> stacks
;
601 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
602 RGWRESTConn
*conn
= iter
->second
;
603 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
604 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
606 stacks
.push_back(stack
);
612 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
614 RGWHTTPManager http_manager
;
617 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
618 http_manager(store
->ctx(), completion_mgr
) {
619 http_manager
.start();
622 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, map
<int, set
<string
> >& shards
) {
623 rgw_http_param_pair pairs
[] = { { "type", "data" },
625 { "source-zone", store
->svc
.zone
->get_zone_params().get_id().c_str() },
628 list
<RGWCoroutinesStack
*> stacks
;
629 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
630 RGWRESTConn
*conn
= iter
->second
;
631 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
632 stack
->call(new RGWPostRESTResourceCR
<map
<int, set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
634 stacks
.push_back(stack
);
640 /* class RGWRadosThread */
642 void RGWRadosThread::start()
644 worker
= new Worker(cct
, this);
645 worker
->create(thread_name
.c_str());
648 void RGWRadosThread::stop()
660 void *RGWRadosThread::Worker::entry() {
661 uint64_t msec
= processor
->interval_msec();
662 utime_t interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
665 utime_t start
= ceph_clock_now();
666 int r
= processor
->process();
668 dout(0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
671 if (processor
->going_down())
674 utime_t end
= ceph_clock_now();
677 uint64_t cur_msec
= processor
->interval_msec();
678 if (cur_msec
!= msec
) { /* was it reconfigured? */
680 interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
685 continue; // next round
687 utime_t wait_time
= interval
;
690 wait_interval(wait_time
);
694 } while (!processor
->going_down());
699 class RGWMetaNotifier
: public RGWRadosThread
{
700 RGWMetaNotifierManager notify_mgr
;
701 RGWMetadataLog
*const log
;
703 uint64_t interval_msec() override
{
704 return cct
->_conf
->rgw_md_notify_interval_msec
;
706 void stop_process() override
{
710 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
711 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
713 int process() override
;
716 int RGWMetaNotifier::process()
720 log
->read_clear_modified(shards
);
722 if (shards
.empty()) {
726 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
727 ldout(cct
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
730 notify_mgr
.notify_all(store
->svc
.zone
->get_zone_conn_map(), shards
);
735 class RGWDataNotifier
: public RGWRadosThread
{
736 RGWDataNotifierManager notify_mgr
;
738 uint64_t interval_msec() override
{
739 return cct
->_conf
.get_val
<int64_t>("rgw_data_notify_interval_msec");
741 void stop_process() override
{
745 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
747 int process() override
;
750 int RGWDataNotifier::process()
752 if (!store
->data_log
) {
756 map
<int, set
<string
> > shards
;
758 store
->data_log
->read_clear_modified(shards
);
760 if (shards
.empty()) {
764 for (map
<int, set
<string
> >::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
765 ldout(cct
, 20) << __func__
<< "(): notifying datalog change, shard_id=" << iter
->first
<< ": " << iter
->second
<< dendl
;
768 notify_mgr
.notify_all(store
->svc
.zone
->get_zone_data_notify_to_map(), shards
);
773 class RGWSyncProcessorThread
: public RGWRadosThread
{
775 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
776 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
777 ~RGWSyncProcessorThread() override
{}
778 int init() override
= 0 ;
779 int process() override
= 0;
782 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
784 RGWMetaSyncStatusManager sync
;
786 uint64_t interval_msec() override
{
787 return 0; /* no interval associated, it'll run once until stopped */
789 void stop_process() override
{
793 RGWMetaSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
)
794 : RGWSyncProcessorThread(_store
, "meta-sync"), sync(_store
, async_rados
) {}
796 void wakeup_sync_shards(set
<int>& shard_ids
) {
797 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
801 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
803 int init() override
{
804 int ret
= sync
.init();
806 ldout(store
->ctx(), 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
812 int process() override
{
818 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
820 PerfCountersRef counters
;
821 RGWDataSyncStatusManager sync
;
824 uint64_t interval_msec() override
{
826 return 0; /* no interval associated, it'll run once until stopped */
828 #define DATA_SYNC_INIT_WAIT_SEC 20
829 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
832 void stop_process() override
{
836 RGWDataSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
,
837 const RGWZone
* source_zone
)
838 : RGWSyncProcessorThread(_store
, "data-sync"),
839 counters(sync_counters::build(store
->ctx(), std::string("data-sync-from-") + source_zone
->name
)),
840 sync(_store
, async_rados
, source_zone
->id
, counters
.get()),
841 initialized(false) {}
843 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
844 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
845 sync
.wakeup(iter
->first
, iter
->second
);
848 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
850 int init() override
{
854 int process() override
{
855 while (!initialized
) {
859 int ret
= sync
.init();
872 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
, DoutPrefixProvider
874 RGWCoroutinesManager crs
;
876 rgw::BucketTrimManager
*bucket_trim
;
878 const utime_t trim_interval
;
880 uint64_t interval_msec() override
{ return 0; }
881 void stop_process() override
{ crs
.stop(); }
883 RGWSyncLogTrimThread(RGWRados
*store
, rgw::BucketTrimManager
*bucket_trim
,
885 : RGWSyncProcessorThread(store
, "sync-log-trim"),
886 crs(store
->ctx(), store
->get_cr_registry()), store(store
),
887 bucket_trim(bucket_trim
),
888 http(store
->ctx(), crs
.get_completion_mgr()),
889 trim_interval(interval
, 0)
892 int init() override
{
895 int process() override
{
896 list
<RGWCoroutinesStack
*> stacks
;
897 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
898 meta
->call(create_meta_log_trim_cr(this, store
, &http
,
899 cct
->_conf
->rgw_md_log_max_shards
,
901 stacks
.push_back(meta
);
903 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
904 data
->call(create_data_log_trim_cr(store
, &http
,
905 cct
->_conf
->rgw_data_log_num_shards
,
907 stacks
.push_back(data
);
909 auto bucket
= new RGWCoroutinesStack(store
->ctx(), &crs
);
910 bucket
->call(bucket_trim
->create_bucket_trim_cr(&http
));
911 stacks
.push_back(bucket
);
917 // implements DoutPrefixProvider
918 CephContext
*get_cct() const override
{ return store
->ctx(); }
919 unsigned get_subsys() const
924 std::ostream
& gen_prefix(std::ostream
& out
) const
926 return out
<< "sync log trim: ";
931 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
933 Mutex::Locker
l(meta_sync_thread_lock
);
934 if (meta_sync_processor_thread
) {
935 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
939 void RGWRados::wakeup_data_sync_shards(const string
& source_zone
, map
<int, set
<string
> >& shard_ids
)
941 ldout(ctx(), 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
942 Mutex::Locker
l(data_sync_thread_lock
);
943 map
<string
, RGWDataSyncProcessorThread
*>::iterator iter
= data_sync_processor_threads
.find(source_zone
);
944 if (iter
== data_sync_processor_threads
.end()) {
945 ldout(ctx(), 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
949 RGWDataSyncProcessorThread
*thread
= iter
->second
;
951 thread
->wakeup_sync_shards(shard_ids
);
954 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
956 Mutex::Locker
l(meta_sync_thread_lock
);
957 if (meta_sync_processor_thread
) {
958 return meta_sync_processor_thread
->get_manager();
963 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const std::string
& source_zone
)
965 Mutex::Locker
l(data_sync_thread_lock
);
966 auto thread
= data_sync_processor_threads
.find(source_zone
);
967 if (thread
== data_sync_processor_threads
.end()) {
970 return thread
->second
->get_manager();
973 int RGWRados::get_required_alignment(const rgw_pool
& pool
, uint64_t *alignment
)
976 int r
= open_pool_ctx(pool
, ioctx
);
978 ldout(cct
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
983 r
= ioctx
.pool_requires_alignment2(&requires
);
985 ldout(cct
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
996 r
= ioctx
.pool_required_alignment2(&align
);
998 ldout(cct
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
1003 ldout(cct
, 20) << "required alignment=" << align
<< dendl
;
1009 void RGWRados::get_max_aligned_size(uint64_t size
, uint64_t alignment
, uint64_t *max_size
)
1011 if (alignment
== 0) {
1016 if (size
<= alignment
) {
1017 *max_size
= alignment
;
1021 *max_size
= size
- (size
% alignment
);
1024 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
, uint64_t *palignment
)
1027 int r
= get_required_alignment(pool
, &alignment
);
1033 *palignment
= alignment
;
1036 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
1038 get_max_aligned_size(config_chunk_size
, alignment
, max_chunk_size
);
1040 ldout(cct
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
1045 int RGWRados::get_max_chunk_size(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
,
1046 uint64_t *max_chunk_size
, uint64_t *palignment
)
1049 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
1050 ldout(cct
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
1053 return get_max_chunk_size(pool
, max_chunk_size
, palignment
);
1056 class RGWIndexCompletionManager
;
1058 struct complete_op_data
{
1059 Mutex lock
{"complete_op_data"};
1060 AioCompletion
*rados_completion
{nullptr};
1061 int manager_shard_id
{-1};
1062 RGWIndexCompletionManager
*manager
{nullptr};
1066 rgw_bucket_entry_ver ver
;
1067 cls_rgw_obj_key key
;
1068 rgw_bucket_dir_entry_meta dir_meta
;
1069 list
<cls_rgw_obj_key
> remove_objs
;
1072 rgw_zone_set zones_trace
;
1074 bool stopped
{false};
1077 Mutex::Locker
l(lock
);
1082 class RGWIndexCompletionThread
: public RGWRadosThread
{
1085 uint64_t interval_msec() override
{
1089 list
<complete_op_data
*> completions
;
1091 Mutex completions_lock
;
1093 RGWIndexCompletionThread(RGWRados
*_store
)
1094 : RGWRadosThread(_store
, "index-complete"), store(_store
), completions_lock("RGWIndexCompletionThread::completions_lock") {}
1096 int process() override
;
1098 void add_completion(complete_op_data
*completion
) {
1100 Mutex::Locker
l(completions_lock
);
1101 completions
.push_back(completion
);
1108 int RGWIndexCompletionThread::process()
1110 list
<complete_op_data
*> comps
;
1113 Mutex::Locker
l(completions_lock
);
1114 completions
.swap(comps
);
1117 for (auto c
: comps
) {
1118 std::unique_ptr
<complete_op_data
> up
{c
};
1123 ldout(store
->ctx(), 20) << __func__
<< "(): handling completion for key=" << c
->key
<< dendl
;
1125 RGWRados::BucketShard
bs(store
);
1126 RGWBucketInfo bucket_info
;
1128 int r
= bs
.init(c
->obj
.bucket
, c
->obj
, &bucket_info
);
1130 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): failed to initialize BucketShard, obj=" << c
->obj
<< " r=" << r
<< dendl
;
1131 /* not much to do */
1135 r
= store
->guard_reshard(&bs
, c
->obj
, bucket_info
,
1136 [&](RGWRados::BucketShard
*bs
) -> int {
1137 librados::ObjectWriteOperation o
;
1138 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
1139 cls_rgw_bucket_complete_op(o
, c
->op
, c
->tag
, c
->ver
, c
->key
, c
->dir_meta
, &c
->remove_objs
,
1140 c
->log_op
, c
->bilog_op
, &c
->zones_trace
);
1141 return bs
->index_ctx
.operate(bs
->bucket_obj
, &o
);
1144 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): bucket index completion failed, obj=" << c
->obj
<< " r=" << r
<< dendl
;
1145 /* ignoring error, can't do anything about it */
1148 r
= store
->data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
1150 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
1157 class RGWIndexCompletionManager
{
1158 RGWRados
*store
{nullptr};
1159 vector
<Mutex
*> locks
;
1160 vector
<set
<complete_op_data
*> > completions
;
1162 RGWIndexCompletionThread
*completion_thread
{nullptr};
1166 std::atomic
<int> cur_shard
{0};
1170 RGWIndexCompletionManager(RGWRados
*_store
) : store(_store
) {
1171 num_shards
= store
->ctx()->_conf
->rgw_thread_pool_size
;
1173 for (int i
= 0; i
< num_shards
; i
++) {
1175 snprintf(buf
, sizeof(buf
), "RGWIndexCompletionManager::lock::%d", i
);
1176 locks
.push_back(new Mutex(buf
));
1179 completions
.resize(num_shards
);
1181 ~RGWIndexCompletionManager() {
1184 for (auto l
: locks
) {
1190 int result
= cur_shard
% num_shards
;
1195 void create_completion(const rgw_obj
& obj
,
1196 RGWModifyOp op
, string
& tag
,
1197 rgw_bucket_entry_ver
& ver
,
1198 const cls_rgw_obj_key
& key
,
1199 rgw_bucket_dir_entry_meta
& dir_meta
,
1200 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
1202 rgw_zone_set
*zones_trace
,
1203 complete_op_data
**result
);
1204 bool handle_completion(completion_t cb
, complete_op_data
*arg
);
1207 completion_thread
= new RGWIndexCompletionThread(store
);
1208 int ret
= completion_thread
->init();
1212 completion_thread
->start();
1216 if (completion_thread
) {
1217 completion_thread
->stop();
1218 delete completion_thread
;
1221 for (int i
= 0; i
< num_shards
; ++i
) {
1222 Mutex::Locker
l(*locks
[i
]);
1223 for (auto c
: completions
[i
]) {
1227 completions
.clear();
1231 static void obj_complete_cb(completion_t cb
, void *arg
)
1233 complete_op_data
*completion
= (complete_op_data
*)arg
;
1234 completion
->lock
.Lock();
1235 if (completion
->stopped
) {
1236 completion
->lock
.Unlock(); /* can drop lock, no one else is referencing us */
1240 bool need_delete
= completion
->manager
->handle_completion(cb
, completion
);
1241 completion
->lock
.Unlock();
1248 void RGWIndexCompletionManager::create_completion(const rgw_obj
& obj
,
1249 RGWModifyOp op
, string
& tag
,
1250 rgw_bucket_entry_ver
& ver
,
1251 const cls_rgw_obj_key
& key
,
1252 rgw_bucket_dir_entry_meta
& dir_meta
,
1253 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
1255 rgw_zone_set
*zones_trace
,
1256 complete_op_data
**result
)
1258 complete_op_data
*entry
= new complete_op_data
;
1260 int shard_id
= next_shard();
1262 entry
->manager_shard_id
= shard_id
;
1263 entry
->manager
= this;
1269 entry
->dir_meta
= dir_meta
;
1270 entry
->log_op
= log_op
;
1271 entry
->bilog_op
= bilog_op
;
1274 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
1275 entry
->remove_objs
.push_back(*iter
);
1280 entry
->zones_trace
= *zones_trace
;
1282 entry
->zones_trace
.insert(store
->svc
.zone
->get_zone().id
);
1287 entry
->rados_completion
= librados::Rados::aio_create_completion(entry
, NULL
, obj_complete_cb
);
1289 Mutex::Locker
l(*locks
[shard_id
]);
1290 completions
[shard_id
].insert(entry
);
1293 bool RGWIndexCompletionManager::handle_completion(completion_t cb
, complete_op_data
*arg
)
1295 int shard_id
= arg
->manager_shard_id
;
1297 Mutex::Locker
l(*locks
[shard_id
]);
1299 auto& comps
= completions
[shard_id
];
1301 auto iter
= comps
.find(arg
);
1302 if (iter
== comps
.end()) {
1309 int r
= rados_aio_get_return_value(cb
);
1310 if (r
!= -ERR_BUSY_RESHARDING
) {
1313 completion_thread
->add_completion(arg
);
1317 void RGWRados::finalize()
1319 cct
->get_admin_socket()->unregister_commands(this);
1320 if (run_sync_thread
) {
1321 Mutex::Locker
l(meta_sync_thread_lock
);
1322 meta_sync_processor_thread
->stop();
1324 Mutex::Locker
dl(data_sync_thread_lock
);
1325 for (auto iter
: data_sync_processor_threads
) {
1326 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1329 if (sync_log_trimmer
) {
1330 sync_log_trimmer
->stop();
1334 async_rados
->stop();
1336 if (run_sync_thread
) {
1337 delete meta_sync_processor_thread
;
1338 meta_sync_processor_thread
= NULL
;
1339 Mutex::Locker
dl(data_sync_thread_lock
);
1340 for (auto iter
: data_sync_processor_threads
) {
1341 RGWDataSyncProcessorThread
*thread
= iter
.second
;
1344 data_sync_processor_threads
.clear();
1345 delete sync_log_trimmer
;
1346 sync_log_trimmer
= nullptr;
1347 bucket_trim
= boost::none
;
1349 if (meta_notifier
) {
1350 meta_notifier
->stop();
1351 delete meta_notifier
;
1353 if (data_notifier
) {
1354 data_notifier
->stop();
1355 delete data_notifier
;
1372 RGWQuotaHandler::free_handler(quota_handler
);
1381 delete obj_tombstone_cache
;
1383 if (reshard_wait
.get()) {
1384 reshard_wait
->stop();
1385 reshard_wait
.reset();
1388 if (run_reshard_thread
) {
1389 reshard
->stop_processor();
1392 delete index_completion_manager
;
1396 * Initialize the RADOS instance and prepare to do other ops
1397 * Returns 0 on success, -ERR# on failure.
1399 int RGWRados::init_rados()
1402 auto admin_socket
= cct
->get_admin_socket();
1403 for (auto cmd
: admin_commands
) {
1404 int r
= admin_socket
->register_command(cmd
[0], cmd
[1], this,
1407 lderr(cct
) << "ERROR: fail to register admin socket command (r=" << r
1413 auto handles
= std::vector
<librados::Rados
>{static_cast<size_t>(cct
->_conf
->rgw_num_rados_handles
)};
1415 for (auto& r
: handles
) {
1416 ret
= r
.init_with_context(cct
);
1426 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
1427 new RGWCoroutinesManagerRegistry(cct
)};
1428 ret
= crs
->hook_to_admin_command("cr dump");
1433 meta_mgr
= new RGWMetadataManager(cct
, this);
1434 data_log
= new RGWDataChangesLog(cct
, this);
1435 cr_registry
= crs
.release();
1437 std::swap(handles
, rados
);
1441 int RGWRados::register_to_service_map(const string
& daemon_type
, const map
<string
, string
>& meta
)
1443 map
<string
,string
> metadata
= meta
;
1444 metadata
["num_handles"] = stringify(rados
.size());
1445 metadata
["zonegroup_id"] = svc
.zone
->get_zonegroup().get_id();
1446 metadata
["zonegroup_name"] = svc
.zone
->get_zonegroup().get_name();
1447 metadata
["zone_name"] = svc
.zone
->zone_name();
1448 metadata
["zone_id"] = svc
.zone
->zone_id();
1449 string name
= cct
->_conf
->name
.get_id();
1450 if (name
.compare(0, 4, "rgw.") == 0) {
1451 name
= name
.substr(4);
1453 int ret
= rados
[0].service_daemon_register(daemon_type
, name
, metadata
);
1455 ldout(cct
, 0) << "ERROR: service_daemon_register() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1462 int RGWRados::update_service_map(std::map
<std::string
, std::string
>&& status
)
1464 int ret
= rados
[0].service_daemon_update_status(move(status
));
1466 ldout(cct
, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
1474 * Initialize the RADOS instance and prepare to do other ops
1475 * Returns 0 on success, -ERR# on failure.
1477 int RGWRados::init_complete()
1482 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1484 auto& zone_public_config
= svc
.zone
->get_zone();
1485 ret
= svc
.sync_modules
->get_manager()->create_instance(cct
, zone_public_config
.tier_type
, svc
.zone
->get_zone_params().tier_config
, &sync_module
);
1487 lderr(cct
) << "ERROR: failed to init sync module instance, ret=" << ret
<< dendl
;
1488 if (ret
== -ENOENT
) {
1489 lderr(cct
) << "ERROR: " << zone_public_config
.tier_type
1490 << " sync module does not exist. valid sync modules: "
1491 << svc
.sync_modules
->get_manager()->get_registered_module_names()
1497 period_puller
.reset(new RGWPeriodPuller(this));
1498 period_history
.reset(new RGWPeriodHistory(cct
, period_puller
.get(),
1499 svc
.zone
->get_current_period()));
1501 ret
= open_root_pool_ctx();
1505 ret
= open_gc_pool_ctx();
1509 ret
= open_lc_pool_ctx();
1513 ret
= open_objexp_pool_ctx();
1517 ret
= open_reshard_pool_ctx();
1521 pools_initialized
= true;
1524 gc
->initialize(cct
, this);
1526 obj_expirer
= new RGWObjectExpirer(this);
1528 if (use_gc_thread
) {
1529 gc
->start_processor();
1530 obj_expirer
->start_processor();
1533 auto& current_period
= svc
.zone
->get_current_period();
1534 auto& zonegroup
= svc
.zone
->get_zonegroup();
1535 auto& zone_params
= svc
.zone
->get_zone_params();
1536 auto& zone
= svc
.zone
->get_zone();
1538 /* no point of running sync thread if we don't have a master zone configured
1539 or there is no rest_master_conn */
1540 if (zonegroup
.master_zone
.empty() || !svc
.zone
->get_master_conn()
1541 || current_period
.get_id().empty()) {
1542 run_sync_thread
= false;
1545 if (run_sync_thread
) {
1546 // initialize the log period history
1547 meta_mgr
->init_oldest_log_period();
1550 async_rados
= new RGWAsyncRadosProcessor(this, cct
->_conf
->rgw_num_async_rados_threads
);
1551 async_rados
->start();
1553 ret
= meta_mgr
->init(current_period
.get_id());
1555 lderr(cct
) << "ERROR: failed to initialize metadata log: "
1556 << cpp_strerror(-ret
) << dendl
;
1560 if (svc
.zone
->is_meta_master()) {
1561 auto md_log
= meta_mgr
->get_log(current_period
.get_id());
1562 meta_notifier
= new RGWMetaNotifier(this, md_log
);
1563 meta_notifier
->start();
1566 /* init it anyway, might run sync through radosgw-admin explicitly */
1567 sync_tracer
= new RGWSyncTraceManager(cct
, cct
->_conf
->rgw_sync_trace_history_size
);
1568 sync_tracer
->init(this);
1569 ret
= sync_tracer
->hook_to_admin_command();
1574 if (run_sync_thread
) {
1575 for (const auto &pt
: zonegroup
.placement_targets
) {
1576 if (zone_params
.placement_pools
.find(pt
.second
.name
)
1577 == zone_params
.placement_pools
.end()){
1578 ldout(cct
, 0) << "WARNING: This zone does not contain the placement target "
1579 << pt
.second
.name
<< " present in zonegroup" << dendl
;
1582 Mutex::Locker
l(meta_sync_thread_lock
);
1583 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this, async_rados
);
1584 ret
= meta_sync_processor_thread
->init();
1586 ldout(cct
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
1589 meta_sync_processor_thread
->start();
1591 // configure the bucket trim manager
1592 rgw::BucketTrimConfig config
;
1593 rgw::configure_bucket_trim(cct
, config
);
1595 bucket_trim
.emplace(this, config
);
1596 ret
= bucket_trim
->init();
1598 ldout(cct
, 0) << "ERROR: failed to start bucket trim manager" << dendl
;
1601 data_log
->set_observer(&*bucket_trim
);
1603 Mutex::Locker
dl(data_sync_thread_lock
);
1604 for (auto source_zone
: svc
.zone
->get_data_sync_source_zones()) {
1605 ldout(cct
, 5) << "starting data sync thread for zone " << source_zone
->name
<< dendl
;
1606 auto *thread
= new RGWDataSyncProcessorThread(this, async_rados
, source_zone
);
1607 ret
= thread
->init();
1609 ldout(cct
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
1613 data_sync_processor_threads
[source_zone
->id
] = thread
;
1615 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
1617 sync_log_trimmer
= new RGWSyncLogTrimThread(this, &*bucket_trim
, interval
);
1618 ret
= sync_log_trimmer
->init();
1620 ldout(cct
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
1623 sync_log_trimmer
->start();
1626 data_notifier
= new RGWDataNotifier(this);
1627 data_notifier
->start();
1630 lc
->initialize(cct
, this);
1633 lc
->start_processor();
1635 quota_handler
= RGWQuotaHandler::generate_handler(this, quota_threads
);
1637 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
1638 zone
.bucket_index_max_shards
);
1639 if (bucket_index_max_shards
> get_max_bucket_shards()) {
1640 bucket_index_max_shards
= get_max_bucket_shards();
1641 ldout(cct
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
1642 << get_max_bucket_shards() << dendl
;
1644 ldout(cct
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
1646 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
1647 binfo_cache
->init(svc
.cache
);
1649 bool need_tombstone_cache
= !svc
.zone
->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1651 if (need_tombstone_cache
) {
1652 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
1655 reshard_wait
= std::make_shared
<RGWReshardWait
>();
1657 reshard
= new RGWReshard(this);
1659 /* only the master zone in the zonegroup reshards buckets */
1660 run_reshard_thread
= run_reshard_thread
&& (zonegroup
.master_zone
== zone
.id
);
1661 if (run_reshard_thread
) {
1662 reshard
->start_processor();
1665 index_completion_manager
= new RGWIndexCompletionManager(this);
1666 ret
= index_completion_manager
->start();
1671 int RGWRados::init_svc(bool raw
)
1674 return svc
.init_raw(cct
, use_cache
);
1677 return svc
.init(cct
, use_cache
);
1681 * Initialize the RADOS instance and prepare to do other ops
1682 * Returns 0 on success, -ERR# on failure.
1684 int RGWRados::initialize()
1688 inject_notify_timeout_probability
=
1689 cct
->_conf
.get_val
<double>("rgw_inject_notify_timeout_probability");
1690 max_notify_retries
= cct
->_conf
.get_val
<uint64_t>("rgw_max_notify_retries");
1692 ret
= init_svc(false);
1694 ldout(cct
, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
1698 host_id
= svc
.zone_utils
->gen_host_id();
1704 return init_complete();
1708 * Open the pool used as root for this gateway
1709 * Returns: 0 on success, -ERR# otherwise.
1711 int RGWRados::open_root_pool_ctx()
1713 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().domain_root
, root_pool_ctx
, true);
1716 int RGWRados::open_gc_pool_ctx()
1718 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().gc_pool
, gc_pool_ctx
, true);
1721 int RGWRados::open_lc_pool_ctx()
1723 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().lc_pool
, lc_pool_ctx
, true);
1726 int RGWRados::open_objexp_pool_ctx()
1728 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, objexp_pool_ctx
, true);
1731 int RGWRados::open_reshard_pool_ctx()
1733 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().reshard_pool
, reshard_pool_ctx
, true);
1736 int RGWRados::open_pool_ctx(const rgw_pool
& pool
, librados::IoCtx
& io_ctx
)
1738 constexpr bool create
= true; // create the pool if it doesn't exist
1739 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
);
1742 void RGWRados::build_bucket_index_marker(const string
& shard_id_str
, const string
& shard_marker
,
1745 *marker
= shard_id_str
;
1746 marker
->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR
);
1747 marker
->append(shard_marker
);
1751 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
)
1753 const rgw_pool
& explicit_pool
= bucket_info
.bucket
.explicit_placement
.index_pool
;
1755 if (!explicit_pool
.empty()) {
1756 return open_pool_ctx(explicit_pool
, index_ctx
);
1759 auto& zonegroup
= svc
.zone
->get_zonegroup();
1760 auto& zone_params
= svc
.zone
->get_zone_params();
1762 const rgw_placement_rule
*rule
= &bucket_info
.placement_rule
;
1763 if (rule
->empty()) {
1764 rule
= &zonegroup
.default_placement
;
1766 auto iter
= zone_params
.placement_pools
.find(rule
->name
);
1767 if (iter
== zone_params
.placement_pools
.end()) {
1768 ldout(cct
, 0) << "could not find placement rule " << *rule
<< " within zonegroup " << dendl
;
1772 int r
= open_pool_ctx(iter
->second
.index_pool
, index_ctx
);
1781 struct log_list_state
{
1783 librados::IoCtx io_ctx
;
1784 librados::NObjectIterator obit
;
1787 int RGWRados::log_list_init(const string
& prefix
, RGWAccessHandle
*handle
)
1789 log_list_state
*state
= new log_list_state
;
1790 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1795 state
->prefix
= prefix
;
1796 state
->obit
= state
->io_ctx
.nobjects_begin();
1797 *handle
= (RGWAccessHandle
)state
;
1801 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
1803 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
1805 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
1809 if (state
->prefix
.length() &&
1810 state
->obit
->get_oid().find(state
->prefix
) != 0) {
1814 *name
= state
->obit
->get_oid();
1821 int RGWRados::log_remove(const string
& name
)
1823 librados::IoCtx io_ctx
;
1824 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
1827 return io_ctx
.remove(name
);
1830 struct log_show_state
{
1831 librados::IoCtx io_ctx
;
1833 bufferlist::const_iterator p
;
1837 log_show_state() : pos(0), eof(false) {}
1840 int RGWRados::log_show_init(const string
& name
, RGWAccessHandle
*handle
)
1842 log_show_state
*state
= new log_show_state
;
1843 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, state
->io_ctx
);
1849 *handle
= (RGWAccessHandle
)state
;
1853 int RGWRados::log_show_next(RGWAccessHandle handle
, rgw_log_entry
*entry
)
1855 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
1856 off_t off
= state
->p
.get_off();
1858 ldout(cct
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
1860 << " eof " << (int)state
->eof
1863 unsigned chunk
= 1024*1024;
1864 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
1866 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
1872 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
1873 } catch (buffer::error
& err
) {
1877 state
->bl
.claim(old
);
1878 state
->bl
.claim_append(more
);
1879 state
->p
= state
->bl
.cbegin();
1880 if ((unsigned)r
< chunk
)
1882 ldout(cct
, 10) << " read " << r
<< dendl
;
1886 return 0; // end of file
1888 decode(*entry
, state
->p
);
1890 catch (const buffer::error
&e
) {
1897 * usage_log_hash: get usage log key hash, based on name and index
1899 * Get the usage object name. Since a user may have more than 1
1900 * object holding that info (multiple shards), we use index to
1901 * specify that shard number. Once index exceeds max shards it
1903 * If name is not being set, results for all users will be returned
1904 * and index will wrap only after total shards number.
1906 * @param cct [in] ceph context
1907 * @param name [in] user name
1908 * @param hash [out] hash value
1909 * @param index [in] shard index number
1911 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
1913 uint32_t val
= index
;
1915 if (!name
.empty()) {
1916 int max_user_shards
= cct
->_conf
->rgw_usage_max_user_shards
;
1917 val
%= max_user_shards
;
1918 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
1921 int max_shards
= cct
->_conf
->rgw_usage_max_shards
;
1922 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
1926 int RGWRados::log_usage(map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
1930 map
<string
, rgw_usage_log_info
> log_objs
;
1935 /* restructure usage map, zone by object hash */
1936 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
1937 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
1938 const rgw_user_bucket
& ub
= iter
->first
;
1939 RGWUsageBatch
& info
= iter
->second
;
1941 if (ub
.user
.empty()) {
1942 ldout(cct
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
1946 if (ub
.user
!= last_user
) {
1947 /* index *should* be random, but why waste extra cycles
1948 in most cases max user shards is not going to exceed 1,
1949 so just incrementing it */
1950 usage_log_hash(cct
, ub
.user
, hash
, index
++);
1952 last_user
= ub
.user
;
1953 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
1955 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
1956 v
.push_back(miter
->second
);
1960 map
<string
, rgw_usage_log_info
>::iterator liter
;
1962 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
1963 int r
= cls_obj_usage_log_add(liter
->first
, liter
->second
);
1970 int RGWRados::read_usage(const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
,
1971 uint32_t max_entries
, bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
,
1972 rgw_usage_log_entry
>& usage
)
1974 uint32_t num
= max_entries
;
1975 string hash
, first_hash
;
1976 string user_str
= user
.to_str();
1977 usage_log_hash(cct
, user_str
, first_hash
, 0);
1979 if (usage_iter
.index
) {
1980 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
1988 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
1989 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
1991 int ret
= cls_obj_usage_log_read(hash
, user_str
, bucket_name
, start_epoch
, end_epoch
, num
,
1992 usage_iter
.read_iter
, ret_usage
, is_truncated
);
1999 num
-= ret_usage
.size();
2001 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
2002 usage
[iter
->first
].aggregate(iter
->second
);
2006 if (!*is_truncated
) {
2007 usage_iter
.read_iter
.clear();
2008 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
2010 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
2014 int RGWRados::trim_usage(const rgw_user
& user
, const string
& bucket_name
, uint64_t start_epoch
, uint64_t end_epoch
)
2017 string hash
, first_hash
;
2018 string user_str
= user
.to_str();
2019 usage_log_hash(cct
, user_str
, first_hash
, index
);
2023 int ret
= cls_obj_usage_log_trim(hash
, user_str
, bucket_name
, start_epoch
, end_epoch
);
2025 if (ret
< 0 && ret
!= -ENOENT
)
2028 usage_log_hash(cct
, user_str
, hash
, ++index
);
2029 } while (hash
!= first_hash
);
2035 int RGWRados::clear_usage()
2037 auto max_shards
= cct
->_conf
->rgw_usage_max_shards
;
2039 for (unsigned i
=0; i
< max_shards
; i
++){
2040 string oid
= RGW_USAGE_OBJ_PREFIX
+ to_string(i
);
2041 ret
= cls_obj_usage_log_clear(oid
);
2043 ldout(cct
,0) << "usage clear on oid="<< oid
<< "failed with ret=" << ret
<< dendl
;
2050 int RGWRados::key_to_shard_id(const string
& key
, int max_shards
)
2052 return rgw_shard_id(key
, max_shards
);
2055 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& key
, string
& name
, int *shard_id
)
2057 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
2060 *shard_id
= val
% max_shards
;
2062 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
2063 name
= prefix
+ buf
;
2066 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& section
, const string
& key
, string
& name
)
2068 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
2069 val
^= ceph_str_hash_linux(section
.c_str(), section
.size());
2071 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
2072 name
= prefix
+ buf
;
2075 void RGWRados::shard_name(const string
& prefix
, unsigned shard_id
, string
& name
)
2078 snprintf(buf
, sizeof(buf
), "%u", shard_id
);
2079 name
= prefix
+ buf
;
2083 void RGWRados::time_log_prepare_entry(cls_log_entry
& entry
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
2085 cls_log_add_prepare_entry(entry
, utime_t(ut
), section
, key
, bl
);
2088 int RGWRados::time_log_add_init(librados::IoCtx
& io_ctx
)
2090 return rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
, true);
2094 int RGWRados::time_log_add(const string
& oid
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
2096 librados::IoCtx io_ctx
;
2098 int r
= time_log_add_init(io_ctx
);
2103 ObjectWriteOperation op
;
2105 cls_log_add(op
, t
, section
, key
, bl
);
2107 return io_ctx
.operate(oid
, &op
);
2110 int RGWRados::time_log_add(const string
& oid
, list
<cls_log_entry
>& entries
,
2111 librados::AioCompletion
*completion
, bool monotonic_inc
)
2113 librados::IoCtx io_ctx
;
2115 int r
= time_log_add_init(io_ctx
);
2120 ObjectWriteOperation op
;
2121 cls_log_add(op
, entries
, monotonic_inc
);
2124 r
= io_ctx
.operate(oid
, &op
);
2126 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
2131 int RGWRados::time_log_list(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
2132 int max_entries
, list
<cls_log_entry
>& entries
,
2133 const string
& marker
,
2137 librados::IoCtx io_ctx
;
2139 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2142 librados::ObjectReadOperation op
;
2144 utime_t
st(start_time
);
2145 utime_t
et(end_time
);
2147 cls_log_list(op
, st
, et
, marker
, max_entries
, entries
,
2148 out_marker
, truncated
);
2152 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
2159 int RGWRados::time_log_info(const string
& oid
, cls_log_header
*header
)
2161 librados::IoCtx io_ctx
;
2163 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2166 librados::ObjectReadOperation op
;
2168 cls_log_info(op
, header
);
2172 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
2179 int RGWRados::time_log_info_async(librados::IoCtx
& io_ctx
, const string
& oid
, cls_log_header
*header
, librados::AioCompletion
*completion
)
2181 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2185 librados::ObjectReadOperation op
;
2187 cls_log_info(op
, header
);
2189 int ret
= io_ctx
.aio_operate(oid
, completion
, &op
, NULL
);
2196 int RGWRados::time_log_trim(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
2197 const string
& from_marker
, const string
& to_marker
,
2198 librados::AioCompletion
*completion
)
2200 librados::IoCtx io_ctx
;
2202 int r
= rgw_init_ioctx(get_rados_handle(), svc
.zone
->get_zone_params().log_pool
, io_ctx
);
2206 utime_t
st(start_time
);
2207 utime_t
et(end_time
);
2209 ObjectWriteOperation op
;
2210 cls_log_trim(op
, st
, et
, from_marker
, to_marker
);
2213 r
= io_ctx
.operate(oid
, &op
);
2215 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
2220 string
RGWRados::objexp_hint_get_shardname(int shard_num
)
2223 snprintf(buf
, sizeof(buf
), "%010u", (unsigned)shard_num
);
2225 string
objname("obj_delete_at_hint.");
2226 return objname
+ buf
;
2229 int RGWRados::objexp_key_shard(const rgw_obj_index_key
& key
)
2231 string obj_key
= key
.name
+ key
.instance
;
2232 int num_shards
= cct
->_conf
->rgw_objexp_hints_num_shards
;
2233 return rgw_bucket_shard_index(obj_key
, num_shards
);
2236 static string
objexp_hint_get_keyext(const string
& tenant_name
,
2237 const string
& bucket_name
,
2238 const string
& bucket_id
,
2239 const rgw_obj_key
& obj_key
)
2241 return tenant_name
+ (tenant_name
.empty() ? "" : ":") + bucket_name
+ ":" + bucket_id
+
2242 ":" + obj_key
.name
+ ":" + obj_key
.instance
;
2245 int RGWRados::objexp_hint_add(const ceph::real_time
& delete_at
,
2246 const string
& tenant_name
,
2247 const string
& bucket_name
,
2248 const string
& bucket_id
,
2249 const rgw_obj_index_key
& obj_key
)
2251 const string keyext
= objexp_hint_get_keyext(tenant_name
, bucket_name
,
2252 bucket_id
, obj_key
);
2253 objexp_hint_entry he
= {
2254 .tenant
= tenant_name
,
2255 .bucket_name
= bucket_name
,
2256 .bucket_id
= bucket_id
,
2258 .exp_time
= delete_at
};
2261 ObjectWriteOperation op
;
2262 cls_timeindex_add(op
, utime_t(delete_at
), keyext
, hebl
);
2264 string shard_name
= objexp_hint_get_shardname(objexp_key_shard(obj_key
));
2265 return objexp_pool_ctx
.operate(shard_name
, &op
);
2268 void RGWRados::objexp_get_shard(int shard_num
,
2269 string
& shard
) /* out */
2271 shard
= objexp_hint_get_shardname(shard_num
);
2274 int RGWRados::objexp_hint_list(const string
& oid
,
2275 const ceph::real_time
& start_time
,
2276 const ceph::real_time
& end_time
,
2277 const int max_entries
,
2278 const string
& marker
,
2279 list
<cls_timeindex_entry
>& entries
, /* out */
2280 string
*out_marker
, /* out */
2281 bool *truncated
) /* out */
2283 librados::ObjectReadOperation op
;
2284 cls_timeindex_list(op
, utime_t(start_time
), utime_t(end_time
), marker
, max_entries
, entries
,
2285 out_marker
, truncated
);
2288 int ret
= objexp_pool_ctx
.operate(oid
, &op
, &obl
);
2290 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
2294 if ((ret
== -ENOENT
) && truncated
) {
2301 int RGWRados::objexp_hint_parse(cls_timeindex_entry
&ti_entry
, /* in */
2302 objexp_hint_entry
& hint_entry
) /* out */
2305 auto iter
= ti_entry
.value
.cbegin();
2306 decode(hint_entry
, iter
);
2307 } catch (buffer::error
& err
) {
2308 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
2314 int RGWRados::objexp_hint_trim(const string
& oid
,
2315 const ceph::real_time
& start_time
,
2316 const ceph::real_time
& end_time
,
2317 const string
& from_marker
,
2318 const string
& to_marker
)
2320 int ret
= cls_timeindex_trim(objexp_pool_ctx
, oid
, utime_t(start_time
), utime_t(end_time
),
2321 from_marker
, to_marker
);
2322 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
2329 int RGWRados::lock_exclusive(const rgw_pool
& pool
, const string
& oid
, timespan
& duration
,
2330 string
& zone_id
, string
& owner_id
) {
2331 librados::IoCtx io_ctx
;
2333 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
2337 uint64_t msec
= std::chrono::duration_cast
<std::chrono::milliseconds
>(duration
).count();
2338 utime_t
ut(msec
/ 1000, msec
% 1000);
2340 rados::cls::lock::Lock
l(log_lock_name
);
2342 l
.set_cookie(owner_id
);
2344 l
.set_may_renew(true);
2346 return l
.lock_exclusive(&io_ctx
, oid
);
2349 int RGWRados::unlock(const rgw_pool
& pool
, const string
& oid
, string
& zone_id
, string
& owner_id
) {
2350 librados::IoCtx io_ctx
;
2352 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
2357 rados::cls::lock::Lock
l(log_lock_name
);
2359 l
.set_cookie(owner_id
);
2361 return l
.unlock(&io_ctx
, oid
);
2364 int RGWRados::decode_policy(bufferlist
& bl
, ACLOwner
*owner
)
2366 auto i
= bl
.cbegin();
2367 RGWAccessControlPolicy
policy(cct
);
2369 policy
.decode_owner(i
);
2370 } catch (buffer::error
& err
) {
2371 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
2374 *owner
= policy
.get_owner();
2378 int rgw_policy_from_attrset(CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
2380 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
2381 if (aiter
== attrset
.end())
2384 bufferlist
& bl
= aiter
->second
;
2385 auto iter
= bl
.cbegin();
2387 policy
->decode(iter
);
2388 } catch (buffer::error
& err
) {
2389 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
2392 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 15>()) {
2393 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
2394 ldout(cct
, 15) << __func__
<< " Read AccessControlPolicy";
2395 s3policy
->to_xml(*_dout
);
2402 int RGWRados::Bucket::update_bucket_id(const string
& new_bucket_id
)
2404 rgw_bucket bucket
= bucket_info
.bucket
;
2405 bucket
.update_bucket_id(new_bucket_id
);
2407 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
2409 bucket_info
.objv_tracker
.clear();
2410 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, nullptr, nullptr);
2420 * Get ordered listing of the objects in a bucket.
2422 * max: maximum number of results to return
2423 * bucket: bucket to list contents of
2424 * prefix: only return results that match this prefix
2425 * delim: do not include results that match this string.
2426 * Any skipped results will have the matching portion of their name
2427 * inserted in common_prefixes with a "true" mark.
2428 * marker: if filled in, begin the listing with this object.
2429 * end_marker: if filled in, end the listing with this object.
2430 * result: the objects are put in here.
2431 * common_prefixes: if delim is filled in, any matching prefixes are
2433 * is_truncated: if number of objects in the bucket is bigger than
2434 * max, then truncated.
2436 static inline std::string
after_delim(std::string_view delim
)
2438 // assert: ! delim.empty()
2439 std::string result
{delim
.data(), delim
.length()};
2440 result
+= char(255);
2444 int RGWRados::Bucket::List::list_objects_ordered(
2446 vector
<rgw_bucket_dir_entry
> *result
,
2447 map
<string
, bool> *common_prefixes
,
2450 RGWRados
*store
= target
->get_store();
2451 CephContext
*cct
= store
->ctx();
2452 int shard_id
= target
->get_shard_id();
2455 bool truncated
= true;
2456 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
,max
);
2460 rgw_obj_key
marker_obj(params
.marker
.name
, params
.marker
.instance
, params
.ns
);
2461 rgw_obj_index_key cur_marker
;
2462 marker_obj
.get_index_key(&cur_marker
);
2464 rgw_obj_key
end_marker_obj(params
.end_marker
.name
, params
.end_marker
.instance
,
2466 rgw_obj_index_key cur_end_marker
;
2467 end_marker_obj
.get_index_key(&cur_end_marker
);
2468 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
2470 rgw_obj_key
prefix_obj(params
.prefix
);
2471 prefix_obj
.ns
= params
.ns
;
2472 string cur_prefix
= prefix_obj
.get_index_key_name();
2473 string after_delim_s
; /* needed in !params.delim.empty() AND later */
2475 if (!params
.delim
.empty()) {
2476 after_delim_s
= after_delim(params
.delim
);
2477 /* if marker points at a common prefix, fast forward it into its
2478 * upper bound string */
2479 int delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
2480 if (delim_pos
>= 0) {
2481 string s
= cur_marker
.name
.substr(0, delim_pos
);
2482 s
.append(after_delim_s
);
2487 string skip_after_delim
;
2488 while (truncated
&& count
<= max
) {
2489 if (skip_after_delim
> cur_marker
.name
) {
2490 cur_marker
= skip_after_delim
;
2492 ldout(cct
, 20) << "setting cur_marker="
2494 << "[" << cur_marker
.instance
<< "]"
2497 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
2498 int r
= store
->cls_bucket_list_ordered(target
->get_bucket_info(),
2502 read_ahead
+ 1 - count
,
2503 params
.list_versions
,
2510 for (auto eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
2511 rgw_bucket_dir_entry
& entry
= eiter
->second
;
2512 rgw_obj_index_key index_key
= entry
.key
;
2514 rgw_obj_key
obj(index_key
);
2516 /* note that parse_raw_oid() here will not set the correct
2517 * object's instance, as rgw_obj_index_key encodes that
2518 * separately. We don't need to set the instance because it's
2519 * not needed for the checks here and we end up using the raw
2520 * entry for the return vector
2522 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
2524 ldout(cct
, 0) << "ERROR: could not parse object name: " << obj
.name
<< dendl
;
2528 bool check_ns
= (obj
.ns
== params
.ns
);
2529 if (!params
.list_versions
&& !entry
.is_visible()) {
2533 if (params
.enforce_ns
&& !check_ns
) {
2534 if (!params
.ns
.empty()) {
2535 /* we've iterated past the namespace we're searching -- done now */
2540 /* we're not looking at the namespace this object is in, next! */
2544 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
2550 params
.marker
= index_key
;
2551 next_marker
= index_key
;
2554 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
2557 if (params
.prefix
.size() &&
2558 (obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
) != 0))
2561 if (!params
.delim
.empty()) {
2562 int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
2564 if (delim_pos
>= 0) {
2565 /* extract key -with trailing delimiter- for CommonPrefix */
2567 obj
.name
.substr(0, delim_pos
+ params
.delim
.length());
2569 if (common_prefixes
&&
2570 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
2575 next_marker
= prefix_key
;
2576 (*common_prefixes
)[prefix_key
] = true;
2578 int marker_delim_pos
= cur_marker
.name
.find(
2579 params
.delim
, cur_prefix
.size());
2581 skip_after_delim
= cur_marker
.name
.substr(0, marker_delim_pos
);
2582 skip_after_delim
.append(after_delim_s
);
2584 ldout(cct
, 20) << "skip_after_delim=" << skip_after_delim
<< dendl
;
2598 result
->emplace_back(std::move(entry
));
2605 *is_truncated
= truncated
;
2608 } // list_objects_ordered
2612 * Get listing of the objects in a bucket and allow the results to be out
2615 * Even though there are key differences with the ordered counterpart,
2616 * the parameters are the same to maintain some compatability.
2618 * max: maximum number of results to return
2619 * bucket: bucket to list contents of
2620 * prefix: only return results that match this prefix
2621 * delim: should not be set; if it is we should have indicated an error
2622 * marker: if filled in, begin the listing with this object.
2623 * end_marker: if filled in, end the listing with this object.
2624 * result: the objects are put in here.
2625 * common_prefixes: this is never filled with an unordered list; the param
2626 * is maintained for compatibility
2627 * is_truncated: if number of objects in the bucket is bigger than max, then
2630 int RGWRados::Bucket::List::list_objects_unordered(int64_t max
,
2631 vector
<rgw_bucket_dir_entry
> *result
,
2632 map
<string
, bool> *common_prefixes
,
2635 RGWRados
*store
= target
->get_store();
2636 CephContext
*cct
= store
->ctx();
2637 int shard_id
= target
->get_shard_id();
2640 bool truncated
= true;
2642 // read a few extra in each call to cls_bucket_list_unordered in
2643 // case some are filtered out due to namespace matching, versioning,
2645 const int64_t max_read_ahead
= 100;
2646 const uint32_t read_ahead
= uint32_t(max
+ std::min(max
, max_read_ahead
));
2650 rgw_obj_key
marker_obj(params
.marker
.name
,
2651 params
.marker
.instance
,
2653 rgw_obj_index_key cur_marker
;
2654 marker_obj
.get_index_key(&cur_marker
);
2656 rgw_obj_key
end_marker_obj(params
.end_marker
.name
,
2657 params
.end_marker
.instance
,
2659 rgw_obj_index_key cur_end_marker
;
2660 end_marker_obj
.get_index_key(&cur_end_marker
);
2661 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
2663 rgw_obj_key
prefix_obj(params
.prefix
);
2664 prefix_obj
.ns
= params
.ns
;
2665 string cur_prefix
= prefix_obj
.get_index_key_name();
2667 while (truncated
&& count
<= max
) {
2668 std::vector
<rgw_bucket_dir_entry
> ent_list
;
2669 int r
= store
->cls_bucket_list_unordered(target
->get_bucket_info(),
2674 params
.list_versions
,
2681 // NB: while regions of ent_list will be sorted, we have no
2682 // guarantee that all items will be sorted since they can cross
2685 for (auto& entry
: ent_list
) {
2686 rgw_obj_index_key index_key
= entry
.key
;
2687 rgw_obj_key
obj(index_key
);
2689 /* note that parse_raw_oid() here will not set the correct
2690 * object's instance, as rgw_obj_index_key encodes that
2691 * separately. We don't need to set the instance because it's
2692 * not needed for the checks here and we end up using the raw
2693 * entry for the return vector
2695 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
2697 ldout(cct
, 0) << "ERROR: could not parse object name: " <<
2702 if (!params
.list_versions
&& !entry
.is_visible()) {
2706 if (params
.enforce_ns
&& obj
.ns
!= params
.ns
) {
2710 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
2711 // we're not guaranteed items will come in order, so we have
2712 // to loop through all
2717 params
.marker
.set(index_key
);
2718 next_marker
.set(index_key
);
2721 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
2724 if (params
.prefix
.size() &&
2725 (0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
)))
2733 result
->emplace_back(std::move(entry
));
2735 } // for (auto& entry : ent_list)
2736 } // while (truncated && count <= max)
2740 *is_truncated
= truncated
;
2743 } // list_objects_unordered
2747 * create a rados pool, associated meta info
2748 * returns 0 on success, -ERR# otherwise.
2750 int RGWRados::create_pool(const rgw_pool
& pool
)
2752 librados::IoCtx io_ctx
;
2753 constexpr bool create
= true;
2754 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
);
2757 int RGWRados::init_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
2759 librados::IoCtx index_ctx
;
2761 string dir_oid
= dir_oid_prefix
;
2762 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
2767 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
2769 map
<int, string
> bucket_objs
;
2770 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
2772 return CLSRGWIssueBucketIndexInit(index_ctx
,
2774 cct
->_conf
->rgw_bucket_index_max_aio
)();
2777 int RGWRados::clean_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
2779 librados::IoCtx index_ctx
;
2781 std::string dir_oid
= dir_oid_prefix
;
2782 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
2787 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
2789 std::map
<int, std::string
> bucket_objs
;
2790 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
2792 return CLSRGWIssueBucketIndexClean(index_ctx
,
2794 cct
->_conf
->rgw_bucket_index_max_aio
)();
2797 void RGWRados::create_bucket_id(string
*bucket_id
)
2799 uint64_t iid
= instance_id();
2800 uint64_t bid
= next_bucket_id();
2801 char buf
[svc
.zone
->get_zone_params().get_id().size() + 48];
2802 snprintf(buf
, sizeof(buf
), "%s.%" PRIu64
".%" PRIu64
,
2803 svc
.zone
->get_zone_params().get_id().c_str(), iid
, bid
);
2807 int RGWRados::create_bucket(const RGWUserInfo
& owner
, rgw_bucket
& bucket
,
2808 const string
& zonegroup_id
,
2809 const rgw_placement_rule
& placement_rule
,
2810 const string
& swift_ver_location
,
2811 const RGWQuotaInfo
* pquota_info
,
2812 map
<std::string
, bufferlist
>& attrs
,
2813 RGWBucketInfo
& info
,
2815 obj_version
*pep_objv
,
2816 real_time creation_time
,
2817 rgw_bucket
*pmaster_bucket
,
2818 uint32_t *pmaster_num_shards
,
2821 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2822 rgw_placement_rule selected_placement_rule
;
2823 RGWZonePlacementInfo rule_info
;
2825 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
2827 ret
= svc
.zone
->select_bucket_placement(owner
, zonegroup_id
, placement_rule
,
2828 &selected_placement_rule
, &rule_info
);
2832 if (!pmaster_bucket
) {
2833 create_bucket_id(&bucket
.marker
);
2834 bucket
.bucket_id
= bucket
.marker
;
2836 bucket
.marker
= pmaster_bucket
->marker
;
2837 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
2840 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
2843 objv_tracker
.write_version
= *pobjv
;
2845 objv_tracker
.generate_new_write_ver(cct
);
2848 info
.bucket
= bucket
;
2849 info
.owner
= owner
.user_id
;
2850 info
.zonegroup
= zonegroup_id
;
2851 info
.placement_rule
= selected_placement_rule
;
2852 info
.index_type
= rule_info
.index_type
;
2853 info
.swift_ver_location
= swift_ver_location
;
2854 info
.swift_versioning
= (!swift_ver_location
.empty());
2855 if (pmaster_num_shards
) {
2856 info
.num_shards
= *pmaster_num_shards
;
2858 info
.num_shards
= bucket_index_max_shards
;
2860 info
.bucket_index_shard_hash_type
= RGWBucketInfo::MOD
;
2861 info
.requester_pays
= false;
2862 if (real_clock::is_zero(creation_time
)) {
2863 info
.creation_time
= ceph::real_clock::now();
2865 info
.creation_time
= creation_time
;
2868 info
.quota
= *pquota_info
;
2871 int r
= init_bucket_index(info
, info
.num_shards
);
2876 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true);
2877 if (ret
== -EEXIST
) {
2878 librados::IoCtx index_ctx
;
2879 map
<int, string
> bucket_objs
;
2880 int r
= open_bucket_index(info
, index_ctx
, bucket_objs
);
2884 /* we need to reread the info and return it, caller will have a use for it */
2885 RGWObjVersionTracker instance_ver
= info
.objv_tracker
;
2886 info
.objv_tracker
.clear();
2887 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
2888 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, NULL
);
2893 ldout(cct
, 0) << "get_bucket_info returned " << r
<< dendl
;
2897 /* only remove it if it's a different bucket instance */
2898 if (info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
2899 /* remove bucket meta instance */
2900 r
= rgw_bucket_instance_remove_entry(this,
2906 /* remove bucket index objects asynchronously by best effort */
2907 (void) CLSRGWIssueBucketIndexClean(index_ctx
,
2909 cct
->_conf
->rgw_bucket_index_max_aio
)();
2911 /* ret == -ENOENT here */
2916 /* this is highly unlikely */
2917 ldout(cct
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
2921 bool RGWRados::get_obj_data_pool(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
2923 return rgw_get_obj_data_pool(svc
.zone
->get_zonegroup(), svc
.zone
->get_zone_params(), placement_rule
, obj
, pool
);
2926 bool RGWRados::obj_to_raw(const rgw_placement_rule
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
2928 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
2930 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
2933 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
2936 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
2939 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2940 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2944 int r
= open_pool_ctx(pool
, *ioctx
);
2949 ioctx
->locator_set_key(key
);
2954 int RGWRados::get_obj_head_ref(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_rados_ref
*ref
)
2956 get_obj_bucket_and_oid_loc(obj
, ref
->obj
.oid
, ref
->obj
.loc
);
2959 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
2960 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
2964 int r
= open_pool_ctx(pool
, ref
->ioctx
);
2969 ref
->ioctx
.locator_set_key(ref
->obj
.loc
);
2974 int RGWRados::get_raw_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2980 if (ref
->obj
.oid
.empty()) {
2981 ref
->obj
.oid
= obj
.pool
.to_str();
2982 ref
->obj
.pool
= svc
.zone
->get_zone_params().domain_root
;
2984 r
= open_pool_ctx(ref
->obj
.pool
, ref
->ioctx
);
2988 ref
->ioctx
.locator_set_key(ref
->obj
.loc
);
2993 int RGWRados::get_system_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
2995 return get_raw_obj_ref(obj
, ref
);
2999 * fixes an issue where head objects were supposed to have a locator created, but ended
3002 int RGWRados::fix_head_obj_locator(const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
3004 const rgw_bucket
& bucket
= bucket_info
.bucket
;
3008 rgw_obj
obj(bucket
, key
);
3010 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
3012 if (locator
.empty()) {
3013 ldout(cct
, 20) << "object does not have a locator, nothing to fix" << dendl
;
3017 librados::IoCtx ioctx
;
3019 int ret
= get_obj_head_ioctx(bucket_info
, obj
, &ioctx
);
3021 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
3024 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
3029 struct timespec mtime_ts
;
3030 map
<string
, bufferlist
> attrs
;
3031 librados::ObjectReadOperation op
;
3032 op
.getxattrs(&attrs
, NULL
);
3033 op
.stat2(&size
, &mtime_ts
, NULL
);
3034 #define HEAD_SIZE 512 * 1024
3035 op
.read(0, HEAD_SIZE
, &data
, NULL
);
3037 ret
= ioctx
.operate(oid
, &op
, NULL
);
3039 lderr(cct
) << "ERROR: ioctx.operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
3043 if (size
> HEAD_SIZE
) {
3044 lderr(cct
) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
3048 if (size
!= data
.length()) {
3049 lderr(cct
) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
3054 librados::ObjectWriteOperation wop
;
3056 wop
.mtime2(&mtime_ts
);
3058 map
<string
, bufferlist
>::iterator iter
;
3059 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
3060 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
3065 ioctx
.locator_set_key(locator
);
3066 ioctx
.operate(oid
, &wop
);
3070 ioctx
.locator_set_key(string());
3072 ret
= ioctx
.remove(oid
);
3074 lderr(cct
) << "ERROR: failed to remove original bad object" << dendl
;
3082 int RGWRados::move_rados_obj(librados::IoCtx
& src_ioctx
,
3083 const string
& src_oid
, const string
& src_locator
,
3084 librados::IoCtx
& dst_ioctx
,
3085 const string
& dst_oid
, const string
& dst_locator
)
3088 #define COPY_BUF_SIZE (4 * 1024 * 1024)
3090 uint64_t chunk_size
= COPY_BUF_SIZE
;
3094 struct timespec mtime_ts
;
3097 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
3101 src_ioctx
.locator_set_key(src_locator
);
3102 dst_ioctx
.locator_set_key(dst_locator
);
3106 ObjectReadOperation rop
;
3107 ObjectWriteOperation wop
;
3110 rop
.stat2(&size
, &mtime_ts
, NULL
);
3111 mtime
= real_clock::from_timespec(mtime_ts
);
3113 rop
.read(ofs
, chunk_size
, &data
, NULL
);
3114 ret
= src_ioctx
.operate(src_oid
, &rop
, NULL
);
3119 if (data
.length() == 0) {
3124 wop
.create(true); /* make it exclusive */
3125 wop
.mtime2(&mtime_ts
);
3126 mtime
= real_clock::from_timespec(mtime_ts
);
3128 wop
.write(ofs
, data
);
3129 ret
= dst_ioctx
.operate(dst_oid
, &wop
);
3133 ofs
+= data
.length();
3134 done
= data
.length() != chunk_size
;
3138 lderr(cct
) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
3139 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
3144 src_ioctx
.remove(src_oid
);
3149 // TODO: clean up dst_oid if we created it
3150 lderr(cct
) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
3155 * fixes an issue where head objects were supposed to have a locator created, but ended
3158 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
)
3160 const rgw_bucket
& bucket
= bucket_info
.bucket
;
3161 rgw_obj
obj(bucket
, key
);
3168 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
3173 RGWObjState
*astate
= NULL
;
3174 RGWObjectCtx
rctx(this);
3175 r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
3179 if (astate
->has_manifest
) {
3180 RGWObjManifest::obj_iterator miter
;
3181 RGWObjManifest
& manifest
= astate
->manifest
;
3182 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
3183 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(this);
3188 rgw_raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
3190 if (loc
.key
.ns
.empty()) {
3191 /* continue, we're only interested in tail objects */
3195 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
3196 ref
.ioctx
.locator_set_key(locator
);
3198 ldout(cct
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
3200 r
= ref
.ioctx
.stat(oid
, NULL
, NULL
);
3206 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
3208 /* create a new ioctx with the bad locator */
3209 librados::IoCtx src_ioctx
;
3210 src_ioctx
.dup(ref
.ioctx
);
3211 src_ioctx
.locator_set_key(bad_loc
);
3213 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
3215 /* cannot find a broken part */
3218 ldout(cct
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
3223 r
= move_rados_obj(src_ioctx
, oid
, bad_loc
, ref
.ioctx
, oid
, locator
);
3225 lderr(cct
) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
3234 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
3236 RGWBucketInfo
* bucket_info_out
)
3240 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
3242 RGWBucketInfo bucket_info
;
3243 RGWBucketInfo
* bucket_info_p
=
3244 bucket_info_out
? bucket_info_out
: &bucket_info
;
3246 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
);
3251 ret
= store
->open_bucket_index_shard(*bucket_info_p
, index_ctx
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
3253 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3256 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3261 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
3263 RGWBucketInfo
* bucket_info_out
)
3268 auto obj_ctx
= store
->svc
.sysobj
->init_obj_ctx();
3270 RGWBucketInfo bucket_info
;
3271 RGWBucketInfo
* bucket_info_p
=
3272 bucket_info_out
? bucket_info_out
: &bucket_info
;
3273 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
);
3278 ret
= store
->open_bucket_index_shard(*bucket_info_p
, index_ctx
, shard_id
, &bucket_obj
);
3280 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3283 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3288 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
,
3291 bucket
= bucket_info
.bucket
;
3293 int ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
,
3294 obj
.get_hash_object(), &bucket_obj
,
3297 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3300 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3305 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
, int sid
)
3307 bucket
= bucket_info
.bucket
;
3310 int ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, shard_id
, &bucket_obj
);
3312 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
3315 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
3321 /* Execute @handler on last item in bucket listing for bucket specified
3322 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
3323 * to objects matching these criterias. */
3324 int RGWRados::on_last_entry_in_listing(RGWBucketInfo
& bucket_info
,
3325 const std::string
& obj_prefix
,
3326 const std::string
& obj_delim
,
3327 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
3329 RGWRados::Bucket
target(this, bucket_info
);
3330 RGWRados::Bucket::List
list_op(&target
);
3332 list_op
.params
.prefix
= obj_prefix
;
3333 list_op
.params
.delim
= obj_delim
;
3335 ldout(cct
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
3336 << ", obj_prefix=" << obj_prefix
3337 << ", obj_delim=" << obj_delim
3340 bool is_truncated
= false;
3342 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
3343 /* We need to rewind to the last object in a listing. */
3345 /* List bucket entries in chunks. */
3346 static constexpr int MAX_LIST_OBJS
= 100;
3347 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
3349 int ret
= list_op
.list_objects(MAX_LIST_OBJS
, &entries
, nullptr,
3353 } else if (!entries
.empty()) {
3354 last_entry
= entries
.back();
3356 } while (is_truncated
);
3359 return handler(*last_entry
);
3362 /* Empty listing - no items we can run handler on. */
3367 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
3368 const rgw_user
& user
,
3369 RGWBucketInfo
& bucket_info
,
3372 if (! swift_versioning_enabled(bucket_info
)) {
3376 obj_ctx
.set_atomic(obj
);
3378 RGWObjState
* state
= nullptr;
3379 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &state
, false);
3384 if (!state
->exists
) {
3388 const string
& src_name
= obj
.get_oid();
3389 char buf
[src_name
.size() + 32];
3390 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
3391 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
3392 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
3394 RGWBucketInfo dest_bucket_info
;
3396 auto sysobj_ctx
= svc
.sysobj
->init_obj_ctx();
3398 r
= get_bucket_info(sysobj_ctx
, bucket_info
.bucket
.tenant
, bucket_info
.swift_ver_location
, dest_bucket_info
, NULL
, NULL
);
3400 ldout(cct
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
3402 return -ERR_PRECONDITION_FAILED
;
3407 if (dest_bucket_info
.owner
!= bucket_info
.owner
) {
3408 return -ERR_PRECONDITION_FAILED
;
3411 rgw_obj
dest_obj(dest_bucket_info
.bucket
, buf
);
3413 if (dest_bucket_info
.versioning_enabled()){
3414 gen_rand_obj_instance_name(&dest_obj
);
3417 obj_ctx
.set_atomic(dest_obj
);
3421 r
= copy_obj(obj_ctx
,
3423 NULL
, /* req_info *info */
3429 bucket_info
.placement_rule
,
3430 NULL
, /* time_t *src_mtime */
3431 NULL
, /* time_t *mtime */
3432 NULL
, /* const time_t *mod_ptr */
3433 NULL
, /* const time_t *unmod_ptr */
3434 false, /* bool high_precision_time */
3435 NULL
, /* const char *if_match */
3436 NULL
, /* const char *if_nomatch */
3437 RGWRados::ATTRSMOD_NONE
,
3438 true, /* bool copy_if_newer */
3440 RGWObjCategory::Main
,
3441 0, /* uint64_t olh_epoch */
3442 real_time(), /* time_t delete_at */
3443 NULL
, /* string *version_id */
3444 NULL
, /* string *ptag */
3445 NULL
, /* string *petag */
3446 NULL
, /* void (*progress_cb)(off_t, void *) */
3447 NULL
); /* void *progress_data */
3448 if (r
== -ECANCELED
|| r
== -ENOENT
) {
3449 /* Has already been overwritten, meaning another rgw process already
3457 int RGWRados::swift_versioning_restore(RGWSysObjectCtx
& sysobj_ctx
,
3458 RGWObjectCtx
& obj_ctx
,
3459 const rgw_user
& user
,
3460 RGWBucketInfo
& bucket_info
,
3462 bool& restored
) /* out */
3464 if (! swift_versioning_enabled(bucket_info
)) {
3468 /* Bucket info of the bucket that stores previous versions of our object. */
3469 RGWBucketInfo archive_binfo
;
3471 int ret
= get_bucket_info(sysobj_ctx
, bucket_info
.bucket
.tenant
,
3472 bucket_info
.swift_ver_location
, archive_binfo
,
3478 /* Abort the operation if the bucket storing our archive belongs to someone
3479 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3480 * into consideration. For we can live with that.
3482 * TODO: delegate this check to un upper layer and compare with ACLs. */
3483 if (bucket_info
.owner
!= archive_binfo
.owner
) {
3487 /* This code will be executed on latest version of the object. */
3488 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
3489 std::string no_zone
;
3491 /* We don't support object versioning of Swift API on those buckets that
3492 * are already versioned using the S3 mechanism. This affects also bucket
3493 * storing archived objects. Otherwise the delete operation would create
3494 * a deletion marker. */
3495 if (archive_binfo
.versioned()) {
3497 return -ERR_PRECONDITION_FAILED
;
3500 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3501 * irrelevant and may be safely skipped. */
3502 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
3504 rgw_obj
archive_obj(archive_binfo
.bucket
, entry
.key
);
3506 if (bucket_info
.versioning_enabled()){
3507 gen_rand_obj_instance_name(&obj
);
3510 obj_ctx
.set_atomic(archive_obj
);
3511 obj_ctx
.set_atomic(obj
);
3513 int ret
= copy_obj(obj_ctx
,
3515 nullptr, /* req_info *info */
3518 archive_obj
, /* src obj */
3519 bucket_info
, /* dest bucket info */
3520 archive_binfo
, /* src bucket info */
3521 bucket_info
.placement_rule
, /* placement_rule */
3522 nullptr, /* time_t *src_mtime */
3523 nullptr, /* time_t *mtime */
3524 nullptr, /* const time_t *mod_ptr */
3525 nullptr, /* const time_t *unmod_ptr */
3526 false, /* bool high_precision_time */
3527 nullptr, /* const char *if_match */
3528 nullptr, /* const char *if_nomatch */
3529 RGWRados::ATTRSMOD_NONE
,
3530 true, /* bool copy_if_newer */
3532 RGWObjCategory::Main
,
3533 0, /* uint64_t olh_epoch */
3534 real_time(), /* time_t delete_at */
3535 nullptr, /* string *version_id */
3536 nullptr, /* string *ptag */
3537 nullptr, /* string *petag */
3538 nullptr, /* void (*progress_cb)(off_t, void *) */
3539 nullptr); /* void *progress_data */
3540 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
3541 /* Has already been overwritten, meaning another rgw process already
3544 } else if (ret
< 0) {
3550 /* Need to remove the archived copy. */
3551 ret
= delete_obj(obj_ctx
, archive_binfo
, archive_obj
,
3552 archive_binfo
.versioning_status());
3557 const std::string
& obj_name
= obj
.get_oid();
3558 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
3561 return on_last_entry_in_listing(archive_binfo
, prefix
, std::string(),
3565 int RGWRados::Object::Write::_do_write_meta(uint64_t size
, uint64_t accounted_size
,
3566 map
<string
, bufferlist
>& attrs
,
3567 bool assume_noent
, bool modify_tail
,
3570 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
3571 RGWRados
*store
= target
->get_store();
3573 ObjectWriteOperation op
;
3575 const struct req_state
* s
= get_req_state();
3579 req_id
= store
->svc
.zone_utils
->unique_id(store
->get_new_req_id());
3586 int r
= target
->get_state(&state
, false, assume_noent
);
3590 rgw_obj
& obj
= target
->get_obj();
3592 if (obj
.get_oid().empty()) {
3593 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
3598 r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
3602 bool is_olh
= state
->is_olh
;
3604 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
3606 const string
*ptag
= meta
.ptag
;
3607 if (!ptag
&& !index_op
->get_optag()->empty()) {
3608 ptag
= index_op
->get_optag();
3610 r
= target
->prepare_atomic_modification(op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false, modify_tail
);
3614 if (real_clock::is_zero(meta
.set_mtime
)) {
3615 meta
.set_mtime
= real_clock::now();
3618 if (state
->is_olh
) {
3619 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
3622 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
3623 op
.mtime2(&mtime_ts
);
3626 /* if we want to overwrite the data, we also want to overwrite the
3627 xattrs, so just remove the object */
3628 op
.write_full(*meta
.data
);
3632 string content_type
;
3634 string storage_class
;
3636 map
<string
, bufferlist
>::iterator iter
;
3638 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
3639 const string
& name
= iter
->first
;
3640 op
.rmxattr(name
.c_str());
3644 if (meta
.manifest
) {
3645 storage_class
= meta
.manifest
->get_tail_placement().placement_rule
.storage_class
;
3647 /* remove existing manifest attr */
3648 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
3649 if (iter
!= attrs
.end())
3653 encode(*meta
.manifest
, bl
);
3654 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
3657 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
3658 const string
& name
= iter
->first
;
3659 bufferlist
& bl
= iter
->second
;
3664 op
.setxattr(name
.c_str(), bl
);
3666 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
3667 etag
= rgw_bl_str(bl
);
3668 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
3669 content_type
= rgw_bl_str(bl
);
3670 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
3674 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
3675 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
3678 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
3680 encode(store
->svc
.zone
->get_zone_short_id(), bl
);
3681 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
3684 if (!storage_class
.empty()) {
3686 bl
.append(storage_class
);
3687 op
.setxattr(RGW_ATTR_STORAGE_CLASS
, bl
);
3698 if (!reset_obj
) { //Multipart upload, it has immutable head.
3699 orig_exists
= false;
3702 orig_exists
= state
->exists
;
3703 orig_size
= state
->accounted_size
;
3706 bool versioned_target
= (meta
.olh_epoch
&& *meta
.olh_epoch
> 0) ||
3707 !obj
.key
.instance
.empty();
3709 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
3712 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
3715 if (!index_op
->is_prepared()) {
3716 tracepoint(rgw_rados
, prepare_enter
, req_id
.c_str());
3717 r
= index_op
->prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
3718 tracepoint(rgw_rados
, prepare_exit
, req_id
.c_str());
3723 tracepoint(rgw_rados
, operate_enter
, req_id
.c_str());
3724 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
3725 tracepoint(rgw_rados
, operate_exit
, req_id
.c_str());
3726 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
3727 or -ENOENT if was removed, or -EEXIST if it did not exist
3728 before and now it does */
3729 if (r
== -EEXIST
&& assume_noent
) {
3730 target
->invalidate_state();
3736 epoch
= ref
.ioctx
.get_last_version();
3737 poolid
= ref
.ioctx
.get_id();
3739 r
= target
->complete_atomic_modification();
3741 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
3744 tracepoint(rgw_rados
, complete_enter
, req_id
.c_str());
3745 r
= index_op
->complete(poolid
, epoch
, size
, accounted_size
,
3746 meta
.set_mtime
, etag
, content_type
,
3747 storage_class
, &acl_bl
,
3748 meta
.category
, meta
.remove_objs
, meta
.user_data
, meta
.appendable
);
3749 tracepoint(rgw_rados
, complete_exit
, req_id
.c_str());
3754 *meta
.mtime
= meta
.set_mtime
;
3757 /* note that index_op was using state so we couldn't invalidate it earlier */
3758 target
->invalidate_state();
3761 if (versioned_op
&& meta
.olh_epoch
) {
3762 r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, *meta
.olh_epoch
, real_time(), false, meta
.zones_trace
);
3768 if (!real_clock::is_zero(meta
.delete_at
)) {
3769 rgw_obj_index_key obj_key
;
3770 obj
.key
.get_index_key(&obj_key
);
3772 r
= store
->objexp_hint_add(meta
.delete_at
,
3773 obj
.bucket
.tenant
, obj
.bucket
.name
, obj
.bucket
.bucket_id
, obj_key
);
3775 ldout(store
->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
3776 /* ignoring error, nothing we can do at this point */
3779 meta
.canceled
= false;
3781 /* update quota cache */
3782 if (meta
.completeMultipart
){
3783 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3787 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
3788 accounted_size
, orig_size
);
3793 int ret
= index_op
->cancel();
3795 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
3798 meta
.canceled
= true;
3800 /* we lost in a race. There are a few options:
3801 * - existing object was rewritten (ECANCELED)
3802 * - non existing object was created (EEXIST)
3803 * - object was removed (ENOENT)
3804 * should treat it as a success
3806 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
3807 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
3811 if (meta
.if_match
!= NULL
) {
3812 // only overwrite existing object
3813 if (strcmp(meta
.if_match
, "*") == 0) {
3815 r
= -ERR_PRECONDITION_FAILED
;
3816 } else if (r
== -ECANCELED
) {
3822 if (meta
.if_nomatch
!= NULL
) {
3823 // only create a new object
3824 if (strcmp(meta
.if_nomatch
, "*") == 0) {
3826 r
= -ERR_PRECONDITION_FAILED
;
3827 } else if (r
== -ENOENT
) {
3837 int RGWRados::Object::Write::write_meta(uint64_t size
, uint64_t accounted_size
,
3838 map
<string
, bufferlist
>& attrs
)
3840 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
3842 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
3843 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
3844 index_op
.set_zones_trace(meta
.zones_trace
);
3846 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
3849 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
);
3851 assume_noent
= false;
3854 if (!assume_noent
) {
3855 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
);
3860 class RGWRadosPutObj
: public RGWHTTPStreamRWRequest::ReceiveCB
3864 rgw::putobj::DataProcessor
*filter
;
3865 boost::optional
<RGWPutObj_Compress
>& compressor
;
3866 boost::optional
<rgw::putobj::ChunkProcessor
> buffering
;
3867 CompressorRef
& plugin
;
3868 rgw::putobj::ObjectProcessor
*processor
;
3869 void (*progress_cb
)(off_t
, void *);
3870 void *progress_data
;
3871 bufferlist extra_data_bl
;
3872 uint64_t extra_data_left
{0};
3873 bool need_to_process_attrs
{true};
3874 uint64_t data_len
{0};
3875 map
<string
, bufferlist
> src_attrs
;
3877 uint64_t lofs
{0}; /* logical ofs */
3878 std::function
<int(const map
<string
, bufferlist
>&)> attrs_handler
;
3880 RGWRadosPutObj(CephContext
* cct
,
3881 CompressorRef
& plugin
,
3882 boost::optional
<RGWPutObj_Compress
>& compressor
,
3883 rgw::putobj::ObjectProcessor
*p
,
3884 void (*_progress_cb
)(off_t
, void *),
3885 void *_progress_data
,
3886 std::function
<int(const map
<string
, bufferlist
>&)> _attrs_handler
) :
3889 compressor(compressor
),
3892 progress_cb(_progress_cb
),
3893 progress_data(_progress_data
),
3894 attrs_handler(_attrs_handler
) {}
3896 int process_attrs(void) {
3897 if (extra_data_bl
.length()) {
3899 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
3900 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
3904 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
3906 src_attrs
.erase(RGW_ATTR_COMPRESSION
);
3907 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
3909 // filter out olh attributes
3910 auto iter
= src_attrs
.lower_bound(RGW_ATTR_OLH_PREFIX
);
3911 while (iter
!= src_attrs
.end()) {
3912 if (!boost::algorithm::starts_with(iter
->first
, RGW_ATTR_OLH_PREFIX
)) {
3915 iter
= src_attrs
.erase(iter
);
3919 int ret
= attrs_handler(src_attrs
);
3924 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
3925 //do not compress if object is encrypted
3926 compressor
= boost::in_place(cct
, plugin
, filter
);
3927 // add a filter that buffers data so we don't try to compress tiny blocks.
3928 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3929 // compression ratio
3930 constexpr unsigned buffer_size
= 512 * 1024;
3931 buffering
= boost::in_place(&*compressor
, buffer_size
);
3932 filter
= &*buffering
;
3935 need_to_process_attrs
= false;
3940 int handle_data(bufferlist
& bl
, bool *pause
) override
{
3942 progress_cb(data_len
, progress_data
);
3944 if (extra_data_left
) {
3945 uint64_t extra_len
= bl
.length();
3946 if (extra_len
> extra_data_left
)
3947 extra_len
= extra_data_left
;
3950 bl
.splice(0, extra_len
, &extra
);
3951 extra_data_bl
.append(extra
);
3953 extra_data_left
-= extra_len
;
3954 if (extra_data_left
== 0) {
3955 int res
= process_attrs();
3960 if (bl
.length() == 0) {
3964 if (need_to_process_attrs
) {
3965 /* need to call process_attrs() even if we don't get any attrs,
3966 * need it to call attrs_handler().
3968 int res
= process_attrs();
3974 ceph_assert(uint64_t(ofs
) >= extra_data_len
);
3976 uint64_t size
= bl
.length();
3979 const uint64_t lofs
= data_len
;
3982 return filter
->process(std::move(bl
), lofs
);
3986 return filter
->process({}, data_len
);
3989 bufferlist
& get_extra_data() { return extra_data_bl
; }
3991 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
3993 void set_extra_data_len(uint64_t len
) override
{
3994 extra_data_left
= len
;
3995 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len
);
3998 uint64_t get_data_len() {
4004 * prepare attrset depending on attrs_mod.
4006 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
4007 map
<string
, bufferlist
>& attrs
,
4008 RGWRados::AttrsMod attrs_mod
)
4010 switch (attrs_mod
) {
4011 case RGWRados::ATTRSMOD_NONE
:
4014 case RGWRados::ATTRSMOD_REPLACE
:
4015 if (!attrs
[RGW_ATTR_ETAG
].length()) {
4016 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
4018 if (!attrs
[RGW_ATTR_TAIL_TAG
].length()) {
4019 auto ttiter
= src_attrs
.find(RGW_ATTR_TAIL_TAG
);
4020 if (ttiter
!= src_attrs
.end()) {
4021 attrs
[RGW_ATTR_TAIL_TAG
] = src_attrs
[RGW_ATTR_TAIL_TAG
];
4025 case RGWRados::ATTRSMOD_MERGE
:
4026 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
4027 if (attrs
.find(it
->first
) == attrs
.end()) {
4028 attrs
[it
->first
] = it
->second
;
4035 int RGWRados::rewrite_obj(RGWBucketInfo
& dest_bucket_info
, const rgw_obj
& obj
)
4037 map
<string
, bufferlist
> attrset
;
4041 RGWObjectCtx
rctx(this);
4043 RGWRados::Object
op_target(this, dest_bucket_info
, rctx
, obj
);
4044 RGWRados::Object::Read
read_op(&op_target
);
4046 read_op
.params
.attrs
= &attrset
;
4047 read_op
.params
.lastmod
= &mtime
;
4048 read_op
.params
.obj_size
= &obj_size
;
4050 int ret
= read_op
.prepare();
4054 attrset
.erase(RGW_ATTR_ID_TAG
);
4055 attrset
.erase(RGW_ATTR_TAIL_TAG
);
4057 return copy_obj_data(rctx
, dest_bucket_info
, dest_bucket_info
.placement_rule
,
4058 read_op
, obj_size
- 1, obj
, NULL
, mtime
, attrset
,
4059 0, real_time(), NULL
);
4062 struct obj_time_weight
{
4064 uint32_t zone_short_id
;
4066 bool high_precision
;
4068 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
4070 bool compare_low_precision(const obj_time_weight
& rhs
) {
4071 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
4072 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
4081 if (!zone_short_id
|| !rhs
.zone_short_id
) {
4082 /* don't compare zone ids, if one wasn't provided */
4085 if (zone_short_id
!= rhs
.zone_short_id
) {
4086 return (zone_short_id
< rhs
.zone_short_id
);
4088 return (pg_ver
< rhs
.pg_ver
);
4092 bool operator<(const obj_time_weight
& rhs
) {
4093 if (!high_precision
|| !rhs
.high_precision
) {
4094 return compare_low_precision(rhs
);
4096 if (mtime
> rhs
.mtime
) {
4099 if (mtime
< rhs
.mtime
) {
4102 if (!zone_short_id
|| !rhs
.zone_short_id
) {
4103 /* don't compare zone ids, if one wasn't provided */
4106 if (zone_short_id
!= rhs
.zone_short_id
) {
4107 return (zone_short_id
< rhs
.zone_short_id
);
4109 return (pg_ver
< rhs
.pg_ver
);
4112 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
4114 zone_short_id
= _short_id
;
4118 void init(RGWObjState
*state
) {
4119 mtime
= state
->mtime
;
4120 zone_short_id
= state
->zone_short_id
;
4121 pg_ver
= state
->pg_ver
;
4125 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
4128 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
4129 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
4135 class RGWGetExtraDataCB
: public RGWHTTPStreamRWRequest::ReceiveCB
{
4136 bufferlist extra_data
;
4138 RGWGetExtraDataCB() {}
4139 int handle_data(bufferlist
& bl
, bool *pause
) override
{
4140 int bl_len
= (int)bl
.length();
4141 if (extra_data
.length() < extra_data_len
) {
4142 off_t max
= extra_data_len
- extra_data
.length();
4146 bl
.splice(0, max
, &extra_data
);
4151 bufferlist
& get_extra_data() {
4156 int RGWRados::stat_remote_obj(RGWObjectCtx
& obj_ctx
,
4157 const rgw_user
& user_id
,
4159 const string
& source_zone
,
4161 RGWBucketInfo
& src_bucket_info
,
4162 real_time
*src_mtime
,
4164 const real_time
*mod_ptr
,
4165 const real_time
*unmod_ptr
,
4166 bool high_precision_time
,
4167 const char *if_match
,
4168 const char *if_nomatch
,
4169 map
<string
, bufferlist
> *pattrs
,
4170 map
<string
, string
> *pheaders
,
4175 /* source is in a different zonegroup, copy from there */
4177 RGWRESTStreamRWRequest
*in_stream_req
;
4179 map
<string
, bufferlist
> src_attrs
;
4180 append_rand_alpha(cct
, tag
, tag
, 32);
4181 obj_time_weight set_mtime_weight
;
4182 set_mtime_weight
.high_precision
= high_precision_time
;
4185 if (source_zone
.empty()) {
4186 if (src_bucket_info
.zonegroup
.empty()) {
4187 /* source is in the master zonegroup */
4188 conn
= svc
.zone
->get_master_conn();
4190 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
4191 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
4192 if (iter
== zonegroup_conn_map
.end()) {
4193 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
4196 conn
= iter
->second
;
4199 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
4200 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
4201 if (iter
== zone_conn_map
.end()) {
4202 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
4205 conn
= iter
->second
;
4208 RGWGetExtraDataCB cb
;
4209 map
<string
, string
> req_headers
;
4210 real_time set_mtime
;
4212 const real_time
*pmod
= mod_ptr
;
4214 obj_time_weight dest_mtime_weight
;
4216 constexpr bool prepend_meta
= true;
4217 constexpr bool get_op
= true;
4218 constexpr bool rgwx_stat
= true;
4219 constexpr bool sync_manifest
= true;
4220 constexpr bool skip_decrypt
= true;
4221 int ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
4222 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
4223 prepend_meta
, get_op
, rgwx_stat
,
4224 sync_manifest
, skip_decrypt
,
4225 true, &cb
, &in_stream_req
);
4230 ret
= conn
->complete_request(in_stream_req
, nullptr, &set_mtime
, psize
, nullptr, pheaders
);
4235 bufferlist
& extra_data_bl
= cb
.get_extra_data();
4236 if (extra_data_bl
.length()) {
4238 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
4239 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
4243 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
4245 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
4249 *src_mtime
= set_mtime
;
4253 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
4254 if (iter
!= src_attrs
.end()) {
4255 bufferlist
& etagbl
= iter
->second
;
4256 *petag
= etagbl
.to_str();
4257 while (petag
->size() > 0 && (*petag
)[petag
->size() - 1] == '\0') {
4258 *petag
= petag
->substr(0, petag
->size() - 1);
4264 *pattrs
= std::move(src_attrs
);
4270 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
4271 const rgw_user
& user_id
,
4273 const string
& source_zone
,
4274 const rgw_obj
& dest_obj
,
4275 const rgw_obj
& src_obj
,
4276 RGWBucketInfo
& dest_bucket_info
,
4277 RGWBucketInfo
& src_bucket_info
,
4278 std::optional
<rgw_placement_rule
> dest_placement_rule
,
4279 real_time
*src_mtime
,
4281 const real_time
*mod_ptr
,
4282 const real_time
*unmod_ptr
,
4283 bool high_precision_time
,
4284 const char *if_match
,
4285 const char *if_nomatch
,
4288 map
<string
, bufferlist
>& attrs
,
4289 RGWObjCategory category
,
4290 std::optional
<uint64_t> olh_epoch
,
4291 real_time delete_at
,
4294 void (*progress_cb
)(off_t
, void *),
4295 void *progress_data
,
4296 rgw_zone_set
*zones_trace
,
4297 std::optional
<uint64_t>* bytes_transferred
)
4299 /* source is in a different zonegroup, copy from there */
4301 RGWRESTStreamRWRequest
*in_stream_req
;
4304 append_rand_alpha(cct
, tag
, tag
, 32);
4305 obj_time_weight set_mtime_weight
;
4306 set_mtime_weight
.high_precision
= high_precision_time
;
4309 rgw::AioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
4310 using namespace rgw::putobj
;
4311 const rgw_placement_rule
*ptail_rule
= (dest_placement_rule
? &(*dest_placement_rule
) : nullptr);
4312 AtomicObjectProcessor
processor(&aio
, this, dest_bucket_info
, ptail_rule
, user_id
,
4313 obj_ctx
, dest_obj
, olh_epoch
, tag
);
4315 auto& zone_conn_map
= svc
.zone
->get_zone_conn_map();
4316 auto& zonegroup_conn_map
= svc
.zone
->get_zonegroup_conn_map();
4317 if (source_zone
.empty()) {
4318 if (dest_bucket_info
.zonegroup
.empty()) {
4319 /* source is in the master zonegroup */
4320 conn
= svc
.zone
->get_master_conn();
4322 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
4323 if (iter
== zonegroup_conn_map
.end()) {
4324 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
4327 conn
= iter
->second
;
4330 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
4331 if (iter
== zone_conn_map
.end()) {
4332 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
4335 conn
= iter
->second
;
4338 string obj_name
= dest_obj
.bucket
.name
+ "/" + dest_obj
.get_oid();
4340 boost::optional
<RGWPutObj_Compress
> compressor
;
4341 CompressorRef plugin
;
4343 rgw_placement_rule dest_rule
;
4344 RGWRadosPutObj
cb(cct
, plugin
, compressor
, &processor
, progress_cb
, progress_data
,
4345 [&](const map
<string
, bufferlist
>& obj_attrs
) {
4347 auto iter
= obj_attrs
.find(RGW_ATTR_STORAGE_CLASS
);
4348 if (iter
!= obj_attrs
.end()) {
4349 dest_rule
.storage_class
= iter
->second
.to_str();
4350 dest_rule
.inherit_from(dest_bucket_info
.placement_rule
);
4351 processor
.set_tail_placement(std::move(dest_rule
));
4352 ptail_rule
= &dest_rule
;
4354 ptail_rule
= &dest_bucket_info
.placement_rule
;
4357 const auto& compression_type
= svc
.zone
->get_zone_params().get_compression_type(*ptail_rule
);
4358 if (compression_type
!= "none") {
4359 plugin
= Compressor::create(cct
, compression_type
);
4361 ldout(cct
, 1) << "Cannot load plugin for compression type "
4362 << compression_type
<< dendl
;
4366 int ret
= processor
.prepare();
4374 real_time set_mtime
;
4375 uint64_t expected_size
= 0;
4377 RGWObjState
*dest_state
= NULL
;
4379 const real_time
*pmod
= mod_ptr
;
4381 obj_time_weight dest_mtime_weight
;
4383 if (copy_if_newer
) {
4384 /* need to get mtime for destination */
4385 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
4389 if (!real_clock::is_zero(dest_state
->mtime
)) {
4390 dest_mtime_weight
.init(dest_state
);
4391 pmod
= &dest_mtime_weight
.mtime
;
4395 static constexpr bool prepend_meta
= true;
4396 static constexpr bool get_op
= true;
4397 static constexpr bool rgwx_stat
= false;
4398 static constexpr bool sync_manifest
= true;
4399 static constexpr bool skip_decrypt
= true;
4400 ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
4401 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
4402 prepend_meta
, get_op
, rgwx_stat
,
4403 sync_manifest
, skip_decrypt
,
4405 &cb
, &in_stream_req
);
4410 ret
= conn
->complete_request(in_stream_req
, &etag
, &set_mtime
,
4411 &expected_size
, nullptr, nullptr);
4419 if (cb
.get_data_len() != expected_size
) {
4421 ldout(cct
, 0) << "ERROR: object truncated during fetching, expected "
4422 << expected_size
<< " bytes but received " << cb
.get_data_len() << dendl
;
4425 if (compressor
&& compressor
->is_compressed()) {
4427 RGWCompressionInfo cs_info
;
4428 cs_info
.compression_type
= plugin
->get_type_name();
4429 cs_info
.orig_size
= cb
.get_data_len();
4430 cs_info
.blocks
= move(compressor
->get_compression_blocks());
4431 encode(cs_info
, tmp
);
4432 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
4435 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4436 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
4438 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
4439 if (iter
!= cb
.get_attrs().end()) {
4441 decode(delete_at
, iter
->second
);
4442 } catch (buffer::error
& err
) {
4443 ldout(cct
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
4449 *src_mtime
= set_mtime
;
4453 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
4454 if (iter
!= cb
.get_attrs().end()) {
4455 *petag
= iter
->second
.to_str();
4459 //erase the append attr
4460 cb
.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM
);
4462 if (source_zone
.empty()) {
4463 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
4465 attrs
= cb
.get_attrs();
4468 if (copy_if_newer
) {
4469 uint64_t pg_ver
= 0;
4470 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
4471 if (i
!= attrs
.end() && i
->second
.length() > 0) {
4472 auto iter
= i
->second
.cbegin();
4474 decode(pg_ver
, iter
);
4475 } catch (buffer::error
& err
) {
4476 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
4477 /* non critical error */
4480 set_mtime_weight
.init(set_mtime
, svc
.zone
->get_zone_short_id(), pg_ver
);
4483 #define MAX_COMPLETE_RETRY 100
4484 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
4485 bool canceled
= false;
4486 ret
= processor
.complete(cb
.get_data_len(), etag
, mtime
, set_mtime
,
4487 attrs
, delete_at
, nullptr, nullptr, nullptr,
4488 zones_trace
, &canceled
);
4492 if (copy_if_newer
&& canceled
) {
4493 ldout(cct
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
4494 obj_ctx
.invalidate(dest_obj
); /* object was overwritten */
4495 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
4497 ldout(cct
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
4500 dest_mtime_weight
.init(dest_state
);
4501 dest_mtime_weight
.high_precision
= high_precision_time
;
4502 if (!dest_state
->exists
||
4503 dest_mtime_weight
< set_mtime_weight
) {
4504 ldout(cct
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
4507 ldout(cct
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
4513 if (i
== MAX_COMPLETE_RETRY
) {
4514 ldout(cct
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
4519 if (bytes_transferred
) {
4520 *bytes_transferred
= cb
.get_data_len();
4524 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
4525 // we may have already fetched during sync of OP_ADD, but were waiting
4526 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4527 if (olh_epoch
&& *olh_epoch
> 0) {
4528 constexpr bool log_data_change
= true;
4529 ret
= set_olh(obj_ctx
, dest_bucket_info
, dest_obj
, false, nullptr,
4530 *olh_epoch
, real_time(), false, zones_trace
, log_data_change
);
4532 // we already have the latest copy
4540 int RGWRados::copy_obj_to_remote_dest(RGWObjState
*astate
,
4541 map
<string
, bufferlist
>& src_attrs
,
4542 RGWRados::Object::Read
& read_op
,
4543 const rgw_user
& user_id
,
4549 RGWRESTStreamS3PutObj
*out_stream_req
;
4551 auto rest_master_conn
= svc
.zone
->get_master_conn();
4553 int ret
= rest_master_conn
->put_obj_async(user_id
, dest_obj
, astate
->size
, src_attrs
, true, &out_stream_req
);
4558 ret
= read_op
.iterate(0, astate
->size
- 1, out_stream_req
->get_out_cb());
4560 delete out_stream_req
;
4564 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
);
4573 * dest_obj: the object to copy into
4574 * src_obj: the object to copy from
4575 * attrs: usage depends on attrs_mod parameter
4576 * attrs_mod: the modification mode of the attrs, may have the following values:
4577 * ATTRSMOD_NONE - the attributes of the source object will be
4578 * copied without modifications, attrs parameter is ignored;
4579 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4580 * parameter, source object attributes are not copied;
4581 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4582 * are overwritten by values contained in attrs parameter.
4583 * err: stores any errors resulting from the get of the original object
4584 * Returns: 0 on success, -ERR# otherwise.
4586 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
4587 const rgw_user
& user_id
,
4589 const string
& source_zone
,
4592 RGWBucketInfo
& dest_bucket_info
,
4593 RGWBucketInfo
& src_bucket_info
,
4594 const rgw_placement_rule
& dest_placement
,
4595 real_time
*src_mtime
,
4597 const real_time
*mod_ptr
,
4598 const real_time
*unmod_ptr
,
4599 bool high_precision_time
,
4600 const char *if_match
,
4601 const char *if_nomatch
,
4604 map
<string
, bufferlist
>& attrs
,
4605 RGWObjCategory category
,
4607 real_time delete_at
,
4611 void (*progress_cb
)(off_t
, void *),
4612 void *progress_data
)
4616 rgw_obj shadow_obj
= dest_obj
;
4622 append_rand_alpha(cct
, dest_obj
.get_oid(), shadow_oid
, 32);
4623 shadow_obj
.init_ns(dest_obj
.bucket
, shadow_oid
, shadow_ns
);
4625 auto& zonegroup
= svc
.zone
->get_zonegroup();
4627 remote_dest
= !zonegroup
.equals(dest_bucket_info
.zonegroup
);
4628 remote_src
= !zonegroup
.equals(src_bucket_info
.zonegroup
);
4630 if (remote_src
&& remote_dest
) {
4631 ldout(cct
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
4635 ldout(cct
, 5) << "Copy object " << src_obj
.bucket
<< ":" << src_obj
.get_oid() << " => " << dest_obj
.bucket
<< ":" << dest_obj
.get_oid() << dendl
;
4637 if (remote_src
|| !source_zone
.empty()) {
4638 return fetch_remote_obj(obj_ctx
, user_id
, info
, source_zone
,
4639 dest_obj
, src_obj
, dest_bucket_info
, src_bucket_info
,
4640 dest_placement
, src_mtime
, mtime
, mod_ptr
,
4641 unmod_ptr
, high_precision_time
,
4642 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
4643 olh_epoch
, delete_at
, ptag
, petag
, progress_cb
, progress_data
);
4646 map
<string
, bufferlist
> src_attrs
;
4647 RGWRados::Object
src_op_target(this, src_bucket_info
, obj_ctx
, src_obj
);
4648 RGWRados::Object::Read
read_op(&src_op_target
);
4650 read_op
.conds
.mod_ptr
= mod_ptr
;
4651 read_op
.conds
.unmod_ptr
= unmod_ptr
;
4652 read_op
.conds
.high_precision_time
= high_precision_time
;
4653 read_op
.conds
.if_match
= if_match
;
4654 read_op
.conds
.if_nomatch
= if_nomatch
;
4655 read_op
.params
.attrs
= &src_attrs
;
4656 read_op
.params
.lastmod
= src_mtime
;
4657 read_op
.params
.obj_size
= &obj_size
;
4659 ret
= read_op
.prepare();
4663 if (src_attrs
.count(RGW_ATTR_CRYPT_MODE
)) {
4664 // Current implementation does not follow S3 spec and even
4665 // may result in data corruption silently when copying
4666 // multipart objects acorss pools. So reject COPY operations
4667 //on encrypted objects before it is fully functional.
4668 ldout(cct
, 0) << "ERROR: copy op for encrypted object " << src_obj
4669 << " has not been implemented." << dendl
;
4670 return -ERR_NOT_IMPLEMENTED
;
4673 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
4674 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
4676 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
4677 attrs
.erase(RGW_ATTR_ID_TAG
);
4678 attrs
.erase(RGW_ATTR_PG_VER
);
4679 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
4680 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
4681 if (cmp
!= src_attrs
.end())
4682 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
4684 RGWObjManifest manifest
;
4685 RGWObjState
*astate
= NULL
;
4687 ret
= get_obj_state(&obj_ctx
, src_bucket_info
, src_obj
, &astate
);
4692 vector
<rgw_raw_obj
> ref_objs
;
4695 /* dest is in a different zonegroup, copy it there */
4696 return copy_obj_to_remote_dest(astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
4698 uint64_t max_chunk_size
;
4700 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, dest_obj
, &max_chunk_size
);
4702 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
.bucket
<< dendl
;
4709 const rgw_placement_rule
*src_rule
{nullptr};
4711 if (astate
->has_manifest
) {
4712 src_rule
= &astate
->manifest
.get_tail_placement().placement_rule
;
4713 ldout(cct
, 20) << __func__
<< "(): manifest src_rule=" << src_rule
->to_str() << dendl
;
4716 if (!src_rule
|| src_rule
->empty()) {
4717 src_rule
= &src_bucket_info
.placement_rule
;
4720 if (!get_obj_data_pool(*src_rule
, src_obj
, &src_pool
)) {
4721 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
4725 if (!get_obj_data_pool(dest_placement
, dest_obj
, &dest_pool
)) {
4726 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
4730 ldout(cct
, 20) << __func__
<< "(): src_rule=" << src_rule
->to_str() << " src_pool=" << src_pool
4731 << " dest_rule=" << dest_placement
.to_str() << " dest_pool=" << dest_pool
<< dendl
;
4733 bool copy_data
= !astate
->has_manifest
||
4734 (*src_rule
!= dest_placement
) ||
4735 (src_pool
!= dest_pool
);
4737 bool copy_first
= false;
4738 if (astate
->has_manifest
) {
4739 if (!astate
->manifest
.has_tail()) {
4742 uint64_t head_size
= astate
->manifest
.get_head_size();
4744 if (head_size
> 0) {
4745 if (head_size
> max_chunk_size
) {
4755 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4756 if (iter
!= attrs
.end()) {
4757 *petag
= iter
->second
.to_str();
4761 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
4762 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4763 return copy_obj_data(obj_ctx
, dest_bucket_info
, dest_placement
, read_op
, obj_size
- 1, dest_obj
,
4764 mtime
, real_time(), attrs
, olh_epoch
, delete_at
, petag
);
4767 RGWObjManifest::obj_iterator miter
= astate
->manifest
.obj_begin();
4769 if (copy_first
) { // we need to copy first chunk, not increase refcount
4774 ret
= get_raw_obj_ref(miter
.get_location().get_raw_obj(this), &ref
);
4779 bufferlist first_chunk
;
4781 bool copy_itself
= (dest_obj
== src_obj
);
4782 RGWObjManifest
*pmanifest
;
4783 ldout(cct
, 20) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
4785 RGWRados::Object
dest_op_target(this, dest_bucket_info
, obj_ctx
, dest_obj
);
4786 RGWRados::Object::Write
write_op(&dest_op_target
);
4795 append_rand_alpha(cct
, tag
, tag
, 32);
4799 attrs
.erase(RGW_ATTR_TAIL_TAG
);
4800 manifest
= astate
->manifest
;
4801 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
4802 if (tail_placement
.bucket
.name
.empty()) {
4803 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
.bucket
);
4806 for (; miter
!= astate
->manifest
.obj_end(); ++miter
) {
4807 ObjectWriteOperation op
;
4808 ref_tag
= tag
+ '\0';
4809 cls_refcount_get(op
, ref_tag
, true);
4810 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(this);
4811 ref
.ioctx
.locator_set_key(loc
.loc
);
4813 ret
= ref
.ioctx
.operate(loc
.oid
, &op
);
4818 ref_objs
.push_back(loc
);
4821 pmanifest
= &manifest
;
4823 pmanifest
= &astate
->manifest
;
4824 /* don't send the object's tail for garbage collection */
4825 astate
->keep_tail
= true;
4829 ret
= read_op
.read(0, max_chunk_size
, first_chunk
);
4834 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, first_chunk
.length());
4836 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, 0);
4839 write_op
.meta
.data
= &first_chunk
;
4840 write_op
.meta
.manifest
= pmanifest
;
4841 write_op
.meta
.ptag
= &tag
;
4842 write_op
.meta
.owner
= dest_bucket_info
.owner
;
4843 write_op
.meta
.mtime
= mtime
;
4844 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
4845 write_op
.meta
.category
= category
;
4846 write_op
.meta
.olh_epoch
= olh_epoch
;
4847 write_op
.meta
.delete_at
= delete_at
;
4848 write_op
.meta
.modify_tail
= !copy_itself
;
4850 ret
= write_op
.write_meta(obj_size
, astate
->accounted_size
, attrs
);
4859 vector
<rgw_raw_obj
>::iterator riter
;
4861 /* rollback reference */
4862 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
4863 ObjectWriteOperation op
;
4864 cls_refcount_put(op
, tag
, true);
4866 ref
.ioctx
.locator_set_key(riter
->loc
);
4868 int r
= ref
.ioctx
.operate(riter
->oid
, &op
);
4870 ldout(cct
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
4878 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
4879 RGWBucketInfo
& dest_bucket_info
,
4880 const rgw_placement_rule
& dest_placement
,
4881 RGWRados::Object::Read
& read_op
, off_t end
,
4882 const rgw_obj
& dest_obj
,
4884 real_time set_mtime
,
4885 map
<string
, bufferlist
>& attrs
,
4887 real_time delete_at
,
4891 append_rand_alpha(cct
, tag
, tag
, 32);
4893 rgw::AioThrottle
aio(cct
->_conf
->rgw_put_obj_min_window_size
);
4894 using namespace rgw::putobj
;
4895 AtomicObjectProcessor
processor(&aio
, this, dest_bucket_info
, &dest_placement
,
4896 dest_bucket_info
.owner
, obj_ctx
,
4897 dest_obj
, olh_epoch
, tag
);
4898 int ret
= processor
.prepare();
4906 ret
= read_op
.read(ofs
, end
, bl
);
4908 ldout(cct
, 0) << "ERROR: fail to read object data, ret = " << ret
<< dendl
;
4912 uint64_t read_len
= ret
;
4913 ret
= processor
.process(std::move(bl
), ofs
);
4919 } while (ofs
<= end
);
4922 ret
= processor
.process({}, ofs
);
4928 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
4929 if (iter
!= attrs
.end()) {
4930 bufferlist
& bl
= iter
->second
;
4937 uint64_t accounted_size
;
4939 bool compressed
{false};
4940 RGWCompressionInfo cs_info
;
4941 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
4943 ldout(cct
, 0) << "ERROR: failed to read compression info" << dendl
;
4946 // pass original size if compressed
4947 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
4950 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
,
4951 nullptr, nullptr, nullptr, nullptr, nullptr);
4954 int RGWRados::transition_obj(RGWObjectCtx
& obj_ctx
,
4955 RGWBucketInfo
& bucket_info
,
4957 const rgw_placement_rule
& placement_rule
,
4958 const real_time
& mtime
,
4961 map
<string
, bufferlist
> attrs
;
4962 real_time read_mtime
;
4965 RGWRados::Object
op_target(this, bucket_info
, obj_ctx
, obj
);
4966 RGWRados::Object::Read
read_op(&op_target
);
4968 read_op
.params
.attrs
= &attrs
;
4969 read_op
.params
.lastmod
= &read_mtime
;
4970 read_op
.params
.obj_size
= &obj_size
;
4972 int ret
= read_op
.prepare();
4977 if (read_mtime
!= mtime
) {
4982 ret
= copy_obj_data(obj_ctx
,
4988 nullptr /* pmtime */,
4993 nullptr /* petag */);
5001 int RGWRados::check_bucket_empty(RGWBucketInfo
& bucket_info
)
5003 std::vector
<rgw_bucket_dir_entry
> ent_list
;
5004 rgw_obj_index_key marker
;
5009 constexpr uint NUM_ENTRIES
= 1000u;
5010 int r
= cls_bucket_list_unordered(bucket_info
,
5023 for (auto const& dirent
: ent_list
) {
5026 if (rgw_obj_key::oid_to_key_in_ns(dirent
.key
.name
, &obj
, ns
))
5029 } while (is_truncated
);
5036 * bucket: the name of the bucket to delete
5037 * Returns 0 on success, -ERR# otherwise.
5039 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, bool check_empty
)
5041 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5042 librados::IoCtx index_ctx
;
5043 map
<int, string
> bucket_objs
;
5044 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
5049 r
= check_bucket_empty(bucket_info
);
5055 r
= rgw_bucket_delete_bucket_obj(this, bucket
.tenant
, bucket
.name
, objv_tracker
);
5059 /* if the bucket is not synced we can remove the meta file */
5060 if (!svc
.zone
->is_syncing_bucket_meta(bucket
)) {
5061 RGWObjVersionTracker objv_tracker
;
5062 r
= rgw_bucket_instance_remove_entry(this, bucket
.get_key(), &objv_tracker
);
5067 /* remove bucket index objects asynchronously by best effort */
5068 (void) CLSRGWIssueBucketIndexClean(index_ctx
,
5070 cct
->_conf
->rgw_bucket_index_max_aio
)();
5076 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
)
5079 map
<string
, bufferlist
> attrs
;
5080 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5082 if (bucket
.bucket_id
.empty()) {
5083 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
5085 r
= get_bucket_instance_info(obj_ctx
, bucket
, info
, nullptr, &attrs
);
5088 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
5092 info
.owner
= owner
.get_id();
5094 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
5096 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
5104 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
)
5108 vector
<rgw_bucket
>::iterator iter
;
5110 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
5111 rgw_bucket
& bucket
= *iter
;
5113 ldout(cct
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
5115 ldout(cct
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
5118 map
<string
, bufferlist
> attrs
;
5119 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5120 int r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
5122 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
5127 info
.flags
&= ~BUCKET_SUSPENDED
;
5129 info
.flags
|= BUCKET_SUSPENDED
;
5132 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
5134 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
5142 int RGWRados::bucket_suspended(rgw_bucket
& bucket
, bool *suspended
)
5144 RGWBucketInfo bucket_info
;
5145 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5146 int ret
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
);
5151 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
5155 int RGWRados::Object::complete_atomic_modification()
5157 if (!state
->has_manifest
|| state
->keep_tail
)
5160 cls_rgw_obj_chain chain
;
5161 store
->update_gc_chain(obj
, state
->manifest
, &chain
);
5163 if (chain
.empty()) {
5167 string tag
= (state
->tail_tag
.length() > 0 ? state
->tail_tag
.to_str() : state
->obj_tag
.to_str());
5168 return store
->gc
->send_chain(chain
, tag
, false); // do it async
5171 void RGWRados::update_gc_chain(rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
5173 RGWObjManifest::obj_iterator iter
;
5174 rgw_raw_obj raw_head
;
5175 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
5176 for (iter
= manifest
.obj_begin(); iter
!= manifest
.obj_end(); ++iter
) {
5177 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(this);
5178 if (mobj
== raw_head
)
5180 cls_rgw_obj_key
key(mobj
.oid
);
5181 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
5185 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
, bool sync
)
5187 return gc
->send_chain(chain
, tag
, sync
);
5190 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
,
5191 librados::IoCtx
& index_ctx
,
5194 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5195 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5199 if (bucket
.bucket_id
.empty()) {
5200 ldout(cct
, 0) << "ERROR: empty bucket id for bucket operation" << dendl
;
5204 bucket_oid
= dir_oid_prefix
;
5205 bucket_oid
.append(bucket
.bucket_id
);
5210 int RGWRados::open_bucket_index_base(const RGWBucketInfo
& bucket_info
,
5211 librados::IoCtx
& index_ctx
,
5212 string
& bucket_oid_base
) {
5213 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5214 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5218 if (bucket
.bucket_id
.empty()) {
5219 ldout(cct
, 0) << "ERROR: empty bucket_id for bucket operation" << dendl
;
5223 bucket_oid_base
= dir_oid_prefix
;
5224 bucket_oid_base
.append(bucket
.bucket_id
);
5230 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
,
5231 librados::IoCtx
& index_ctx
,
5232 map
<int, string
>& bucket_objs
,
5234 map
<int, string
> *bucket_instance_ids
) {
5235 string bucket_oid_base
;
5236 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
5241 get_bucket_index_objects(bucket_oid_base
, bucket_info
.num_shards
, bucket_objs
, shard_id
);
5242 if (bucket_instance_ids
) {
5243 get_bucket_instance_ids(bucket_info
, shard_id
, bucket_instance_ids
);
5248 template<typename T
>
5249 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
5250 map
<int, string
>& oids
, map
<int, T
>& bucket_objs
,
5251 int shard_id
, map
<int, string
> *bucket_instance_ids
)
5253 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, bucket_instance_ids
);
5257 map
<int, string
>::const_iterator iter
= oids
.begin();
5258 for (; iter
!= oids
.end(); ++iter
) {
5259 bucket_objs
[iter
->first
] = T();
5264 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
5265 const string
& obj_key
, string
*bucket_obj
, int *shard_id
)
5267 string bucket_oid_base
;
5268 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
5272 RGWObjectCtx
obj_ctx(this);
5274 ret
= get_bucket_index_object(bucket_oid_base
, obj_key
, bucket_info
.num_shards
,
5275 (RGWBucketInfo::BIShardsHashType
)bucket_info
.bucket_index_shard_hash_type
, bucket_obj
, shard_id
);
5277 ldout(cct
, 10) << "get_bucket_index_object() returned ret=" << ret
<< dendl
;
5283 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
5284 int shard_id
, string
*bucket_obj
)
5286 string bucket_oid_base
;
5287 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
5291 RGWObjectCtx
obj_ctx(this);
5293 get_bucket_index_object(bucket_oid_base
, bucket_info
.num_shards
,
5294 shard_id
, bucket_obj
);
5298 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
5299 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
5301 for (const auto& pair
: header
.stats
) {
5302 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
5303 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
5305 RGWStorageStats
& s
= stats
[category
];
5307 s
.category
= category
;
5308 s
.size
+= header_stats
.total_size
;
5309 s
.size_rounded
+= header_stats
.total_size_rounded
;
5310 s
.size_utilized
+= header_stats
.actual_size
;
5311 s
.num_objects
+= header_stats
.num_entries
;
5315 int RGWRados::bucket_check_index(RGWBucketInfo
& bucket_info
,
5316 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
5317 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
5319 librados::IoCtx index_ctx
;
5320 // key - bucket index object id
5321 // value - bucket index check OP returned result with the given bucket index object (shard)
5322 map
<int, string
> oids
;
5323 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
5325 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, bucket_objs_ret
);
5330 ret
= CLSRGWIssueBucketCheck(index_ctx
, oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5335 // Aggregate results (from different shards if there is any)
5336 map
<int, struct rgw_cls_check_index_ret
>::iterator iter
;
5337 for (iter
= bucket_objs_ret
.begin(); iter
!= bucket_objs_ret
.end(); ++iter
) {
5338 accumulate_raw_stats(iter
->second
.existing_header
, *existing_stats
);
5339 accumulate_raw_stats(iter
->second
.calculated_header
, *calculated_stats
);
5345 int RGWRados::bucket_rebuild_index(RGWBucketInfo
& bucket_info
)
5347 librados::IoCtx index_ctx
;
5348 map
<int, string
> bucket_objs
;
5350 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
5355 return CLSRGWIssueBucketRebuild(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5358 int RGWRados::bucket_set_reshard(const RGWBucketInfo
& bucket_info
, const cls_rgw_bucket_instance_entry
& entry
)
5360 librados::IoCtx index_ctx
;
5361 map
<int, string
> bucket_objs
;
5363 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
5368 return CLSRGWIssueSetBucketResharding(index_ctx
, bucket_objs
, entry
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5371 int RGWRados::defer_gc(void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
)
5373 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
5374 std::string oid
, key
;
5375 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5379 RGWObjState
*state
= NULL
;
5381 int r
= get_obj_state(rctx
, bucket_info
, obj
, &state
, false);
5385 if (!state
->is_atomic
) {
5386 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
5392 if (state
->tail_tag
.length() > 0) {
5393 tag
= state
->tail_tag
.c_str();
5394 } else if (state
->obj_tag
.length() > 0) {
5395 tag
= state
->obj_tag
.c_str();
5397 ldout(cct
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
5401 ldout(cct
, 0) << "defer chain tag=" << tag
<< dendl
;
5403 return gc
->defer_chain(tag
, false);
5406 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
5408 list
<string
> prefixes
;
5409 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
5410 cls_rgw_remove_obj(op
, prefixes
);
5413 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
5415 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
5418 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
5420 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
5426 * bucket: name of the bucket storing the object
5427 * obj: name of the object to delete
5428 * Returns: 0 on success, -ERR# otherwise.
5430 int RGWRados::Object::Delete::delete_obj()
5432 RGWRados
*store
= target
->get_store();
5433 rgw_obj
& src_obj
= target
->get_obj();
5434 const string
& instance
= src_obj
.key
.instance
;
5435 rgw_obj obj
= src_obj
;
5437 if (instance
== "null") {
5438 obj
.key
.instance
.clear();
5441 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
5443 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
5444 if (instance
.empty() || explicit_marker_version
) {
5445 rgw_obj marker
= obj
;
5447 if (!params
.marker_version_id
.empty()) {
5448 if (params
.marker_version_id
!= "null") {
5449 marker
.key
.set_instance(params
.marker_version_id
);
5451 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
5452 store
->gen_rand_obj_instance_name(&marker
);
5455 result
.version_id
= marker
.key
.instance
;
5456 if (result
.version_id
.empty())
5457 result
.version_id
= "null";
5458 result
.delete_marker
= true;
5460 struct rgw_bucket_dir_entry_meta meta
;
5462 meta
.owner
= params
.obj_owner
.get_id().to_str();
5463 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
5465 if (real_clock::is_zero(params
.mtime
)) {
5466 meta
.mtime
= real_clock::now();
5468 meta
.mtime
= params
.mtime
;
5471 int r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
, params
.zones_trace
);
5476 rgw_bucket_dir_entry dirent
;
5478 int r
= store
->bi_get_instance(target
->get_bucket_info(), obj
, &dirent
);
5482 result
.delete_marker
= dirent
.is_delete_marker();
5483 r
= store
->unlink_obj_instance(target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
, params
.zones_trace
);
5487 result
.version_id
= instance
;
5491 int r
= target
->get_bucket_shard(&bs
);
5493 ldout(store
->ctx(), 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
5497 if (target
->bucket_info
.datasync_flag_enabled()) {
5498 r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
5500 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
5509 int r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
5515 r
= target
->get_state(&state
, false);
5519 ObjectWriteOperation op
;
5521 if (!real_clock::is_zero(params
.unmod_since
)) {
5522 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
5523 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
5524 if (!params
.high_precision_time
) {
5529 ldout(store
->ctx(), 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
5530 if (ctime
> unmod
) {
5531 return -ERR_PRECONDITION_FAILED
;
5534 /* only delete object if mtime is less than or equal to params.unmod_since */
5535 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
5537 uint64_t obj_accounted_size
= state
->accounted_size
;
5539 if (!real_clock::is_zero(params
.expiration_time
)) {
5541 real_time delete_at
;
5543 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
5545 auto iter
= bl
.cbegin();
5546 decode(delete_at
, iter
);
5547 } catch (buffer::error
& err
) {
5548 ldout(store
->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
5552 if (params
.expiration_time
!= delete_at
) {
5553 return -ERR_PRECONDITION_FAILED
;
5556 return -ERR_PRECONDITION_FAILED
;
5560 if (!state
->exists
) {
5561 target
->invalidate_state();
5565 r
= target
->prepare_atomic_modification(op
, false, NULL
, NULL
, NULL
, true, false);
5569 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
5571 RGWRados::Bucket
bop(store
, bucket_info
);
5572 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5574 index_op
.set_zones_trace(params
.zones_trace
);
5575 index_op
.set_bilog_flags(params
.bilog_flags
);
5577 r
= index_op
.prepare(CLS_RGW_OP_DEL
, &state
->write_tag
);
5581 store
->remove_rgw_head_obj(op
);
5582 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
5584 /* raced with another operation, object state is indeterminate */
5585 const bool need_invalidate
= (r
== -ECANCELED
);
5587 int64_t poolid
= ref
.ioctx
.get_id();
5589 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
5590 if (obj_tombstone_cache
) {
5591 tombstone_entry entry
{*state
};
5592 obj_tombstone_cache
->add(obj
, entry
);
5594 r
= index_op
.complete_del(poolid
, ref
.ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
5596 int ret
= target
->complete_atomic_modification();
5598 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
5600 /* other than that, no need to propagate error */
5602 int ret
= index_op
.cancel();
5604 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
5608 if (need_invalidate
) {
5609 target
->invalidate_state();
5615 /* update quota cache */
5616 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_accounted_size
);
5621 int RGWRados::delete_obj(RGWObjectCtx
& obj_ctx
,
5622 const RGWBucketInfo
& bucket_info
,
5624 int versioning_status
,
5625 uint16_t bilog_flags
,
5626 const real_time
& expiration_time
,
5627 rgw_zone_set
*zones_trace
)
5629 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
5630 RGWRados::Object::Delete
del_op(&del_target
);
5632 del_op
.params
.bucket_owner
= bucket_info
.owner
;
5633 del_op
.params
.versioning_status
= versioning_status
;
5634 del_op
.params
.bilog_flags
= bilog_flags
;
5635 del_op
.params
.expiration_time
= expiration_time
;
5636 del_op
.params
.zones_trace
= zones_trace
;
5638 return del_op
.delete_obj();
5641 int RGWRados::delete_raw_obj(const rgw_raw_obj
& obj
)
5644 int r
= get_raw_obj_ref(obj
, &ref
);
5649 ObjectWriteOperation op
;
5652 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
5659 int RGWRados::delete_obj_index(const rgw_obj
& obj
)
5661 std::string oid
, key
;
5662 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5664 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
5666 RGWBucketInfo bucket_info
;
5667 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
);
5669 ldout(cct
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
5673 RGWRados::Bucket
bop(this, bucket_info
);
5674 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
5676 real_time removed_mtime
;
5677 int r
= index_op
.complete_del(-1 /* pool */, 0, removed_mtime
, NULL
);
5682 static void generate_fake_tag(RGWRados
*store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
5686 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin();
5687 if (mi
!= manifest
.obj_end()) {
5688 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
5690 tag
= mi
.get_location().get_raw_obj(store
).oid
;
5694 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
5695 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
5697 hash
.Update((const unsigned char *)manifest_bl
.c_str(), manifest_bl
.length());
5699 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
5700 if (iter
!= attrset
.end()) {
5701 bufferlist
& bl
= iter
->second
;
5702 hash
.Update((const unsigned char *)bl
.c_str(), bl
.length());
5706 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
5707 tag
.append(md5_str
);
5709 ldout(store
->ctx(), 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
5711 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
5714 static bool is_olh(map
<string
, bufferlist
>& attrs
)
5716 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
5717 return (iter
!= attrs
.end());
5720 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
5722 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
5723 return (iter
!= attrs
.end());
5726 int RGWRados::get_olh_target_state(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5727 RGWObjState
*olh_state
, RGWObjState
**target_state
)
5729 ceph_assert(olh_state
->is_olh
);
5732 int r
= RGWRados::follow_olh(bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
5736 r
= get_obj_state(&obj_ctx
, bucket_info
, target
, target_state
, false);
5744 int RGWRados::get_obj_state_impl(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
5745 RGWObjState
**state
, bool follow_olh
, bool assume_noent
)
5751 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
5753 RGWObjState
*s
= rctx
->get_state(obj
);
5754 ldout(cct
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
5757 if (s
->is_olh
&& need_follow_olh
) {
5758 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
5765 rgw_raw_obj raw_obj
;
5766 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
5770 if (!assume_noent
) {
5771 r
= RGWRados::raw_obj_stat(raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
);
5776 s
->has_attrs
= true;
5777 tombstone_entry entry
;
5778 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
5779 s
->mtime
= entry
.mtime
;
5780 s
->zone_short_id
= entry
.zone_short_id
;
5781 s
->pg_ver
= entry
.pg_ver
;
5782 ldout(cct
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
5783 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
5785 s
->mtime
= real_time();
5793 s
->has_attrs
= true;
5794 s
->accounted_size
= s
->size
;
5796 auto iter
= s
->attrset
.find(RGW_ATTR_ETAG
);
5797 if (iter
!= s
->attrset
.end()) {
5798 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5799 bufferlist
& bletag
= iter
->second
;
5800 if (bletag
.length() > 0 && bletag
[bletag
.length() - 1] == '\0') {
5802 bletag
.splice(0, bletag
.length() - 1, &newbl
);
5803 bletag
.claim(newbl
);
5807 iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
5808 const bool compressed
= (iter
!= s
->attrset
.end());
5810 // use uncompressed size for accounted_size
5812 RGWCompressionInfo info
;
5813 auto p
= iter
->second
.cbegin();
5815 s
->accounted_size
= info
.orig_size
;
5816 } catch (buffer::error
&) {
5817 dout(0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
5822 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
5823 if (iter
!= s
->attrset
.end()) {
5824 bufferlist bl
= iter
->second
;
5825 bufferlist::iterator it
= bl
.begin();
5826 it
.copy(bl
.length(), s
->shadow_obj
);
5827 s
->shadow_obj
[bl
.length()] = '\0';
5829 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
5830 auto ttiter
= s
->attrset
.find(RGW_ATTR_TAIL_TAG
);
5831 if (ttiter
!= s
->attrset
.end()) {
5832 s
->tail_tag
= s
->attrset
[RGW_ATTR_TAIL_TAG
];
5835 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
5836 if (manifest_bl
.length()) {
5837 auto miter
= manifest_bl
.cbegin();
5839 decode(s
->manifest
, miter
);
5840 s
->has_manifest
= true;
5841 s
->manifest
.set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
5842 broken due to old bugs */
5843 s
->size
= s
->manifest
.get_obj_size();
5845 s
->accounted_size
= s
->size
;
5846 } catch (buffer::error
& err
) {
5847 ldout(cct
, 0) << "ERROR: couldn't decode manifest" << dendl
;
5850 ldout(cct
, 10) << "manifest: total_size = " << s
->manifest
.get_obj_size() << dendl
;
5851 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>() && \
5852 s
->manifest
.has_explicit_objs()) {
5853 RGWObjManifest::obj_iterator mi
;
5854 for (mi
= s
->manifest
.obj_begin(); mi
!= s
->manifest
.obj_end(); ++mi
) {
5855 ldout(cct
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(this) << dendl
;
5859 if (!s
->obj_tag
.length()) {
5861 * Uh oh, something's wrong, object with manifest should have tag. Let's
5862 * create one out of the manifest, would be unique
5864 generate_fake_tag(this, s
->attrset
, s
->manifest
, manifest_bl
, s
->obj_tag
);
5868 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
5869 if (aiter
!= s
->attrset
.end()) {
5870 bufferlist
& pg_ver_bl
= aiter
->second
;
5871 if (pg_ver_bl
.length()) {
5872 auto pgbl
= pg_ver_bl
.cbegin();
5874 decode(s
->pg_ver
, pgbl
);
5875 } catch (buffer::error
& err
) {
5876 ldout(cct
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5880 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
5881 if (aiter
!= s
->attrset
.end()) {
5882 bufferlist
& zone_short_id_bl
= aiter
->second
;
5883 if (zone_short_id_bl
.length()) {
5884 auto zbl
= zone_short_id_bl
.cbegin();
5886 decode(s
->zone_short_id
, zbl
);
5887 } catch (buffer::error
& err
) {
5888 ldout(cct
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
5892 if (s
->obj_tag
.length())
5893 ldout(cct
, 20) << "get_obj_state: setting s->obj_tag to " << s
->obj_tag
.c_str() << dendl
;
5895 ldout(cct
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
5897 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5898 * it exist, and not only if is_olh() returns true
5900 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
5901 if (iter
!= s
->attrset
.end()) {
5902 s
->olh_tag
= iter
->second
;
5905 if (is_olh(s
->attrset
)) {
5908 ldout(cct
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
5910 if (need_follow_olh
) {
5911 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
5912 } else if (obj
.key
.have_null_instance() && !s
->has_manifest
) {
5913 // read null version, and the head object only have olh info
5922 int RGWRados::get_obj_state(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
5923 bool follow_olh
, bool assume_noent
)
5928 ret
= get_obj_state_impl(rctx
, bucket_info
, obj
, state
, follow_olh
, assume_noent
);
5929 } while (ret
== -EAGAIN
);
5934 int RGWRados::Object::get_manifest(RGWObjManifest
**pmanifest
)
5936 RGWObjState
*astate
;
5937 int r
= get_state(&astate
, true);
5942 *pmanifest
= &astate
->manifest
;
5947 int RGWRados::Object::Read::get_attr(const char *name
, bufferlist
& dest
)
5950 int r
= source
->get_state(&state
, true);
5955 if (!state
->get_attr(name
, dest
))
5962 int RGWRados::Object::Stat::stat_async()
5964 RGWObjectCtx
& ctx
= source
->get_ctx();
5965 rgw_obj
& obj
= source
->get_obj();
5966 RGWRados
*store
= source
->get_store();
5968 RGWObjState
*s
= ctx
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
5972 result
.size
= s
->size
;
5973 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
5974 result
.attrs
= s
->attrset
;
5975 result
.has_manifest
= s
->has_manifest
;
5976 result
.manifest
= s
->manifest
;
5982 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
5984 int r
= store
->get_obj_head_ioctx(source
->get_bucket_info(), obj
, &state
.io_ctx
);
5989 librados::ObjectReadOperation op
;
5990 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
5991 op
.getxattrs(&result
.attrs
, NULL
);
5992 state
.completion
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
5993 state
.io_ctx
.locator_set_key(loc
);
5994 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
5996 ldout(store
->ctx(), 5) << __func__
5997 << ": ERROR: aio_operate() returned ret=" << r
6006 int RGWRados::Object::Stat::wait()
6008 if (!state
.completion
) {
6012 state
.completion
->wait_for_safe();
6013 state
.ret
= state
.completion
->get_return_value();
6014 state
.completion
->release();
6016 if (state
.ret
!= 0) {
6023 int RGWRados::Object::Stat::finish()
6025 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
6026 if (iter
!= result
.attrs
.end()) {
6027 bufferlist
& bl
= iter
->second
;
6028 auto biter
= bl
.cbegin();
6030 decode(result
.manifest
, biter
);
6031 } catch (buffer::error
& err
) {
6032 RGWRados
*store
= source
->get_store();
6033 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
6036 result
.has_manifest
= true;
6042 int RGWRados::append_atomic_test(RGWObjectCtx
*rctx
,
6043 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6044 ObjectOperation
& op
, RGWObjState
**pstate
)
6049 int r
= get_obj_state(rctx
, bucket_info
, obj
, pstate
, false);
6053 return append_atomic_test(*pstate
, op
);
6056 int RGWRados::append_atomic_test(const RGWObjState
* state
,
6057 librados::ObjectOperation
& op
)
6059 if (!state
->is_atomic
) {
6060 ldout(cct
, 20) << "state for obj=" << state
->obj
<< " is not atomic, not appending atomic test" << dendl
;
6064 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
6065 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
6067 ldout(cct
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
6072 int RGWRados::Object::get_state(RGWObjState
**pstate
, bool follow_olh
, bool assume_noent
)
6074 return store
->get_obj_state(&ctx
, bucket_info
, obj
, pstate
, follow_olh
, assume_noent
);
6077 void RGWRados::Object::invalidate_state()
6079 ctx
.invalidate(obj
);
6082 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
6083 const char *if_match
, const char *if_nomatch
, bool removal_op
,
6086 int r
= get_state(&state
, false);
6090 bool need_guard
= (state
->has_manifest
|| (state
->obj_tag
.length() != 0) ||
6091 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
6094 if (!state
->is_atomic
) {
6095 ldout(store
->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
6099 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
6106 /* first verify that the object wasn't replaced under */
6107 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
6108 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
6109 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
6113 if (strcmp(if_match
, "*") == 0) {
6114 // test the object is existing
6115 if (!state
->exists
) {
6116 return -ERR_PRECONDITION_FAILED
;
6120 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
6121 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
6122 return -ERR_PRECONDITION_FAILED
;
6128 if (strcmp(if_nomatch
, "*") == 0) {
6129 // test the object is NOT existing
6130 if (state
->exists
) {
6131 return -ERR_PRECONDITION_FAILED
;
6135 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
6136 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
6137 return -ERR_PRECONDITION_FAILED
;
6144 if (state
->exists
) {
6146 store
->remove_rgw_head_obj(op
);
6153 /* the object is being removed, no need to update its tag */
6158 state
->write_tag
= *ptag
;
6160 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
6163 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
6165 ldout(store
->ctx(), 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
6167 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6169 op
.setxattr(RGW_ATTR_TAIL_TAG
, bl
);
6176 * Set an attr on an object.
6177 * bucket: name of the bucket holding the object
6178 * obj: name of the object to set the attr on
6179 * name: the attr to set
6180 * bl: the contents of the attr
6181 * Returns: 0 on success, -ERR# otherwise.
6183 int RGWRados::set_attr(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
6185 map
<string
, bufferlist
> attrs
;
6187 return set_attrs(ctx
, bucket_info
, obj
, attrs
, NULL
);
6190 int RGWRados::set_attrs(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
,
6191 map
<string
, bufferlist
>& attrs
,
6192 map
<string
, bufferlist
>* rmattrs
)
6195 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6199 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
6201 ObjectWriteOperation op
;
6202 RGWObjState
*state
= NULL
;
6204 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &state
);
6208 map
<string
, bufferlist
>::iterator iter
;
6210 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
6211 const string
& name
= iter
->first
;
6212 op
.rmxattr(name
.c_str());
6216 const rgw_bucket
& bucket
= obj
.bucket
;
6218 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6219 const string
& name
= iter
->first
;
6220 bufferlist
& bl
= iter
->second
;
6225 op
.setxattr(name
.c_str(), bl
);
6227 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
6232 rgw_obj_index_key obj_key
;
6233 obj
.key
.get_index_key(&obj_key
);
6235 objexp_hint_add(ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
6236 } catch (buffer::error
& err
) {
6237 ldout(cct
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
6245 RGWObjectCtx
obj_ctx(this);
6248 RGWRados::Bucket
bop(this, bucket_info
);
6249 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
6253 append_rand_alpha(cct
, tag
, tag
, 32);
6254 state
->write_tag
= tag
;
6255 r
= index_op
.prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
6260 bl
.append(tag
.c_str(), tag
.size() + 1);
6261 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
6265 real_time mtime
= real_clock::now();
6266 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
6267 op
.mtime2(&mtime_ts
);
6268 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
6271 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
6272 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
6273 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
6274 string etag
= rgw_bl_str(etag_bl
);
6275 string content_type
= rgw_bl_str(content_type_bl
);
6276 string storage_class
;
6277 auto iter
= attrs
.find(RGW_ATTR_STORAGE_CLASS
);
6278 if (iter
!= attrs
.end()) {
6279 storage_class
= rgw_bl_str(iter
->second
);
6281 uint64_t epoch
= ref
.ioctx
.get_last_version();
6282 int64_t poolid
= ref
.ioctx
.get_id();
6283 r
= index_op
.complete(poolid
, epoch
, state
->size
, state
->accounted_size
,
6284 mtime
, etag
, content_type
, storage_class
, &acl_bl
,
6285 RGWObjCategory::Main
, NULL
);
6287 int ret
= index_op
.cancel();
6289 ldout(cct
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
6297 state
->obj_tag
.swap(bl
);
6299 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
6300 state
->attrset
.erase(iter
->first
);
6303 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6304 state
->attrset
[iter
->first
] = iter
->second
;
6311 int RGWRados::Object::Read::prepare()
6313 RGWRados
*store
= source
->get_store();
6314 CephContext
*cct
= store
->ctx();
6318 map
<string
, bufferlist
>::iterator iter
;
6320 RGWObjState
*astate
;
6321 int r
= source
->get_state(&astate
, true);
6325 if (!astate
->exists
) {
6329 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
6331 state
.obj
= astate
->obj
;
6332 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
6334 state
.cur_pool
= state
.head_obj
.pool
;
6335 state
.cur_ioctx
= &state
.io_ctxs
[state
.cur_pool
];
6337 r
= store
->get_obj_head_ioctx(bucket_info
, state
.obj
, state
.cur_ioctx
);
6342 *params
.attrs
= astate
->attrset
;
6343 if (cct
->_conf
->subsys
.should_gather
<ceph_subsys_rgw
, 20>()) {
6344 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
6345 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
6350 /* Convert all times go GMT to make them compatible */
6351 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
6352 obj_time_weight src_weight
;
6353 src_weight
.init(astate
);
6354 src_weight
.high_precision
= conds
.high_precision_time
;
6356 obj_time_weight dest_weight
;
6357 dest_weight
.high_precision
= conds
.high_precision_time
;
6359 if (conds
.mod_ptr
) {
6360 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
6361 ldout(cct
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
6362 if (!(dest_weight
< src_weight
)) {
6363 return -ERR_NOT_MODIFIED
;
6367 if (conds
.unmod_ptr
) {
6368 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
6369 ldout(cct
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
6370 if (dest_weight
< src_weight
) {
6371 return -ERR_PRECONDITION_FAILED
;
6375 if (conds
.if_match
|| conds
.if_nomatch
) {
6376 r
= get_attr(RGW_ATTR_ETAG
, etag
);
6382 if (conds
.if_match
) {
6383 string if_match_str
= rgw_string_unquote(conds
.if_match
);
6384 ldout(cct
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-Match: " << if_match_str
<< dendl
;
6385 if (if_match_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) != 0) {
6386 return -ERR_PRECONDITION_FAILED
;
6390 if (conds
.if_nomatch
) {
6391 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
6392 ldout(cct
, 10) << "ETag: " << string(etag
.c_str(), etag
.length()) << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
6393 if (if_nomatch_str
.compare(0, etag
.length(), etag
.c_str(), etag
.length()) == 0) {
6394 return -ERR_NOT_MODIFIED
;
6399 if (params
.obj_size
)
6400 *params
.obj_size
= astate
->size
;
6402 *params
.lastmod
= astate
->mtime
;
6407 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
6414 } else if (end
< 0) {
6419 if (ofs
>= (off_t
)obj_size
) {
6422 if (end
>= (off_t
)obj_size
) {
6429 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard
**pbs
, std::function
<int(BucketShard
*)> call
)
6431 RGWRados
*store
= target
->get_store();
6435 #define NUM_RESHARD_RETRIES 10
6436 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
6437 int ret
= get_bucket_shard(&bs
);
6439 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6443 if (r
!= -ERR_BUSY_RESHARDING
) {
6446 ldout(store
->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
6447 string new_bucket_id
;
6448 r
= store
->block_while_resharding(bs
, &new_bucket_id
,
6449 target
->bucket_info
, null_yield
);
6450 if (r
== -ERR_BUSY_RESHARDING
) {
6456 ldout(store
->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
6457 i
= 0; /* resharding is finished, make sure we can retry */
6458 r
= target
->update_bucket_id(new_bucket_id
);
6460 ldout(store
->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id
<< " returned r=" << r
<< dendl
;
6477 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op
, const string
*write_tag
)
6482 RGWRados
*store
= target
->get_store();
6484 if (write_tag
&& write_tag
->length()) {
6485 optag
= string(write_tag
->c_str(), write_tag
->length());
6487 if (optag
.empty()) {
6488 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
6492 int r
= guard_reshard(nullptr, [&](BucketShard
*bs
) -> int {
6493 return store
->cls_obj_prepare_op(*bs
, op
, optag
, obj
, bilog_flags
, zones_trace
);
6504 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid
, uint64_t epoch
,
6505 uint64_t size
, uint64_t accounted_size
,
6506 ceph::real_time
& ut
, const string
& etag
,
6507 const string
& content_type
, const string
& storage_class
,
6509 RGWObjCategory category
,
6510 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
,
6516 RGWRados
*store
= target
->get_store();
6519 int ret
= get_bucket_shard(&bs
);
6521 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6525 rgw_bucket_dir_entry ent
;
6526 obj
.key
.get_index_key(&ent
.key
);
6527 ent
.meta
.size
= size
;
6528 ent
.meta
.accounted_size
= accounted_size
;
6529 ent
.meta
.mtime
= ut
;
6530 ent
.meta
.etag
= etag
;
6531 ent
.meta
.storage_class
= storage_class
;
6533 ent
.meta
.user_data
= *user_data
;
6536 if (acl_bl
&& acl_bl
->length()) {
6537 int ret
= store
->decode_policy(*acl_bl
, &owner
);
6539 ldout(store
->ctx(), 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
6542 ent
.meta
.owner
= owner
.get_id().to_str();
6543 ent
.meta
.owner_display_name
= owner
.get_display_name();
6544 ent
.meta
.content_type
= content_type
;
6545 ent
.meta
.appendable
= appendable
;
6547 ret
= store
->cls_obj_complete_add(*bs
, obj
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
6549 if (target
->bucket_info
.datasync_flag_enabled()) {
6550 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
6552 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6559 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid
, uint64_t epoch
,
6560 real_time
& removed_mtime
,
6561 list
<rgw_obj_index_key
> *remove_objs
)
6566 RGWRados
*store
= target
->get_store();
6569 int ret
= get_bucket_shard(&bs
);
6571 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
6575 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
, zones_trace
);
6577 if (target
->bucket_info
.datasync_flag_enabled()) {
6578 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
6580 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6588 int RGWRados::Bucket::UpdateIndex::cancel()
6593 RGWRados
*store
= target
->get_store();
6596 int ret
= guard_reshard(&bs
, [&](BucketShard
*bs
) -> int {
6597 return store
->cls_obj_complete_cancel(*bs
, optag
, obj
, bilog_flags
, zones_trace
);
6601 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6602 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6603 * have no way to tell that they're all caught up
6605 if (target
->bucket_info
.datasync_flag_enabled()) {
6606 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
6608 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
6615 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
)
6617 RGWRados
*store
= source
->get_store();
6618 CephContext
*cct
= store
->ctx();
6620 rgw_raw_obj read_obj
;
6621 uint64_t read_ofs
= ofs
;
6622 uint64_t len
, read_len
;
6623 bool reading_from_head
= true;
6624 ObjectReadOperation op
;
6626 bool merge_bl
= false;
6627 bufferlist
*pbl
= &bl
;
6629 uint64_t max_chunk_size
;
6631 RGWObjState
*astate
;
6632 int r
= source
->get_state(&astate
, true);
6636 if (astate
->size
== 0) {
6638 } else if (end
>= (int64_t)astate
->size
) {
6639 end
= astate
->size
- 1;
6645 len
= end
- ofs
+ 1;
6647 if (astate
->has_manifest
&& astate
->manifest
.has_tail()) {
6648 /* now get the relevant object part */
6649 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
6651 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
6652 read_obj
= iter
.get_location().get_raw_obj(store
);
6653 len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6654 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6655 reading_from_head
= (read_obj
== state
.head_obj
);
6657 read_obj
= state
.head_obj
;
6660 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
);
6662 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
6666 if (len
> max_chunk_size
)
6667 len
= max_chunk_size
;
6672 if (reading_from_head
) {
6673 /* only when reading from the head object do we need to do the atomic test */
6674 r
= store
->append_atomic_test(&source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
);
6678 if (astate
&& astate
->prefetch_data
) {
6679 if (!ofs
&& astate
->data
.length() >= len
) {
6684 if (ofs
< astate
->data
.length()) {
6685 unsigned copy_len
= std::min((uint64_t)astate
->data
.length() - ofs
, len
);
6686 astate
->data
.copy(ofs
, copy_len
, bl
);
6687 read_len
-= copy_len
;
6688 read_ofs
+= copy_len
;
6698 ldout(cct
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
6699 op
.read(read_ofs
, read_len
, pbl
, NULL
);
6701 if (state
.cur_pool
!= read_obj
.pool
) {
6702 auto iter
= state
.io_ctxs
.find(read_obj
.pool
);
6703 if (iter
== state
.io_ctxs
.end()) {
6704 state
.cur_ioctx
= &state
.io_ctxs
[read_obj
.pool
];
6705 r
= store
->open_pool_ctx(read_obj
.pool
, *state
.cur_ioctx
);
6707 ldout(cct
, 20) << "ERROR: failed to open pool context for pool=" << read_obj
.pool
<< " r=" << r
<< dendl
;
6711 state
.cur_ioctx
= &iter
->second
;
6713 state
.cur_pool
= read_obj
.pool
;
6716 state
.cur_ioctx
->locator_set_key(read_obj
.loc
);
6718 r
= state
.cur_ioctx
->operate(read_obj
.oid
, &op
, NULL
);
6719 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
6732 struct get_obj_data
{
6734 RGWGetDataCB
* client_cb
;
6736 uint64_t offset
; // next offset to write to client
6737 rgw::AioResultList completed
; // completed read results, sorted by offset
6739 get_obj_data(RGWRados
* store
, RGWGetDataCB
* cb
, rgw::Aio
* aio
, uint64_t offset
)
6740 : store(store
), client_cb(cb
), aio(aio
), offset(offset
) {}
6742 int flush(rgw::AioResultList
&& results
) {
6743 int r
= rgw::check_for_errors(results
);
6748 auto cmp
= [](const auto& lhs
, const auto& rhs
) { return lhs
.id
< rhs
.id
; };
6749 results
.sort(cmp
); // merge() requires results to be sorted first
6750 completed
.merge(results
, cmp
); // merge results in sorted order
6752 while (!completed
.empty() && completed
.front().id
== offset
) {
6753 auto bl
= std::move(completed
.front().data
);
6754 completed
.pop_front_and_dispose(std::default_delete
<rgw::AioResultEntry
>{});
6756 offset
+= bl
.length();
6757 int r
= client_cb
->handle_data(bl
, 0, bl
.length());
6766 // wait for all completions to drain and ignore the results
6771 auto c
= aio
->wait();
6772 while (!c
.empty()) {
6773 int r
= flush(std::move(c
));
6780 return flush(std::move(c
));
6784 static int _get_obj_iterate_cb(const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6785 off_t read_ofs
, off_t len
, bool is_head_obj
,
6786 RGWObjState
*astate
, void *arg
)
6788 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
6790 return d
->store
->get_obj_iterate_cb(read_obj
, obj_ofs
, read_ofs
, len
,
6791 is_head_obj
, astate
, arg
);
6794 int RGWRados::get_obj_iterate_cb(const rgw_raw_obj
& read_obj
, off_t obj_ofs
,
6795 off_t read_ofs
, off_t len
, bool is_head_obj
,
6796 RGWObjState
*astate
, void *arg
)
6798 ObjectReadOperation op
;
6799 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
6803 /* only when reading from the head object do we need to do the atomic test */
6804 int r
= append_atomic_test(astate
, op
);
6809 obj_ofs
< astate
->data
.length()) {
6810 unsigned chunk_len
= std::min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
6812 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
6817 d
->offset
+= chunk_len
;
6818 read_ofs
+= chunk_len
;
6819 obj_ofs
+= chunk_len
;
6825 auto obj
= d
->store
->svc
.rados
->obj(read_obj
);
6828 ldout(cct
, 4) << "failed to open rados context for " << read_obj
<< dendl
;
6832 ldout(cct
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
6833 op
.read(read_ofs
, len
, nullptr, nullptr);
6835 const uint64_t cost
= len
;
6836 const uint64_t id
= obj_ofs
; // use logical object offset for sorting replies
6838 auto completed
= d
->aio
->submit(obj
, &op
, cost
, id
);
6840 return d
->flush(std::move(completed
));
6843 int RGWRados::Object::Read::iterate(int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
)
6845 RGWRados
*store
= source
->get_store();
6846 CephContext
*cct
= store
->ctx();
6847 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
6848 const uint64_t chunk_size
= cct
->_conf
->rgw_get_obj_max_req_size
;
6849 const uint64_t window_size
= cct
->_conf
->rgw_get_obj_window_size
;
6851 rgw::AioThrottle
aio(window_size
);
6852 get_obj_data
data(store
, cb
, &aio
, ofs
);
6854 int r
= store
->iterate_obj(obj_ctx
, source
->get_bucket_info(), state
.obj
,
6855 ofs
, end
, chunk_size
, _get_obj_iterate_cb
, &data
);
6857 ldout(cct
, 0) << "iterate_obj() failed with " << r
<< dendl
;
6858 data
.cancel(); // drain completions without writing back to client
6862 return data
.drain();
6865 int RGWRados::iterate_obj(RGWObjectCtx
& obj_ctx
,
6866 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
6867 off_t ofs
, off_t end
, uint64_t max_chunk_size
,
6868 iterate_obj_cb cb
, void *arg
)
6870 rgw_raw_obj head_obj
;
6871 rgw_raw_obj read_obj
;
6872 uint64_t read_ofs
= ofs
;
6874 bool reading_from_head
= true;
6875 RGWObjState
*astate
= NULL
;
6877 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
6879 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &astate
, false);
6887 len
= end
- ofs
+ 1;
6889 if (astate
->has_manifest
) {
6890 /* now get the relevant object stripe */
6891 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
6893 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
.obj_end();
6895 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
6896 off_t stripe_ofs
= iter
.get_stripe_ofs();
6897 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
6899 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
6900 read_obj
= iter
.get_location().get_raw_obj(this);
6901 uint64_t read_len
= std::min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
6902 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
6904 if (read_len
> max_chunk_size
) {
6905 read_len
= max_chunk_size
;
6908 reading_from_head
= (read_obj
== head_obj
);
6909 r
= cb(read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
6919 while (ofs
<= end
) {
6920 read_obj
= head_obj
;
6921 uint64_t read_len
= std::min(len
, max_chunk_size
);
6923 r
= cb(read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
6936 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
6939 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6944 return ref
.ioctx
.operate(ref
.obj
.oid
, op
);
6947 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
6950 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6957 return ref
.ioctx
.operate(ref
.obj
.oid
, op
, &outbl
);
6960 int RGWRados::olh_init_modification_impl(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
6962 ObjectWriteOperation op
;
6964 ceph_assert(olh_obj
.key
.instance
.empty());
6966 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
6968 if (!state
.exists
) {
6972 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
6973 op
.mtime2(&mtime_ts
);
6977 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6978 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6979 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6980 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6981 * log will reflect that.
6983 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6984 * is used for object data instance, olh_tag for olh instance.
6987 /* guard against racing writes */
6988 bucket_index_guard_olh_op(state
, op
);
6994 gen_rand_alphanumeric_lower(cct
, &obj_tag
, 32);
6997 bl
.append(obj_tag
.c_str(), obj_tag
.size());
6998 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
7000 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
7005 gen_rand_alphanumeric_lower(cct
, &olh_tag
, 32);
7008 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
7009 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
7011 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
7012 state
.olh_tag
= olh_bl
;
7013 state
.is_olh
= true;
7016 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
7020 RGWOLHPendingInfo pending_info
;
7021 pending_info
.time
= real_clock::now();
7022 encode(pending_info
, bl
);
7024 #define OLH_PENDING_TAG_LEN 32
7025 /* tag will start with current time epoch, this so that entries are sorted by time */
7027 utime_t
ut(pending_info
.time
);
7028 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
7032 gen_rand_alphanumeric_lower(cct
, &s
, OLH_PENDING_TAG_LEN
- op_tag
->size());
7036 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
7037 attr_name
.append(*op_tag
);
7039 op
.setxattr(attr_name
.c_str(), bl
);
7041 int ret
= obj_operate(bucket_info
, olh_obj
, &op
);
7046 state
.exists
= true;
7047 state
.attrset
[attr_name
] = bl
;
7052 int RGWRados::olh_init_modification(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
7056 ret
= olh_init_modification_impl(bucket_info
, state
, obj
, op_tag
);
7057 if (ret
== -EEXIST
) {
7064 int RGWRados::guard_reshard(BucketShard
*bs
,
7065 const rgw_obj
& obj_instance
,
7066 const RGWBucketInfo
& bucket_info
,
7067 std::function
<int(BucketShard
*)> call
)
7070 const rgw_obj
*pobj
= &obj_instance
;
7073 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
7074 r
= bs
->init(pobj
->bucket
, *pobj
, nullptr /* no RGWBucketInfo */);
7076 ldout(cct
, 5) << "bs.init() returned ret=" << r
<< dendl
;
7080 if (r
!= -ERR_BUSY_RESHARDING
) {
7083 ldout(cct
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
7084 string new_bucket_id
;
7085 r
= block_while_resharding(bs
, &new_bucket_id
, bucket_info
, null_yield
);
7086 if (r
== -ERR_BUSY_RESHARDING
) {
7092 ldout(cct
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
7093 i
= 0; /* resharding is finished, make sure we can retry */
7096 obj
.bucket
.update_bucket_id(new_bucket_id
);
7107 int RGWRados::block_while_resharding(RGWRados::BucketShard
*bs
,
7108 string
*new_bucket_id
,
7109 const RGWBucketInfo
& bucket_info
,
7113 cls_rgw_bucket_instance_entry entry
;
7115 // since we want to run this recovery code from two distinct places,
7116 // let's just put it in a lambda so we can easily re-use; if the
7117 // lambda successfully fetches a new bucket id, it sets
7118 // new_bucket_id and returns 0, otherwise it returns a negative
7120 auto fetch_new_bucket_id
=
7121 [this, bucket_info
](const std::string
& log_tag
,
7122 std::string
* new_bucket_id
) -> int {
7123 RGWBucketInfo fresh_bucket_info
= bucket_info
;
7124 int ret
= try_refresh_bucket_info(fresh_bucket_info
, nullptr);
7126 ldout(cct
, 0) << __func__
<<
7127 " ERROR: failed to refresh bucket info after reshard at " <<
7128 log_tag
<< ": " << cpp_strerror(-ret
) << dendl
;
7131 *new_bucket_id
= fresh_bucket_info
.bucket
.bucket_id
;
7135 constexpr int num_retries
= 10;
7136 for (int i
= 1; i
<= num_retries
; i
++) { // nb: 1-based for loop
7137 ret
= cls_rgw_get_bucket_resharding(bs
->index_ctx
, bs
->bucket_obj
, &entry
);
7138 if (ret
== -ENOENT
) {
7139 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id
);
7140 } else if (ret
< 0) {
7141 ldout(cct
, 0) << __func__
<<
7142 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret
) <<
7147 if (!entry
.resharding_in_progress()) {
7148 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
7152 ldout(cct
, 20) << "NOTICE: reshard still in progress; " <<
7153 (i
< num_retries
? "retrying" : "too many retries") << dendl
;
7155 if (i
== num_retries
) {
7159 // If bucket is erroneously marked as resharding (e.g., crash or
7160 // other error) then fix it. If we can take the bucket reshard
7161 // lock then it means no other resharding should be taking place,
7162 // and we're free to clear the flags.
7164 // since we expect to do this rarely, we'll do our work in a
7165 // block and erase our work after each try
7167 RGWObjectCtx
obj_ctx(this);
7168 const rgw_bucket
& b
= bs
->bucket
;
7169 std::string bucket_id
= b
.get_key();
7170 RGWBucketReshardLock
reshard_lock(this, bucket_info
, true);
7171 ret
= reshard_lock
.lock();
7173 ldout(cct
, 20) << __func__
<<
7174 " INFO: failed to take reshard lock for bucket " <<
7175 bucket_id
<< "; expected if resharding underway" << dendl
;
7177 ldout(cct
, 10) << __func__
<<
7178 " INFO: was able to take reshard lock for bucket " <<
7180 ret
= RGWBucketReshard::clear_resharding(this, bucket_info
);
7182 reshard_lock
.unlock();
7183 ldout(cct
, 0) << __func__
<<
7184 " ERROR: failed to clear resharding flags for bucket " <<
7187 reshard_lock
.unlock();
7188 ldout(cct
, 5) << __func__
<<
7189 " INFO: apparently successfully cleared resharding flags for "
7190 "bucket " << bucket_id
<< dendl
;
7191 continue; // if we apparently succeed immediately test again
7192 } // if clear resharding succeeded
7193 } // if taking of lock succeeded
7194 } // block to encapsulate recovery from incomplete reshard
7196 ret
= reshard_wait
->wait(y
);
7198 ldout(cct
, 0) << __func__
<<
7199 " ERROR: bucket is still resharding, please retry" << dendl
;
7204 ldout(cct
, 0) << __func__
<<
7205 " ERROR: bucket is still resharding, please retry" << dendl
;
7206 return -ERR_BUSY_RESHARDING
;
7209 int RGWRados::bucket_index_link_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
7211 const string
& op_tag
,
7212 struct rgw_bucket_dir_entry_meta
*meta
,
7214 real_time unmod_since
, bool high_precision_time
,
7215 rgw_zone_set
*_zones_trace
, bool log_data_change
)
7218 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7223 rgw_zone_set zones_trace
;
7225 zones_trace
= *_zones_trace
;
7227 zones_trace
.insert(svc
.zone
->get_zone().id
);
7229 BucketShard
bs(this);
7231 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
7232 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7233 [&](BucketShard
*bs
) -> int {
7234 librados::ObjectWriteOperation op
;
7235 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7236 return cls_rgw_bucket_link_olh(bs
->index_ctx
, op
,
7237 bs
->bucket_obj
, key
, olh_state
.olh_tag
, delete_marker
, op_tag
, meta
, olh_epoch
,
7238 unmod_since
, high_precision_time
,
7239 svc
.zone
->get_zone().log_data
, zones_trace
);
7242 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
7246 if (log_data_change
&& bucket_info
.datasync_flag_enabled()) {
7247 data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
7253 void RGWRados::bucket_index_guard_olh_op(RGWObjState
& olh_state
, ObjectOperation
& op
)
7255 ldout(cct
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
7256 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
7259 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
7260 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
, rgw_zone_set
*_zones_trace
)
7263 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7268 rgw_zone_set zones_trace
;
7270 zones_trace
= *_zones_trace
;
7272 zones_trace
.insert(svc
.zone
->get_zone().id
);
7274 BucketShard
bs(this);
7276 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
7277 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7278 [&](BucketShard
*bs
) -> int {
7279 librados::ObjectWriteOperation op
;
7280 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7281 return cls_rgw_bucket_unlink_instance(bs
->index_ctx
, op
, bs
->bucket_obj
, key
, op_tag
,
7282 olh_tag
, olh_epoch
, svc
.zone
->get_zone().log_data
, zones_trace
);
7285 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
7292 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
7293 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
7294 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
7298 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7303 BucketShard
bs(this);
7305 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
7307 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7311 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7313 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7315 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7316 [&](BucketShard
*bs
) -> int {
7317 ObjectReadOperation op
;
7318 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7319 return cls_rgw_get_olh_log(bs
->index_ctx
, bs
->bucket_obj
, op
,
7320 key
, ver_marker
, olh_tag
, log
, is_truncated
);
7323 ldout(cct
, 20) << "cls_rgw_get_olh_log() returned r=" << r
<< dendl
;
7330 // a multisite sync bug resulted in the OLH head attributes being overwritten by
7331 // the attributes from another zone, causing link_olh() to fail endlessly due to
7332 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7333 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
7334 int RGWRados::repair_olh(RGWObjState
* state
, const RGWBucketInfo
& bucket_info
,
7337 // fetch the current olh entry from the bucket index
7338 rgw_bucket_olh_entry olh
;
7339 int r
= bi_get_olh(bucket_info
, obj
, &olh
);
7341 ldout(cct
, 0) << "repair_olh failed to read olh entry for " << obj
<< dendl
;
7344 if (olh
.tag
== rgw_bl_str(state
->olh_tag
)) { // mismatch already resolved?
7348 ldout(cct
, 4) << "repair_olh setting olh_tag=" << olh
.tag
7349 << " key=" << olh
.key
<< " delete_marker=" << olh
.delete_marker
<< dendl
;
7351 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7352 ObjectWriteOperation op
;
7353 // assert this is the same olh tag we think we're fixing
7354 bucket_index_guard_olh_op(*state
, op
);
7355 // preserve existing mtime
7356 struct timespec mtime_ts
= ceph::real_clock::to_timespec(state
->mtime
);
7357 op
.mtime2(&mtime_ts
);
7360 bl
.append(olh
.tag
.c_str(), olh
.tag
.size());
7361 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, bl
);
7365 info
.target
= rgw_obj(bucket_info
.bucket
, olh
.key
);
7366 info
.removed
= olh
.delete_marker
;
7369 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7372 r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
7376 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
7378 ldout(cct
, 0) << "repair_olh failed to write olh attributes with "
7379 << cpp_strerror(r
) << dendl
;
7385 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
7388 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7393 BucketShard
bs(this);
7395 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
7397 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
7401 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7403 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7405 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7406 [&](BucketShard
*pbs
) -> int {
7407 ObjectWriteOperation op
;
7408 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7409 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
7410 return pbs
->index_ctx
.operate(pbs
->bucket_obj
, &op
);
7413 ldout(cct
, 20) << "cls_rgw_trim_olh_log() returned r=" << ret
<< dendl
;
7420 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
7423 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
7428 BucketShard
bs(this);
7430 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
7432 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
7434 int ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
7435 [&](BucketShard
*pbs
) -> int {
7436 ObjectWriteOperation op
;
7437 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
7438 return cls_rgw_clear_olh(pbs
->index_ctx
, op
, pbs
->bucket_obj
, key
, olh_tag
);
7441 ldout(cct
, 5) << "cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
7448 int RGWRados::apply_olh_log(RGWObjectCtx
& obj_ctx
, RGWObjState
& state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
7449 bufferlist
& olh_tag
, map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >& log
,
7450 uint64_t *plast_ver
, rgw_zone_set
* zones_trace
)
7456 librados::ObjectWriteOperation op
;
7458 uint64_t last_ver
= log
.rbegin()->first
;
7459 *plast_ver
= last_ver
;
7461 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
7463 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7464 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
7467 string last_ver_s
= to_string(last_ver
);
7468 ver_bl
.append(last_ver_s
.c_str(), last_ver_s
.size());
7469 op
.setxattr(RGW_ATTR_OLH_VER
, ver_bl
);
7471 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
7472 op
.mtime2(&mtime_ts
);
7474 bool need_to_link
= false;
7475 cls_rgw_obj_key key
;
7476 bool delete_marker
= false;
7477 list
<cls_rgw_obj_key
> remove_instances
;
7478 bool need_to_remove
= false;
7480 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
7481 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
7482 for (; viter
!= iter
->second
.end(); ++viter
) {
7483 rgw_bucket_olh_log_entry
& entry
= *viter
;
7485 ldout(cct
, 20) << "olh_log_entry: op=" << (int)entry
.op
7486 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
7487 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
7489 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
7490 remove_instances
.push_back(entry
.key
);
7492 case CLS_RGW_OLH_OP_LINK_OLH
:
7493 need_to_link
= true;
7494 need_to_remove
= false;
7496 delete_marker
= entry
.delete_marker
;
7498 case CLS_RGW_OLH_OP_UNLINK_OLH
:
7499 need_to_remove
= true;
7500 need_to_link
= false;
7503 ldout(cct
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
7506 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
7507 attr_name
.append(entry
.op_tag
);
7508 op
.rmxattr(attr_name
.c_str());
7513 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
7518 const rgw_bucket
& bucket
= obj
.bucket
;
7521 rgw_obj
target(bucket
, key
);
7523 info
.target
= target
;
7524 info
.removed
= delete_marker
;
7527 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
7530 /* first remove object instances */
7531 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
7532 liter
!= remove_instances
.end(); ++liter
) {
7533 cls_rgw_obj_key
& key
= *liter
;
7534 rgw_obj
obj_instance(bucket
, key
);
7535 int ret
= delete_obj(obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
, ceph::real_time(), zones_trace
);
7536 if (ret
< 0 && ret
!= -ENOENT
) {
7537 ldout(cct
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
7542 /* update olh object */
7543 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
7544 if (r
== -ECANCELED
) {
7548 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7552 r
= bucket_index_trim_olh_log(bucket_info
, state
, obj
, last_ver
);
7554 ldout(cct
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
7558 if (need_to_remove
) {
7559 ObjectWriteOperation rm_op
;
7561 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
7562 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_EQ
, last_ver
);
7563 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
7566 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &rm_op
);
7567 if (r
== -ECANCELED
) {
7568 return 0; /* someone else won this race */
7571 * only clear if was successful, otherwise we might clobber pending operations on this object
7573 r
= bucket_index_clear_olh(bucket_info
, state
, obj
);
7575 ldout(cct
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
7585 * read olh log and apply it
7587 int RGWRados::update_olh(RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_zone_set
*zones_trace
)
7589 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
7591 uint64_t ver_marker
= 0;
7594 int ret
= bucket_index_read_olh_log(bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
7598 ret
= apply_olh_log(obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
, zones_trace
);
7602 } while (is_truncated
);
7607 int RGWRados::set_olh(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
7608 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
,
7609 rgw_zone_set
*zones_trace
, bool log_data_change
)
7613 rgw_obj olh_obj
= target_obj
;
7614 olh_obj
.key
.instance
.clear();
7616 RGWObjState
*state
= NULL
;
7621 #define MAX_ECANCELED_RETRY 100
7622 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7623 if (ret
== -ECANCELED
) {
7624 obj_ctx
.invalidate(olh_obj
);
7627 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
7632 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
7634 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7635 if (ret
== -ECANCELED
) {
7640 ret
= bucket_index_link_olh(bucket_info
, *state
, target_obj
, delete_marker
,
7641 op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
,
7642 zones_trace
, log_data_change
);
7644 ldout(cct
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
7645 if (ret
== -ECANCELED
) {
7646 // the bucket index rejected the link_olh() due to olh tag mismatch;
7647 // attempt to reconstruct olh head attributes based on the bucket index
7648 int r2
= repair_olh(state
, bucket_info
, olh_obj
);
7649 if (r2
< 0 && r2
!= -ECANCELED
) {
7659 if (i
== MAX_ECANCELED_RETRY
) {
7660 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7664 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
7665 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7669 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7676 int RGWRados::unlink_obj_instance(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
7677 uint64_t olh_epoch
, rgw_zone_set
*zones_trace
)
7681 rgw_obj olh_obj
= target_obj
;
7682 olh_obj
.key
.instance
.clear();
7684 RGWObjState
*state
= NULL
;
7689 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
7690 if (ret
== -ECANCELED
) {
7691 obj_ctx
.invalidate(olh_obj
);
7694 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
7698 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
7700 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7701 if (ret
== -ECANCELED
) {
7707 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
7709 ret
= bucket_index_unlink_instance(bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
, zones_trace
);
7711 ldout(cct
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7712 if (ret
== -ECANCELED
) {
7720 if (i
== MAX_ECANCELED_RETRY
) {
7721 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
7725 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
, zones_trace
);
7726 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
7730 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
7737 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key
*target_key
)
7739 #define OBJ_INSTANCE_LEN 32
7740 char buf
[OBJ_INSTANCE_LEN
+ 1];
7742 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
7743 no underscore for instance name due to the way we encode the raw keys */
7745 target_key
->set_instance(buf
);
7748 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
7750 gen_rand_obj_instance_name(&target_obj
->key
);
7753 int RGWRados::get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
7755 map
<string
, bufferlist
> unfiltered_attrset
;
7757 ObjectReadOperation op
;
7758 op
.getxattrs(&unfiltered_attrset
, NULL
);
7761 int r
= obj_operate(bucket_info
, obj
, &op
);
7766 map
<string
, bufferlist
> attrset
;
7768 rgw_filter_attrset(unfiltered_attrset
, RGW_ATTR_OLH_PREFIX
, &attrset
);
7770 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
7771 if (iter
== attrset
.end()) { /* not an olh */
7776 auto biter
= iter
->second
.cbegin();
7777 decode(*olh
, biter
);
7778 } catch (buffer::error
& err
) {
7779 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
7786 void RGWRados::check_pending_olh_entries(map
<string
, bufferlist
>& pending_entries
,
7787 map
<string
, bufferlist
> *rm_pending_entries
)
7789 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
7791 real_time now
= real_clock::now();
7793 while (iter
!= pending_entries
.end()) {
7794 auto biter
= iter
->second
.cbegin();
7795 RGWOLHPendingInfo pending_info
;
7797 decode(pending_info
, biter
);
7798 } catch (buffer::error
& err
) {
7799 /* skipping bad entry, we could remove it but it might hide a bug */
7800 ldout(cct
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
7805 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
7807 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
7808 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
7809 pending_entries
.erase(cur_iter
);
7811 /* entries names are sorted by time (rounded to a second) */
7817 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
7820 int r
= get_obj_head_ref(bucket_info
, olh_obj
, &ref
);
7825 // trim no more than 1000 entries per osd op
7826 constexpr int max_entries
= 1000;
7828 auto i
= pending_attrs
.begin();
7829 while (i
!= pending_attrs
.end()) {
7830 ObjectWriteOperation op
;
7831 bucket_index_guard_olh_op(state
, op
);
7833 for (int n
= 0; n
< max_entries
&& i
!= pending_attrs
.end(); ++n
, ++i
) {
7834 op
.rmxattr(i
->first
.c_str());
7837 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
7838 if (r
== -ENOENT
|| r
== -ECANCELED
) {
7839 /* raced with some other change, shouldn't sweat about it */
7843 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
7850 int RGWRados::follow_olh(const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
7852 map
<string
, bufferlist
> pending_entries
;
7853 rgw_filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
7855 map
<string
, bufferlist
> rm_pending_entries
;
7856 check_pending_olh_entries(pending_entries
, &rm_pending_entries
);
7858 if (!rm_pending_entries
.empty()) {
7859 int ret
= remove_olh_pending_entries(bucket_info
, *state
, olh_obj
, rm_pending_entries
);
7861 ldout(cct
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
7865 if (!pending_entries
.empty()) {
7866 ldout(cct
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
7868 int ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
7874 map
<string
, bufferlist
>::iterator iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
7875 ceph_assert(iter
!= state
->attrset
.end());
7878 auto biter
= iter
->second
.cbegin();
7880 } catch (buffer::error
& err
) {
7881 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
7889 *target
= olh
.target
;
7894 int RGWRados::raw_obj_stat(rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
7895 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
7896 RGWObjVersionTracker
*objv_tracker
)
7899 int r
= get_raw_obj_ref(obj
, &ref
);
7904 map
<string
, bufferlist
> unfiltered_attrset
;
7906 struct timespec mtime_ts
;
7908 ObjectReadOperation op
;
7910 objv_tracker
->prepare_op_for_read(&op
);
7913 op
.getxattrs(&unfiltered_attrset
, NULL
);
7915 if (psize
|| pmtime
) {
7916 op
.stat2(&size
, &mtime_ts
, NULL
);
7919 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
7922 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
, &outbl
);
7925 *epoch
= ref
.ioctx
.get_last_version();
7934 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
7936 rgw_filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
7942 int RGWRados::get_bucket_stats(RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
7943 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
, bool *syncstopped
)
7945 vector
<rgw_bucket_dir_header
> headers
;
7946 map
<int, string
> bucket_instance_ids
;
7947 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
7952 ceph_assert(headers
.size() == bucket_instance_ids
.size());
7954 auto iter
= headers
.begin();
7955 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
7956 BucketIndexShardsManager ver_mgr
;
7957 BucketIndexShardsManager master_ver_mgr
;
7958 BucketIndexShardsManager marker_mgr
;
7960 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
7961 accumulate_raw_stats(*iter
, stats
);
7962 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->ver
);
7963 ver_mgr
.add(viter
->first
, string(buf
));
7964 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->master_ver
);
7965 master_ver_mgr
.add(viter
->first
, string(buf
));
7966 if (shard_id
>= 0) {
7967 *max_marker
= iter
->max_marker
;
7969 marker_mgr
.add(viter
->first
, iter
->max_marker
);
7971 if (syncstopped
!= NULL
)
7972 *syncstopped
= iter
->syncstopped
;
7974 ver_mgr
.to_string(bucket_ver
);
7975 master_ver_mgr
.to_string(master_ver
);
7977 marker_mgr
.to_string(max_marker
);
7982 int RGWRados::get_bi_log_status(RGWBucketInfo
& bucket_info
, int shard_id
,
7983 map
<int, string
>& markers
)
7985 vector
<rgw_bucket_dir_header
> headers
;
7986 map
<int, string
> bucket_instance_ids
;
7987 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
7991 ceph_assert(headers
.size() == bucket_instance_ids
.size());
7993 auto iter
= headers
.begin();
7994 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
7996 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
7997 if (shard_id
>= 0) {
7998 markers
[shard_id
] = iter
->max_marker
;
8000 markers
[viter
->first
] = iter
->max_marker
;
8006 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
8007 RGWGetBucketStats_CB
*cb
;
8009 map
<RGWObjCategory
, RGWStorageStats
> stats
;
8015 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
8016 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true),
8017 lock("RGWGetBucketStatsContext") {}
8019 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
8020 Mutex::Locker
l(lock
);
8023 accumulate_raw_stats(header
, stats
);
8029 if (--pendings
== 0) {
8031 cb
->set_response(&stats
);
8033 cb
->handle_response(ret_code
);
8040 Mutex::Locker
l(lock
);
8045 int RGWRados::get_bucket_stats_async(RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
8048 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.num_shards
? : 1);
8049 ceph_assert(get_ctx
);
8050 int r
= cls_bucket_head_async(bucket_info
, shard_id
, get_ctx
, &num_aio
);
8054 get_ctx
->unset_cb();
8061 class RGWGetUserStatsContext
: public RGWGetUserHeader_CB
{
8062 RGWGetUserStats_CB
*cb
;
8065 explicit RGWGetUserStatsContext(RGWGetUserStats_CB
* const cb
)
8068 void handle_response(int r
, cls_user_header
& header
) override
{
8069 const cls_user_stats
& hs
= header
.stats
;
8071 RGWStorageStats stats
;
8073 stats
.size
= hs
.total_bytes
;
8074 stats
.size_rounded
= hs
.total_bytes_rounded
;
8075 stats
.num_objects
= hs
.total_entries
;
8077 cb
->set_response(stats
);
8080 cb
->handle_response(r
);
8086 int RGWRados::get_user_stats(const rgw_user
& user
, RGWStorageStats
& stats
)
8088 string user_str
= user
.to_str();
8090 cls_user_header header
;
8091 int r
= cls_user_get_header(user_str
, &header
);
8095 const cls_user_stats
& hs
= header
.stats
;
8097 stats
.size
= hs
.total_bytes
;
8098 stats
.size_rounded
= hs
.total_bytes_rounded
;
8099 stats
.num_objects
= hs
.total_entries
;
8104 int RGWRados::get_user_stats_async(const rgw_user
& user
, RGWGetUserStats_CB
*ctx
)
8106 string user_str
= user
.to_str();
8108 RGWGetUserStatsContext
*get_ctx
= new RGWGetUserStatsContext(ctx
);
8109 int r
= cls_user_get_header_async(user_str
, get_ctx
);
8119 void RGWRados::get_bucket_meta_oid(const rgw_bucket
& bucket
, string
& oid
)
8121 oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ bucket
.get_key(':');
8124 void RGWRados::get_bucket_instance_obj(const rgw_bucket
& bucket
, rgw_raw_obj
& obj
)
8126 if (!bucket
.oid
.empty()) {
8127 obj
.init(svc
.zone
->get_zone_params().domain_root
, bucket
.oid
);
8130 get_bucket_meta_oid(bucket
, oid
);
8131 obj
.init(svc
.zone
->get_zone_params().domain_root
, oid
);
8135 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const string
& meta_key
, RGWBucketInfo
& info
,
8136 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
8138 size_t pos
= meta_key
.find(':');
8139 if (pos
== string::npos
) {
8142 string oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ meta_key
;
8143 rgw_bucket_instance_key_to_oid(oid
);
8145 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
8148 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
8149 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
8152 if (bucket
.oid
.empty()) {
8153 get_bucket_meta_oid(bucket
, oid
);
8158 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
8161 int RGWRados::get_bucket_instance_from_oid(RGWSysObjectCtx
& obj_ctx
, const string
& oid
, RGWBucketInfo
& info
,
8162 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
,
8163 rgw_cache_entry_info
*cache_info
,
8164 boost::optional
<obj_version
> refresh_version
)
8166 auto& domain_root
= svc
.zone
->get_zone_params().domain_root
;
8168 ldout(cct
, 20) << "reading from " << domain_root
<< ":" << oid
<< dendl
;
8172 int ret
= rgw_get_system_obj(this, obj_ctx
, domain_root
,
8173 oid
, epbl
, &info
.objv_tracker
, pmtime
, pattrs
,
8174 cache_info
, refresh_version
);
8179 auto iter
= epbl
.cbegin();
8182 } catch (buffer::error
& err
) {
8183 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
8186 info
.bucket
.oid
= oid
;
8190 int RGWRados::get_bucket_entrypoint_info(RGWSysObjectCtx
& obj_ctx
,
8191 const string
& tenant_name
,
8192 const string
& bucket_name
,
8193 RGWBucketEntryPoint
& entry_point
,
8194 RGWObjVersionTracker
*objv_tracker
,
8196 map
<string
, bufferlist
> *pattrs
,
8197 rgw_cache_entry_info
*cache_info
,
8198 boost::optional
<obj_version
> refresh_version
)
8201 string bucket_entry
;
8203 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
8204 int ret
= rgw_get_system_obj(this, obj_ctx
, svc
.zone
->get_zone_params().domain_root
,
8205 bucket_entry
, bl
, objv_tracker
, pmtime
, pattrs
,
8206 cache_info
, refresh_version
);
8211 auto iter
= bl
.cbegin();
8213 decode(entry_point
, iter
);
8214 } catch (buffer::error
& err
) {
8215 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
8221 int RGWRados::convert_old_bucket_info(RGWSysObjectCtx
& obj_ctx
,
8222 const string
& tenant_name
,
8223 const string
& bucket_name
)
8225 RGWBucketEntryPoint entry_point
;
8227 RGWObjVersionTracker ot
;
8228 map
<string
, bufferlist
> attrs
;
8231 ldout(cct
, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name
<< dendl
;
8233 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant_name
, bucket_name
, entry_point
, &ot
, &ep_mtime
, &attrs
);
8235 ldout(cct
, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret
<< " bucket=" << bucket_name
<< dendl
;
8239 if (!entry_point
.has_bucket_info
) {
8240 /* already converted! */
8244 info
= entry_point
.old_bucket_info
;
8245 info
.bucket
.oid
= bucket_name
;
8246 info
.ep_objv
= ot
.read_version
;
8248 ot
.generate_new_write_ver(cct
);
8250 ret
= put_linked_bucket_info(info
, false, ep_mtime
, &ot
.write_version
, &attrs
, true);
8252 ldout(cct
, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret
<< dendl
;
8259 int RGWRados::_get_bucket_info(RGWSysObjectCtx
& obj_ctx
,
8260 const string
& tenant
,
8261 const string
& bucket_name
,
8262 RGWBucketInfo
& info
,
8264 map
<string
, bufferlist
> *pattrs
,
8265 boost::optional
<obj_version
> refresh_version
)
8267 string bucket_entry
;
8268 rgw_make_bucket_entry_name(tenant
, bucket_name
, bucket_entry
);
8271 if (auto e
= binfo_cache
->find(bucket_entry
)) {
8272 if (refresh_version
&&
8273 e
->info
.objv_tracker
.read_version
.compare(&(*refresh_version
))) {
8274 lderr(cct
) << "WARNING: The bucket info cache is inconsistent. This is "
8275 << "a failure that should be debugged. I am a nice machine, "
8276 << "so I will try to recover." << dendl
;
8277 binfo_cache
->invalidate(bucket_entry
);
8288 bucket_info_entry e
;
8289 RGWBucketEntryPoint entry_point
;
8291 RGWObjVersionTracker ot
;
8292 rgw_cache_entry_info entry_cache_info
;
8293 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant
, bucket_name
,
8294 entry_point
, &ot
, &ep_mtime
, pattrs
,
8295 &entry_cache_info
, refresh_version
);
8297 /* only init these fields */
8298 info
.bucket
.tenant
= tenant
;
8299 info
.bucket
.name
= bucket_name
;
8303 if (entry_point
.has_bucket_info
) {
8304 info
= entry_point
.old_bucket_info
;
8305 info
.bucket
.oid
= bucket_name
;
8306 info
.bucket
.tenant
= tenant
;
8307 info
.ep_objv
= ot
.read_version
;
8308 ldout(cct
, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info
.bucket
<< " owner " << info
.owner
<< dendl
;
8312 /* data is in the bucket instance object, we need to get attributes from there, clear everything
8319 ldout(cct
, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point
.bucket
<< dendl
;
8322 /* read bucket instance info */
8325 get_bucket_meta_oid(entry_point
.bucket
, oid
);
8327 rgw_cache_entry_info cache_info
;
8329 ret
= get_bucket_instance_from_oid(obj_ctx
, oid
, e
.info
, &e
.mtime
, &e
.attrs
,
8330 &cache_info
, refresh_version
);
8331 e
.info
.ep_objv
= ot
.read_version
;
8334 lderr(cct
) << "ERROR: get_bucket_instance_from_oid failed: " << ret
<< dendl
;
8335 info
.bucket
.tenant
= tenant
;
8336 info
.bucket
.name
= bucket_name
;
8337 // XXX and why return anything in case of an error anyway?
8346 /* chain to both bucket entry point and bucket instance */
8347 if (!binfo_cache
->put(svc
.cache
, bucket_entry
, &e
, {&entry_cache_info
, &cache_info
})) {
8348 ldout(cct
, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl
;
8351 if (refresh_version
&&
8352 refresh_version
->compare(&info
.objv_tracker
.read_version
)) {
8353 lderr(cct
) << "WARNING: The OSD has the same version I have. Something may "
8354 << "have gone squirrelly. An administrator may have forced a "
8355 << "change; otherwise there is a problem somewhere." << dendl
;
8361 int RGWRados::get_bucket_info(RGWSysObjectCtx
& obj_ctx
,
8362 const string
& tenant
, const string
& bucket_name
,
8363 RGWBucketInfo
& info
,
8364 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
8366 return _get_bucket_info(obj_ctx
, tenant
, bucket_name
, info
, pmtime
,
8367 pattrs
, boost::none
);
8370 int RGWRados::try_refresh_bucket_info(RGWBucketInfo
& info
,
8371 ceph::real_time
*pmtime
,
8372 map
<string
, bufferlist
> *pattrs
)
8374 RGWSysObjectCtx obj_ctx
= svc
.sysobj
->init_obj_ctx();
8376 return _get_bucket_info(obj_ctx
, info
.bucket
.tenant
, info
.bucket
.name
,
8377 info
, pmtime
, pattrs
, info
.objv_tracker
.read_version
);
8380 int RGWRados::put_bucket_entrypoint_info(const string
& tenant_name
, const string
& bucket_name
, RGWBucketEntryPoint
& entry_point
,
8381 bool exclusive
, RGWObjVersionTracker
& objv_tracker
, real_time mtime
,
8382 map
<string
, bufferlist
> *pattrs
)
8385 encode(entry_point
, epbl
);
8386 string bucket_entry
;
8387 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
8388 return rgw_bucket_store_info(this, bucket_entry
, epbl
, exclusive
, pattrs
, &objv_tracker
, mtime
);
8391 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
8392 real_time mtime
, map
<string
, bufferlist
> *pattrs
)
8394 info
.has_instance_obj
= true;
8399 string key
= info
.bucket
.get_key(); /* when we go through meta api, we don't use oid directly */
8400 int ret
= rgw_bucket_instance_store_info(this, key
, bl
, exclusive
, pattrs
, &info
.objv_tracker
, mtime
);
8401 if (ret
== -EEXIST
) {
8402 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
8403 * bucket operation on this specific bucket (e.g., being synced from the master), but
8404 * since bucket instace meta object is unique for this specific bucket instace, we don't
8405 * need to return an error.
8406 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
8407 * master, creating a bucket, sending bucket creation to the master, we create the bucket
8408 * locally, while in the sync thread we sync the new bucket.
8415 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
8416 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
)
8418 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
8420 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
);
8426 return 0; /* done! */
8428 RGWBucketEntryPoint entry_point
;
8429 entry_point
.bucket
= info
.bucket
;
8430 entry_point
.owner
= info
.owner
;
8431 entry_point
.creation_time
= info
.creation_time
;
8432 entry_point
.linked
= true;
8433 RGWObjVersionTracker ot
;
8434 if (pep_objv
&& !pep_objv
->tag
.empty()) {
8435 ot
.write_version
= *pep_objv
;
8437 ot
.generate_new_write_ver(cct
);
8439 *pep_objv
= ot
.write_version
;
8442 ret
= put_bucket_entrypoint_info(info
.bucket
.tenant
, info
.bucket
.name
, entry_point
, exclusive
, ot
, mtime
, NULL
);
8449 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
)
8451 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
8453 map
<string
, RGWBucketEnt
>::iterator iter
;
8454 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
8455 RGWBucketEnt
& ent
= iter
->second
;
8456 rgw_bucket
& bucket
= ent
.bucket
;
8459 ent
.size_rounded
= 0;
8461 vector
<rgw_bucket_dir_header
> headers
;
8463 RGWBucketInfo bucket_info
;
8464 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
8469 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
8473 auto hiter
= headers
.begin();
8474 for (; hiter
!= headers
.end(); ++hiter
) {
8475 RGWObjCategory category
= main_category
;
8476 auto iter
= (hiter
->stats
).find(category
);
8477 if (iter
!= hiter
->stats
.end()) {
8478 struct rgw_bucket_category_stats
& stats
= iter
->second
;
8479 ent
.count
+= stats
.num_entries
;
8480 ent
.size
+= stats
.total_size
;
8481 ent
.size_rounded
+= stats
.total_size_rounded
;
8485 // fill in placement_rule from the bucket instance for use in swift's
8486 // per-storage policy statistics
8487 ent
.placement_rule
= std::move(bucket_info
.placement_rule
);
8493 int RGWRados::append_async(rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
8496 int r
= get_raw_obj_ref(obj
, &ref
);
8500 librados::Rados
*rad
= get_rados_handle();
8501 librados::AioCompletion
*completion
= rad
->aio_create_completion(NULL
, NULL
, NULL
);
8503 r
= ref
.ioctx
.aio_append(ref
.obj
.oid
, completion
, bl
, size
);
8504 completion
->release();
8508 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
8510 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8511 librados::NObjectIterator
& iter
= ctx
.iter
;
8513 int r
= open_pool_ctx(pool
, io_ctx
);
8517 iter
= io_ctx
.nobjects_begin();
8522 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, const string
& cursor
, RGWPoolIterCtx
& ctx
)
8524 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8525 librados::NObjectIterator
& iter
= ctx
.iter
;
8527 int r
= open_pool_ctx(pool
, io_ctx
);
8531 librados::ObjectCursor oc
;
8532 if (!oc
.from_str(cursor
)) {
8533 ldout(cct
, 10) << "failed to parse cursor: " << cursor
<< dendl
;
8538 iter
= io_ctx
.nobjects_begin(oc
);
8540 } catch (const std::system_error
& e
) {
8541 r
= -e
.code().value();
8542 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
8543 << ", returning " << r
<< dendl
;
8545 } catch (const std::exception
& e
) {
8546 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
8547 << ", returning -5" << dendl
;
8552 string
RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx
& ctx
)
8554 return ctx
.iter
.get_cursor().to_str();
8557 static int do_pool_iterate(CephContext
* cct
, RGWPoolIterCtx
& ctx
, uint32_t num
,
8558 vector
<rgw_bucket_dir_entry
>& objs
,
8559 bool *is_truncated
, RGWAccessListFilter
*filter
)
8561 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
8562 librados::NObjectIterator
& iter
= ctx
.iter
;
8564 if (iter
== io_ctx
.nobjects_end())
8569 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
8570 rgw_bucket_dir_entry e
;
8572 string oid
= iter
->get_oid();
8573 ldout(cct
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
8575 // fill it in with initial values; we may correct later
8576 if (filter
&& !filter
->filter(oid
, oid
))
8584 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
8589 int RGWRados::pool_iterate(RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
8590 bool *is_truncated
, RGWAccessListFilter
*filter
)
8592 // catch exceptions from NObjectIterator::operator++()
8594 return do_pool_iterate(cct
, ctx
, num
, objs
, is_truncated
, filter
);
8595 } catch (const std::system_error
& e
) {
8596 int r
= -e
.code().value();
8597 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
8598 << ", returning " << r
<< dendl
;
8600 } catch (const std::exception
& e
) {
8601 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
8602 << ", returning -5" << dendl
;
8607 int RGWRados::list_raw_objects_init(const rgw_pool
& pool
, const string
& marker
, RGWListRawObjsCtx
*ctx
)
8609 if (!ctx
->initialized
) {
8610 int r
= pool_iterate_begin(pool
, marker
, ctx
->iter_ctx
);
8612 ldout(cct
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
8615 ctx
->initialized
= true;
8620 int RGWRados::list_raw_objects_next(const string
& prefix_filter
, int max
,
8621 RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
8624 if (!ctx
.initialized
) {
8627 RGWAccessListFilterPrefix
filter(prefix_filter
);
8628 vector
<rgw_bucket_dir_entry
> objs
;
8629 int r
= pool_iterate(ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
8632 ldout(cct
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
8636 vector
<rgw_bucket_dir_entry
>::iterator iter
;
8637 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
8638 oids
.push_back(iter
->key
.name
);
8644 int RGWRados::list_raw_objects(const rgw_pool
& pool
, const string
& prefix_filter
,
8645 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
8648 if (!ctx
.initialized
) {
8649 int r
= list_raw_objects_init(pool
, string(), &ctx
);
8655 return list_raw_objects_next(prefix_filter
, max
, ctx
, oids
, is_truncated
);
8658 string
RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx
& ctx
)
8660 return pool_iterate_get_cursor(ctx
.iter_ctx
);
8663 int RGWRados::list_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& marker
, uint32_t max
,
8664 std::list
<rgw_bi_log_entry
>& result
, bool *truncated
)
8666 ldout(cct
, 20) << __func__
<< ": " << bucket_info
.bucket
<< " marker " << marker
<< " shard_id=" << shard_id
<< " max " << max
<< dendl
;
8669 librados::IoCtx index_ctx
;
8670 map
<int, string
> oids
;
8671 map
<int, cls_rgw_bi_log_list_ret
> bi_log_lists
;
8672 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
8676 BucketIndexShardsManager marker_mgr
;
8677 bool has_shards
= (oids
.size() > 1 || shard_id
>= 0);
8678 // If there are multiple shards for the bucket index object, the marker
8679 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
8680 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
8681 // only contain one record, and the key is the bucket instance id.
8682 r
= marker_mgr
.from_string(marker
, shard_id
);
8686 r
= CLSRGWIssueBILogList(index_ctx
, marker_mgr
, max
, oids
, bi_log_lists
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8690 map
<int, list
<rgw_bi_log_entry
>::iterator
> vcurrents
;
8691 map
<int, list
<rgw_bi_log_entry
>::iterator
> vends
;
8695 map
<int, cls_rgw_bi_log_list_ret
>::iterator miter
= bi_log_lists
.begin();
8696 for (; miter
!= bi_log_lists
.end(); ++miter
) {
8697 int shard_id
= miter
->first
;
8698 vcurrents
[shard_id
] = miter
->second
.entries
.begin();
8699 vends
[shard_id
] = miter
->second
.entries
.end();
8701 *truncated
= (*truncated
|| miter
->second
.truncated
);
8706 bool has_more
= true;
8707 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator viter
;
8708 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator eiter
;
8709 while (total
< max
&& has_more
) {
8712 viter
= vcurrents
.begin();
8713 eiter
= vends
.begin();
8715 for (; total
< max
&& viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
8716 assert (eiter
!= vends
.end());
8718 int shard_id
= viter
->first
;
8719 list
<rgw_bi_log_entry
>::iterator
& liter
= viter
->second
;
8721 if (liter
== eiter
->second
){
8724 rgw_bi_log_entry
& entry
= *(liter
);
8727 snprintf(buf
, sizeof(buf
), "%d", shard_id
);
8729 build_bucket_index_marker(buf
, entry
.id
, &tmp_id
);
8730 entry
.id
.swap(tmp_id
);
8732 marker_mgr
.add(shard_id
, entry
.id
);
8733 result
.push_back(entry
);
8741 for (viter
= vcurrents
.begin(), eiter
= vends
.begin(); viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
8742 assert (eiter
!= vends
.end());
8743 *truncated
= (*truncated
|| (viter
->second
!= eiter
->second
));
8747 // Refresh marker, if there are multiple shards, the output will look like
8748 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
8749 // if there is no sharding, the simply marker (without oid) is returned
8751 marker_mgr
.to_string(&marker
);
8753 if (!result
.empty()) {
8754 marker
= result
.rbegin()->id
;
8761 int RGWRados::trim_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& start_marker
, string
& end_marker
)
8763 librados::IoCtx index_ctx
;
8764 map
<int, string
> bucket_objs
;
8766 BucketIndexShardsManager start_marker_mgr
;
8767 BucketIndexShardsManager end_marker_mgr
;
8769 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
8774 r
= start_marker_mgr
.from_string(start_marker
, shard_id
);
8779 r
= end_marker_mgr
.from_string(end_marker
, shard_id
);
8784 return CLSRGWIssueBILogTrim(index_ctx
, start_marker_mgr
, end_marker_mgr
, bucket_objs
,
8785 cct
->_conf
->rgw_bucket_index_max_aio
)();
8788 int RGWRados::resync_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
)
8790 librados::IoCtx index_ctx
;
8791 map
<int, string
> bucket_objs
;
8792 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
8796 return CLSRGWIssueResyncBucketBILog(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8799 int RGWRados::stop_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
)
8801 librados::IoCtx index_ctx
;
8802 map
<int, string
> bucket_objs
;
8803 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
8807 return CLSRGWIssueBucketBILogStop(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8810 int RGWRados::bi_get_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8811 rgw_bucket_dir_entry
*dirent
)
8813 rgw_cls_bi_entry bi_entry
;
8814 int r
= bi_get(bucket_info
, obj
, BIIndexType::Instance
, &bi_entry
);
8815 if (r
< 0 && r
!= -ENOENT
) {
8816 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
8821 auto iter
= bi_entry
.data
.cbegin();
8823 decode(*dirent
, iter
);
8824 } catch (buffer::error
& err
) {
8825 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
8832 int RGWRados::bi_get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8833 rgw_bucket_olh_entry
*olh
)
8835 rgw_cls_bi_entry bi_entry
;
8836 int r
= bi_get(bucket_info
, obj
, BIIndexType::OLH
, &bi_entry
);
8837 if (r
< 0 && r
!= -ENOENT
) {
8838 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
8843 auto iter
= bi_entry
.data
.cbegin();
8846 } catch (buffer::error
& err
) {
8847 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
8854 int RGWRados::bi_get(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8855 BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
8857 BucketShard
bs(this);
8858 int ret
= bs
.init(bucket_info
, obj
);
8860 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8864 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
8866 return cls_rgw_bi_get(bs
.index_ctx
, bs
.bucket_obj
, index_type
, key
, entry
);
8869 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
8871 cls_rgw_bi_put(op
, bs
.bucket_obj
, entry
);
8874 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
8876 int ret
= cls_rgw_bi_put(bs
.index_ctx
, bs
.bucket_obj
, entry
);
8883 int RGWRados::bi_put(rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
8885 BucketShard
bs(this);
8886 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
8888 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8892 return bi_put(bs
, entry
);
8895 int RGWRados::bi_list(rgw_bucket
& bucket
, const string
& obj_name
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8897 rgw_obj
obj(bucket
, obj_name
);
8898 BucketShard
bs(this);
8899 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
8901 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8905 ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, obj_name
, marker
, max
, entries
, is_truncated
);
8906 if (ret
== -ENOENT
) {
8907 *is_truncated
= false;
8915 int RGWRados::bi_list(BucketShard
& bs
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8917 int ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, filter_obj
, marker
, max
, entries
, is_truncated
);
8924 int RGWRados::bi_remove(BucketShard
& bs
)
8926 int ret
= bs
.index_ctx
.remove(bs
.bucket_obj
);
8927 if (ret
== -ENOENT
) {
8931 ldout(cct
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
8938 int RGWRados::bi_list(rgw_bucket
& bucket
, int shard_id
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
8940 BucketShard
bs(this);
8941 int ret
= bs
.init(bucket
, shard_id
, nullptr /* no RGWBucketInfo */);
8943 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
8947 return bi_list(bs
, filter_obj
, marker
, max
, entries
, is_truncated
);
8950 int RGWRados::gc_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
8952 return gc_pool_ctx
.operate(oid
, op
);
8955 int RGWRados::gc_aio_operate(string
& oid
, librados::ObjectWriteOperation
*op
, AioCompletion
**pc
)
8957 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
8958 int r
= gc_pool_ctx
.aio_operate(oid
, c
, op
);
8967 int RGWRados::gc_operate(string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
8969 return gc_pool_ctx
.operate(oid
, op
, pbl
);
8972 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
)
8974 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
);
8977 int RGWRados::process_gc(bool expired_only
)
8979 return gc
->process(expired_only
);
8982 int RGWRados::list_lc_progress(const string
& marker
, uint32_t max_entries
, map
<string
, int> *progress_map
)
8984 return lc
->list_lc_progress(marker
, max_entries
, progress_map
);
8987 int RGWRados::process_lc()
8989 return lc
->process();
8992 bool RGWRados::process_expire_objects()
8994 return obj_expirer
->inspect_all_shards(utime_t(), ceph_clock_now());
8997 int RGWRados::cls_obj_prepare_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
8998 rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
9000 rgw_zone_set zones_trace
;
9002 zones_trace
= *_zones_trace
;
9004 zones_trace
.insert(svc
.zone
->get_zone().id
);
9006 ObjectWriteOperation o
;
9007 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
9008 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
9009 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), svc
.zone
->get_zone().log_data
, bilog_flags
, zones_trace
);
9010 return bs
.index_ctx
.operate(bs
.bucket_obj
, &o
);
9013 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, const rgw_obj
& obj
, RGWModifyOp op
, string
& tag
,
9014 int64_t pool
, uint64_t epoch
,
9015 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
9016 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
9018 ObjectWriteOperation o
;
9019 rgw_bucket_dir_entry_meta dir_meta
;
9020 dir_meta
= ent
.meta
;
9021 dir_meta
.category
= category
;
9023 rgw_zone_set zones_trace
;
9025 zones_trace
= *_zones_trace
;
9027 zones_trace
.insert(svc
.zone
->get_zone().id
);
9029 rgw_bucket_entry_ver ver
;
9032 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
9033 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
9034 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
9035 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
);
9036 complete_op_data
*arg
;
9037 index_completion_manager
->create_completion(obj
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
9038 svc
.zone
->get_zone().log_data
, bilog_flags
, &zones_trace
, &arg
);
9039 librados::AioCompletion
*completion
= arg
->rados_completion
;
9040 int ret
= bs
.index_ctx
.aio_operate(bs
.bucket_obj
, arg
->rados_completion
, &o
);
9041 completion
->release(); /* can't reference arg here, as it might have already been released */
9045 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, const rgw_obj
& obj
, string
& tag
,
9046 int64_t pool
, uint64_t epoch
,
9047 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
9048 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
9050 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
9053 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
9054 int64_t pool
, uint64_t epoch
,
9056 real_time
& removed_mtime
,
9057 list
<rgw_obj_index_key
> *remove_objs
,
9058 uint16_t bilog_flags
,
9059 rgw_zone_set
*zones_trace
)
9061 rgw_bucket_dir_entry ent
;
9062 ent
.meta
.mtime
= removed_mtime
;
9063 obj
.key
.get_index_key(&ent
.key
);
9064 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
,
9065 ent
, RGWObjCategory::None
, remove_objs
,
9066 bilog_flags
, zones_trace
);
9069 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
9071 rgw_bucket_dir_entry ent
;
9072 obj
.key
.get_index_key(&ent
.key
);
9073 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_CANCEL
, tag
,
9074 -1 /* pool id */, 0, ent
,
9075 RGWObjCategory::None
, NULL
, bilog_flags
,
9079 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo
& bucket_info
, uint64_t timeout
)
9081 librados::IoCtx index_ctx
;
9082 map
<int, string
> bucket_objs
;
9083 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
9087 return CLSRGWIssueSetTagTimeout(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
9091 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo
& bucket_info
,
9093 const rgw_obj_index_key
& start
,
9094 const string
& prefix
,
9095 uint32_t num_entries
,
9097 map
<string
, rgw_bucket_dir_entry
>& m
,
9099 rgw_obj_index_key
*last_entry
,
9100 bool (*force_check_filter
)(const string
& name
))
9102 ldout(cct
, 10) << "cls_bucket_list_ordered " << bucket_info
.bucket
<<
9103 " start " << start
.name
<< "[" << start
.instance
<< "] num_entries " <<
9104 num_entries
<< dendl
;
9106 librados::IoCtx index_ctx
;
9107 // key - oid (for different shards if there is any)
9108 // value - list result for the corresponding oid (shard), it is filled by
9110 map
<int, string
> oids
;
9111 map
<int, struct rgw_cls_list_ret
> list_results
;
9112 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
9116 cls_rgw_obj_key
start_key(start
.name
, start
.instance
);
9117 r
= CLSRGWIssueBucketList(index_ctx
, start_key
, prefix
, num_entries
,
9118 list_versions
, oids
, list_results
,
9119 cct
->_conf
->rgw_bucket_index_max_aio
)();
9123 // Create a list of iterators that are used to iterate each shard
9124 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vcurrents
;
9125 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vends
;
9126 vector
<string
> vnames
;
9127 vcurrents
.reserve(list_results
.size());
9128 vends
.reserve(list_results
.size());
9129 vnames
.reserve(list_results
.size());
9130 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
9131 *is_truncated
= false;
9132 for (; iter
!= list_results
.end(); ++iter
) {
9133 vcurrents
.push_back(iter
->second
.dir
.m
.begin());
9134 vends
.push_back(iter
->second
.dir
.m
.end());
9135 vnames
.push_back(oids
[iter
->first
]);
9136 *is_truncated
= (*is_truncated
|| iter
->second
.is_truncated
);
9139 // Create a map to track the next candidate entry from each shard, if the entry
9140 // from a specified shard is selected/erased, the next entry from that shard will
9141 // be inserted for next round selection
9142 map
<string
, size_t> candidates
;
9143 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
9144 if (vcurrents
[i
] != vends
[i
]) {
9145 candidates
[vcurrents
[i
]->first
] = i
;
9149 map
<string
, bufferlist
> updates
;
9151 while (count
< num_entries
&& !candidates
.empty()) {
9153 // Select the next one
9154 int pos
= candidates
.begin()->second
;
9155 const string
& name
= vcurrents
[pos
]->first
;
9156 struct rgw_bucket_dir_entry
& dirent
= vcurrents
[pos
]->second
;
9158 bool force_check
= force_check_filter
&&
9159 force_check_filter(dirent
.key
.name
);
9160 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
9161 !dirent
.pending_map
.empty() ||
9163 /* there are uncommitted ops. We need to check the current state,
9164 * and if the tags are old we need to do cleanup as well. */
9165 librados::IoCtx sub_ctx
;
9166 sub_ctx
.dup(index_ctx
);
9167 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
,
9168 updates
[vnames
[pos
]]);
9169 if (r
< 0 && r
!= -ENOENT
) {
9174 ldout(cct
, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
9175 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
9176 m
[name
] = std::move(dirent
);
9180 // Refresh the candidates map
9181 candidates
.erase(candidates
.begin());
9183 if (vcurrents
[pos
] != vends
[pos
]) {
9184 candidates
[vcurrents
[pos
]->first
] = pos
;
9188 // Suggest updates if there is any
9189 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
9190 for (; miter
!= updates
.end(); ++miter
) {
9191 if (miter
->second
.length()) {
9192 ObjectWriteOperation o
;
9193 cls_rgw_suggest_changes(o
, miter
->second
);
9194 // we don't care if we lose suggested updates, send them off blindly
9195 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
9196 index_ctx
.aio_operate(miter
->first
, c
, &o
);
9201 // Check if all the returned entries are consumed or not
9202 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
9203 if (vcurrents
[i
] != vends
[i
]) {
9204 *is_truncated
= true;
9209 *last_entry
= m
.rbegin()->first
;
9215 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo
& bucket_info
,
9217 const rgw_obj_index_key
& start
,
9218 const string
& prefix
,
9219 uint32_t num_entries
,
9221 std::vector
<rgw_bucket_dir_entry
>& ent_list
,
9223 rgw_obj_index_key
*last_entry
,
9224 bool (*force_check_filter
)(const string
& name
)) {
9225 ldout(cct
, 10) << "cls_bucket_list_unordered " << bucket_info
.bucket
<<
9226 " start " << start
.name
<< "[" << start
.instance
<<
9227 "] num_entries " << num_entries
<< dendl
;
9229 static MultipartMetaFilter multipart_meta_filter
;
9231 *is_truncated
= false;
9232 librados::IoCtx index_ctx
;
9234 map
<int, string
> oids
;
9235 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
9238 const uint32_t num_shards
= oids
.size();
9240 rgw_obj_index_key marker
= start
;
9241 uint32_t current_shard
;
9242 if (shard_id
>= 0) {
9243 current_shard
= shard_id
;
9244 } else if (start
.empty()) {
9247 // at this point we have a marker (start) that has something in
9248 // it, so we need to get to the bucket shard index, so we can
9249 // start reading from there
9252 // test whether object name is a multipart meta name
9253 if(! multipart_meta_filter
.filter(start
.name
, key
)) {
9254 // if multipart_meta_filter fails, must be "regular" (i.e.,
9255 // unadorned) and the name is the key
9259 // now convert the key (oid) to an rgw_obj_key since that will
9260 // separate out the namespace, name, and instance
9261 rgw_obj_key obj_key
;
9262 bool parsed
= rgw_obj_key::parse_raw_oid(key
, &obj_key
);
9265 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9266 "start marker: '" << start
<< "'" << dendl
;
9268 } else if (obj_key
.name
.empty()) {
9269 // if the name is empty that means the object name came in with
9270 // a namespace only, and therefore we need to start our scan at
9271 // the first bucket index shard
9274 // so now we have the key used to compute the bucket index shard
9275 // and can extract the specific shard from it
9276 current_shard
= rgw_bucket_shard_index(obj_key
.name
, num_shards
);
9280 uint32_t count
= 0u;
9281 map
<string
, bufferlist
> updates
;
9282 rgw_obj_index_key last_added_entry
;
9283 while (count
<= num_entries
&&
9284 ((shard_id
>= 0 && current_shard
== uint32_t(shard_id
)) ||
9285 current_shard
< num_shards
)) {
9286 const std::string
& oid
= oids
[current_shard
];
9287 rgw_cls_list_ret result
;
9289 librados::ObjectReadOperation op
;
9290 cls_rgw_bucket_list_op(op
, marker
, prefix
, num_entries
,
9291 list_versions
, &result
);
9292 r
= index_ctx
.operate(oid
, &op
, nullptr);
9296 for (auto& entry
: result
.dir
.m
) {
9297 rgw_bucket_dir_entry
& dirent
= entry
.second
;
9299 bool force_check
= force_check_filter
&&
9300 force_check_filter(dirent
.key
.name
);
9301 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
9302 !dirent
.pending_map
.empty() ||
9304 /* there are uncommitted ops. We need to check the current state,
9305 * and if the tags are old we need to do cleanup as well. */
9306 librados::IoCtx sub_ctx
;
9307 sub_ctx
.dup(index_ctx
);
9308 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
, updates
[oid
]);
9309 if (r
< 0 && r
!= -ENOENT
) {
9314 // at this point either r >=0 or r == -ENOENT
9315 if (r
>= 0) { // i.e., if r != -ENOENT
9316 ldout(cct
, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
9317 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
9319 if (count
< num_entries
) {
9320 marker
= last_added_entry
= dirent
.key
; // double assign
9321 ent_list
.emplace_back(std::move(dirent
));
9324 *is_truncated
= true;
9327 } else { // r == -ENOENT
9328 // in the case of -ENOENT, make sure we're advancing marker
9329 // for possible next call to CLSRGWIssueBucketList
9330 marker
= dirent
.key
;
9334 if (!result
.is_truncated
) {
9335 // if we reached the end of the shard read next shard
9337 marker
= rgw_obj_index_key();
9343 // suggest updates if there is any
9344 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
9345 for (; miter
!= updates
.end(); ++miter
) {
9346 if (miter
->second
.length()) {
9347 ObjectWriteOperation o
;
9348 cls_rgw_suggest_changes(o
, miter
->second
);
9349 // we don't care if we lose suggested updates, send them off blindly
9350 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
9351 index_ctx
.aio_operate(miter
->first
, c
, &o
);
9356 if (last_entry
&& !ent_list
.empty()) {
9357 *last_entry
= last_added_entry
;
9361 } // RGWRados::cls_bucket_list_unordered
9364 int RGWRados::cls_obj_usage_log_add(const string
& oid
,
9365 rgw_usage_log_info
& info
)
9367 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9370 int r
= get_raw_obj_ref(obj
, &ref
);
9375 ObjectWriteOperation op
;
9376 cls_rgw_usage_log_add(op
, info
);
9378 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9382 int RGWRados::cls_obj_usage_log_read(const string
& oid
, const string
& user
, const string
& bucket
,
9383 uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
9384 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
,
9387 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9390 int r
= get_raw_obj_ref(obj
, &ref
);
9395 *is_truncated
= false;
9397 r
= cls_rgw_usage_log_read(ref
.ioctx
, ref
.obj
.oid
, user
, bucket
, start_epoch
, end_epoch
,
9398 max_entries
, read_iter
, usage
, is_truncated
);
9403 int RGWRados::cls_obj_usage_log_trim(const string
& oid
, const string
& user
, const string
& bucket
,
9404 uint64_t start_epoch
, uint64_t end_epoch
)
9406 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9409 int r
= get_raw_obj_ref(obj
, &ref
);
9414 r
= cls_rgw_usage_log_trim(ref
.ioctx
, ref
.obj
.oid
, user
, bucket
, start_epoch
, end_epoch
);
9418 int RGWRados::cls_obj_usage_log_clear(string
& oid
)
9420 rgw_raw_obj
obj(svc
.zone
->get_zone_params().usage_log_pool
, oid
);
9423 int r
= get_raw_obj_ref(obj
, &ref
);
9427 librados::ObjectWriteOperation op
;
9428 cls_rgw_usage_log_clear(op
);
9429 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9434 int RGWRados::remove_objs_from_index(RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
9436 librados::IoCtx index_ctx
;
9439 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
9441 int r
= open_bucket_index(bucket_info
, index_ctx
, dir_oid
);
9447 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
9448 rgw_bucket_dir_entry entry
;
9450 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
9451 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9452 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
9453 encode(entry
, updates
);
9458 r
= index_ctx
.exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
9463 int RGWRados::check_disk_state(librados::IoCtx io_ctx
,
9464 const RGWBucketInfo
& bucket_info
,
9465 rgw_bucket_dir_entry
& list_state
,
9466 rgw_bucket_dir_entry
& object
,
9467 bufferlist
& suggested_updates
)
9469 const rgw_bucket
& bucket
= bucket_info
.bucket
;
9470 uint8_t suggest_flag
= (svc
.zone
->get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
9474 rgw_obj
obj(bucket
, list_state
.key
);
9477 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
9479 if (loc
!= list_state
.locator
) {
9480 ldout(cct
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
9483 io_ctx
.locator_set_key(list_state
.locator
);
9485 RGWObjState
*astate
= NULL
;
9486 RGWObjectCtx
rctx(this);
9487 int r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
9491 list_state
.pending_map
.clear(); // we don't need this and it inflates size
9492 if (!astate
->exists
) {
9493 /* object doesn't exist right now -- hopefully because it's
9494 * marked as !exists and got deleted */
9495 if (list_state
.exists
) {
9496 /* FIXME: what should happen now? Work out if there are any
9497 * non-bad ways this could happen (there probably are, but annoying
9500 // encode a suggested removal of that key
9501 list_state
.ver
.epoch
= io_ctx
.get_last_version();
9502 list_state
.ver
.pool
= io_ctx
.get_id();
9503 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
9508 string content_type
;
9511 object
.meta
.size
= astate
->size
;
9512 object
.meta
.accounted_size
= astate
->accounted_size
;
9513 object
.meta
.mtime
= astate
->mtime
;
9515 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
9516 if (iter
!= astate
->attrset
.end()) {
9517 etag
= rgw_bl_str(iter
->second
);
9519 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
9520 if (iter
!= astate
->attrset
.end()) {
9521 content_type
= rgw_bl_str(iter
->second
);
9523 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
9524 if (iter
!= astate
->attrset
.end()) {
9525 r
= decode_policy(iter
->second
, &owner
);
9527 dout(0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
9531 if (astate
->has_manifest
) {
9532 RGWObjManifest::obj_iterator miter
;
9533 RGWObjManifest
& manifest
= astate
->manifest
;
9534 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
9535 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(this);
9537 rgw_raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
9539 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
9540 dout(10) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
9541 r
= delete_obj_index(loc
);
9543 dout(0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
9549 object
.meta
.etag
= etag
;
9550 object
.meta
.content_type
= content_type
;
9551 object
.meta
.owner
= owner
.get_id().to_str();
9552 object
.meta
.owner_display_name
= owner
.get_display_name();
9554 // encode suggested updates
9555 list_state
.ver
.pool
= io_ctx
.get_id();
9556 list_state
.ver
.epoch
= astate
->epoch
;
9557 list_state
.meta
.size
= object
.meta
.size
;
9558 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
9559 list_state
.meta
.mtime
= object
.meta
.mtime
;
9560 list_state
.meta
.category
= main_category
;
9561 list_state
.meta
.etag
= etag
;
9562 list_state
.meta
.content_type
= content_type
;
9563 if (astate
->obj_tag
.length() > 0)
9564 list_state
.tag
= astate
->obj_tag
.c_str();
9565 list_state
.meta
.owner
= owner
.get_id().to_str();
9566 list_state
.meta
.owner_display_name
= owner
.get_display_name();
9568 list_state
.exists
= true;
9569 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
9573 int RGWRados::cls_bucket_head(const RGWBucketInfo
& bucket_info
, int shard_id
, vector
<rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
9575 librados::IoCtx index_ctx
;
9576 map
<int, string
> oids
;
9577 map
<int, struct rgw_cls_list_ret
> list_results
;
9578 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, list_results
, shard_id
, bucket_instance_ids
);
9582 r
= CLSRGWIssueGetDirHeader(index_ctx
, oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
9586 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
9587 for(; iter
!= list_results
.end(); ++iter
) {
9588 headers
.push_back(std::move(iter
->second
.dir
.header
));
9593 int RGWRados::cls_bucket_head_async(const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
9595 librados::IoCtx index_ctx
;
9596 map
<int, string
> bucket_objs
;
9597 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
9601 map
<int, string
>::iterator iter
= bucket_objs
.begin();
9602 for (; iter
!= bucket_objs
.end(); ++iter
) {
9603 r
= cls_rgw_get_dir_header_async(index_ctx
, iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
9614 int RGWRados::cls_user_get_header(const string
& user_id
, cls_user_header
*header
)
9616 string buckets_obj_id
;
9617 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9618 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9621 int r
= get_raw_obj_ref(obj
, &ref
);
9626 librados::ObjectReadOperation op
;
9628 ::cls_user_get_header(op
, header
, &rc
);
9630 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
, &ibl
);
9639 int RGWRados::cls_user_reset_stats(const string
& user_id
)
9641 string buckets_obj_id
;
9642 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9643 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9646 int r
= get_raw_obj_ref(obj
, &ref
);
9651 librados::ObjectWriteOperation op
;
9652 ::cls_user_reset_stats(op
);
9653 return ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9656 int RGWRados::cls_user_get_header_async(const string
& user_id
, RGWGetUserHeader_CB
*ctx
)
9658 string buckets_obj_id
;
9659 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9660 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9663 int r
= get_raw_obj_ref(obj
, &ref
);
9668 r
= ::cls_user_get_header_async(ref
.ioctx
, ref
.obj
.oid
, ctx
);
9675 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj
& user_obj
,
9676 const RGWBucketInfo
& bucket_info
)
9678 vector
<rgw_bucket_dir_header
> headers
;
9679 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
9681 ldout(cct
, 20) << "cls_bucket_header() returned " << r
<< dendl
;
9685 cls_user_bucket_entry entry
;
9687 bucket_info
.bucket
.convert(&entry
.bucket
);
9689 for (const auto& hiter
: headers
) {
9690 for (const auto& iter
: hiter
.stats
) {
9691 if (RGWObjCategory::Main
== iter
.first
||
9692 RGWObjCategory::MultiMeta
== iter
.first
) {
9693 const struct rgw_bucket_category_stats
& header_stats
= iter
.second
;
9694 entry
.size
+= header_stats
.total_size
;
9695 entry
.size_rounded
+= header_stats
.total_size_rounded
;
9696 entry
.count
+= header_stats
.num_entries
;
9701 list
<cls_user_bucket_entry
> entries
;
9702 entries
.push_back(entry
);
9704 r
= cls_user_update_buckets(user_obj
, entries
, false);
9706 ldout(cct
, 20) << "cls_user_update_buckets() returned " << r
<< dendl
;
9713 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket
& bucket
, cls_user_bucket_entry
& entry
)
9715 vector
<rgw_bucket_dir_header
> headers
;
9716 RGWBucketInfo bucket_info
;
9717 auto obj_ctx
= svc
.sysobj
->init_obj_ctx();
9718 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
9723 ret
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
9725 ldout(cct
, 20) << "cls_bucket_header() returned " << ret
<< dendl
;
9729 bucket
.convert(&entry
.bucket
);
9731 for (const auto& hiter
: headers
) {
9732 for (const auto& iter
: hiter
.stats
) {
9733 const struct rgw_bucket_category_stats
& header_stats
= iter
.second
;
9734 entry
.size
+= header_stats
.total_size
;
9735 entry
.size_rounded
+= header_stats
.total_size_rounded
;
9736 entry
.count
+= header_stats
.num_entries
;
9743 int RGWRados::cls_user_list_buckets(rgw_raw_obj
& obj
,
9744 const string
& in_marker
,
9745 const string
& end_marker
,
9746 const int max_entries
,
9747 list
<cls_user_bucket_entry
>& entries
,
9748 string
* const out_marker
,
9749 bool * const truncated
)
9752 int r
= get_raw_obj_ref(obj
, &ref
);
9757 librados::ObjectReadOperation op
;
9760 cls_user_bucket_list(op
, in_marker
, end_marker
, max_entries
, entries
, out_marker
, truncated
, &rc
);
9762 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
, &ibl
);
9771 int RGWRados::cls_user_update_buckets(rgw_raw_obj
& obj
, list
<cls_user_bucket_entry
>& entries
, bool add
)
9774 int r
= get_raw_obj_ref(obj
, &ref
);
9779 librados::ObjectWriteOperation op
;
9780 cls_user_set_buckets(op
, entries
, add
);
9781 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9788 int RGWRados::complete_sync_user_stats(const rgw_user
& user_id
)
9790 string buckets_obj_id
;
9791 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
9792 rgw_raw_obj
obj(svc
.zone
->get_zone_params().user_uid_pool
, buckets_obj_id
);
9793 return cls_user_complete_stats_sync(obj
);
9796 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj
& obj
)
9799 int r
= get_raw_obj_ref(obj
, &ref
);
9804 librados::ObjectWriteOperation op
;
9805 ::cls_user_complete_stats_sync(op
);
9806 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9813 int RGWRados::cls_user_add_bucket(rgw_raw_obj
& obj
, const cls_user_bucket_entry
& entry
)
9815 list
<cls_user_bucket_entry
> l
;
9818 return cls_user_update_buckets(obj
, l
, true);
9821 int RGWRados::cls_user_remove_bucket(rgw_raw_obj
& obj
, const cls_user_bucket
& bucket
)
9824 int r
= get_system_obj_ref(obj
, &ref
);
9829 librados::ObjectWriteOperation op
;
9830 ::cls_user_remove_bucket(op
, bucket
);
9831 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
9838 int RGWRados::check_bucket_shards(const RGWBucketInfo
& bucket_info
, const rgw_bucket
& bucket
,
9839 RGWQuotaInfo
& bucket_quota
)
9841 if (! cct
->_conf
.get_val
<bool>("rgw_dynamic_resharding")) {
9845 bool need_resharding
= false;
9846 int num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
9847 uint32_t suggested_num_shards
;
9849 const uint64_t max_objs_per_shard
=
9850 cct
->_conf
.get_val
<uint64_t>("rgw_max_objs_per_shard");
9852 quota_handler
->check_bucket_shards(max_objs_per_shard
, num_source_shards
,
9853 bucket_info
.owner
, bucket
, bucket_quota
,
9854 1, need_resharding
, &suggested_num_shards
);
9859 if (need_resharding
) {
9860 ldout(cct
, 20) << __func__
<< " bucket " << bucket
.name
<< " need resharding " <<
9861 " old num shards " << bucket_info
.num_shards
<< " new num shards " << suggested_num_shards
<<
9863 return add_bucket_to_reshard(bucket_info
, suggested_num_shards
);
9869 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo
& bucket_info
, uint32_t new_num_shards
)
9871 RGWReshard
reshard(this);
9873 uint32_t num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
9875 new_num_shards
= std::min(new_num_shards
, get_max_bucket_shards());
9876 if (new_num_shards
<= num_source_shards
) {
9877 ldout(cct
, 20) << "not resharding bucket name=" << bucket_info
.bucket
.name
<< ", orig_num=" << num_source_shards
<< ", new_num_shards=" << new_num_shards
<< dendl
;
9881 cls_rgw_reshard_entry entry
;
9882 entry
.time
= real_clock::now();
9883 entry
.tenant
= bucket_info
.owner
.tenant
;
9884 entry
.bucket_name
= bucket_info
.bucket
.name
;
9885 entry
.bucket_id
= bucket_info
.bucket
.bucket_id
;
9886 entry
.old_num_shards
= num_source_shards
;
9887 entry
.new_num_shards
= new_num_shards
;
9889 return reshard
.add(entry
);
9892 int RGWRados::check_quota(const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
9893 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
, uint64_t obj_size
, bool check_size_only
)
9895 // if we only check size, then num_objs will set to 0
9897 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 0, obj_size
);
9899 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
);
9902 void RGWRados::get_bucket_index_objects(const string
& bucket_oid_base
,
9903 uint32_t num_shards
,
9904 map
<int, string
>& bucket_objects
,
9907 bucket_objects
[0] = bucket_oid_base
;
9909 char buf
[bucket_oid_base
.size() + 32];
9911 for (uint32_t i
= 0; i
< num_shards
; ++i
) {
9912 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), i
);
9913 bucket_objects
[i
] = buf
;
9916 if ((uint32_t)shard_id
> num_shards
) {
9919 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
9920 bucket_objects
[shard_id
] = buf
;
9925 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<int, string
> *result
)
9927 const rgw_bucket
& bucket
= bucket_info
.bucket
;
9928 string plain_id
= bucket
.name
+ ":" + bucket
.bucket_id
;
9929 if (!bucket_info
.num_shards
) {
9930 (*result
)[0] = plain_id
;
9934 for (uint32_t i
= 0; i
< bucket_info
.num_shards
; ++i
) {
9935 snprintf(buf
, sizeof(buf
), ":%d", i
);
9936 (*result
)[i
] = plain_id
+ buf
;
9939 if ((uint32_t)shard_id
> bucket_info
.num_shards
) {
9942 snprintf(buf
, sizeof(buf
), ":%d", shard_id
);
9943 (*result
)[shard_id
] = plain_id
+ buf
;
9948 int RGWRados::get_target_shard_id(const RGWBucketInfo
& bucket_info
, const string
& obj_key
,
9952 switch (bucket_info
.bucket_index_shard_hash_type
) {
9953 case RGWBucketInfo::MOD
:
9954 if (!bucket_info
.num_shards
) {
9959 uint32_t sid
= rgw_bucket_shard_index(obj_key
, bucket_info
.num_shards
);
9961 *shard_id
= (int)sid
;
9971 void RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, uint32_t num_shards
,
9972 int shard_id
, string
*bucket_obj
)
9975 // By default with no sharding, we use the bucket oid as itself
9976 (*bucket_obj
) = bucket_oid_base
;
9978 char buf
[bucket_oid_base
.size() + 32];
9979 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
9980 (*bucket_obj
) = buf
;
9984 int RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, const string
& obj_key
,
9985 uint32_t num_shards
, RGWBucketInfo::BIShardsHashType hash_type
, string
*bucket_obj
, int *shard_id
)
9988 switch (hash_type
) {
9989 case RGWBucketInfo::MOD
:
9991 // By default with no sharding, we use the bucket oid as itself
9992 (*bucket_obj
) = bucket_oid_base
;
9997 uint32_t sid
= rgw_bucket_shard_index(obj_key
, num_shards
);
9998 char buf
[bucket_oid_base
.size() + 32];
9999 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), sid
);
10000 (*bucket_obj
) = buf
;
10002 *shard_id
= (int)sid
;
10012 uint64_t RGWRados::instance_id()
10014 return get_rados_handle()->get_instance_id();
10017 uint64_t RGWRados::next_bucket_id()
10019 Mutex::Locker
l(bucket_id_lock
);
10020 return ++max_bucket_id
;
10023 RGWRados
*RGWStoreManager::init_storage_provider(CephContext
*cct
, bool use_gc_thread
, bool use_lc_thread
,
10024 bool quota_threads
, bool run_sync_thread
, bool run_reshard_thread
, bool use_cache
)
10026 RGWRados
*store
= new RGWRados
;
10028 if ((*store
).set_use_cache(use_cache
)
10029 .set_run_gc_thread(use_gc_thread
)
10030 .set_run_lc_thread(use_lc_thread
)
10031 .set_run_quota_threads(quota_threads
)
10032 .set_run_sync_thread(run_sync_thread
)
10033 .set_run_reshard_thread(run_reshard_thread
)
10034 .initialize(cct
) < 0) {
10042 RGWRados
*RGWStoreManager::init_raw_storage_provider(CephContext
*cct
)
10044 RGWRados
*store
= NULL
;
10045 store
= new RGWRados
;
10047 store
->set_context(cct
);
10049 int ret
= store
->init_svc(true);
10051 ldout(cct
, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret
) << ")" << dendl
;
10055 if (store
->init_rados() < 0) {
10063 void RGWStoreManager::close_storage(RGWRados
*store
)
10073 librados::Rados
* RGWRados::get_rados_handle()
10075 if (rados
.size() == 1) {
10078 handle_lock
.get_read();
10079 pthread_t id
= pthread_self();
10080 std::map
<pthread_t
, int>:: iterator it
= rados_map
.find(id
);
10082 if (it
!= rados_map
.end()) {
10083 handle_lock
.put_read();
10084 return &rados
[it
->second
];
10086 handle_lock
.put_read();
10087 handle_lock
.get_write();
10088 const uint32_t handle
= next_rados_handle
;
10089 rados_map
[id
] = handle
;
10090 if (++next_rados_handle
== rados
.size()) {
10091 next_rados_handle
= 0;
10093 handle_lock
.put_write();
10094 return &rados
[handle
];
10099 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
10102 int ret
= get_raw_obj_ref(obj
, &ref
);
10104 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
10108 ObjectWriteOperation op
;
10109 list
<string
> prefixes
;
10110 cls_rgw_remove_obj(op
, prefixes
);
10112 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
10113 ret
= ref
.ioctx
.aio_operate(ref
.obj
.oid
, c
, &op
);
10115 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
10120 handles
.push_back(c
);
10125 int RGWRados::delete_obj_aio(const rgw_obj
& obj
,
10126 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
10127 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
)
10130 int ret
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10132 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
10136 if (keep_index_consistent
) {
10137 RGWRados::Bucket
bop(this, bucket_info
);
10138 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
10140 ret
= index_op
.prepare(CLS_RGW_OP_DEL
, &astate
->write_tag
);
10142 lderr(cct
) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
10147 ObjectWriteOperation op
;
10148 list
<string
> prefixes
;
10149 cls_rgw_remove_obj(op
, prefixes
);
10151 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
10152 ret
= ref
.ioctx
.aio_operate(ref
.obj
.oid
, c
, &op
);
10154 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
10159 handles
.push_back(c
);
10161 if (keep_index_consistent
) {
10162 ret
= delete_obj_index(obj
);
10164 lderr(cct
) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;
10171 int rgw_compression_info_from_attrset(map
<string
, bufferlist
>& attrs
, bool& need_decompress
, RGWCompressionInfo
& cs_info
) {
10172 map
<string
, bufferlist
>::iterator value
= attrs
.find(RGW_ATTR_COMPRESSION
);
10173 if (value
!= attrs
.end()) {
10174 auto bliter
= value
->second
.cbegin();
10176 decode(cs_info
, bliter
);
10177 } catch (buffer::error
& err
) {
10180 if (cs_info
.blocks
.size() == 0) {
10183 if (cs_info
.compression_type
!= "none")
10184 need_decompress
= true;
10186 need_decompress
= false;
10189 need_decompress
= false;
10194 bool RGWRados::call(std::string_view command
, const cmdmap_t
& cmdmap
,
10195 std::string_view format
, bufferlist
& out
)
10197 if (command
== "cache list"sv
) {
10198 std::optional
<std::string
> filter
;
10199 if (auto i
= cmdmap
.find("filter"); i
!= cmdmap
.cend()) {
10200 filter
= boost::get
<std::string
>(i
->second
);
10202 std::unique_ptr
<Formatter
> f(ceph::Formatter::create(format
, "table"));
10204 f
->open_array_section("cache_entries");
10205 call_list(filter
, f
.get());
10206 f
->close_section();
10210 out
.append("Unable to create Formatter.\n");
10213 } else if (command
== "cache inspect"sv
) {
10214 std::unique_ptr
<Formatter
> f(ceph::Formatter::create(format
, "json-pretty"));
10216 const auto& target
= boost::get
<std::string
>(cmdmap
.at("target"));
10217 if (call_inspect(target
, f
.get())) {
10221 out
.append("Unable to find entry "s
+ target
+ ".\n");
10225 out
.append("Unable to create Formatter.\n");
10228 } else if (command
== "cache erase"sv
) {
10229 const auto& target
= boost::get
<std::string
>(cmdmap
.at("target"));
10230 if (call_erase(target
)) {
10233 out
.append("Unable to find entry "s
+ target
+ ".\n");
10236 } else if (command
== "cache zap"sv
) {
10243 void RGWRados::call_list(const std::optional
<std::string
>& s
,
10244 ceph::Formatter
*f
)
10249 svc
.cache
->call_list(s
, f
);
10252 bool RGWRados::call_inspect(const std::string
& s
, Formatter
*f
)
10257 return svc
.cache
->call_inspect(s
, f
);
10260 bool RGWRados::call_erase(const std::string
& s
) {
10264 return svc
.cache
->call_erase(s
);
10267 void RGWRados::call_zap() {
10271 svc
.cache
->call_zap();
10274 string
RGWRados::get_mfa_oid(const rgw_user
& user
)
10276 return string("user:") + user
.to_str();
10279 int RGWRados::get_mfa_ref(const rgw_user
& user
, rgw_rados_ref
*ref
)
10281 string oid
= get_mfa_oid(user
);
10282 rgw_raw_obj
obj(svc
.zone
->get_zone_params().otp_pool
, oid
);
10283 return get_system_obj_ref(obj
, ref
);
10286 int RGWRados::check_mfa(const rgw_user
& user
, const string
& otp_id
, const string
& pin
)
10290 int r
= get_mfa_ref(user
, &ref
);
10295 rados::cls::otp::otp_check_t result
;
10297 r
= rados::cls::otp::OTP::check(cct
, ref
.ioctx
, ref
.obj
.oid
, otp_id
, pin
, &result
);
10301 ldout(cct
, 20) << "OTP check, otp_id=" << otp_id
<< " result=" << (int)result
.result
<< dendl
;
10303 return (result
.result
== rados::cls::otp::OTP_CHECK_SUCCESS
? 0 : -EACCES
);
10306 void RGWRados::prepare_mfa_write(librados::ObjectWriteOperation
*op
,
10307 RGWObjVersionTracker
*objv_tracker
,
10308 const ceph::real_time
& mtime
)
10310 RGWObjVersionTracker ot
;
10312 if (objv_tracker
) {
10313 ot
= *objv_tracker
;
10316 if (ot
.write_version
.tag
.empty()) {
10317 if (ot
.read_version
.tag
.empty()) {
10318 ot
.generate_new_write_ver(cct
);
10320 ot
.write_version
= ot
.read_version
;
10321 ot
.write_version
.ver
++;
10325 ot
.prepare_op_for_write(op
);
10326 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
10327 op
->mtime2(&mtime_ts
);
10330 int RGWRados::create_mfa(const rgw_user
& user
, const rados::cls::otp::otp_info_t
& config
,
10331 RGWObjVersionTracker
*objv_tracker
, const ceph::real_time
& mtime
)
10335 int r
= get_mfa_ref(user
, &ref
);
10340 librados::ObjectWriteOperation op
;
10341 prepare_mfa_write(&op
, objv_tracker
, mtime
);
10342 rados::cls::otp::OTP::create(&op
, config
);
10343 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
10345 ldout(cct
, 20) << "OTP create, otp_id=" << config
.id
<< " result=" << (int)r
<< dendl
;
10352 int RGWRados::remove_mfa(const rgw_user
& user
, const string
& id
,
10353 RGWObjVersionTracker
*objv_tracker
,
10354 const ceph::real_time
& mtime
)
10358 int r
= get_mfa_ref(user
, &ref
);
10363 librados::ObjectWriteOperation op
;
10364 prepare_mfa_write(&op
, objv_tracker
, mtime
);
10365 rados::cls::otp::OTP::remove(&op
, id
);
10366 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
10368 ldout(cct
, 20) << "OTP remove, otp_id=" << id
<< " result=" << (int)r
<< dendl
;
10375 int RGWRados::get_mfa(const rgw_user
& user
, const string
& id
, rados::cls::otp::otp_info_t
*result
)
10379 int r
= get_mfa_ref(user
, &ref
);
10384 r
= rados::cls::otp::OTP::get(nullptr, ref
.ioctx
, ref
.obj
.oid
, id
, result
);
10392 int RGWRados::list_mfa(const rgw_user
& user
, list
<rados::cls::otp::otp_info_t
> *result
)
10396 int r
= get_mfa_ref(user
, &ref
);
10401 r
= rados::cls::otp::OTP::get_all(nullptr, ref
.ioctx
, ref
.obj
.oid
, result
);
10409 int RGWRados::otp_get_current_time(const rgw_user
& user
, ceph::real_time
*result
)
10413 int r
= get_mfa_ref(user
, &ref
);
10418 r
= rados::cls::otp::OTP::get_current_time(ref
.ioctx
, ref
.obj
.oid
, result
);
10426 int RGWRados::set_mfa(const string
& oid
, const list
<rados::cls::otp::otp_info_t
>& entries
,
10427 bool reset_obj
, RGWObjVersionTracker
*objv_tracker
,
10428 const real_time
& mtime
)
10430 rgw_raw_obj
obj(svc
.zone
->get_zone_params().otp_pool
, oid
);
10432 int r
= get_system_obj_ref(obj
, &ref
);
10437 librados::ObjectWriteOperation op
;
10440 op
.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK
);
10443 prepare_mfa_write(&op
, objv_tracker
, mtime
);
10444 rados::cls::otp::OTP::set(&op
, entries
);
10445 r
= ref
.ioctx
.operate(ref
.obj
.oid
, &op
);
10447 ldout(cct
, 20) << "OTP set entries.size()=" << entries
.size() << " result=" << (int)r
<< dendl
;
10454 int RGWRados::list_mfa(const string
& oid
, list
<rados::cls::otp::otp_info_t
> *result
,
10455 RGWObjVersionTracker
*objv_tracker
, ceph::real_time
*pmtime
)
10457 rgw_raw_obj
obj(svc
.zone
->get_zone_params().otp_pool
, oid
);
10459 int r
= get_system_obj_ref(obj
, &ref
);
10463 librados::ObjectReadOperation op
;
10464 struct timespec mtime_ts
;
10466 op
.stat2(nullptr, &mtime_ts
, nullptr);
10468 objv_tracker
->prepare_op_for_read(&op
);
10469 r
= rados::cls::otp::OTP::get_all(&op
, ref
.ioctx
, ref
.obj
.oid
, result
);
10474 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);