1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "include/compat.h"
8 #include <boost/algorithm/string.hpp>
10 #include <boost/format.hpp>
11 #include <boost/optional.hpp>
12 #include <boost/utility/in_place_factory.hpp>
14 #include "common/ceph_json.h"
15 #include "common/utf8.h"
17 #include "common/errno.h"
18 #include "common/Formatter.h"
19 #include "common/Throttle.h"
20 #include "common/Finisher.h"
22 #include "rgw_rados.h"
23 #include "rgw_cache.h"
25 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26 #include "rgw_metadata.h"
27 #include "rgw_bucket.h"
28 #include "rgw_rest_conn.h"
29 #include "rgw_cr_rados.h"
30 #include "rgw_cr_rest.h"
32 #include "cls/rgw/cls_rgw_ops.h"
33 #include "cls/rgw/cls_rgw_types.h"
34 #include "cls/rgw/cls_rgw_client.h"
35 #include "cls/rgw/cls_rgw_const.h"
36 #include "cls/refcount/cls_refcount_client.h"
37 #include "cls/version/cls_version_client.h"
38 #include "cls/log/cls_log_client.h"
39 #include "cls/statelog/cls_statelog_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
44 #include "rgw_tools.h"
45 #include "rgw_coroutine.h"
46 #include "rgw_compression.h"
48 #undef fork // fails to compile RGWPeriod::fork() below
50 #include "common/Clock.h"
52 #include "include/rados/librados.hpp"
53 using namespace librados
;
61 #include "auth/Crypto.h" // get_random_bytes()
68 #include "rgw_object_expirer_core.h"
70 #include "rgw_data_sync.h"
71 #include "rgw_realm_watcher.h"
72 #include "rgw_reshard.h"
74 #include "compressor/Compressor.h"
76 #define dout_context g_ceph_context
77 #define dout_subsys ceph_subsys_rgw
81 static string notify_oid_prefix
= "notify";
82 static string
*notify_oids
= NULL
;
83 static string shadow_ns
= "shadow";
84 static string dir_oid_prefix
= ".dir.";
85 static string default_storage_pool_suffix
= "rgw.buckets.data";
86 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
87 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
88 static string avail_pools
= ".pools.avail";
90 static string zone_info_oid_prefix
= "zone_info.";
91 static string zone_names_oid_prefix
= "zone_names.";
92 static string region_info_oid_prefix
= "region_info.";
93 static string zone_group_info_oid_prefix
= "zonegroup_info.";
94 static string realm_names_oid_prefix
= "realms_names.";
95 static string realm_info_oid_prefix
= "realms.";
96 static string default_region_info_oid
= "default.region";
97 static string default_zone_group_info_oid
= "default.zonegroup";
98 static string period_info_oid_prefix
= "periods.";
99 static string period_latest_epoch_info_oid
= ".latest_epoch";
100 static string region_map_oid
= "region_map";
101 static string zonegroup_map_oid
= "zonegroup_map";
102 static string log_lock_name
= "rgw_log_lock";
103 static string default_realm_info_oid
= "default.realm";
104 const string default_zonegroup_name
= "default";
105 const string default_zone_name
= "default";
106 static string zonegroup_names_oid_prefix
= "zonegroups_names.";
107 static RGWObjCategory main_category
= RGW_OBJ_CATEGORY_MAIN
;
108 #define RGW_USAGE_OBJ_PREFIX "usage."
109 #define FIRST_EPOCH 1
110 static string RGW_DEFAULT_ZONE_ROOT_POOL
= "rgw.root";
111 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL
= "rgw.root";
112 static string RGW_DEFAULT_REALM_ROOT_POOL
= "rgw.root";
113 static string RGW_DEFAULT_PERIOD_ROOT_POOL
= "rgw.root";
115 #define RGW_STATELOG_OBJ_PREFIX "statelog."
117 #define dout_subsys ceph_subsys_rgw
120 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
121 const string
& placement_id
, const rgw_obj
& obj
, rgw_pool
*pool
)
123 if (!zone_params
.get_head_data_pool(placement_id
, obj
, pool
)) {
124 RGWZonePlacementInfo placement
;
125 if (!zone_params
.get_placement(zonegroup
.default_placement
, &placement
)) {
129 if (!obj
.in_extra_data
) {
130 *pool
= placement
.data_pool
;
132 *pool
= placement
.get_data_extra_pool();
139 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
140 const string
& placement_id
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
142 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
144 return rgw_get_obj_data_pool(zonegroup
, zone_params
, placement_id
, obj
, &raw_obj
->pool
);
147 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
151 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
157 rgw_raw_obj
rgw_obj_select::get_raw_obj(RGWRados
*store
) const
161 store
->obj_to_raw(placement_rule
, obj
, &r
);
167 int rgw_init_ioctx(librados::Rados
*rados
, const rgw_pool
& pool
, IoCtx
& ioctx
, bool create
)
169 int r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
170 if (r
== -ENOENT
&& create
) {
171 r
= rados
->pool_create(pool
.name
.c_str());
172 if (r
< 0 && r
!= -EEXIST
) {
176 r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
181 if (!pool
.ns
.empty()) {
182 ioctx
.set_namespace(pool
.ns
);
188 void RGWObjectCtxImpl
<rgw_obj
, RGWObjState
>::invalidate(rgw_obj
& obj
) {
189 RWLock::WLocker
wl(lock
);
190 auto iter
= objs_state
.find(obj
);
191 if (iter
== objs_state
.end()) {
194 bool is_atomic
= iter
->second
.is_atomic
;
195 bool prefetch_data
= iter
->second
.prefetch_data
;
197 objs_state
.erase(iter
);
199 if (is_atomic
|| prefetch_data
) {
200 auto& s
= objs_state
[obj
];
201 s
.is_atomic
= is_atomic
;
202 s
.prefetch_data
= prefetch_data
;
207 void RGWObjectCtxImpl
<rgw_raw_obj
, RGWRawObjState
>::invalidate(rgw_raw_obj
& obj
) {
208 RWLock::WLocker
wl(lock
);
209 auto iter
= objs_state
.find(obj
);
210 if (iter
== objs_state
.end()) {
214 objs_state
.erase(iter
);
217 void RGWDefaultZoneGroupInfo::dump(Formatter
*f
) const {
218 encode_json("default_zonegroup", default_zonegroup
, f
);
221 void RGWDefaultZoneGroupInfo::decode_json(JSONObj
*obj
) {
223 JSONDecoder::decode_json("default_zonegroup", default_zonegroup
, obj
);
224 /* backward compatability with region */
225 if (default_zonegroup
.empty()) {
226 JSONDecoder::decode_json("default_region", default_zonegroup
, obj
);
230 rgw_pool
RGWZoneGroup::get_pool(CephContext
*cct_
)
232 if (cct_
->_conf
->rgw_zonegroup_root_pool
.empty()) {
233 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL
);
236 return rgw_pool(cct_
->_conf
->rgw_zonegroup_root_pool
);
239 int RGWZoneGroup::create_default(bool old_format
)
241 name
= default_zonegroup_name
;
244 RGWZoneGroupPlacementTarget placement_target
;
245 placement_target
.name
= "default-placement";
246 placement_targets
[placement_target
.name
] = placement_target
;
247 default_placement
= "default-placement";
249 RGWZoneParams
zone_params(default_zone_name
);
251 int r
= zone_params
.init(cct
, store
, false);
253 ldout(cct
, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r
) << dendl
;
257 r
= zone_params
.create_default();
258 if (r
< 0 && r
!= -EEXIST
) {
259 ldout(cct
, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r
) << dendl
;
261 } else if (r
== -EEXIST
) {
262 ldout(cct
, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl
;
263 zone_params
.clear_id();
264 r
= zone_params
.init(cct
, store
);
266 ldout(cct
, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r
) << dendl
;
269 ldout(cct
, 20) << "zone_params::create_default() " << zone_params
.get_name() << " id " << zone_params
.get_id()
273 RGWZone
& default_zone
= zones
[zone_params
.get_id()];
274 default_zone
.name
= zone_params
.get_name();
275 default_zone
.id
= zone_params
.get_id();
276 master_zone
= default_zone
.id
;
279 if (r
< 0 && r
!= -EEXIST
) {
280 ldout(cct
, 0) << "error storing zone group info: " << cpp_strerror(-r
) << dendl
;
285 ldout(cct
, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl
;
287 r
= init(cct
, store
);
297 post_process_params();
302 const string
RGWZoneGroup::get_default_oid(bool old_region_format
)
304 if (old_region_format
) {
305 if (cct
->_conf
->rgw_default_region_info_oid
.empty()) {
306 return default_region_info_oid
;
308 return cct
->_conf
->rgw_default_region_info_oid
;
311 string default_oid
= cct
->_conf
->rgw_default_zonegroup_info_oid
;
313 if (cct
->_conf
->rgw_default_zonegroup_info_oid
.empty()) {
314 default_oid
= default_zone_group_info_oid
;
317 default_oid
+= "." + realm_id
;
322 const string
& RGWZoneGroup::get_info_oid_prefix(bool old_region_format
)
324 if (old_region_format
) {
325 return region_info_oid_prefix
;
327 return zone_group_info_oid_prefix
;
330 const string
& RGWZoneGroup::get_names_oid_prefix()
332 return zonegroup_names_oid_prefix
;
335 const string
& RGWZoneGroup::get_predefined_name(CephContext
*cct
) {
336 return cct
->_conf
->rgw_zonegroup
;
339 int RGWZoneGroup::equals(const string
& other_zonegroup
) const
341 if (is_master
&& other_zonegroup
.empty())
344 return (id
== other_zonegroup
);
347 int RGWZoneGroup::add_zone(const RGWZoneParams
& zone_params
, bool *is_master
, bool *read_only
,
348 const list
<string
>& endpoints
, const string
*ptier_type
,
349 bool *psync_from_all
, list
<string
>& sync_from
, list
<string
>& sync_from_rm
)
351 auto& zone_id
= zone_params
.get_id();
352 auto& zone_name
= zone_params
.get_name();
354 // check for duplicate zone name on insert
355 if (!zones
.count(zone_id
)) {
356 for (const auto& zone
: zones
) {
357 if (zone
.second
.name
== zone_name
) {
358 ldout(cct
, 0) << "ERROR: found existing zone name " << zone_name
359 << " (" << zone
.first
<< ") in zonegroup " << get_name() << dendl
;
367 if (!master_zone
.empty() && master_zone
!= zone_params
.get_id()) {
368 ldout(cct
, 0) << "NOTICE: overriding master zone: " << master_zone
<< dendl
;
370 master_zone
= zone_params
.get_id();
371 } else if (master_zone
== zone_params
.get_id()) {
376 RGWZone
& zone
= zones
[zone_params
.get_id()];
377 zone
.name
= zone_params
.get_name();
378 zone
.id
= zone_params
.get_id();
379 if (!endpoints
.empty()) {
380 zone
.endpoints
= endpoints
;
383 zone
.read_only
= *read_only
;
386 zone
.tier_type
= *ptier_type
;
389 if (psync_from_all
) {
390 zone
.sync_from_all
= *psync_from_all
;
393 for (auto add
: sync_from
) {
394 zone
.sync_from
.insert(add
);
397 for (auto rm
: sync_from_rm
) {
398 zone
.sync_from
.erase(rm
);
401 post_process_params();
407 int RGWZoneGroup::rename_zone(const RGWZoneParams
& zone_params
)
409 RGWZone
& zone
= zones
[zone_params
.get_id()];
410 zone
.name
= zone_params
.get_name();
415 void RGWZoneGroup::post_process_params()
417 bool log_data
= zones
.size() > 1;
419 if (master_zone
.empty()) {
420 map
<string
, RGWZone
>::iterator iter
= zones
.begin();
421 if (iter
!= zones
.end()) {
422 master_zone
= iter
->first
;
426 for (map
<string
, RGWZone
>::iterator iter
= zones
.begin(); iter
!= zones
.end(); ++iter
) {
427 RGWZone
& zone
= iter
->second
;
428 zone
.log_data
= log_data
;
429 zone
.log_meta
= (is_master
&& zone
.id
== master_zone
);
431 RGWZoneParams
zone_params(zone
.id
, zone
.name
);
432 int ret
= zone_params
.init(cct
, store
);
434 ldout(cct
, 0) << "WARNING: could not read zone params for zone id=" << zone
.id
<< " name=" << zone
.name
<< dendl
;
438 for (map
<string
, RGWZonePlacementInfo
>::iterator iter
= zone_params
.placement_pools
.begin();
439 iter
!= zone_params
.placement_pools
.end(); ++iter
) {
440 const string
& placement_name
= iter
->first
;
441 if (placement_targets
.find(placement_name
) == placement_targets
.end()) {
442 RGWZoneGroupPlacementTarget placement_target
;
443 placement_target
.name
= placement_name
;
444 placement_targets
[placement_name
] = placement_target
;
449 if (default_placement
.empty() && !placement_targets
.empty()) {
450 default_placement
= placement_targets
.begin()->first
;
454 int RGWZoneGroup::remove_zone(const std::string
& zone_id
)
456 map
<string
, RGWZone
>::iterator iter
= zones
.find(zone_id
);
457 if (iter
== zones
.end()) {
458 ldout(cct
, 0) << "zone id " << zone_id
<< " is not a part of zonegroup "
465 post_process_params();
470 int RGWZoneGroup::read_default_id(string
& default_id
, bool old_format
)
472 if (realm_id
.empty()) {
473 /* try using default realm */
475 int ret
= realm
.init(cct
, store
);
477 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
480 realm_id
= realm
.get_id();
483 return RGWSystemMetaObj::read_default_id(default_id
, old_format
);
486 int RGWZoneGroup::set_as_default(bool exclusive
)
488 if (realm_id
.empty()) {
489 /* try using default realm */
491 int ret
= realm
.init(cct
, store
);
493 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
496 realm_id
= realm
.get_id();
499 return RGWSystemMetaObj::set_as_default(exclusive
);
502 int RGWSystemMetaObj::init(CephContext
*_cct
, RGWRados
*_store
, bool setup_obj
, bool old_format
)
510 if (old_format
&& id
.empty()) {
517 name
= get_predefined_name(cct
);
520 r
= use_default(old_format
);
524 } else if (!old_format
) {
525 r
= read_id(name
, id
);
528 ldout(cct
, 0) << "error in read_id for object name: " << name
<< " : " << cpp_strerror(-r
) << dendl
;
535 return read_info(id
, old_format
);
538 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo
& default_info
, const string
& oid
)
540 auto pool
= get_pool(cct
);
542 RGWObjectCtx
obj_ctx(store
);
543 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
548 bufferlist::iterator iter
= bl
.begin();
549 ::decode(default_info
, iter
);
550 } catch (buffer::error
& err
) {
551 ldout(cct
, 0) << "error decoding data from " << pool
<< ":" << oid
<< dendl
;
558 int RGWSystemMetaObj::read_default_id(string
& default_id
, bool old_format
)
560 RGWDefaultSystemMetaObjInfo default_info
;
562 int ret
= read_default(default_info
, get_default_oid(old_format
));
567 default_id
= default_info
.default_id
;
572 int RGWSystemMetaObj::use_default(bool old_format
)
574 return read_default_id(id
, old_format
);
577 int RGWSystemMetaObj::set_as_default(bool exclusive
)
579 string oid
= get_default_oid();
581 rgw_pool
pool(get_pool(cct
));
584 RGWDefaultSystemMetaObjInfo default_info
;
585 default_info
.default_id
= id
;
587 ::encode(default_info
, bl
);
589 int ret
= rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
590 exclusive
, NULL
, real_time(), NULL
);
597 int RGWSystemMetaObj::read_id(const string
& obj_name
, string
& object_id
)
599 rgw_pool
pool(get_pool(cct
));
602 string oid
= get_names_oid_prefix() + obj_name
;
604 RGWObjectCtx
obj_ctx(store
);
605 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
610 RGWNameToId nameToId
;
612 bufferlist::iterator iter
= bl
.begin();
613 ::decode(nameToId
, iter
);
614 } catch (buffer::error
& err
) {
615 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << oid
<< dendl
;
618 object_id
= nameToId
.obj_id
;
622 int RGWSystemMetaObj::delete_obj(bool old_format
)
624 rgw_pool
pool(get_pool(cct
));
626 /* check to see if obj is the default */
627 RGWDefaultSystemMetaObjInfo default_info
;
628 int ret
= read_default(default_info
, get_default_oid(old_format
));
629 if (ret
< 0 && ret
!= -ENOENT
)
631 if (default_info
.default_id
== id
|| (old_format
&& default_info
.default_id
== name
)) {
632 string oid
= get_default_oid(old_format
);
633 rgw_raw_obj
default_named_obj(pool
, oid
);
634 ret
= store
->delete_system_obj(default_named_obj
);
636 ldout(cct
, 0) << "Error delete default obj name " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
641 string oid
= get_names_oid_prefix() + name
;
642 rgw_raw_obj
object_name(pool
, oid
);
643 ret
= store
->delete_system_obj(object_name
);
645 ldout(cct
, 0) << "Error delete obj name " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
650 string oid
= get_info_oid_prefix(old_format
);
657 rgw_raw_obj
object_id(pool
, oid
);
658 ret
= store
->delete_system_obj(object_id
);
660 ldout(cct
, 0) << "Error delete object id " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
666 int RGWSystemMetaObj::store_name(bool exclusive
)
668 rgw_pool
pool(get_pool(cct
));
669 string oid
= get_names_oid_prefix() + name
;
671 RGWNameToId nameToId
;
672 nameToId
.obj_id
= id
;
675 ::encode(nameToId
, bl
);
676 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
679 int RGWSystemMetaObj::rename(const string
& new_name
)
682 int ret
= read_id(new_name
, new_id
);
686 if (ret
< 0 && ret
!= -ENOENT
) {
687 ldout(cct
, 0) << "Error read_id " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
690 string old_name
= name
;
694 ldout(cct
, 0) << "Error storing new obj info " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
697 ret
= store_name(true);
699 ldout(cct
, 0) << "Error storing new name " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
702 /* delete old name */
703 rgw_pool
pool(get_pool(cct
));
704 string oid
= get_names_oid_prefix() + old_name
;
705 rgw_raw_obj
old_name_obj(pool
, oid
);
706 ret
= store
->delete_system_obj(old_name_obj
);
708 ldout(cct
, 0) << "Error delete old obj name " << old_name
<< ": " << cpp_strerror(-ret
) << dendl
;
715 int RGWSystemMetaObj::read_info(const string
& obj_id
, bool old_format
)
717 rgw_pool
pool(get_pool(cct
));
721 string oid
= get_info_oid_prefix(old_format
) + obj_id
;
723 RGWObjectCtx
obj_ctx(store
);
724 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
726 ldout(cct
, 0) << "failed reading obj info from " << pool
<< ":" << oid
<< ": " << cpp_strerror(-ret
) << dendl
;
731 bufferlist::iterator iter
= bl
.begin();
732 ::decode(*this, iter
);
733 } catch (buffer::error
& err
) {
734 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << oid
<< dendl
;
741 int RGWSystemMetaObj::read()
743 int ret
= read_id(name
, id
);
748 return read_info(id
);
751 int RGWSystemMetaObj::create(bool exclusive
)
755 /* check to see the name is not used */
756 ret
= read_id(name
, id
);
757 if (exclusive
&& ret
== 0) {
758 ldout(cct
, 10) << "ERROR: name " << name
<< " already in use for obj id " << id
<< dendl
;
760 } else if ( ret
< 0 && ret
!= -ENOENT
) {
761 ldout(cct
, 0) << "failed reading obj id " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
766 /* create unique id */
769 new_uuid
.generate_random();
770 new_uuid
.print(uuid_str
);
774 ret
= store_info(exclusive
);
776 ldout(cct
, 0) << "ERROR: storing info for " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
780 return store_name(exclusive
);
783 int RGWSystemMetaObj::store_info(bool exclusive
)
785 rgw_pool
pool(get_pool(cct
));
787 string oid
= get_info_oid_prefix() + id
;
791 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
794 int RGWSystemMetaObj::write(bool exclusive
)
796 int ret
= store_info(exclusive
);
798 ldout(cct
, 20) << __func__
<< "(): store_info() returned ret=" << ret
<< dendl
;
801 ret
= store_name(exclusive
);
803 ldout(cct
, 20) << __func__
<< "(): store_name() returned ret=" << ret
<< dendl
;
810 const string
& RGWRealm::get_predefined_name(CephContext
*cct
) {
811 return cct
->_conf
->rgw_realm
;
814 int RGWRealm::create(bool exclusive
)
816 int ret
= RGWSystemMetaObj::create(exclusive
);
818 ldout(cct
, 0) << "ERROR creating new realm object " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
821 // create the control object for watch/notify
822 ret
= create_control(exclusive
);
824 ldout(cct
, 0) << "ERROR creating control for new realm " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
828 if (current_period
.empty()) {
829 /* create new period for the realm */
830 ret
= period
.init(cct
, store
, id
, name
, false);
834 ret
= period
.create(true);
836 ldout(cct
, 0) << "ERROR: creating new period for realm " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
840 period
= RGWPeriod(current_period
, 0);
841 int ret
= period
.init(cct
, store
, id
, name
);
843 ldout(cct
, 0) << "ERROR: failed to init period " << current_period
<< dendl
;
847 ret
= set_current_period(period
);
849 ldout(cct
, 0) << "ERROR: failed set current period " << current_period
<< dendl
;
852 // try to set as default. may race with another create, so pass exclusive=true
853 // so we don't override an existing default
854 ret
= set_as_default(true);
855 if (ret
< 0 && ret
!= -EEXIST
) {
856 ldout(cct
, 0) << "WARNING: failed to set realm as default realm, ret=" << ret
<< dendl
;
862 int RGWRealm::delete_obj()
864 int ret
= RGWSystemMetaObj::delete_obj();
868 return delete_control();
871 int RGWRealm::create_control(bool exclusive
)
873 auto pool
= rgw_pool
{get_pool(cct
)};
874 auto oid
= get_control_oid();
875 return rgw_put_system_obj(store
, pool
, oid
, nullptr, 0, exclusive
,
876 nullptr, real_time(), nullptr);
879 int RGWRealm::delete_control()
881 auto pool
= rgw_pool
{get_pool(cct
)};
882 auto obj
= rgw_raw_obj
{pool
, get_control_oid()};
883 return store
->delete_system_obj(obj
);
886 rgw_pool
RGWRealm::get_pool(CephContext
*cct
)
888 if (cct
->_conf
->rgw_realm_root_pool
.empty()) {
889 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL
);
891 return rgw_pool(cct
->_conf
->rgw_realm_root_pool
);
894 const string
RGWRealm::get_default_oid(bool old_format
)
896 if (cct
->_conf
->rgw_default_realm_info_oid
.empty()) {
897 return default_realm_info_oid
;
899 return cct
->_conf
->rgw_default_realm_info_oid
;
902 const string
& RGWRealm::get_names_oid_prefix()
904 return realm_names_oid_prefix
;
907 const string
& RGWRealm::get_info_oid_prefix(bool old_format
)
909 return realm_info_oid_prefix
;
912 int RGWRealm::set_current_period(RGWPeriod
& period
)
914 // update realm epoch to match the period's
915 if (epoch
> period
.get_realm_epoch()) {
916 ldout(cct
, 0) << "ERROR: set_current_period with old realm epoch "
917 << period
.get_realm_epoch() << ", current epoch=" << epoch
<< dendl
;
920 if (epoch
== period
.get_realm_epoch() && current_period
!= period
.get_id()) {
921 ldout(cct
, 0) << "ERROR: set_current_period with same realm epoch "
922 << period
.get_realm_epoch() << ", but different period id "
923 << period
.get_id() << " != " << current_period
<< dendl
;
927 epoch
= period
.get_realm_epoch();
928 current_period
= period
.get_id();
932 ldout(cct
, 0) << "ERROR: period update: " << cpp_strerror(-ret
) << dendl
;
936 ret
= period
.reflect();
938 ldout(cct
, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret
) << dendl
;
945 string
RGWRealm::get_control_oid()
947 return get_info_oid_prefix() + id
+ ".control";
950 int RGWRealm::notify_zone(bufferlist
& bl
)
952 // open a context on the realm's pool
953 rgw_pool pool
{get_pool(cct
)};
955 int r
= rgw_init_ioctx(store
->get_rados_handle(), pool
, ctx
);
957 ldout(cct
, 0) << "Failed to open pool " << pool
<< dendl
;
960 // send a notify on the realm object
961 r
= ctx
.notify2(get_control_oid(), bl
, 0, nullptr);
963 ldout(cct
, 0) << "Realm notify failed with " << r
<< dendl
;
969 int RGWRealm::notify_new_period(const RGWPeriod
& period
)
972 // push the period to dependent zonegroups/zones
973 ::encode(RGWRealmNotify::ZonesNeedPeriod
, bl
);
974 ::encode(period
, bl
);
975 // reload the gateway with the new period
976 ::encode(RGWRealmNotify::Reload
, bl
);
978 return notify_zone(bl
);
981 std::string
RGWPeriodConfig::get_oid(const std::string
& realm_id
)
983 if (realm_id
.empty()) {
984 return "period_config.default";
986 return "period_config." + realm_id
;
989 rgw_pool
RGWPeriodConfig::get_pool(CephContext
*cct
)
991 const auto& pool_name
= cct
->_conf
->rgw_period_root_pool
;
992 if (pool_name
.empty()) {
993 return {RGW_DEFAULT_PERIOD_ROOT_POOL
};
998 int RGWPeriodConfig::read(RGWRados
*store
, const std::string
& realm_id
)
1000 RGWObjectCtx
obj_ctx(store
);
1001 const auto& pool
= get_pool(store
->ctx());
1002 const auto& oid
= get_oid(realm_id
);
1005 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, nullptr, nullptr);
1010 bufferlist::iterator iter
= bl
.begin();
1011 ::decode(*this, iter
);
1012 } catch (buffer::error
& err
) {
1018 int RGWPeriodConfig::write(RGWRados
*store
, const std::string
& realm_id
)
1020 const auto& pool
= get_pool(store
->ctx());
1021 const auto& oid
= get_oid(realm_id
);
1023 ::encode(*this, bl
);
1024 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1025 false, nullptr, real_time(), nullptr);
1028 int RGWPeriod::init(CephContext
*_cct
, RGWRados
*_store
, const string
& period_realm_id
,
1029 const string
& period_realm_name
, bool setup_obj
)
1033 realm_id
= period_realm_id
;
1034 realm_name
= period_realm_name
;
1039 return init(_cct
, _store
, setup_obj
);
1043 int RGWPeriod::init(CephContext
*_cct
, RGWRados
*_store
, bool setup_obj
)
1052 RGWRealm
realm(realm_id
, realm_name
);
1053 int ret
= realm
.init(cct
, store
);
1055 ldout(cct
, 0) << "RGWPeriod::init failed to init realm " << realm_name
<< " id " << realm_id
<< " : " <<
1056 cpp_strerror(-ret
) << dendl
;
1059 id
= realm
.get_current_period();
1060 realm_id
= realm
.get_id();
1064 int ret
= use_latest_epoch();
1066 ldout(cct
, 0) << "failed to use_latest_epoch period id " << id
<< " realm " << realm_name
<< " id " << realm_id
1067 << " : " << cpp_strerror(-ret
) << dendl
;
1076 int RGWPeriod::get_zonegroup(RGWZoneGroup
& zonegroup
, const string
& zonegroup_id
) {
1077 map
<string
, RGWZoneGroup
>::const_iterator iter
;
1078 if (!zonegroup_id
.empty()) {
1079 iter
= period_map
.zonegroups
.find(zonegroup_id
);
1081 iter
= period_map
.zonegroups
.find("default");
1083 if (iter
!= period_map
.zonegroups
.end()) {
1084 zonegroup
= iter
->second
;
1091 bool RGWPeriod::is_single_zonegroup(CephContext
*cct
, RGWRados
*store
)
1093 return (period_map
.zonegroups
.size() == 1);
1096 const string
& RGWPeriod::get_latest_epoch_oid()
1098 if (cct
->_conf
->rgw_period_latest_epoch_info_oid
.empty()) {
1099 return period_latest_epoch_info_oid
;
1101 return cct
->_conf
->rgw_period_latest_epoch_info_oid
;
1104 const string
& RGWPeriod::get_info_oid_prefix()
1106 return period_info_oid_prefix
;
1109 const string
RGWPeriod::get_period_oid_prefix()
1111 return get_info_oid_prefix() + id
;
1114 const string
RGWPeriod::get_period_oid()
1116 std::ostringstream oss
;
1117 oss
<< get_period_oid_prefix();
1118 // skip the epoch for the staging period
1119 if (id
!= get_staging_id(realm_id
))
1120 oss
<< "." << epoch
;
1124 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo
& info
)
1126 string oid
= get_period_oid_prefix() + get_latest_epoch_oid();
1128 rgw_pool
pool(get_pool(cct
));
1130 RGWObjectCtx
obj_ctx(store
);
1131 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
1133 ldout(cct
, 1) << "error read_lastest_epoch " << pool
<< ":" << oid
<< dendl
;
1137 bufferlist::iterator iter
= bl
.begin();
1138 ::decode(info
, iter
);
1139 } catch (buffer::error
& err
) {
1140 ldout(cct
, 0) << "error decoding data from " << pool
<< ":" << oid
<< dendl
;
1147 int RGWPeriod::get_latest_epoch(epoch_t
& latest_epoch
)
1149 RGWPeriodLatestEpochInfo info
;
1151 int ret
= read_latest_epoch(info
);
1156 latest_epoch
= info
.epoch
;
1161 int RGWPeriod::use_latest_epoch()
1163 RGWPeriodLatestEpochInfo info
;
1164 int ret
= read_latest_epoch(info
);
1174 int RGWPeriod::set_latest_epoch(epoch_t epoch
, bool exclusive
)
1176 string oid
= get_period_oid_prefix() + get_latest_epoch_oid();
1178 rgw_pool
pool(get_pool(cct
));
1181 RGWPeriodLatestEpochInfo info
;
1186 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1187 exclusive
, NULL
, real_time(), NULL
);
1190 int RGWPeriod::delete_obj()
1192 rgw_pool
pool(get_pool(cct
));
1194 // delete the object for each period epoch
1195 for (epoch_t e
= 1; e
<= epoch
; e
++) {
1196 RGWPeriod p
{get_id(), e
};
1197 rgw_raw_obj oid
{pool
, p
.get_period_oid()};
1198 int ret
= store
->delete_system_obj(oid
);
1200 ldout(cct
, 0) << "WARNING: failed to delete period object " << oid
1201 << ": " << cpp_strerror(-ret
) << dendl
;
1205 // delete the .latest_epoch object
1206 rgw_raw_obj oid
{pool
, get_period_oid_prefix() + get_latest_epoch_oid()};
1207 int ret
= store
->delete_system_obj(oid
);
1209 ldout(cct
, 0) << "WARNING: failed to delete period object " << oid
1210 << ": " << cpp_strerror(-ret
) << dendl
;
1215 int RGWPeriod::read_info()
1217 rgw_pool
pool(get_pool(cct
));
1221 RGWObjectCtx
obj_ctx(store
);
1222 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, get_period_oid(), bl
, NULL
, NULL
);
1224 ldout(cct
, 0) << "failed reading obj info from " << pool
<< ":" << get_period_oid() << ": " << cpp_strerror(-ret
) << dendl
;
1229 bufferlist::iterator iter
= bl
.begin();
1230 ::decode(*this, iter
);
1231 } catch (buffer::error
& err
) {
1232 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << get_period_oid() << dendl
;
1239 int RGWPeriod::create(bool exclusive
)
1243 /* create unique id */
1246 new_uuid
.generate_random();
1247 new_uuid
.print(uuid_str
);
1250 epoch
= FIRST_EPOCH
;
1254 ret
= store_info(exclusive
);
1256 ldout(cct
, 0) << "ERROR: storing info for " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
1259 ret
= set_latest_epoch(epoch
);
1261 ldout(cct
, 0) << "ERROR: setting latest epoch " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
1267 int RGWPeriod::store_info(bool exclusive
)
1269 epoch_t latest_epoch
= FIRST_EPOCH
- 1;
1270 int ret
= get_latest_epoch(latest_epoch
);
1271 if (ret
< 0 && ret
!= -ENOENT
) {
1272 ldout(cct
, 0) << "ERROR: RGWPeriod::get_latest_epoch() returned " << cpp_strerror(-ret
) << dendl
;
1276 rgw_pool
pool(get_pool(cct
));
1278 string oid
= get_period_oid();
1280 ::encode(*this, bl
);
1281 ret
= rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
1283 ldout(cct
, 0) << "ERROR: rgw_put_system_obj(" << pool
<< ":" << oid
<< "): " << cpp_strerror(-ret
) << dendl
;
1286 if (latest_epoch
< epoch
) {
1287 ret
= set_latest_epoch(epoch
);
1289 ldout(cct
, 0) << "ERROR: RGWPeriod::set_latest_epoch() returned " << cpp_strerror(-ret
) << dendl
;
1296 rgw_pool
RGWPeriod::get_pool(CephContext
*cct
)
1298 if (cct
->_conf
->rgw_period_root_pool
.empty()) {
1299 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL
);
1301 return rgw_pool(cct
->_conf
->rgw_period_root_pool
);
1304 int RGWPeriod::use_next_epoch()
1306 epoch_t latest_epoch
;
1307 int ret
= get_latest_epoch(latest_epoch
);
1311 epoch
= latest_epoch
+ 1;
1313 if (ret
< 0 && ret
!= -ENOENT
) {
1316 if (ret
== -ENOENT
) {
1319 ldout(cct
, 0) << "Error creating new epoch " << epoch
<< dendl
;
1326 int RGWPeriod::add_zonegroup(const RGWZoneGroup
& zonegroup
)
1328 if (zonegroup
.realm_id
!= realm_id
) {
1331 int ret
= period_map
.update(zonegroup
, cct
);
1333 ldout(cct
, 0) << "ERROR: updating period map: " << cpp_strerror(-ret
) << dendl
;
1337 return store_info(false);
1340 int RGWPeriod::update()
1342 ldout(cct
, 20) << __func__
<< " realm " << realm_id
<< " period " << get_id() << dendl
;
1343 list
<string
> zonegroups
;
1344 int ret
= store
->list_zonegroups(zonegroups
);
1346 ldout(cct
, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret
) << dendl
;
1350 // clear zone short ids of removed zones. period_map.update() will add the
1351 // remaining zones back
1352 period_map
.short_zone_ids
.clear();
1354 for (auto& iter
: zonegroups
) {
1355 RGWZoneGroup
zg(string(), iter
);
1356 ret
= zg
.init(cct
, store
);
1358 ldout(cct
, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret
) << dendl
;
1362 if (zg
.realm_id
!= realm_id
) {
1363 ldout(cct
, 20) << "skipping zonegroup " << zg
.get_name() << " zone realm id " << zg
.realm_id
<< ", not on our realm " << realm_id
<< dendl
;
1367 if (zg
.master_zone
.empty()) {
1368 ldout(cct
, 0) << "ERROR: zonegroup " << zg
.get_name() << " should have a master zone " << dendl
;
1372 if (zg
.is_master_zonegroup()) {
1373 master_zonegroup
= zg
.get_id();
1374 master_zone
= zg
.master_zone
;
1377 int ret
= period_map
.update(zg
, cct
);
1383 ret
= period_config
.read(store
, realm_id
);
1384 if (ret
< 0 && ret
!= -ENOENT
) {
1385 ldout(cct
, 0) << "ERROR: failed to read period config: "
1386 << cpp_strerror(ret
) << dendl
;
1392 int RGWPeriod::reflect()
1394 for (auto& iter
: period_map
.zonegroups
) {
1395 RGWZoneGroup
& zg
= iter
.second
;
1396 zg
.reinit_instance(cct
, store
);
1397 int r
= zg
.write(false);
1399 ldout(cct
, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter
.first
<< ": " << cpp_strerror(-r
) << dendl
;
1402 if (zg
.is_master_zonegroup()) {
1403 // set master as default if no default exists
1404 r
= zg
.set_as_default(true);
1406 ldout(cct
, 1) << "Set the period's master zonegroup " << zg
.get_id()
1407 << " as the default" << dendl
;
1412 int r
= period_config
.write(store
, realm_id
);
1414 ldout(cct
, 0) << "ERROR: failed to store period config: "
1415 << cpp_strerror(-r
) << dendl
;
1421 void RGWPeriod::fork()
1423 ldout(cct
, 20) << __func__
<< " realm " << realm_id
<< " period " << id
<< dendl
;
1424 predecessor_uuid
= id
;
1425 id
= get_staging_id(realm_id
);
1430 static int read_sync_status(RGWRados
*store
, rgw_meta_sync_status
*sync_status
)
1432 // initialize a sync status manager to read the status
1433 RGWMetaSyncStatusManager
mgr(store
, store
->get_async_rados());
1438 r
= mgr
.read_sync_status(sync_status
);
1443 int RGWPeriod::update_sync_status(const RGWPeriod
¤t_period
,
1444 std::ostream
& error_stream
,
1445 bool force_if_stale
)
1447 rgw_meta_sync_status status
;
1448 int r
= read_sync_status(store
, &status
);
1450 ldout(cct
, 0) << "period failed to read sync status: "
1451 << cpp_strerror(-r
) << dendl
;
1455 std::vector
<std::string
> markers
;
1457 const auto current_epoch
= current_period
.get_realm_epoch();
1458 if (current_epoch
!= status
.sync_info
.realm_epoch
) {
1459 // no sync status markers for the current period
1460 assert(current_epoch
> status
.sync_info
.realm_epoch
);
1461 const int behind
= current_epoch
- status
.sync_info
.realm_epoch
;
1462 if (!force_if_stale
&& current_epoch
> 1) {
1463 error_stream
<< "ERROR: This zone is " << behind
<< " period(s) behind "
1464 "the current master zone in metadata sync. If this zone is promoted "
1465 "to master, any metadata changes during that time are likely to "
1467 "Waiting for this zone to catch up on metadata sync (see "
1468 "'radosgw-admin sync status') is recommended.\n"
1469 "To promote this zone to master anyway, add the flag "
1470 "--yes-i-really-mean-it." << std::endl
;
1473 // empty sync status markers - other zones will skip this period during
1474 // incremental metadata sync
1475 markers
.resize(status
.sync_info
.num_shards
);
1477 markers
.reserve(status
.sync_info
.num_shards
);
1478 for (auto& i
: status
.sync_markers
) {
1479 auto& marker
= i
.second
;
1480 // filter out markers from other periods
1481 if (marker
.realm_epoch
!= current_epoch
) {
1482 marker
.marker
.clear();
1484 markers
.emplace_back(std::move(marker
.marker
));
1488 std::swap(sync_status
, markers
);
1492 int RGWPeriod::commit(RGWRealm
& realm
, const RGWPeriod
& current_period
,
1493 std::ostream
& error_stream
, bool force_if_stale
)
1495 ldout(cct
, 20) << __func__
<< " realm " << realm
.get_id() << " period " << current_period
.get_id() << dendl
;
1496 // gateway must be in the master zone to commit
1497 if (master_zone
!= store
->get_zone_params().get_id()) {
1498 error_stream
<< "Cannot commit period on zone "
1499 << store
->get_zone_params().get_id() << ", it must be sent to "
1500 "the period's master zone " << master_zone
<< '.' << std::endl
;
1503 // period predecessor must match current period
1504 if (predecessor_uuid
!= current_period
.get_id()) {
1505 error_stream
<< "Period predecessor " << predecessor_uuid
1506 << " does not match current period " << current_period
.get_id()
1507 << ". Use 'period pull' to get the latest period from the master, "
1508 "reapply your changes, and try again." << std::endl
;
1511 // realm epoch must be 1 greater than current period
1512 if (realm_epoch
!= current_period
.get_realm_epoch() + 1) {
1513 error_stream
<< "Period's realm epoch " << realm_epoch
1514 << " does not come directly after current realm epoch "
1515 << current_period
.get_realm_epoch() << ". Use 'realm pull' to get the "
1516 "latest realm and period from the master zone, reapply your changes, "
1517 "and try again." << std::endl
;
1520 // did the master zone change?
1521 if (master_zone
!= current_period
.get_master_zone()) {
1522 // store the current metadata sync status in the period
1523 int r
= update_sync_status(current_period
, error_stream
, force_if_stale
);
1525 ldout(cct
, 0) << "failed to update metadata sync status: "
1526 << cpp_strerror(-r
) << dendl
;
1529 // create an object with a new period id
1532 ldout(cct
, 0) << "failed to create new period: " << cpp_strerror(-r
) << dendl
;
1535 // set as current period
1536 r
= realm
.set_current_period(*this);
1538 ldout(cct
, 0) << "failed to update realm's current period: "
1539 << cpp_strerror(-r
) << dendl
;
1542 ldout(cct
, 4) << "Promoted to master zone and committed new period "
1544 realm
.notify_new_period(*this);
1547 // period must be based on current epoch
1548 if (epoch
!= current_period
.get_epoch()) {
1549 error_stream
<< "Period epoch " << epoch
<< " does not match "
1550 "predecessor epoch " << current_period
.get_epoch()
1551 << ". Use 'period pull' to get the latest epoch from the master zone, "
1552 "reapply your changes, and try again." << std::endl
;
1555 // set period as next epoch
1556 set_id(current_period
.get_id());
1557 set_epoch(current_period
.get_epoch() + 1);
1558 set_predecessor(current_period
.get_predecessor());
1559 realm_epoch
= current_period
.get_realm_epoch();
1560 // write the period to rados
1561 int r
= store_info(false);
1563 ldout(cct
, 0) << "failed to store period: " << cpp_strerror(-r
) << dendl
;
1566 // set as latest epoch
1567 r
= set_latest_epoch(epoch
);
1569 ldout(cct
, 0) << "failed to set latest epoch: " << cpp_strerror(-r
) << dendl
;
1574 ldout(cct
, 0) << "failed to update local objects: " << cpp_strerror(-r
) << dendl
;
1577 ldout(cct
, 4) << "Committed new epoch " << epoch
1578 << " for period " << id
<< dendl
;
1579 realm
.notify_new_period(*this);
1583 int RGWZoneParams::create_default(bool old_format
)
1585 name
= default_zone_name
;
1600 int get_zones_pool_set(CephContext
* cct
,
1602 const list
<string
>& zones
,
1603 const string
& my_zone_id
,
1604 set
<rgw_pool
>& pool_names
)
1606 for(auto const& iter
: zones
) {
1607 RGWZoneParams
zone(iter
);
1608 int r
= zone
.init(cct
, store
);
1610 ldout(cct
, 0) << "Error: init zone " << iter
<< ":" << cpp_strerror(-r
) << dendl
;
1613 if (zone
.get_id() != my_zone_id
) {
1614 pool_names
.insert(zone
.domain_root
);
1615 pool_names
.insert(zone
.metadata_heap
);
1616 pool_names
.insert(zone
.control_pool
);
1617 pool_names
.insert(zone
.gc_pool
);
1618 pool_names
.insert(zone
.log_pool
);
1619 pool_names
.insert(zone
.intent_log_pool
);
1620 pool_names
.insert(zone
.usage_log_pool
);
1621 pool_names
.insert(zone
.user_keys_pool
);
1622 pool_names
.insert(zone
.user_email_pool
);
1623 pool_names
.insert(zone
.user_swift_pool
);
1624 pool_names
.insert(zone
.user_uid_pool
);
1625 pool_names
.insert(zone
.roles_pool
);
1626 pool_names
.insert(zone
.reshard_pool
);
1627 for(auto& iter
: zone
.placement_pools
) {
1628 pool_names
.insert(iter
.second
.index_pool
);
1629 pool_names
.insert(iter
.second
.data_pool
);
1630 pool_names
.insert(iter
.second
.data_extra_pool
);
1637 rgw_pool
fix_zone_pool_dup(set
<rgw_pool
> pools
,
1638 const string
& default_prefix
,
1639 const string
& default_suffix
,
1640 const rgw_pool
& suggested_pool
)
1642 string suggested_name
= suggested_pool
.to_str();
1644 string prefix
= default_prefix
;
1645 string suffix
= default_suffix
;
1647 if (!suggested_pool
.empty()) {
1648 prefix
= suggested_name
.substr(0, suggested_name
.find("."));
1649 suffix
= suggested_name
.substr(prefix
.length());
1652 rgw_pool
pool(prefix
+ suffix
);
1654 if (pools
.find(pool
) == pools
.end()) {
1658 pool
= prefix
+ "_" + std::to_string(std::rand()) + suffix
;
1659 if (pools
.find(pool
) == pools
.end()) {
1666 int RGWZoneParams::fix_pool_names()
1670 int r
= store
->list_zones(zones
);
1672 ldout(cct
, 10) << "WARNING: store->list_zones() returned r=" << r
<< dendl
;
1675 set
<rgw_pool
> pools
;
1676 r
= get_zones_pool_set(cct
, store
, zones
, id
, pools
);
1678 ldout(cct
, 0) << "Error: get_zones_pool_names" << r
<< dendl
;
1682 domain_root
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:root", domain_root
);
1683 if (!metadata_heap
.name
.empty()) {
1684 metadata_heap
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:heap", metadata_heap
);
1686 control_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.control", control_pool
);
1687 gc_pool
= fix_zone_pool_dup(pools
, name
,".rgw.log:gc", gc_pool
);
1688 lc_pool
= fix_zone_pool_dup(pools
, name
,".rgw.log:lc", lc_pool
);
1689 log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log", log_pool
);
1690 intent_log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:intent", intent_log_pool
);
1691 usage_log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:usage", usage_log_pool
);
1692 user_keys_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.keys", user_keys_pool
);
1693 user_email_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.email", user_email_pool
);
1694 user_swift_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.swift", user_swift_pool
);
1695 user_uid_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.uid", user_uid_pool
);
1696 roles_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:roles", roles_pool
);
1697 reshard_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:reshard", reshard_pool
);
1699 for(auto& iter
: placement_pools
) {
1700 iter
.second
.index_pool
= fix_zone_pool_dup(pools
, name
, "." + default_bucket_index_pool_suffix
,
1701 iter
.second
.index_pool
);
1702 iter
.second
.data_pool
= fix_zone_pool_dup(pools
, name
, "." + default_storage_pool_suffix
,
1703 iter
.second
.data_pool
);
1704 iter
.second
.data_extra_pool
= fix_zone_pool_dup(pools
, name
, "." + default_storage_extra_pool_suffix
,
1705 iter
.second
.data_extra_pool
);
1711 int RGWZoneParams::create(bool exclusive
)
1713 /* check for old pools config */
1714 rgw_raw_obj
obj(domain_root
, avail_pools
);
1715 int r
= store
->raw_obj_stat(obj
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
);
1717 ldout(store
->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl
;
1718 /* a new system, let's set new placement info */
1719 RGWZonePlacementInfo default_placement
;
1720 default_placement
.index_pool
= name
+ "." + default_bucket_index_pool_suffix
;
1721 default_placement
.data_pool
= name
+ "." + default_storage_pool_suffix
;
1722 default_placement
.data_extra_pool
= name
+ "." + default_storage_extra_pool_suffix
;
1723 placement_pools
["default-placement"] = default_placement
;
1726 r
= fix_pool_names();
1728 ldout(cct
, 0) << "ERROR: fix_pool_names returned r=" << r
<< dendl
;
1732 r
= RGWSystemMetaObj::create(exclusive
);
1737 // try to set as default. may race with another create, so pass exclusive=true
1738 // so we don't override an existing default
1739 r
= set_as_default(true);
1740 if (r
< 0 && r
!= -EEXIST
) {
1741 ldout(cct
, 10) << "WARNING: failed to set zone as default, r=" << r
<< dendl
;
1747 rgw_pool
RGWZoneParams::get_pool(CephContext
*cct
)
1749 if (cct
->_conf
->rgw_zone_root_pool
.empty()) {
1750 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL
);
1753 return rgw_pool(cct
->_conf
->rgw_zone_root_pool
);
1756 const string
RGWZoneParams::get_default_oid(bool old_format
)
1759 return cct
->_conf
->rgw_default_zone_info_oid
;
1762 return cct
->_conf
->rgw_default_zone_info_oid
+ "." + realm_id
;
1765 const string
& RGWZoneParams::get_names_oid_prefix()
1767 return zone_names_oid_prefix
;
1770 const string
& RGWZoneParams::get_info_oid_prefix(bool old_format
)
1772 return zone_info_oid_prefix
;
1775 const string
& RGWZoneParams::get_predefined_name(CephContext
*cct
) {
1776 return cct
->_conf
->rgw_zone
;
1779 int RGWZoneParams::init(CephContext
*cct
, RGWRados
*store
, bool setup_obj
, bool old_format
)
1782 name
= cct
->_conf
->rgw_zone
;
1785 return RGWSystemMetaObj::init(cct
, store
, setup_obj
, old_format
);
1788 int RGWZoneParams::read_default_id(string
& default_id
, bool old_format
)
1790 if (realm_id
.empty()) {
1791 /* try using default realm */
1793 int ret
= realm
.init(cct
, store
);
1795 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
1798 realm_id
= realm
.get_id();
1801 return RGWSystemMetaObj::read_default_id(default_id
, old_format
);
1805 int RGWZoneParams::set_as_default(bool exclusive
)
1807 if (realm_id
.empty()) {
1808 /* try using default realm */
1810 int ret
= realm
.init(cct
, store
);
1812 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
1815 realm_id
= realm
.get_id();
1818 return RGWSystemMetaObj::set_as_default(exclusive
);
1821 const string
& RGWZoneParams::get_compression_type(const string
& placement_rule
) const
1823 static const std::string NONE
{"none"};
1824 auto p
= placement_pools
.find(placement_rule
);
1825 if (p
== placement_pools
.end()) {
1828 const auto& type
= p
->second
.compression_type
;
1829 return !type
.empty() ? type
: NONE
;
1832 void RGWPeriodMap::encode(bufferlist
& bl
) const {
1833 ENCODE_START(2, 1, bl
);
1835 ::encode(zonegroups
, bl
);
1836 ::encode(master_zonegroup
, bl
);
1837 ::encode(short_zone_ids
, bl
);
1841 void RGWPeriodMap::decode(bufferlist::iterator
& bl
) {
1842 DECODE_START(2, bl
);
1844 ::decode(zonegroups
, bl
);
1845 ::decode(master_zonegroup
, bl
);
1846 if (struct_v
>= 2) {
1847 ::decode(short_zone_ids
, bl
);
1851 zonegroups_by_api
.clear();
1852 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.begin();
1853 iter
!= zonegroups
.end(); ++iter
) {
1854 RGWZoneGroup
& zonegroup
= iter
->second
;
1855 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1856 if (zonegroup
.is_master_zonegroup()) {
1857 master_zonegroup
= zonegroup
.get_id();
1862 // run an MD5 hash on the zone_id and return the first 32 bits
1863 static uint32_t gen_short_zone_id(const std::string zone_id
)
1865 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
1867 hash
.Update((const byte
*)zone_id
.c_str(), zone_id
.size());
1871 memcpy((char *)&short_id
, md5
, sizeof(short_id
));
1872 return std::max(short_id
, 1u);
1875 int RGWPeriodMap::update(const RGWZoneGroup
& zonegroup
, CephContext
*cct
)
1877 if (zonegroup
.is_master_zonegroup() && (!master_zonegroup
.empty() && zonegroup
.get_id() != master_zonegroup
)) {
1878 ldout(cct
,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl
;
1879 ldout(cct
,0) << "master zonegroup: " << master_zonegroup
<< " and " << zonegroup
.get_id() <<dendl
;
1882 map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.find(zonegroup
.get_id());
1883 if (iter
!= zonegroups
.end()) {
1884 RGWZoneGroup
& old_zonegroup
= iter
->second
;
1885 if (!old_zonegroup
.api_name
.empty()) {
1886 zonegroups_by_api
.erase(old_zonegroup
.api_name
);
1889 zonegroups
[zonegroup
.get_id()] = zonegroup
;
1891 if (!zonegroup
.api_name
.empty()) {
1892 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1895 if (zonegroup
.is_master_zonegroup()) {
1896 master_zonegroup
= zonegroup
.get_id();
1897 } else if (master_zonegroup
== zonegroup
.get_id()) {
1898 master_zonegroup
= "";
1901 for (auto& i
: zonegroup
.zones
) {
1902 auto& zone
= i
.second
;
1903 if (short_zone_ids
.find(zone
.id
) != short_zone_ids
.end()) {
1906 // calculate the zone's short id
1907 uint32_t short_id
= gen_short_zone_id(zone
.id
);
1909 // search for an existing zone with the same short id
1910 for (auto& s
: short_zone_ids
) {
1911 if (s
.second
== short_id
) {
1912 ldout(cct
, 0) << "New zone '" << zone
.name
<< "' (" << zone
.id
1913 << ") generates the same short_zone_id " << short_id
1914 << " as existing zone id " << s
.first
<< dendl
;
1919 short_zone_ids
[zone
.id
] = short_id
;
1925 uint32_t RGWPeriodMap::get_zone_short_id(const string
& zone_id
) const
1927 auto i
= short_zone_ids
.find(zone_id
);
1928 if (i
== short_zone_ids
.end()) {
1934 int RGWZoneGroupMap::read(CephContext
*cct
, RGWRados
*store
)
1938 int ret
= period
.init(cct
, store
);
1940 cerr
<< "failed to read current period info: " << cpp_strerror(ret
);
1944 bucket_quota
= period
.get_config().bucket_quota
;
1945 user_quota
= period
.get_config().user_quota
;
1946 zonegroups
= period
.get_map().zonegroups
;
1947 zonegroups_by_api
= period
.get_map().zonegroups_by_api
;
1948 master_zonegroup
= period
.get_map().master_zonegroup
;
1953 void RGWRegionMap::encode(bufferlist
& bl
) const {
1954 ENCODE_START( 3, 1, bl
);
1955 ::encode(regions
, bl
);
1956 ::encode(master_region
, bl
);
1957 ::encode(bucket_quota
, bl
);
1958 ::encode(user_quota
, bl
);
1962 void RGWRegionMap::decode(bufferlist::iterator
& bl
) {
1963 DECODE_START(3, bl
);
1964 ::decode(regions
, bl
);
1965 ::decode(master_region
, bl
);
1967 ::decode(bucket_quota
, bl
);
1969 ::decode(user_quota
, bl
);
1973 void RGWZoneGroupMap::encode(bufferlist
& bl
) const {
1974 ENCODE_START( 3, 1, bl
);
1975 ::encode(zonegroups
, bl
);
1976 ::encode(master_zonegroup
, bl
);
1977 ::encode(bucket_quota
, bl
);
1978 ::encode(user_quota
, bl
);
1982 void RGWZoneGroupMap::decode(bufferlist::iterator
& bl
) {
1983 DECODE_START(3, bl
);
1984 ::decode(zonegroups
, bl
);
1985 ::decode(master_zonegroup
, bl
);
1987 ::decode(bucket_quota
, bl
);
1989 ::decode(user_quota
, bl
);
1992 zonegroups_by_api
.clear();
1993 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.begin();
1994 iter
!= zonegroups
.end(); ++iter
) {
1995 RGWZoneGroup
& zonegroup
= iter
->second
;
1996 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1997 if (zonegroup
.is_master_zonegroup()) {
1998 master_zonegroup
= zonegroup
.get_name();
2003 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
2005 obj_version
*check_objv
= version_for_check();
2008 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
2011 cls_version_read(*op
, &read_version
);
2014 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
2016 obj_version
*check_objv
= version_for_check();
2017 obj_version
*modify_version
= version_for_write();
2020 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
2023 if (modify_version
) {
2024 cls_version_set(*op
, *modify_version
);
2026 cls_version_inc(*op
);
2030 void RGWObjManifest::obj_iterator::operator++()
2032 if (manifest
->explicit_objs
) {
2035 if (explicit_iter
== manifest
->objs
.end()) {
2036 ofs
= manifest
->obj_size
;
2040 update_explicit_pos();
2046 uint64_t obj_size
= manifest
->get_obj_size();
2047 uint64_t head_size
= manifest
->get_head_size();
2049 if (ofs
== obj_size
) {
2053 if (manifest
->rules
.empty()) {
2057 /* are we still pointing at the head? */
2058 if (ofs
< head_size
) {
2059 rule_iter
= manifest
->rules
.begin();
2060 RGWObjManifestRule
*rule
= &rule_iter
->second
;
2061 ofs
= MIN(head_size
, obj_size
);
2064 stripe_size
= MIN(obj_size
- ofs
, rule
->stripe_max_size
);
2065 if (rule
->part_size
> 0) {
2066 stripe_size
= MIN(stripe_size
, rule
->part_size
);
2072 RGWObjManifestRule
*rule
= &rule_iter
->second
;
2074 stripe_ofs
+= rule
->stripe_max_size
;
2076 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule
->part_size
<< " rules.size()=" << manifest
->rules
.size() << dendl
;
2078 if (rule
->part_size
> 0) {
2079 /* multi part, multi stripes object */
2081 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
2083 if (stripe_ofs
>= part_ofs
+ rule
->part_size
) {
2084 /* moved to the next part */
2086 part_ofs
+= rule
->part_size
;
2087 stripe_ofs
= part_ofs
;
2089 bool last_rule
= (next_rule_iter
== manifest
->rules
.end());
2090 /* move to the next rule? */
2091 if (!last_rule
&& stripe_ofs
>= next_rule_iter
->second
.start_ofs
) {
2092 rule_iter
= next_rule_iter
;
2093 last_rule
= (next_rule_iter
== manifest
->rules
.end());
2097 cur_part_id
= rule_iter
->second
.start_part_num
;
2102 rule
= &rule_iter
->second
;
2105 stripe_size
= MIN(rule
->part_size
- (stripe_ofs
- part_ofs
), rule
->stripe_max_size
);
2108 cur_override_prefix
= rule
->override_prefix
;
2111 if (ofs
> obj_size
) {
2117 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs
<< " stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
2121 int RGWObjManifest::generator::create_begin(CephContext
*cct
, RGWObjManifest
*_m
, const string
& placement_rule
, rgw_bucket
& _b
, rgw_obj
& _obj
)
2125 manifest
->set_tail_placement(placement_rule
, _b
);
2126 manifest
->set_head(placement_rule
, _obj
, 0);
2129 if (manifest
->get_prefix().empty()) {
2131 gen_rand_alphanumeric(cct
, buf
, sizeof(buf
) - 1);
2133 string oid_prefix
= ".";
2134 oid_prefix
.append(buf
);
2135 oid_prefix
.append("_");
2137 manifest
->set_prefix(oid_prefix
);
2140 bool found
= manifest
->get_rule(0, &rule
);
2142 derr
<< "ERROR: manifest->get_rule() could not find rule" << dendl
;
2146 uint64_t head_size
= manifest
->get_head_size();
2148 if (head_size
> 0) {
2149 cur_stripe_size
= head_size
;
2151 cur_stripe_size
= rule
.stripe_max_size
;
2154 cur_part_id
= rule
.start_part_num
;
2156 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, 0, NULL
, &cur_obj
);
2158 // Normal object which not generated through copy operation
2159 manifest
->set_tail_instance(_obj
.key
.instance
);
2161 manifest
->update_iterators();
2166 int RGWObjManifest::generator::create_next(uint64_t ofs
)
2168 if (ofs
< last_ofs
) /* only going forward */
2171 uint64_t max_head_size
= manifest
->get_max_head_size();
2173 if (ofs
< max_head_size
) {
2174 manifest
->set_head_size(ofs
);
2177 if (ofs
>= max_head_size
) {
2178 manifest
->set_head_size(max_head_size
);
2179 cur_stripe
= (ofs
- max_head_size
) / rule
.stripe_max_size
;
2180 cur_stripe_size
= rule
.stripe_max_size
;
2182 if (cur_part_id
== 0 && max_head_size
> 0) {
2188 manifest
->set_obj_size(ofs
);
2190 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, ofs
, NULL
, &cur_obj
);
2192 manifest
->update_iterators();
2197 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_begin()
2202 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_end()
2207 RGWObjManifest::obj_iterator
RGWObjManifest::obj_find(uint64_t ofs
)
2209 if (ofs
> obj_size
) {
2212 RGWObjManifest::obj_iterator
iter(this);
2217 int RGWObjManifest::append(RGWObjManifest
& m
, RGWZoneGroup
& zonegroup
, RGWZoneParams
& zone_params
)
2219 if (explicit_objs
|| m
.explicit_objs
) {
2220 return append_explicit(m
, zonegroup
, zone_params
);
2223 if (rules
.empty()) {
2228 string override_prefix
;
2230 if (prefix
.empty()) {
2234 if (prefix
!= m
.prefix
) {
2235 override_prefix
= m
.prefix
;
2238 map
<uint64_t, RGWObjManifestRule
>::iterator miter
= m
.rules
.begin();
2239 if (miter
== m
.rules
.end()) {
2240 return append_explicit(m
, zonegroup
, zone_params
);
2243 for (; miter
!= m
.rules
.end(); ++miter
) {
2244 map
<uint64_t, RGWObjManifestRule
>::reverse_iterator last_rule
= rules
.rbegin();
2246 RGWObjManifestRule
& rule
= last_rule
->second
;
2248 if (rule
.part_size
== 0) {
2249 rule
.part_size
= obj_size
- rule
.start_ofs
;
2252 RGWObjManifestRule
& next_rule
= miter
->second
;
2253 if (!next_rule
.part_size
) {
2254 next_rule
.part_size
= m
.obj_size
- next_rule
.start_ofs
;
2257 string rule_prefix
= prefix
;
2258 if (!rule
.override_prefix
.empty()) {
2259 rule_prefix
= rule
.override_prefix
;
2262 string next_rule_prefix
= m
.prefix
;
2263 if (!next_rule
.override_prefix
.empty()) {
2264 next_rule_prefix
= next_rule
.override_prefix
;
2267 if (rule
.part_size
!= next_rule
.part_size
||
2268 rule
.stripe_max_size
!= next_rule
.stripe_max_size
||
2269 rule_prefix
!= next_rule_prefix
) {
2270 if (next_rule_prefix
!= prefix
) {
2271 append_rules(m
, miter
, &next_rule_prefix
);
2273 append_rules(m
, miter
, NULL
);
2278 uint64_t expected_part_num
= rule
.start_part_num
+ 1;
2279 if (rule
.part_size
> 0) {
2280 expected_part_num
= rule
.start_part_num
+ (obj_size
+ next_rule
.start_ofs
- rule
.start_ofs
) / rule
.part_size
;
2283 if (expected_part_num
!= next_rule
.start_part_num
) {
2284 append_rules(m
, miter
, NULL
);
2289 set_obj_size(obj_size
+ m
.obj_size
);
2294 int RGWObjManifest::append(RGWObjManifest
& m
, RGWRados
*store
)
2296 return append(m
, store
->get_zonegroup(), store
->get_zone_params());
2299 void RGWObjManifest::append_rules(RGWObjManifest
& m
, map
<uint64_t, RGWObjManifestRule
>::iterator
& miter
,
2300 string
*override_prefix
)
2302 for (; miter
!= m
.rules
.end(); ++miter
) {
2303 RGWObjManifestRule rule
= miter
->second
;
2304 rule
.start_ofs
+= obj_size
;
2305 if (override_prefix
)
2306 rule
.override_prefix
= *override_prefix
;
2307 rules
[rule
.start_ofs
] = rule
;
2311 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
2313 if (explicit_objs
) {
2316 obj_iterator iter
= obj_begin();
2318 while (iter
!= obj_end()) {
2319 RGWObjManifestPart
& part
= objs
[iter
.get_stripe_ofs()];
2320 const rgw_obj_select
& os
= iter
.get_location();
2321 const rgw_raw_obj
& raw_loc
= os
.get_raw_obj(zonegroup
, zone_params
);
2324 uint64_t ofs
= iter
.get_stripe_ofs();
2329 rgw_raw_obj_to_obj(tail_placement
.bucket
, raw_loc
, &part
.loc
);
2332 uint64_t next_ofs
= iter
.get_stripe_ofs();
2334 part
.size
= next_ofs
- ofs
;
2337 explicit_objs
= true;
2342 int RGWObjManifest::append_explicit(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
2344 if (!explicit_objs
) {
2345 convert_to_explicit(zonegroup
, zone_params
);
2347 if (!m
.explicit_objs
) {
2348 m
.convert_to_explicit(zonegroup
, zone_params
);
2350 map
<uint64_t, RGWObjManifestPart
>::iterator iter
;
2351 uint64_t base
= obj_size
;
2352 for (iter
= m
.objs
.begin(); iter
!= m
.objs
.end(); ++iter
) {
2353 RGWObjManifestPart
& part
= iter
->second
;
2354 objs
[base
+ iter
->first
] = part
;
2356 obj_size
+= m
.obj_size
;
2361 bool RGWObjManifest::get_rule(uint64_t ofs
, RGWObjManifestRule
*rule
)
2363 if (rules
.empty()) {
2367 map
<uint64_t, RGWObjManifestRule
>::iterator iter
= rules
.upper_bound(ofs
);
2368 if (iter
!= rules
.begin()) {
2372 *rule
= iter
->second
;
2377 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
2379 write_version
.ver
= 1;
2382 write_version
.tag
.clear();
2383 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
2386 int RGWPutObjProcessor::complete(size_t accounted_size
, const string
& etag
,
2387 real_time
*mtime
, real_time set_mtime
,
2388 map
<string
, bufferlist
>& attrs
, real_time delete_at
,
2389 const char *if_match
, const char *if_nomatch
, const string
*user_data
,
2390 rgw_zone_set
*zones_trace
)
2392 int r
= do_complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
, if_match
, if_nomatch
, user_data
, zones_trace
);
2396 is_complete
= !canceled
;
2400 CephContext
*RGWPutObjProcessor::ctx()
2402 return store
->ctx();
2405 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2412 set
<rgw_raw_obj
>::iterator iter
;
2413 bool need_to_remove_head
= false;
2414 rgw_raw_obj raw_head
;
2416 if (!head_obj
.empty()) {
2417 store
->obj_to_raw(bucket_info
.placement_rule
, head_obj
, &raw_head
);
2421 * We should delete the object in the "multipart" namespace to avoid race condition.
2422 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2423 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2424 * written by the second upload may be deleted by the first upload.
2425 * details is describled on #11749
2427 * The above comment still stands, but instead of searching for a specific object in the multipart
2428 * namespace, we just make sure that we remove the object that is marked as the head object after
2429 * we remove all the other raw objects. Note that we use different call to remove the head object,
2430 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2432 for (iter
= written_objs
.begin(); iter
!= written_objs
.end(); ++iter
) {
2433 const rgw_raw_obj
& obj
= *iter
;
2434 if (!head_obj
.empty() && obj
== raw_head
) {
2435 ldout(store
->ctx(), 5) << "NOTE: we should not process the head object (" << obj
<< ") here" << dendl
;
2436 need_to_remove_head
= true;
2440 int r
= store
->delete_raw_obj(obj
);
2441 if (r
< 0 && r
!= -ENOENT
) {
2442 ldout(store
->ctx(), 5) << "WARNING: failed to remove obj (" << obj
<< "), leaked" << dendl
;
2446 if (need_to_remove_head
) {
2447 ldout(store
->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head
<< ")" << dendl
;
2448 int r
= store
->delete_obj(obj_ctx
, bucket_info
, head_obj
, 0, 0);
2449 if (r
< 0 && r
!= -ENOENT
) {
2450 ldout(store
->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head
<< "), leaked" << dendl
;
2455 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj
& obj
, bufferlist
& bl
, off_t ofs
, off_t abs_ofs
, void **phandle
, bool exclusive
)
2457 if ((uint64_t)abs_ofs
+ bl
.length() > obj_len
)
2458 obj_len
= abs_ofs
+ bl
.length();
2460 if (!(obj
== last_written_obj
)) {
2461 last_written_obj
= obj
;
2464 // For the first call pass -1 as the offset to
2466 return store
->aio_put_obj_data(NULL
, obj
, bl
, ((ofs
!= 0) ? ofs
: -1), exclusive
, phandle
);
2469 struct put_obj_aio_info
RGWPutObjProcessor_Aio::pop_pending()
2471 struct put_obj_aio_info info
;
2472 info
= pending
.front();
2473 pending
.pop_front();
2474 pending_size
-= info
.size
;
2478 int RGWPutObjProcessor_Aio::wait_pending_front()
2480 if (pending
.empty()) {
2483 struct put_obj_aio_info info
= pop_pending();
2484 int ret
= store
->aio_wait(info
.handle
);
2487 add_written_obj(info
.obj
);
2493 bool RGWPutObjProcessor_Aio::pending_has_completed()
2495 if (pending
.empty())
2498 struct put_obj_aio_info
& info
= pending
.front();
2499 return store
->aio_completed(info
.handle
);
2502 int RGWPutObjProcessor_Aio::drain_pending()
2505 while (!pending
.empty()) {
2506 int r
= wait_pending_front();
2513 int RGWPutObjProcessor_Aio::throttle_data(void *handle
, const rgw_raw_obj
& obj
, uint64_t size
, bool need_to_wait
)
2515 bool _wait
= need_to_wait
;
2518 struct put_obj_aio_info info
;
2519 info
.handle
= handle
;
2522 pending_size
+= size
;
2523 pending
.push_back(info
);
2525 size_t orig_size
= pending_size
;
2527 /* first drain complete IOs */
2528 while (pending_has_completed()) {
2529 int r
= wait_pending_front();
2536 /* resize window in case messages are draining too fast */
2537 if (orig_size
- pending_size
>= window_size
) {
2538 window_size
+= store
->ctx()->_conf
->rgw_max_chunk_size
;
2539 uint64_t max_window_size
= store
->ctx()->_conf
->rgw_put_obj_max_window_size
;
2540 if (window_size
> max_window_size
) {
2541 window_size
= max_window_size
;
2545 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2546 if (pending_size
> window_size
|| _wait
) {
2547 int r
= wait_pending_front();
2554 int RGWPutObjProcessor_Atomic::write_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
, bool exclusive
)
2556 if (ofs
>= next_part_ofs
) {
2557 int r
= prepare_next_part(ofs
);
2568 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj
, bl
, ofs
- cur_part_ofs
, ofs
, phandle
, exclusive
);
2571 int RGWPutObjProcessor_Aio::prepare(RGWRados
*store
, string
*oid_rand
)
2573 RGWPutObjProcessor::prepare(store
, oid_rand
);
2575 window_size
= store
->ctx()->_conf
->rgw_put_obj_min_window_size
;
2580 int RGWPutObjProcessor_Atomic::handle_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
, bool *again
)
2583 uint64_t max_write_size
= MIN(max_chunk_size
, (uint64_t)next_part_ofs
- data_ofs
);
2585 pending_data_bl
.claim_append(bl
);
2586 if (pending_data_bl
.length() < max_write_size
) {
2591 pending_data_bl
.splice(0, max_write_size
, &bl
);
2593 /* do we have enough data pending accumulated that needs to be written? */
2594 *again
= (pending_data_bl
.length() >= max_chunk_size
);
2596 if (!data_ofs
&& !immutable_head()) {
2597 first_chunk
.claim(bl
);
2598 obj_len
= (uint64_t)first_chunk
.length();
2599 int r
= prepare_next_part(obj_len
);
2606 off_t write_ofs
= data_ofs
;
2607 data_ofs
= write_ofs
+ bl
.length();
2608 bool exclusive
= (!write_ofs
&& immutable_head()); /* immutable head object, need to verify nothing exists there
2609 we could be racing with another upload, to the same
2610 object and cleanup can be messy */
2611 int ret
= write_data(bl
, write_ofs
, phandle
, pobj
, exclusive
);
2612 if (ret
>= 0) { /* we might return, need to clear bl as it was already sent */
2619 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados
*store
, string
*oid_rand
)
2621 RGWPutObjProcessor_Aio::prepare(store
, oid_rand
);
2623 int r
= store
->get_max_chunk_size(bucket_info
.placement_rule
, head_obj
, &max_chunk_size
);
2631 int RGWPutObjProcessor_Atomic::prepare(RGWRados
*store
, string
*oid_rand
)
2633 head_obj
.init(bucket
, obj_str
);
2635 int r
= prepare_init(store
, oid_rand
);
2640 if (!version_id
.empty()) {
2641 head_obj
.key
.set_instance(version_id
);
2642 } else if (versioned_object
) {
2643 store
->gen_rand_obj_instance_name(&head_obj
);
2646 manifest
.set_trivial_rule(max_chunk_size
, store
->ctx()->_conf
->rgw_obj_stripe_size
);
2648 r
= manifest_gen
.create_begin(store
->ctx(), &manifest
, bucket_info
.placement_rule
, head_obj
.bucket
, head_obj
);
2656 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs
) {
2658 int ret
= manifest_gen
.create_next(ofs
);
2660 lderr(store
->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret
<< dendl
;
2664 next_part_ofs
= ofs
+ manifest_gen
.cur_stripe_max_size();
2665 cur_obj
= manifest_gen
.get_cur_obj(store
);
2670 int RGWPutObjProcessor_Atomic::complete_parts()
2672 if (obj_len
> (uint64_t)cur_part_ofs
) {
2673 return prepare_next_part(obj_len
);
2678 int RGWPutObjProcessor_Atomic::complete_writing_data()
2680 if (!data_ofs
&& !immutable_head()) {
2681 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2682 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2683 * clobber first_chunk
2685 if (pending_data_bl
.length() > 0) {
2686 first_chunk
.claim(pending_data_bl
);
2688 obj_len
= (uint64_t)first_chunk
.length();
2690 while (pending_data_bl
.length()) {
2693 uint64_t max_write_size
= MIN(max_chunk_size
, (uint64_t)next_part_ofs
- data_ofs
);
2694 if (max_write_size
> pending_data_bl
.length()) {
2695 max_write_size
= pending_data_bl
.length();
2698 pending_data_bl
.splice(0, max_write_size
, &bl
);
2699 uint64_t write_len
= bl
.length();
2700 int r
= write_data(bl
, data_ofs
, &handle
, &obj
, false);
2702 ldout(store
->ctx(), 0) << "ERROR: write_data() returned " << r
<< dendl
;
2705 data_ofs
+= write_len
;
2706 r
= throttle_data(handle
, obj
, write_len
, false);
2708 ldout(store
->ctx(), 0) << "ERROR: throttle_data() returned " << r
<< dendl
;
2712 if (data_ofs
>= next_part_ofs
) {
2713 r
= prepare_next_part(data_ofs
);
2715 ldout(store
->ctx(), 0) << "ERROR: prepare_next_part() returned " << r
<< dendl
;
2720 int r
= complete_parts();
2725 r
= drain_pending();
2732 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size
, const string
& etag
,
2733 real_time
*mtime
, real_time set_mtime
,
2734 map
<string
, bufferlist
>& attrs
,
2735 real_time delete_at
,
2736 const char *if_match
,
2737 const char *if_nomatch
, const string
*user_data
,
2738 rgw_zone_set
*zones_trace
) {
2739 int r
= complete_writing_data();
2743 obj_ctx
.obj
.set_atomic(head_obj
);
2745 RGWRados::Object
op_target(store
, bucket_info
, obj_ctx
, head_obj
);
2747 /* some object types shouldn't be versioned, e.g., multipart parts */
2748 op_target
.set_versioning_disabled(!versioned_object
);
2750 RGWRados::Object::Write
obj_op(&op_target
);
2752 obj_op
.meta
.data
= &first_chunk
;
2753 obj_op
.meta
.manifest
= &manifest
;
2754 obj_op
.meta
.ptag
= &unique_tag
; /* use req_id as operation tag */
2755 obj_op
.meta
.if_match
= if_match
;
2756 obj_op
.meta
.if_nomatch
= if_nomatch
;
2757 obj_op
.meta
.mtime
= mtime
;
2758 obj_op
.meta
.set_mtime
= set_mtime
;
2759 obj_op
.meta
.owner
= bucket_info
.owner
;
2760 obj_op
.meta
.flags
= PUT_OBJ_CREATE
;
2761 obj_op
.meta
.olh_epoch
= olh_epoch
;
2762 obj_op
.meta
.delete_at
= delete_at
;
2763 obj_op
.meta
.user_data
= user_data
;
2764 obj_op
.meta
.zones_trace
= zones_trace
;
2766 r
= obj_op
.write_meta(obj_len
, accounted_size
, attrs
);
2771 canceled
= obj_op
.meta
.canceled
;
2776 int RGWRados::watch(const string
& oid
, uint64_t *watch_handle
, librados::WatchCtx2
*ctx
) {
2777 int r
= control_pool_ctx
.watch2(oid
, watch_handle
, ctx
);
2783 int RGWRados::unwatch(uint64_t watch_handle
)
2785 int r
= control_pool_ctx
.unwatch2(watch_handle
);
2787 ldout(cct
, 0) << "ERROR: rados->unwatch2() returned r=" << r
<< dendl
;
2790 r
= rados
[0].watch_flush();
2792 ldout(cct
, 0) << "ERROR: rados->watch_flush() returned r=" << r
<< dendl
;
2798 void RGWRados::add_watcher(int i
)
2800 ldout(cct
, 20) << "add_watcher() i=" << i
<< dendl
;
2801 Mutex::Locker
l(watchers_lock
);
2802 watchers_set
.insert(i
);
2803 if (watchers_set
.size() == (size_t)num_watchers
) {
2804 ldout(cct
, 2) << "all " << num_watchers
<< " watchers are set, enabling cache" << dendl
;
2805 set_cache_enabled(true);
2809 void RGWRados::remove_watcher(int i
)
2811 ldout(cct
, 20) << "remove_watcher() i=" << i
<< dendl
;
2812 Mutex::Locker
l(watchers_lock
);
2813 size_t orig_size
= watchers_set
.size();
2814 watchers_set
.erase(i
);
2815 if (orig_size
== (size_t)num_watchers
&&
2816 watchers_set
.size() < orig_size
) { /* actually removed */
2817 ldout(cct
, 2) << "removed watcher, disabling cache" << dendl
;
2818 set_cache_enabled(false);
2822 class RGWWatcher
: public librados::WatchCtx2
{
2826 uint64_t watch_handle
;
2828 class C_ReinitWatch
: public Context
{
2829 RGWWatcher
*watcher
;
2831 explicit C_ReinitWatch(RGWWatcher
*_watcher
) : watcher(_watcher
) {}
2832 void finish(int r
) override
{
2837 RGWWatcher(RGWRados
*r
, int i
, const string
& o
) : rados(r
), index(i
), oid(o
), watch_handle(0) {}
2838 void handle_notify(uint64_t notify_id
,
2840 uint64_t notifier_id
,
2841 bufferlist
& bl
) override
{
2842 ldout(rados
->ctx(), 10) << "RGWWatcher::handle_notify() "
2843 << " notify_id " << notify_id
2844 << " cookie " << cookie
2845 << " notifier " << notifier_id
2846 << " bl.length()=" << bl
.length() << dendl
;
2847 rados
->watch_cb(notify_id
, cookie
, notifier_id
, bl
);
2849 bufferlist reply_bl
; // empty reply payload
2850 rados
->control_pool_ctx
.notify_ack(oid
, notify_id
, cookie
, reply_bl
);
2852 void handle_error(uint64_t cookie
, int err
) override
{
2853 lderr(rados
->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2854 << " err " << cpp_strerror(err
) << dendl
;
2855 rados
->remove_watcher(index
);
2856 rados
->schedule_context(new C_ReinitWatch(this));
2860 int ret
= unregister_watch();
2862 ldout(rados
->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret
<< dendl
;
2865 ret
= register_watch();
2867 ldout(rados
->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret
<< dendl
;
2872 int unregister_watch() {
2873 int r
= rados
->unwatch(watch_handle
);
2877 rados
->remove_watcher(index
);
2881 int register_watch() {
2882 int r
= rados
->watch(oid
, &watch_handle
, this);
2886 rados
->add_watcher(index
);
2891 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
2893 RGWHTTPManager http_manager
;
2896 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
2897 http_manager(store
->ctx(), completion_mgr
) {
2898 http_manager
.set_threaded();
2901 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
2902 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
2906 list
<RGWCoroutinesStack
*> stacks
;
2907 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
2908 RGWRESTConn
*conn
= iter
->second
;
2909 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
2910 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
2912 stacks
.push_back(stack
);
2918 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
2920 RGWHTTPManager http_manager
;
2923 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
2924 http_manager(store
->ctx(), completion_mgr
) {
2925 http_manager
.set_threaded();
2928 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, map
<int, set
<string
> >& shards
) {
2929 rgw_http_param_pair pairs
[] = { { "type", "data" },
2931 { "source-zone", store
->get_zone_params().get_id().c_str() },
2934 list
<RGWCoroutinesStack
*> stacks
;
2935 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
2936 RGWRESTConn
*conn
= iter
->second
;
2937 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
2938 stack
->call(new RGWPostRESTResourceCR
<map
<int, set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
2940 stacks
.push_back(stack
);
2946 class RGWRadosThread
{
2947 class Worker
: public Thread
{
2949 RGWRadosThread
*processor
;
2954 Mutex::Locker
l(lock
);
2958 void wait_interval(const utime_t
& wait_time
) {
2959 Mutex::Locker
l(lock
);
2960 cond
.WaitInterval(lock
, wait_time
);
2964 Worker(CephContext
*_cct
, RGWRadosThread
*_p
) : cct(_cct
), processor(_p
), lock("RGWRadosThread::Worker") {}
2965 void *entry() override
;
2967 Mutex::Locker
l(lock
);
2978 std::atomic
<bool> down_flag
= { false };
2982 virtual uint64_t interval_msec() = 0;
2983 virtual void stop_process() {}
2985 RGWRadosThread(RGWRados
*_store
, const string
& thread_name
= "radosgw")
2986 : worker(NULL
), cct(_store
->ctx()), store(_store
), thread_name(thread_name
) {}
2987 virtual ~RGWRadosThread() {
2991 virtual int init() { return 0; }
2992 virtual int process() = 0;
2994 bool going_down() { return down_flag
; }
3006 void RGWRadosThread::start()
3008 worker
= new Worker(cct
, this);
3009 worker
->create(thread_name
.c_str());
3012 void RGWRadosThread::stop()
3024 void *RGWRadosThread::Worker::entry() {
3025 uint64_t msec
= processor
->interval_msec();
3026 utime_t interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
3029 utime_t start
= ceph_clock_now();
3030 int r
= processor
->process();
3032 dout(0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
3035 if (processor
->going_down())
3038 utime_t end
= ceph_clock_now();
3041 uint64_t cur_msec
= processor
->interval_msec();
3042 if (cur_msec
!= msec
) { /* was it reconfigured? */
3044 interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
3048 if (interval
<= end
)
3049 continue; // next round
3051 utime_t wait_time
= interval
;
3054 wait_interval(wait_time
);
3058 } while (!processor
->going_down());
3063 class RGWMetaNotifier
: public RGWRadosThread
{
3064 RGWMetaNotifierManager notify_mgr
;
3065 RGWMetadataLog
*const log
;
3067 uint64_t interval_msec() override
{
3068 return cct
->_conf
->rgw_md_notify_interval_msec
;
3071 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
3072 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
3074 int process() override
;
3077 int RGWMetaNotifier::process()
3081 log
->read_clear_modified(shards
);
3083 if (shards
.empty()) {
3087 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
3088 ldout(cct
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
3091 notify_mgr
.notify_all(store
->zone_conn_map
, shards
);
3096 class RGWDataNotifier
: public RGWRadosThread
{
3097 RGWDataNotifierManager notify_mgr
;
3099 uint64_t interval_msec() override
{
3100 return cct
->_conf
->rgw_md_notify_interval_msec
;
3103 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
3105 int process() override
;
3108 int RGWDataNotifier::process()
3110 if (!store
->data_log
) {
3114 map
<int, set
<string
> > shards
;
3116 store
->data_log
->read_clear_modified(shards
);
3118 if (shards
.empty()) {
3122 for (map
<int, set
<string
> >::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
3123 ldout(cct
, 20) << __func__
<< "(): notifying datalog change, shard_id=" << iter
->first
<< ": " << iter
->second
<< dendl
;
3126 notify_mgr
.notify_all(store
->zone_data_notify_to_map
, shards
);
3131 class RGWSyncProcessorThread
: public RGWRadosThread
{
3133 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
3134 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
3135 ~RGWSyncProcessorThread() override
{}
3136 int init() override
= 0 ;
3137 int process() override
= 0;
3140 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
3142 RGWMetaSyncStatusManager sync
;
3144 uint64_t interval_msec() override
{
3145 return 0; /* no interval associated, it'll run once until stopped */
3147 void stop_process() override
{
3151 RGWMetaSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
)
3152 : RGWSyncProcessorThread(_store
, "meta-sync"), sync(_store
, async_rados
) {}
3154 void wakeup_sync_shards(set
<int>& shard_ids
) {
3155 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
3159 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
3161 int init() override
{
3162 int ret
= sync
.init();
3164 ldout(store
->ctx(), 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
3170 int process() override
{
3176 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
3178 RGWDataSyncStatusManager sync
;
3181 uint64_t interval_msec() override
{
3183 return 0; /* no interval associated, it'll run once until stopped */
3185 #define DATA_SYNC_INIT_WAIT_SEC 20
3186 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
3189 void stop_process() override
{
3193 RGWDataSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
,
3194 const string
& _source_zone
)
3195 : RGWSyncProcessorThread(_store
, "data-sync"), sync(_store
, async_rados
, _source_zone
),
3196 initialized(false) {}
3198 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
3199 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
3200 sync
.wakeup(iter
->first
, iter
->second
);
3203 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
3205 int init() override
{
3209 int process() override
{
3210 while (!initialized
) {
3214 int ret
= sync
.init();
3219 /* we'll be back! */
3227 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
3229 RGWCoroutinesManager crs
;
3231 RGWHTTPManager http
;
3232 const utime_t trim_interval
;
3234 uint64_t interval_msec() override
{ return 0; }
3235 void stop_process() override
{ crs
.stop(); }
3237 RGWSyncLogTrimThread(RGWRados
*store
, int interval
)
3238 : RGWSyncProcessorThread(store
, "sync-log-trim"),
3239 crs(store
->ctx(), store
->get_cr_registry()), store(store
),
3240 http(store
->ctx(), crs
.get_completion_mgr()),
3241 trim_interval(interval
, 0)
3244 int init() override
{
3245 return http
.set_threaded();
3247 int process() override
{
3248 list
<RGWCoroutinesStack
*> stacks
;
3249 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3250 meta
->call(create_meta_log_trim_cr(store
, &http
,
3251 cct
->_conf
->rgw_md_log_max_shards
,
3253 stacks
.push_back(meta
);
3255 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3256 data
->call(create_data_log_trim_cr(store
, &http
,
3257 cct
->_conf
->rgw_data_log_num_shards
,
3259 stacks
.push_back(data
);
3266 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
3268 Mutex::Locker
l(meta_sync_thread_lock
);
3269 if (meta_sync_processor_thread
) {
3270 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
3274 void RGWRados::wakeup_data_sync_shards(const string
& source_zone
, map
<int, set
<string
> >& shard_ids
)
3276 ldout(ctx(), 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
3277 Mutex::Locker
l(data_sync_thread_lock
);
3278 map
<string
, RGWDataSyncProcessorThread
*>::iterator iter
= data_sync_processor_threads
.find(source_zone
);
3279 if (iter
== data_sync_processor_threads
.end()) {
3280 ldout(ctx(), 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
3284 RGWDataSyncProcessorThread
*thread
= iter
->second
;
3286 thread
->wakeup_sync_shards(shard_ids
);
3289 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
3291 Mutex::Locker
l(meta_sync_thread_lock
);
3292 if (meta_sync_processor_thread
) {
3293 return meta_sync_processor_thread
->get_manager();
3298 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const std::string
& source_zone
)
3300 Mutex::Locker
l(data_sync_thread_lock
);
3301 auto thread
= data_sync_processor_threads
.find(source_zone
);
3302 if (thread
== data_sync_processor_threads
.end()) {
3305 return thread
->second
->get_manager();
3308 int RGWRados::get_required_alignment(const rgw_pool
& pool
, uint64_t *alignment
)
3311 int r
= open_pool_ctx(pool
, ioctx
);
3313 ldout(cct
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
3318 r
= ioctx
.pool_requires_alignment2(&requires
);
3320 ldout(cct
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3331 r
= ioctx
.pool_required_alignment2(&align
);
3333 ldout(cct
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3338 ldout(cct
, 20) << "required alignment=" << align
<< dendl
;
3344 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
)
3347 int r
= get_required_alignment(pool
, &alignment
);
3352 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
3354 if (alignment
== 0) {
3355 *max_chunk_size
= config_chunk_size
;
3359 if (config_chunk_size
<= alignment
) {
3360 *max_chunk_size
= alignment
;
3364 *max_chunk_size
= config_chunk_size
- (config_chunk_size
% alignment
);
3366 ldout(cct
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
3371 int RGWRados::get_max_chunk_size(const string
& placement_rule
, const rgw_obj
& obj
, uint64_t *max_chunk_size
)
3374 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
3375 ldout(cct
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
3378 return get_max_chunk_size(pool
, max_chunk_size
);
3381 class RGWIndexCompletionManager
;
3383 struct complete_op_data
{
3384 Mutex lock
{"complete_op_data"};
3385 AioCompletion
*rados_completion
{nullptr};
3386 int manager_shard_id
{-1};
3387 RGWIndexCompletionManager
*manager
{nullptr};
3391 rgw_bucket_entry_ver ver
;
3392 cls_rgw_obj_key key
;
3393 rgw_bucket_dir_entry_meta dir_meta
;
3394 list
<cls_rgw_obj_key
> remove_objs
;
3397 rgw_zone_set zones_trace
;
3399 bool stopped
{false};
3402 Mutex::Locker
l(lock
);
3407 class RGWIndexCompletionThread
: public RGWRadosThread
{
3410 uint64_t interval_msec() override
{
3414 list
<complete_op_data
*> completions
;
3416 Mutex completions_lock
;
3418 RGWIndexCompletionThread(RGWRados
*_store
)
3419 : RGWRadosThread(_store
, "index-complete"), store(_store
), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3421 int process() override
;
3423 void add_completion(complete_op_data
*completion
) {
3425 Mutex::Locker
l(completions_lock
);
3426 completions
.push_back(completion
);
3433 int RGWIndexCompletionThread::process()
3435 list
<complete_op_data
*> comps
;
3438 Mutex::Locker
l(completions_lock
);
3439 completions
.swap(comps
);
3442 for (auto c
: comps
) {
3443 std::unique_ptr
<complete_op_data
> up
{c
};
3448 ldout(store
->ctx(), 20) << __func__
<< "(): handling completion for key=" << c
->key
<< dendl
;
3450 RGWRados::BucketShard
bs(store
);
3452 int r
= bs
.init(c
->obj
.bucket
, c
->obj
);
3454 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): failed to initialize BucketShard, obj=" << c
->obj
<< " r=" << r
<< dendl
;
3455 /* not much to do */
3459 r
= store
->guard_reshard(&bs
, c
->obj
, [&](RGWRados::BucketShard
*bs
) -> int {
3460 librados::ObjectWriteOperation o
;
3461 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
3462 cls_rgw_bucket_complete_op(o
, c
->op
, c
->tag
, c
->ver
, c
->key
, c
->dir_meta
, &c
->remove_objs
,
3463 c
->log_op
, c
->bilog_op
, &c
->zones_trace
);
3465 return bs
->index_ctx
.operate(bs
->bucket_obj
, &o
);
3468 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): bucket index completion failed, obj=" << c
->obj
<< " r=" << r
<< dendl
;
3469 /* ignoring error, can't do anything about it */
3472 r
= store
->data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
3474 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
3481 class RGWIndexCompletionManager
{
3482 RGWRados
*store
{nullptr};
3483 vector
<Mutex
*> locks
;
3484 vector
<set
<complete_op_data
*> > completions
;
3486 RGWIndexCompletionThread
*completion_thread
{nullptr};
3490 std::atomic
<int> cur_shard
{0};
3494 RGWIndexCompletionManager(RGWRados
*_store
) : store(_store
) {
3495 num_shards
= store
->ctx()->_conf
->rgw_thread_pool_size
;
3497 for (int i
= 0; i
< num_shards
; i
++) {
3499 snprintf(buf
, sizeof(buf
), "RGWIndexCompletionManager::lock::%d", i
);
3500 locks
.push_back(new Mutex(buf
));
3503 completions
.resize(num_shards
);
3505 ~RGWIndexCompletionManager() {
3508 for (auto l
: locks
) {
3514 int result
= cur_shard
% num_shards
;
3519 void create_completion(const rgw_obj
& obj
,
3520 RGWModifyOp op
, string
& tag
,
3521 rgw_bucket_entry_ver
& ver
,
3522 const cls_rgw_obj_key
& key
,
3523 rgw_bucket_dir_entry_meta
& dir_meta
,
3524 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
3526 rgw_zone_set
*zones_trace
,
3527 complete_op_data
**result
);
3528 bool handle_completion(completion_t cb
, complete_op_data
*arg
);
3531 completion_thread
= new RGWIndexCompletionThread(store
);
3532 int ret
= completion_thread
->init();
3536 completion_thread
->start();
3540 if (completion_thread
) {
3541 completion_thread
->stop();
3542 delete completion_thread
;
3545 for (int i
= 0; i
< num_shards
; ++i
) {
3546 Mutex::Locker
l(*locks
[i
]);
3547 for (auto c
: completions
[i
]) {
3548 Mutex::Locker
cl(c
->lock
);
3552 completions
.clear();
3556 static void obj_complete_cb(completion_t cb
, void *arg
)
3558 complete_op_data
*completion
= (complete_op_data
*)arg
;
3559 completion
->lock
.Lock();
3560 if (completion
->stopped
) {
3561 completion
->lock
.Unlock(); /* can drop lock, no one else is referencing us */
3565 bool need_delete
= completion
->manager
->handle_completion(cb
, completion
);
3566 completion
->lock
.Unlock();
3573 void RGWIndexCompletionManager::create_completion(const rgw_obj
& obj
,
3574 RGWModifyOp op
, string
& tag
,
3575 rgw_bucket_entry_ver
& ver
,
3576 const cls_rgw_obj_key
& key
,
3577 rgw_bucket_dir_entry_meta
& dir_meta
,
3578 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
3580 rgw_zone_set
*zones_trace
,
3581 complete_op_data
**result
)
3583 complete_op_data
*entry
= new complete_op_data
;
3585 int shard_id
= next_shard();
3587 entry
->manager_shard_id
= shard_id
;
3588 entry
->manager
= this;
3594 entry
->dir_meta
= dir_meta
;
3595 entry
->log_op
= log_op
;
3596 entry
->bilog_op
= bilog_op
;
3599 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
3600 entry
->remove_objs
.push_back(*iter
);
3605 entry
->zones_trace
= *zones_trace
;
3607 entry
->zones_trace
.insert(store
->get_zone().id
);
3612 entry
->rados_completion
= librados::Rados::aio_create_completion(entry
, NULL
, obj_complete_cb
);
3614 Mutex::Locker
l(*locks
[shard_id
]);
3615 completions
[shard_id
].insert(entry
);
3618 bool RGWIndexCompletionManager::handle_completion(completion_t cb
, complete_op_data
*arg
)
3620 int shard_id
= arg
->manager_shard_id
;
3622 Mutex::Locker
l(*locks
[shard_id
]);
3624 auto& comps
= completions
[shard_id
];
3626 auto iter
= comps
.find(arg
);
3627 if (iter
== comps
.end()) {
3634 int r
= rados_aio_get_return_value(cb
);
3635 if (r
!= -ERR_BUSY_RESHARDING
) {
3638 completion_thread
->add_completion(arg
);
3642 void RGWRados::finalize()
3644 if (run_sync_thread
) {
3645 Mutex::Locker
l(meta_sync_thread_lock
);
3646 meta_sync_processor_thread
->stop();
3648 Mutex::Locker
dl(data_sync_thread_lock
);
3649 for (auto iter
: data_sync_processor_threads
) {
3650 RGWDataSyncProcessorThread
*thread
= iter
.second
;
3653 if (sync_log_trimmer
) {
3654 sync_log_trimmer
->stop();
3658 async_rados
->stop();
3660 if (run_sync_thread
) {
3661 delete meta_sync_processor_thread
;
3662 meta_sync_processor_thread
= NULL
;
3663 Mutex::Locker
dl(data_sync_thread_lock
);
3664 for (auto iter
: data_sync_processor_threads
) {
3665 RGWDataSyncProcessorThread
*thread
= iter
.second
;
3668 data_sync_processor_threads
.clear();
3669 delete sync_log_trimmer
;
3670 sync_log_trimmer
= nullptr;
3675 if (need_watch_notify()) {
3679 /* delete finisher only after cleaning up watches, as watch error path might call
3680 * into finisher. We stop finisher before finalizing watch to make sure we don't
3681 * actually handle any racing work
3685 if (meta_notifier
) {
3686 meta_notifier
->stop();
3687 delete meta_notifier
;
3689 if (data_notifier
) {
3690 data_notifier
->stop();
3691 delete data_notifier
;
3697 if (use_gc_thread
) {
3698 gc
->stop_processor();
3699 obj_expirer
->stop_processor();
3704 if (use_lc_thread
) {
3705 lc
->stop_processor();
3713 delete rest_master_conn
;
3715 map
<string
, RGWRESTConn
*>::iterator iter
;
3716 for (iter
= zone_conn_map
.begin(); iter
!= zone_conn_map
.end(); ++iter
) {
3717 RGWRESTConn
*conn
= iter
->second
;
3721 for (iter
= zonegroup_conn_map
.begin(); iter
!= zonegroup_conn_map
.end(); ++iter
) {
3722 RGWRESTConn
*conn
= iter
->second
;
3725 RGWQuotaHandler::free_handler(quota_handler
);
3731 delete obj_tombstone_cache
;
3732 delete sync_modules_manager
;
3734 if (reshard_wait
.get()) {
3735 reshard_wait
->stop();
3736 reshard_wait
.reset();
3739 if (run_reshard_thread
) {
3740 reshard
->stop_processor();
3743 delete index_completion_manager
;
3747 * Initialize the RADOS instance and prepare to do other ops
3748 * Returns 0 on success, -ERR# on failure.
3750 int RGWRados::init_rados()
3753 auto handles
= std::vector
<librados::Rados
>{cct
->_conf
->rgw_num_rados_handles
};
3755 for (auto& r
: handles
) {
3756 ret
= r
.init_with_context(cct
);
3767 sync_modules_manager
= new RGWSyncModulesManager();
3769 rgw_register_sync_modules(sync_modules_manager
);
3771 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
3772 new RGWCoroutinesManagerRegistry(cct
)};
3773 ret
= crs
->hook_to_admin_command("cr dump");
3778 meta_mgr
= new RGWMetadataManager(cct
, this);
3779 data_log
= new RGWDataChangesLog(cct
, this);
3780 cr_registry
= crs
.release();
3782 std::swap(handles
, rados
);
3787 * Add new connection to connections map
3788 * @param zonegroup_conn_map map which new connection will be added to
3789 * @param zonegroup zonegroup which new connection will connect to
3790 * @param new_connection pointer to new connection instance
3792 static void add_new_connection_to_map(map
<string
, RGWRESTConn
*> &zonegroup_conn_map
,
3793 const RGWZoneGroup
&zonegroup
, RGWRESTConn
*new_connection
)
3795 // Delete if connection is already exists
3796 map
<string
, RGWRESTConn
*>::iterator iterZoneGroup
= zonegroup_conn_map
.find(zonegroup
.get_id());
3797 if (iterZoneGroup
!= zonegroup_conn_map
.end()) {
3798 delete iterZoneGroup
->second
;
3801 // Add new connection to connections map
3802 zonegroup_conn_map
[zonegroup
.get_id()] = new_connection
;
3805 int RGWRados::convert_regionmap()
3807 RGWZoneGroupMap zonegroupmap
;
3809 string pool_name
= cct
->_conf
->rgw_zone_root_pool
;
3810 if (pool_name
.empty()) {
3811 pool_name
= RGW_DEFAULT_ZONE_ROOT_POOL
;
3813 string oid
= region_map_oid
;
3815 rgw_pool
pool(pool_name
);
3817 RGWObjectCtx
obj_ctx(this);
3818 int ret
= rgw_get_system_obj(this, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
3819 if (ret
< 0 && ret
!= -ENOENT
) {
3821 } else if (ret
== -ENOENT
) {
3826 bufferlist::iterator iter
= bl
.begin();
3827 ::decode(zonegroupmap
, iter
);
3828 } catch (buffer::error
& err
) {
3829 ldout(cct
, 0) << "error decoding regionmap from " << pool
<< ":" << oid
<< dendl
;
3833 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroupmap
.zonegroups
.begin();
3834 iter
!= zonegroupmap
.zonegroups
.end(); ++iter
) {
3835 RGWZoneGroup
& zonegroup
= iter
->second
;
3836 ret
= zonegroup
.init(cct
, this, false);
3837 ret
= zonegroup
.update();
3838 if (ret
< 0 && ret
!= -ENOENT
) {
3839 ldout(cct
, 0) << "Error could not update zonegroup " << zonegroup
.get_name() << ": " <<
3840 cpp_strerror(-ret
) << dendl
;
3842 } else if (ret
== -ENOENT
) {
3843 ret
= zonegroup
.create();
3845 ldout(cct
, 0) << "Error could not create " << zonegroup
.get_name() << ": " <<
3846 cpp_strerror(-ret
) << dendl
;
3852 current_period
.set_user_quota(zonegroupmap
.user_quota
);
3853 current_period
.set_bucket_quota(zonegroupmap
.bucket_quota
);
3855 // remove the region_map so we don't try to convert again
3856 rgw_raw_obj
obj(pool
, oid
);
3857 ret
= delete_system_obj(obj
);
3859 ldout(cct
, 0) << "Error could not remove " << obj
3860 << " after upgrading to zonegroup map: " << cpp_strerror(ret
) << dendl
;
3868 * Replace all region configuration with zonegroup for
3869 * backward compatability
3870 * Returns 0 on success, -ERR# on failure.
3872 int RGWRados::replace_region_with_zonegroup()
3874 /* copy default region */
3875 /* convert default region to default zonegroup */
3876 string default_oid
= cct
->_conf
->rgw_default_region_info_oid
;
3877 if (default_oid
.empty()) {
3878 default_oid
= default_region_info_oid
;
3882 RGWZoneGroup default_zonegroup
;
3883 rgw_pool pool
{default_zonegroup
.get_pool(cct
)};
3884 string oid
= "converted";
3886 RGWObjectCtx
obj_ctx(this);
3888 int ret
= rgw_get_system_obj(this, obj_ctx
, pool
,oid
, bl
, NULL
, NULL
);
3889 if (ret
< 0 && ret
!= -ENOENT
) {
3890 ldout(cct
, 0) << __func__
<< " failed to read converted: ret "<< ret
<< " " << cpp_strerror(-ret
)
3893 } else if (ret
!= -ENOENT
) {
3894 ldout(cct
, 20) << "System already converted " << dendl
;
3898 string default_region
;
3899 ret
= default_zonegroup
.init(cct
, this, false, true);
3901 ldout(cct
, 0) << __func__
<< " failed init default region: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3904 ret
= default_zonegroup
.read_default_id(default_region
, true);
3905 if (ret
< 0 && ret
!= -ENOENT
) {
3906 ldout(cct
, 0) << __func__
<< " failed reading old default region: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3910 /* convert regions to zonegroups */
3911 list
<string
> regions
;
3912 ret
= list_regions(regions
);
3913 if (ret
< 0 && ret
!= -ENOENT
) {
3914 ldout(cct
, 0) << __func__
<< " failed to list regions: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3916 } else if (ret
== -ENOENT
|| regions
.empty()) {
3917 RGWZoneParams
zoneparams(default_zone_name
);
3918 int ret
= zoneparams
.init(cct
, this);
3919 if (ret
< 0 && ret
!= -ENOENT
) {
3920 ldout(cct
, 0) << __func__
<< ": error initializing default zone params: " << cpp_strerror(-ret
) << dendl
;
3923 /* update master zone */
3924 RGWZoneGroup
default_zg(default_zonegroup_name
);
3925 ret
= default_zg
.init(cct
, this);
3926 if (ret
< 0 && ret
!= -ENOENT
) {
3927 ldout(cct
, 0) << __func__
<< ": error in initializing default zonegroup: " << cpp_strerror(-ret
) << dendl
;
3930 if (ret
!= -ENOENT
&& default_zg
.master_zone
.empty()) {
3931 default_zg
.master_zone
= zoneparams
.get_id();
3932 return default_zg
.update();
3937 string master_region
, master_zone
;
3938 for (list
<string
>::iterator iter
= regions
.begin(); iter
!= regions
.end(); ++iter
) {
3939 if (*iter
!= default_zonegroup_name
){
3940 RGWZoneGroup
region(*iter
);
3941 int ret
= region
.init(cct
, this, true, true);
3943 ldout(cct
, 0) << __func__
<< " failed init region "<< *iter
<< ": " << cpp_strerror(-ret
) << dendl
;
3946 if (region
.is_master_zonegroup()) {
3947 master_region
= region
.get_id();
3948 master_zone
= region
.master_zone
;
3953 /* create realm if there is none.
3954 The realm name will be the region and zone concatenated
3955 realm id will be mds of its name */
3956 if (realm
.get_id().empty() && !master_region
.empty() && !master_zone
.empty()) {
3957 string new_realm_name
= master_region
+ "." + master_zone
;
3958 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
3959 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
3961 hash
.Update((const byte
*)new_realm_name
.c_str(), new_realm_name
.length());
3963 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
3964 string
new_realm_id(md5_str
);
3965 RGWRealm
new_realm(new_realm_id
,new_realm_name
);
3966 ret
= new_realm
.init(cct
, this, false);
3968 ldout(cct
, 0) << __func__
<< " Error initing new realm: " << cpp_strerror(-ret
) << dendl
;
3971 ret
= new_realm
.create();
3972 if (ret
< 0 && ret
!= -EEXIST
) {
3973 ldout(cct
, 0) << __func__
<< " Error creating new realm: " << cpp_strerror(-ret
) << dendl
;
3976 ret
= new_realm
.set_as_default();
3978 ldout(cct
, 0) << __func__
<< " Error setting realm as default: " << cpp_strerror(-ret
) << dendl
;
3981 ret
= realm
.init(cct
, this);
3983 ldout(cct
, 0) << __func__
<< " Error initing realm: " << cpp_strerror(-ret
) << dendl
;
3986 ret
= current_period
.init(cct
, this, realm
.get_id(), realm
.get_name());
3988 ldout(cct
, 0) << __func__
<< " Error initing current period: " << cpp_strerror(-ret
) << dendl
;
3993 list
<string
>::iterator iter
;
3994 /* create zonegroups */
3995 for (iter
= regions
.begin(); iter
!= regions
.end(); ++iter
)
3997 ldout(cct
, 0) << __func__
<< "Converting " << *iter
<< dendl
;
3998 /* check to see if we don't have already a zonegroup with this name */
3999 RGWZoneGroup
new_zonegroup(*iter
);
4000 ret
= new_zonegroup
.init(cct
, this);
4001 if (ret
== 0 && new_zonegroup
.get_id() != *iter
) {
4002 ldout(cct
, 0) << __func__
<< " zonegroup "<< *iter
<< " already exists id " << new_zonegroup
.get_id () <<
4003 " skipping conversion " << dendl
;
4006 RGWZoneGroup
zonegroup(*iter
);
4007 zonegroup
.set_id(*iter
);
4008 int ret
= zonegroup
.init(cct
, this, true, true);
4010 ldout(cct
, 0) << __func__
<< " failed init zonegroup: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4013 zonegroup
.realm_id
= realm
.get_id();
4014 /* fix default region master zone */
4015 if (*iter
== default_zonegroup_name
&& zonegroup
.master_zone
.empty()) {
4016 ldout(cct
, 0) << __func__
<< " Setting default zone as master for default region" << dendl
;
4017 zonegroup
.master_zone
= default_zone_name
;
4019 ret
= zonegroup
.update();
4020 if (ret
< 0 && ret
!= -EEXIST
) {
4021 ldout(cct
, 0) << __func__
<< " failed to update zonegroup " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4025 ret
= zonegroup
.update_name();
4026 if (ret
< 0 && ret
!= -EEXIST
) {
4027 ldout(cct
, 0) << __func__
<< " failed to update_name for zonegroup " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4031 if (zonegroup
.get_name() == default_region
) {
4032 ret
= zonegroup
.set_as_default();
4034 ldout(cct
, 0) << __func__
<< " failed to set_as_default " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4039 for (map
<string
, RGWZone
>::const_iterator iter
= zonegroup
.zones
.begin(); iter
!= zonegroup
.zones
.end();
4041 ldout(cct
, 0) << __func__
<< " Converting zone" << iter
->first
<< dendl
;
4042 RGWZoneParams
zoneparams(iter
->first
, iter
->first
);
4043 zoneparams
.set_id(iter
->first
);
4044 zoneparams
.realm_id
= realm
.get_id();
4045 ret
= zoneparams
.init(cct
, this);
4046 if (ret
< 0 && ret
!= -ENOENT
) {
4047 ldout(cct
, 0) << __func__
<< " failed to init zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
4049 } else if (ret
== -ENOENT
) {
4050 ldout(cct
, 0) << __func__
<< " zone is part of another cluster " << iter
->first
<< " skipping " << dendl
;
4053 zonegroup
.realm_id
= realm
.get_id();
4054 ret
= zoneparams
.update();
4055 if (ret
< 0 && ret
!= -EEXIST
) {
4056 ldout(cct
, 0) << __func__
<< " failed to update zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
4059 ret
= zoneparams
.update_name();
4060 if (ret
< 0 && ret
!= -EEXIST
) {
4061 ldout(cct
, 0) << __func__
<< " failed to init zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
4066 if (!current_period
.get_id().empty()) {
4067 ret
= current_period
.add_zonegroup(zonegroup
);
4069 ldout(cct
, 0) << __func__
<< " failed to add zonegroup to current_period: " << cpp_strerror(-ret
) << dendl
;
4075 if (!current_period
.get_id().empty()) {
4076 ret
= current_period
.update();
4078 ldout(cct
, 0) << __func__
<< " failed to update new period: " << cpp_strerror(-ret
) << dendl
;
4081 ret
= current_period
.store_info(false);
4083 ldout(cct
, 0) << __func__
<< " failed to store new period: " << cpp_strerror(-ret
) << dendl
;
4086 ret
= current_period
.reflect();
4088 ldout(cct
, 0) << __func__
<< " failed to update local objects: " << cpp_strerror(-ret
) << dendl
;
4093 for (auto const& iter
: regions
) {
4094 RGWZoneGroup
zonegroup(iter
);
4095 int ret
= zonegroup
.init(cct
, this, true, true);
4097 ldout(cct
, 0) << __func__
<< " failed init zonegroup" << iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4100 ret
= zonegroup
.delete_obj(true);
4101 if (ret
< 0 && ret
!= -ENOENT
) {
4102 ldout(cct
, 0) << __func__
<< " failed to delete region " << iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4108 /* mark as converted */
4109 ret
= rgw_put_system_obj(this, pool
, oid
, bl
.c_str(), bl
.length(),
4110 true, NULL
, real_time(), NULL
);
4112 ldout(cct
, 0) << __func__
<< " failed to mark cluster as converted: ret "<< ret
<< " " << cpp_strerror(-ret
)
4120 int RGWRados::init_zg_from_period(bool *initialized
)
4122 *initialized
= false;
4124 if (current_period
.get_id().empty()) {
4128 int ret
= zonegroup
.init(cct
, this);
4129 ldout(cct
, 20) << "period zonegroup init ret " << ret
<< dendl
;
4130 if (ret
== -ENOENT
) {
4134 ldout(cct
, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret
) << dendl
;
4137 ldout(cct
, 20) << "period zonegroup name " << zonegroup
.get_name() << dendl
;
4139 map
<string
, RGWZoneGroup
>::const_iterator iter
=
4140 current_period
.get_map().zonegroups
.find(zonegroup
.get_id());
4142 if (iter
!= current_period
.get_map().zonegroups
.end()) {
4143 ldout(cct
, 20) << "using current period zonegroup " << zonegroup
.get_name() << dendl
;
4144 zonegroup
= iter
->second
;
4145 ret
= zonegroup
.init(cct
, this, false);
4147 ldout(cct
, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret
) << dendl
;
4150 ret
= zone_params
.init(cct
, this);
4151 if (ret
< 0 && ret
!= -ENOENT
) {
4152 ldout(cct
, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret
) << dendl
;
4154 } if (ret
==-ENOENT
&& zonegroup
.get_name() == default_zonegroup_name
) {
4155 ldout(cct
, 10) << " Using default name "<< default_zone_name
<< dendl
;
4156 zone_params
.set_name(default_zone_name
);
4157 ret
= zone_params
.init(cct
, this);
4158 if (ret
< 0 && ret
!= -ENOENT
) {
4159 ldout(cct
, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret
) << dendl
;
4164 for (iter
= current_period
.get_map().zonegroups
.begin();
4165 iter
!= current_period
.get_map().zonegroups
.end(); ++iter
){
4166 const RGWZoneGroup
& zg
= iter
->second
;
4167 // use endpoints from the zonegroup's master zone
4168 auto master
= zg
.zones
.find(zg
.master_zone
);
4169 if (master
== zg
.zones
.end()) {
4170 // fix missing master zone for a single zone zonegroup
4171 if (zg
.master_zone
.empty() && zg
.zones
.size() == 1) {
4172 master
= zg
.zones
.begin();
4173 ldout(cct
, 0) << "zonegroup " << zg
.get_name() << " missing master_zone, setting zone " <<
4174 master
->second
.name
<< " id:" << master
->second
.id
<< " as master" << dendl
;
4175 if (zonegroup
.get_id() == zg
.get_id()) {
4176 zonegroup
.master_zone
= master
->second
.id
;
4177 ret
= zonegroup
.update();
4179 ldout(cct
, 0) << "error updating zonegroup : " << cpp_strerror(-ret
) << dendl
;
4183 RGWZoneGroup
fixed_zg(zg
.get_id(),zg
.get_name());
4184 ret
= fixed_zg
.init(cct
, this);
4186 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
4189 fixed_zg
.master_zone
= master
->second
.id
;
4190 ret
= fixed_zg
.update();
4192 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
4197 ldout(cct
, 0) << "zonegroup " << zg
.get_name() << " missing zone for master_zone=" <<
4198 zg
.master_zone
<< dendl
;
4202 const auto& endpoints
= master
->second
.endpoints
;
4203 add_new_connection_to_map(zonegroup_conn_map
, zg
, new RGWRESTConn(cct
, this, zg
.get_id(), endpoints
));
4204 if (!current_period
.get_master_zonegroup().empty() &&
4205 zg
.get_id() == current_period
.get_master_zonegroup()) {
4206 rest_master_conn
= new RGWRESTConn(cct
, this, zg
.get_id(), endpoints
);
4210 *initialized
= true;
4215 int RGWRados::init_zg_from_local(bool *creating_defaults
)
4217 int ret
= zonegroup
.init(cct
, this);
4218 if ( (ret
< 0 && ret
!= -ENOENT
) || (ret
== -ENOENT
&& !cct
->_conf
->rgw_zonegroup
.empty())) {
4219 ldout(cct
, 0) << "failed reading zonegroup info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4221 } else if (ret
== -ENOENT
) {
4222 *creating_defaults
= true;
4223 ldout(cct
, 10) << "Creating default zonegroup " << dendl
;
4224 ret
= zonegroup
.create_default();
4226 ldout(cct
, 0) << "failure in zonegroup create_default: ret "<< ret
<< " " << cpp_strerror(-ret
)
4230 ret
= zonegroup
.init(cct
, this);
4232 ldout(cct
, 0) << "failure in zonegroup create_default: ret "<< ret
<< " " << cpp_strerror(-ret
)
4237 ldout(cct
, 20) << "zonegroup " << zonegroup
.get_name() << dendl
;
4238 if (zonegroup
.is_master_zonegroup()) {
4239 // use endpoints from the zonegroup's master zone
4240 auto master
= zonegroup
.zones
.find(zonegroup
.master_zone
);
4241 if (master
== zonegroup
.zones
.end()) {
4242 // fix missing master zone for a single zone zonegroup
4243 if (zonegroup
.master_zone
.empty() && zonegroup
.zones
.size() == 1) {
4244 master
= zonegroup
.zones
.begin();
4245 ldout(cct
, 0) << "zonegroup " << zonegroup
.get_name() << " missing master_zone, setting zone " <<
4246 master
->second
.name
<< " id:" << master
->second
.id
<< " as master" << dendl
;
4247 zonegroup
.master_zone
= master
->second
.id
;
4248 ret
= zonegroup
.update();
4250 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
4254 ldout(cct
, 0) << "zonegroup " << zonegroup
.get_name() << " missing zone for "
4255 "master_zone=" << zonegroup
.master_zone
<< dendl
;
4259 const auto& endpoints
= master
->second
.endpoints
;
4260 rest_master_conn
= new RGWRESTConn(cct
, this, zonegroup
.get_id(), endpoints
);
4267 bool RGWRados::zone_syncs_from(RGWZone
& target_zone
, RGWZone
& source_zone
)
4269 return target_zone
.syncs_from(source_zone
.name
) &&
4270 sync_modules_manager
->supports_data_export(source_zone
.tier_type
);
4274 * Initialize the RADOS instance and prepare to do other ops
4275 * Returns 0 on success, -ERR# on failure.
4277 int RGWRados::init_complete()
4279 int ret
= realm
.init(cct
, this);
4280 if (ret
< 0 && ret
!= -ENOENT
) {
4281 ldout(cct
, 0) << "failed reading realm info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4283 } else if (ret
!= -ENOENT
) {
4284 ldout(cct
, 20) << "realm " << realm
.get_name() << " " << realm
.get_id() << dendl
;
4285 ret
= current_period
.init(cct
, this, realm
.get_id(), realm
.get_name());
4286 if (ret
< 0 && ret
!= -ENOENT
) {
4287 ldout(cct
, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret
) << dendl
;
4290 ldout(cct
, 20) << "current period " << current_period
.get_id() << dendl
;
4293 ret
= replace_region_with_zonegroup();
4295 lderr(cct
) << "failed converting region to zonegroup : ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4299 ret
= convert_regionmap();
4301 lderr(cct
) << "failed converting regionmap: " << cpp_strerror(-ret
) << dendl
;
4305 bool zg_initialized
= false;
4307 if (!current_period
.get_id().empty()) {
4308 ret
= init_zg_from_period(&zg_initialized
);
4314 bool creating_defaults
= false;
4315 bool using_local
= (!zg_initialized
);
4317 ldout(cct
, 10) << " cannot find current period zonegroup using local zonegroup" << dendl
;
4318 ret
= init_zg_from_local(&creating_defaults
);
4322 // read period_config into current_period
4323 auto& period_config
= current_period
.get_config();
4324 ret
= period_config
.read(this, zonegroup
.realm_id
);
4325 if (ret
< 0 && ret
!= -ENOENT
) {
4326 ldout(cct
, 0) << "ERROR: failed to read period config: "
4327 << cpp_strerror(ret
) << dendl
;
4332 ldout(cct
, 10) << "Cannot find current period zone using local zone" << dendl
;
4333 if (creating_defaults
&& cct
->_conf
->rgw_zone
.empty()) {
4334 ldout(cct
, 10) << " Using default name "<< default_zone_name
<< dendl
;
4335 zone_params
.set_name(default_zone_name
);
4338 ret
= zone_params
.init(cct
, this);
4339 if (ret
< 0 && ret
!= -ENOENT
) {
4340 lderr(cct
) << "failed reading zone info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4343 map
<string
, RGWZone
>::iterator zone_iter
= get_zonegroup().zones
.find(zone_params
.get_id());
4344 if (zone_iter
== get_zonegroup().zones
.end()) {
4346 lderr(cct
) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << ")" << dendl
;
4349 ldout(cct
, 1) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << "), switching to local zonegroup configuration" << dendl
;
4350 ret
= init_zg_from_local(&creating_defaults
);
4354 zone_iter
= get_zonegroup().zones
.find(zone_params
.get_id());
4356 if (zone_iter
!= get_zonegroup().zones
.end()) {
4357 zone_public_config
= zone_iter
->second
;
4358 ldout(cct
, 20) << "zone " << zone_params
.get_name() << dendl
;
4360 lderr(cct
) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << ")" << dendl
;
4364 zone_short_id
= current_period
.get_map().get_zone_short_id(zone_params
.get_id());
4366 if (run_sync_thread
) {
4367 ret
= sync_modules_manager
->create_instance(cct
, zone_public_config
.tier_type
, zone_params
.tier_config
, &sync_module
);
4369 lderr(cct
) << "ERROR: failed to init sync module instance, ret=" << ret
<< dendl
;
4374 writeable_zone
= (zone_public_config
.tier_type
.empty() || zone_public_config
.tier_type
== "rgw");
4376 init_unique_trans_id_deps();
4378 finisher
= new Finisher(cct
);
4381 period_puller
.reset(new RGWPeriodPuller(this));
4382 period_history
.reset(new RGWPeriodHistory(cct
, period_puller
.get(),
4385 if (need_watch_notify()) {
4388 lderr(cct
) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret
) << dendl
;
4393 /* first build all zones index */
4394 for (auto ziter
: get_zonegroup().zones
) {
4395 const string
& id
= ziter
.first
;
4396 RGWZone
& z
= ziter
.second
;
4397 zone_id_by_name
[z
.name
] = id
;
4401 if (zone_by_id
.find(zone_id()) == zone_by_id
.end()) {
4402 ldout(cct
, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl
;
4404 zone_public_config
= zone_by_id
[zone_id()];
4405 for (auto ziter
: get_zonegroup().zones
) {
4406 const string
& id
= ziter
.first
;
4407 RGWZone
& z
= ziter
.second
;
4408 if (id
== zone_id()) {
4411 if (z
.endpoints
.empty()) {
4412 ldout(cct
, 0) << "WARNING: can't generate connection for zone " << z
.id
<< " id " << z
.name
<< ": no endpoints defined" << dendl
;
4415 ldout(cct
, 20) << "generating connection object for zone " << z
.name
<< " id " << z
.id
<< dendl
;
4416 RGWRESTConn
*conn
= new RGWRESTConn(cct
, this, z
.id
, z
.endpoints
);
4417 zone_conn_map
[id
] = conn
;
4418 if (zone_syncs_from(zone_public_config
, z
) ||
4419 zone_syncs_from(z
, zone_public_config
)) {
4420 if (zone_syncs_from(zone_public_config
, z
)) {
4421 zone_data_sync_from_map
[id
] = conn
;
4423 if (zone_syncs_from(z
, zone_public_config
)) {
4424 zone_data_notify_to_map
[id
] = conn
;
4427 ldout(cct
, 20) << "NOTICE: not syncing to/from zone " << z
.name
<< " id " << z
.id
<< dendl
;
4431 ret
= open_root_pool_ctx();
4435 ret
= open_gc_pool_ctx();
4439 ret
= open_lc_pool_ctx();
4443 ret
= open_objexp_pool_ctx();
4447 ret
= open_reshard_pool_ctx();
4451 pools_initialized
= true;
4454 gc
->initialize(cct
, this);
4456 obj_expirer
= new RGWObjectExpirer(this);
4458 if (use_gc_thread
) {
4459 gc
->start_processor();
4460 obj_expirer
->start_processor();
4463 if (run_sync_thread
) {
4464 // initialize the log period history. we want to do this any time we're not
4465 // running under radosgw-admin, so we check run_sync_thread here before
4466 // disabling it based on the zone/zonegroup setup
4467 meta_mgr
->init_oldest_log_period();
4470 /* no point of running sync thread if we don't have a master zone configured
4471 or there is no rest_master_conn */
4472 if (get_zonegroup().master_zone
.empty() || !rest_master_conn
4473 || current_period
.get_id().empty()) {
4474 run_sync_thread
= false;
4477 async_rados
= new RGWAsyncRadosProcessor(this, cct
->_conf
->rgw_num_async_rados_threads
);
4478 async_rados
->start();
4480 ret
= meta_mgr
->init(current_period
.get_id());
4482 lderr(cct
) << "ERROR: failed to initialize metadata log: "
4483 << cpp_strerror(-ret
) << dendl
;
4487 if (is_meta_master()) {
4488 auto md_log
= meta_mgr
->get_log(current_period
.get_id());
4489 meta_notifier
= new RGWMetaNotifier(this, md_log
);
4490 meta_notifier
->start();
4493 if (run_sync_thread
) {
4494 Mutex::Locker
l(meta_sync_thread_lock
);
4495 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this, async_rados
);
4496 ret
= meta_sync_processor_thread
->init();
4498 ldout(cct
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
4501 meta_sync_processor_thread
->start();
4503 Mutex::Locker
dl(data_sync_thread_lock
);
4504 for (auto iter
: zone_data_sync_from_map
) {
4505 ldout(cct
, 5) << "starting data sync thread for zone " << iter
.first
<< dendl
;
4506 RGWDataSyncProcessorThread
*thread
= new RGWDataSyncProcessorThread(this, async_rados
, iter
.first
);
4507 ret
= thread
->init();
4509 ldout(cct
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
4513 data_sync_processor_threads
[iter
.first
] = thread
;
4515 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
4517 sync_log_trimmer
= new RGWSyncLogTrimThread(this, interval
);
4518 ret
= sync_log_trimmer
->init();
4520 ldout(cct
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
4523 sync_log_trimmer
->start();
4526 data_notifier
= new RGWDataNotifier(this);
4527 data_notifier
->start();
4530 lc
->initialize(cct
, this);
4533 lc
->start_processor();
4535 quota_handler
= RGWQuotaHandler::generate_handler(this, quota_threads
);
4537 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
4538 get_zone().bucket_index_max_shards
);
4539 if (bucket_index_max_shards
> get_max_bucket_shards()) {
4540 bucket_index_max_shards
= get_max_bucket_shards();
4541 ldout(cct
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
4542 << get_max_bucket_shards() << dendl
;
4544 ldout(cct
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
4546 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
4547 binfo_cache
->init(this);
4549 bool need_tombstone_cache
= !zone_data_notify_to_map
.empty(); /* have zones syncing from us */
4551 if (need_tombstone_cache
) {
4552 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
4555 reshard_wait
= std::make_shared
<RGWReshardWait
>(this);
4557 reshard
= new RGWReshard(this);
4559 /* only the master zone in the zonegroup reshards buckets */
4560 run_reshard_thread
= run_reshard_thread
&& (get_zonegroup().master_zone
== zone_public_config
.id
);
4561 if (run_reshard_thread
) {
4562 reshard
->start_processor();
4565 index_completion_manager
= new RGWIndexCompletionManager(this);
4566 ret
= index_completion_manager
->start();
4572 * Initialize the RADOS instance and prepare to do other ops
4573 * Returns 0 on success, -ERR# on failure.
4575 int RGWRados::initialize()
4583 return init_complete();
4586 void RGWRados::finalize_watch()
4588 for (int i
= 0; i
< num_watchers
; i
++) {
4589 RGWWatcher
*watcher
= watchers
[i
];
4590 watcher
->unregister_watch();
4594 delete[] notify_oids
;
4598 void RGWRados::schedule_context(Context
*c
) {
4602 int RGWRados::list_raw_prefixed_objs(const rgw_pool
& pool
, const string
& prefix
, list
<string
>& result
)
4605 RGWListRawObjsCtx ctx
;
4608 int r
= list_raw_objects(pool
, prefix
, 1000,
4609 ctx
, oids
, &is_truncated
);
4613 list
<string
>::iterator iter
;
4614 for (iter
= oids
.begin(); iter
!= oids
.end(); ++iter
) {
4615 string
& val
= *iter
;
4616 if (val
.size() > prefix
.size())
4617 result
.push_back(val
.substr(prefix
.size()));
4619 } while (is_truncated
);
4624 int RGWRados::list_regions(list
<string
>& regions
)
4626 RGWZoneGroup zonegroup
;
4628 return list_raw_prefixed_objs(zonegroup
.get_pool(cct
), region_info_oid_prefix
, regions
);
4631 int RGWRados::list_zonegroups(list
<string
>& zonegroups
)
4633 RGWZoneGroup zonegroup
;
4635 return list_raw_prefixed_objs(zonegroup
.get_pool(cct
), zonegroup_names_oid_prefix
, zonegroups
);
4638 int RGWRados::list_zones(list
<string
>& zones
)
4640 RGWZoneParams zoneparams
;
4642 return list_raw_prefixed_objs(zoneparams
.get_pool(cct
), zone_names_oid_prefix
, zones
);
4645 int RGWRados::list_realms(list
<string
>& realms
)
4647 RGWRealm
realm(cct
, this);
4648 return list_raw_prefixed_objs(realm
.get_pool(cct
), realm_names_oid_prefix
, realms
);
4651 int RGWRados::list_periods(list
<string
>& periods
)
4654 list
<string
> raw_periods
;
4655 int ret
= list_raw_prefixed_objs(period
.get_pool(cct
), period
.get_info_oid_prefix(), raw_periods
);
4659 for (const auto& oid
: raw_periods
) {
4660 size_t pos
= oid
.find(".");
4661 if (pos
!= std::string::npos
) {
4662 periods
.push_back(oid
.substr(0, pos
));
4664 periods
.push_back(oid
);
4667 periods
.sort(); // unique() only detects duplicates if they're adjacent
4673 int RGWRados::list_periods(const string
& current_period
, list
<string
>& periods
)
4676 string period_id
= current_period
;
4677 while(!period_id
.empty()) {
4678 RGWPeriod
period(period_id
);
4679 ret
= period
.init(cct
, this);
4683 periods
.push_back(period
.get_id());
4684 period_id
= period
.get_predecessor();
4691 * Open the pool used as root for this gateway
4692 * Returns: 0 on success, -ERR# otherwise.
4694 int RGWRados::open_root_pool_ctx()
4696 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root
, root_pool_ctx
, true);
4699 int RGWRados::open_gc_pool_ctx()
4701 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool
, gc_pool_ctx
, true);
4704 int RGWRados::open_lc_pool_ctx()
4706 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool
, lc_pool_ctx
, true);
4709 int RGWRados::open_objexp_pool_ctx()
4711 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, objexp_pool_ctx
, true);
4714 int RGWRados::open_reshard_pool_ctx()
4716 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool
, reshard_pool_ctx
, true);
4719 int RGWRados::init_watch()
4721 int r
= rgw_init_ioctx(&rados
[0], get_zone_params().control_pool
, control_pool_ctx
, true);
4726 num_watchers
= cct
->_conf
->rgw_num_control_oids
;
4728 bool compat_oid
= (num_watchers
== 0);
4730 if (num_watchers
<= 0)
4733 notify_oids
= new string
[num_watchers
];
4734 watchers
= new RGWWatcher
*[num_watchers
];
4736 for (int i
=0; i
< num_watchers
; i
++) {
4737 string
& notify_oid
= notify_oids
[i
];
4738 notify_oid
= notify_oid_prefix
;
4741 snprintf(buf
, sizeof(buf
), ".%d", i
);
4742 notify_oid
.append(buf
);
4744 r
= control_pool_ctx
.create(notify_oid
, false);
4745 if (r
< 0 && r
!= -EEXIST
)
4748 RGWWatcher
*watcher
= new RGWWatcher(this, i
, notify_oid
);
4749 watchers
[i
] = watcher
;
4751 r
= watcher
->register_watch();
4756 watch_initialized
= true;
4758 set_cache_enabled(true);
4763 void RGWRados::pick_control_oid(const string
& key
, string
& notify_oid
)
4765 uint32_t r
= ceph_str_hash_linux(key
.c_str(), key
.size());
4767 int i
= r
% num_watchers
;
4769 snprintf(buf
, sizeof(buf
), ".%d", i
);
4771 notify_oid
= notify_oid_prefix
;
4772 notify_oid
.append(buf
);
4775 int RGWRados::open_pool_ctx(const rgw_pool
& pool
, librados::IoCtx
& io_ctx
)
4777 librados::Rados
*rad
= get_rados_handle();
4778 int r
= rgw_init_ioctx(rad
, pool
, io_ctx
);
4782 if (!pools_initialized
)
4785 r
= rad
->pool_create(pool
.name
.c_str());
4786 if (r
< 0 && r
!= -EEXIST
)
4789 return rgw_init_ioctx(rad
, pool
, io_ctx
);
4792 void RGWRados::build_bucket_index_marker(const string
& shard_id_str
, const string
& shard_marker
,
4795 *marker
= shard_id_str
;
4796 marker
->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR
);
4797 marker
->append(shard_marker
);
4801 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
)
4803 const string
*rule
= &bucket_info
.placement_rule
;
4804 if (rule
->empty()) {
4805 rule
= &zonegroup
.default_placement
;
4807 auto iter
= zone_params
.placement_pools
.find(*rule
);
4808 if (iter
== zone_params
.placement_pools
.end()) {
4809 ldout(cct
, 0) << "could not find placement rule " << *rule
<< " within zonegroup " << dendl
;
4813 int r
= open_pool_ctx(iter
->second
.index_pool
, index_ctx
);
4821 * set up a bucket listing.
4822 * handle is filled in.
4823 * Returns 0 on success, -ERR# otherwise.
4825 int RGWRados::list_buckets_init(RGWAccessHandle
*handle
)
4827 librados::NObjectIterator
*state
= new librados::NObjectIterator(root_pool_ctx
.nobjects_begin());
4828 *handle
= (RGWAccessHandle
)state
;
4833 * get the next bucket in the listing.
4835 * handle is updated.
4836 * returns 0 on success, -ERR# otherwise.
4838 int RGWRados::list_buckets_next(rgw_bucket_dir_entry
& obj
, RGWAccessHandle
*handle
)
4840 librados::NObjectIterator
*state
= (librados::NObjectIterator
*)*handle
;
4843 if (*state
== root_pool_ctx
.nobjects_end()) {
4848 obj
.key
.name
= (*state
)->get_oid();
4849 if (obj
.key
.name
[0] == '_') {
4850 obj
.key
.name
= obj
.key
.name
.substr(1);
4854 } while (obj
.key
.name
[0] == '.'); /* skip all entries starting with '.' */
4862 struct log_list_state
{
4864 librados::IoCtx io_ctx
;
4865 librados::NObjectIterator obit
;
4868 int RGWRados::log_list_init(const string
& prefix
, RGWAccessHandle
*handle
)
4870 log_list_state
*state
= new log_list_state
;
4871 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, state
->io_ctx
);
4876 state
->prefix
= prefix
;
4877 state
->obit
= state
->io_ctx
.nobjects_begin();
4878 *handle
= (RGWAccessHandle
)state
;
4882 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
4884 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
4886 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
4890 if (state
->prefix
.length() &&
4891 state
->obit
->get_oid().find(state
->prefix
) != 0) {
4895 *name
= state
->obit
->get_oid();
4902 int RGWRados::log_remove(const string
& name
)
4904 librados::IoCtx io_ctx
;
4905 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
4908 return io_ctx
.remove(name
);
4911 struct log_show_state
{
4912 librados::IoCtx io_ctx
;
4914 bufferlist::iterator p
;
4918 log_show_state() : pos(0), eof(false) {}
4921 int RGWRados::log_show_init(const string
& name
, RGWAccessHandle
*handle
)
4923 log_show_state
*state
= new log_show_state
;
4924 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, state
->io_ctx
);
4930 *handle
= (RGWAccessHandle
)state
;
4934 int RGWRados::log_show_next(RGWAccessHandle handle
, rgw_log_entry
*entry
)
4936 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
4937 off_t off
= state
->p
.get_off();
4939 ldout(cct
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
4941 << " eof " << (int)state
->eof
4944 unsigned chunk
= 1024*1024;
4945 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
4947 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
4953 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
4954 } catch (buffer::error
& err
) {
4958 state
->bl
.claim(old
);
4959 state
->bl
.claim_append(more
);
4960 state
->p
= state
->bl
.begin();
4961 if ((unsigned)r
< chunk
)
4963 ldout(cct
, 10) << " read " << r
<< dendl
;
4967 return 0; // end of file
4969 ::decode(*entry
, state
->p
);
4971 catch (const buffer::error
&e
) {
4978 * usage_log_hash: get usage log key hash, based on name and index
4980 * Get the usage object name. Since a user may have more than 1
4981 * object holding that info (multiple shards), we use index to
4982 * specify that shard number. Once index exceeds max shards it
4984 * If name is not being set, results for all users will be returned
4985 * and index will wrap only after total shards number.
4987 * @param cct [in] ceph context
4988 * @param name [in] user name
4989 * @param hash [out] hash value
4990 * @param index [in] shard index number
4992 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
4994 uint32_t val
= index
;
4996 if (!name
.empty()) {
4997 int max_user_shards
= max(cct
->_conf
->rgw_usage_max_user_shards
, 1);
4998 val
%= max_user_shards
;
4999 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
5002 int max_shards
= max(cct
->_conf
->rgw_usage_max_shards
, 1);
5003 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
5007 int RGWRados::log_usage(map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
5011 map
<string
, rgw_usage_log_info
> log_objs
;
5016 /* restructure usage map, zone by object hash */
5017 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
5018 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
5019 const rgw_user_bucket
& ub
= iter
->first
;
5020 RGWUsageBatch
& info
= iter
->second
;
5022 if (ub
.user
.empty()) {
5023 ldout(cct
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
5027 if (ub
.user
!= last_user
) {
5028 /* index *should* be random, but why waste extra cycles
5029 in most cases max user shards is not going to exceed 1,
5030 so just incrementing it */
5031 usage_log_hash(cct
, ub
.user
, hash
, index
++);
5033 last_user
= ub
.user
;
5034 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
5036 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
5037 v
.push_back(miter
->second
);
5041 map
<string
, rgw_usage_log_info
>::iterator liter
;
5043 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
5044 int r
= cls_obj_usage_log_add(liter
->first
, liter
->second
);
5051 int RGWRados::read_usage(const rgw_user
& user
, uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
5052 bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
)
5054 uint32_t num
= max_entries
;
5055 string hash
, first_hash
;
5056 string user_str
= user
.to_str();
5057 usage_log_hash(cct
, user_str
, first_hash
, 0);
5059 if (usage_iter
.index
) {
5060 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
5068 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
5069 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
5071 int ret
= cls_obj_usage_log_read(hash
, user_str
, start_epoch
, end_epoch
, num
,
5072 usage_iter
.read_iter
, ret_usage
, is_truncated
);
5079 num
-= ret_usage
.size();
5081 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
5082 usage
[iter
->first
].aggregate(iter
->second
);
5086 if (!*is_truncated
) {
5087 usage_iter
.read_iter
.clear();
5088 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
5090 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
5094 int RGWRados::trim_usage(rgw_user
& user
, uint64_t start_epoch
, uint64_t end_epoch
)
5097 string hash
, first_hash
;
5098 string user_str
= user
.to_str();
5099 usage_log_hash(cct
, user_str
, first_hash
, index
);
5104 int ret
= cls_obj_usage_log_trim(hash
, user_str
, start_epoch
, end_epoch
);
5112 usage_log_hash(cct
, user_str
, hash
, ++index
);
5113 } while (hash
!= first_hash
);
5118 int RGWRados::key_to_shard_id(const string
& key
, int max_shards
)
5120 return rgw_shards_hash(key
, max_shards
);
5123 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& key
, string
& name
, int *shard_id
)
5125 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
5128 *shard_id
= val
% max_shards
;
5130 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
5131 name
= prefix
+ buf
;
5134 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& section
, const string
& key
, string
& name
)
5136 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
5137 val
^= ceph_str_hash_linux(section
.c_str(), section
.size());
5139 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
5140 name
= prefix
+ buf
;
5143 void RGWRados::shard_name(const string
& prefix
, unsigned shard_id
, string
& name
)
5146 snprintf(buf
, sizeof(buf
), "%u", shard_id
);
5147 name
= prefix
+ buf
;
5151 void RGWRados::time_log_prepare_entry(cls_log_entry
& entry
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
5153 cls_log_add_prepare_entry(entry
, utime_t(ut
), section
, key
, bl
);
5156 int RGWRados::time_log_add_init(librados::IoCtx
& io_ctx
)
5158 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
, true);
5162 int RGWRados::time_log_add(const string
& oid
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
5164 librados::IoCtx io_ctx
;
5166 int r
= time_log_add_init(io_ctx
);
5171 ObjectWriteOperation op
;
5173 cls_log_add(op
, t
, section
, key
, bl
);
5175 return io_ctx
.operate(oid
, &op
);
5178 int RGWRados::time_log_add(const string
& oid
, list
<cls_log_entry
>& entries
,
5179 librados::AioCompletion
*completion
, bool monotonic_inc
)
5181 librados::IoCtx io_ctx
;
5183 int r
= time_log_add_init(io_ctx
);
5188 ObjectWriteOperation op
;
5189 cls_log_add(op
, entries
, monotonic_inc
);
5192 r
= io_ctx
.operate(oid
, &op
);
5194 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
5199 int RGWRados::time_log_list(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
5200 int max_entries
, list
<cls_log_entry
>& entries
,
5201 const string
& marker
,
5205 librados::IoCtx io_ctx
;
5207 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5210 librados::ObjectReadOperation op
;
5212 utime_t
st(start_time
);
5213 utime_t
et(end_time
);
5215 cls_log_list(op
, st
, et
, marker
, max_entries
, entries
,
5216 out_marker
, truncated
);
5220 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
5227 int RGWRados::time_log_info(const string
& oid
, cls_log_header
*header
)
5229 librados::IoCtx io_ctx
;
5231 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5234 librados::ObjectReadOperation op
;
5236 cls_log_info(op
, header
);
5240 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
5247 int RGWRados::time_log_info_async(librados::IoCtx
& io_ctx
, const string
& oid
, cls_log_header
*header
, librados::AioCompletion
*completion
)
5249 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5253 librados::ObjectReadOperation op
;
5255 cls_log_info(op
, header
);
5257 int ret
= io_ctx
.aio_operate(oid
, completion
, &op
, NULL
);
5264 int RGWRados::time_log_trim(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
5265 const string
& from_marker
, const string
& to_marker
,
5266 librados::AioCompletion
*completion
)
5268 librados::IoCtx io_ctx
;
5270 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5274 utime_t
st(start_time
);
5275 utime_t
et(end_time
);
5277 ObjectWriteOperation op
;
5278 cls_log_trim(op
, st
, et
, from_marker
, to_marker
);
5281 r
= io_ctx
.operate(oid
, &op
);
5283 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
5288 string
RGWRados::objexp_hint_get_shardname(int shard_num
)
5291 snprintf(buf
, sizeof(buf
), "%010u", (unsigned)shard_num
);
5293 string
objname("obj_delete_at_hint.");
5294 return objname
+ buf
;
5297 int RGWRados::objexp_key_shard(const rgw_obj_index_key
& key
)
5299 string obj_key
= key
.name
+ key
.instance
;
5300 int num_shards
= cct
->_conf
->rgw_objexp_hints_num_shards
;
5301 uint32_t sid
= ceph_str_hash_linux(obj_key
.c_str(), obj_key
.size());
5302 uint32_t sid2
= sid
^ ((sid
& 0xFF) << 24);
5303 sid
= rgw_shards_mod(sid2
, num_shards
);
5307 static string
objexp_hint_get_keyext(const string
& tenant_name
,
5308 const string
& bucket_name
,
5309 const string
& bucket_id
,
5310 const rgw_obj_key
& obj_key
)
5312 return tenant_name
+ (tenant_name
.empty() ? "" : ":") + bucket_name
+ ":" + bucket_id
+
5313 ":" + obj_key
.name
+ ":" + obj_key
.instance
;
5316 int RGWRados::objexp_hint_add(const ceph::real_time
& delete_at
,
5317 const string
& tenant_name
,
5318 const string
& bucket_name
,
5319 const string
& bucket_id
,
5320 const rgw_obj_index_key
& obj_key
)
5322 const string keyext
= objexp_hint_get_keyext(tenant_name
, bucket_name
,
5323 bucket_id
, obj_key
);
5324 objexp_hint_entry he
= {
5325 .tenant
= tenant_name
,
5326 .bucket_name
= bucket_name
,
5327 .bucket_id
= bucket_id
,
5329 .exp_time
= delete_at
};
5332 ObjectWriteOperation op
;
5333 cls_timeindex_add(op
, utime_t(delete_at
), keyext
, hebl
);
5335 string shard_name
= objexp_hint_get_shardname(objexp_key_shard(obj_key
));
5336 return objexp_pool_ctx
.operate(shard_name
, &op
);
5339 void RGWRados::objexp_get_shard(int shard_num
,
5340 string
& shard
) /* out */
5342 shard
= objexp_hint_get_shardname(shard_num
);
5345 int RGWRados::objexp_hint_list(const string
& oid
,
5346 const ceph::real_time
& start_time
,
5347 const ceph::real_time
& end_time
,
5348 const int max_entries
,
5349 const string
& marker
,
5350 list
<cls_timeindex_entry
>& entries
, /* out */
5351 string
*out_marker
, /* out */
5352 bool *truncated
) /* out */
5354 librados::ObjectReadOperation op
;
5355 cls_timeindex_list(op
, utime_t(start_time
), utime_t(end_time
), marker
, max_entries
, entries
,
5356 out_marker
, truncated
);
5359 int ret
= objexp_pool_ctx
.operate(oid
, &op
, &obl
);
5361 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
5365 if ((ret
== -ENOENT
) && truncated
) {
5372 int RGWRados::objexp_hint_parse(cls_timeindex_entry
&ti_entry
, /* in */
5373 objexp_hint_entry
& hint_entry
) /* out */
5376 bufferlist::iterator iter
= ti_entry
.value
.begin();
5377 ::decode(hint_entry
, iter
);
5378 } catch (buffer::error
& err
) {
5379 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
5385 int RGWRados::objexp_hint_trim(const string
& oid
,
5386 const ceph::real_time
& start_time
,
5387 const ceph::real_time
& end_time
,
5388 const string
& from_marker
,
5389 const string
& to_marker
)
5391 int ret
= cls_timeindex_trim(objexp_pool_ctx
, oid
, utime_t(start_time
), utime_t(end_time
),
5392 from_marker
, to_marker
);
5393 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
5400 int RGWRados::lock_exclusive(rgw_pool
& pool
, const string
& oid
, timespan
& duration
,
5401 string
& zone_id
, string
& owner_id
) {
5402 librados::IoCtx io_ctx
;
5404 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
5408 uint64_t msec
= std::chrono::duration_cast
<std::chrono::milliseconds
>(duration
).count();
5409 utime_t
ut(msec
/ 1000, msec
% 1000);
5411 rados::cls::lock::Lock
l(log_lock_name
);
5413 l
.set_cookie(owner_id
);
5417 return l
.lock_exclusive(&io_ctx
, oid
);
5420 int RGWRados::unlock(rgw_pool
& pool
, const string
& oid
, string
& zone_id
, string
& owner_id
) {
5421 librados::IoCtx io_ctx
;
5423 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
5428 rados::cls::lock::Lock
l(log_lock_name
);
5430 l
.set_cookie(owner_id
);
5432 return l
.unlock(&io_ctx
, oid
);
5435 int RGWRados::decode_policy(bufferlist
& bl
, ACLOwner
*owner
)
5437 bufferlist::iterator i
= bl
.begin();
5438 RGWAccessControlPolicy
policy(cct
);
5440 policy
.decode_owner(i
);
5441 } catch (buffer::error
& err
) {
5442 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
5445 *owner
= policy
.get_owner();
5449 int rgw_policy_from_attrset(CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
5451 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
5452 if (aiter
== attrset
.end())
5455 bufferlist
& bl
= aiter
->second
;
5456 bufferlist::iterator iter
= bl
.begin();
5458 policy
->decode(iter
);
5459 } catch (buffer::error
& err
) {
5460 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
5463 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 15)) {
5464 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
5465 ldout(cct
, 15) << __func__
<< " Read AccessControlPolicy";
5466 s3policy
->to_xml(*_dout
);
5473 int RGWRados::Bucket::update_bucket_id(const string
& new_bucket_id
)
5475 rgw_bucket bucket
= bucket_info
.bucket
;
5476 bucket
.update_bucket_id(new_bucket_id
);
5478 RGWObjectCtx
obj_ctx(store
);
5480 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, nullptr, nullptr);
5489 * get listing of the objects in a bucket.
5491 * max: maximum number of results to return
5492 * bucket: bucket to list contents of
5493 * prefix: only return results that match this prefix
5494 * delim: do not include results that match this string.
5495 * Any skipped results will have the matching portion of their name
5496 * inserted in common_prefixes with a "true" mark.
5497 * marker: if filled in, begin the listing with this object.
5498 * end_marker: if filled in, end the listing with this object.
5499 * result: the objects are put in here.
5500 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5501 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5503 int RGWRados::Bucket::List::list_objects(int max
, vector
<rgw_bucket_dir_entry
> *result
,
5504 map
<string
, bool> *common_prefixes
,
5507 RGWRados
*store
= target
->get_store();
5508 CephContext
*cct
= store
->ctx();
5509 int shard_id
= target
->get_shard_id();
5512 bool truncated
= true;
5513 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
,max
);
5517 rgw_obj_key
marker_obj(params
.marker
.name
, params
.marker
.instance
, params
.ns
);
5519 rgw_obj_key end_marker_obj
;
5520 rgw_obj_index_key cur_end_marker
;
5521 if (!params
.ns
.empty()) {
5522 end_marker_obj
= rgw_obj_key(params
.end_marker
.name
, params
.end_marker
.instance
, params
.ns
);
5523 end_marker_obj
.ns
= params
.ns
;
5524 end_marker_obj
.get_index_key(&cur_end_marker
);
5526 rgw_obj_index_key cur_marker
;
5527 marker_obj
.get_index_key(&cur_marker
);
5529 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
5531 rgw_obj_key
prefix_obj(params
.prefix
);
5532 prefix_obj
.ns
= params
.ns
;
5533 string cur_prefix
= prefix_obj
.get_index_key_name();
5535 string bigger_than_delim
;
5537 if (!params
.delim
.empty()) {
5538 unsigned long val
= decode_utf8((unsigned char *)params
.delim
.c_str(), params
.delim
.size());
5539 char buf
[params
.delim
.size() + 16];
5540 int r
= encode_utf8(val
+ 1, (unsigned char *)buf
);
5542 ldout(cct
,0) << "ERROR: encode_utf8() failed" << dendl
;
5547 bigger_than_delim
= buf
;
5549 /* if marker points at a common prefix, fast forward it into its upperbound string */
5550 int delim_pos
= cur_marker
.name
.find(params
.delim
, params
.prefix
.size());
5551 if (delim_pos
>= 0) {
5552 string s
= cur_marker
.name
.substr(0, delim_pos
);
5553 s
.append(bigger_than_delim
);
5558 string skip_after_delim
;
5559 while (truncated
&& count
<= max
) {
5560 if (skip_after_delim
> cur_marker
.name
) {
5561 cur_marker
= skip_after_delim
;
5562 ldout(cct
, 20) << "setting cur_marker=" << cur_marker
.name
<< "[" << cur_marker
.instance
<< "]" << dendl
;
5564 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
5565 int r
= store
->cls_bucket_list(target
->get_bucket_info(), shard_id
, cur_marker
, cur_prefix
,
5566 read_ahead
+ 1 - count
, params
.list_versions
, ent_map
,
5567 &truncated
, &cur_marker
);
5571 std::map
<string
, rgw_bucket_dir_entry
>::iterator eiter
;
5572 for (eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
5573 rgw_bucket_dir_entry
& entry
= eiter
->second
;
5574 rgw_obj_index_key index_key
= entry
.key
;
5576 rgw_obj_key
obj(index_key
);
5578 /* note that parse_raw_oid() here will not set the correct object's instance, as
5579 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5580 * not needed for the checks here and we end up using the raw entry for the return vector
5582 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
5584 ldout(cct
, 0) << "ERROR: could not parse object name: " << obj
.name
<< dendl
;
5587 bool check_ns
= (obj
.ns
== params
.ns
);
5588 if (!params
.list_versions
&& !entry
.is_visible()) {
5592 if (params
.enforce_ns
&& !check_ns
) {
5593 if (!params
.ns
.empty()) {
5594 /* we've iterated past the namespace we're searching -- done now */
5599 /* we're not looking at the namespace this object is in, next! */
5603 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
5609 params
.marker
= index_key
;
5610 next_marker
= index_key
;
5613 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
5616 if (params
.prefix
.size() && (obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
) != 0))
5619 if (!params
.delim
.empty()) {
5620 int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
5622 if (delim_pos
>= 0) {
5623 string prefix_key
= obj
.name
.substr(0, delim_pos
+ 1);
5625 if (common_prefixes
&&
5626 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
5631 next_marker
= prefix_key
;
5632 (*common_prefixes
)[prefix_key
] = true;
5634 skip_after_delim
= obj
.name
.substr(0, delim_pos
);
5635 skip_after_delim
.append(bigger_than_delim
);
5637 ldout(cct
, 20) << "skip_after_delim=" << skip_after_delim
<< dendl
;
5651 result
->emplace_back(std::move(entry
));
5655 // Either the back-end telling us truncated, or we don't consume all
5656 // items returned per the amount caller request
5657 truncated
= (truncated
|| eiter
!= ent_map
.end());
5662 *is_truncated
= truncated
;
5668 * create a rados pool, associated meta info
5669 * returns 0 on success, -ERR# otherwise.
5671 int RGWRados::create_pool(const rgw_pool
& pool
)
5675 librados::Rados
*rad
= get_rados_handle();
5676 ret
= rad
->pool_create(pool
.name
.c_str(), 0);
5679 else if (ret
== -ERANGE
) {
5682 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret
)
5683 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5692 int RGWRados::init_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
5694 librados::IoCtx index_ctx
; // context for new bucket
5696 string dir_oid
= dir_oid_prefix
;
5697 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5702 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
5704 map
<int, string
> bucket_objs
;
5705 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
5707 return CLSRGWIssueBucketIndexInit(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5710 void RGWRados::create_bucket_id(string
*bucket_id
)
5712 uint64_t iid
= instance_id();
5713 uint64_t bid
= next_bucket_id();
5714 char buf
[get_zone_params().get_id().size() + 48];
5715 snprintf(buf
, sizeof(buf
), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid
, (long long)bid
);
5720 * create a bucket with name bucket and the given list of attrs
5721 * returns 0 on success, -ERR# otherwise.
5723 int RGWRados::create_bucket(RGWUserInfo
& owner
, rgw_bucket
& bucket
,
5724 const string
& zonegroup_id
,
5725 const string
& placement_rule
,
5726 const string
& swift_ver_location
,
5727 const RGWQuotaInfo
* pquota_info
,
5728 map
<std::string
, bufferlist
>& attrs
,
5729 RGWBucketInfo
& info
,
5731 obj_version
*pep_objv
,
5732 real_time creation_time
,
5733 rgw_bucket
*pmaster_bucket
,
5734 uint32_t *pmaster_num_shards
,
5737 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5738 string selected_placement_rule_name
;
5739 RGWZonePlacementInfo rule_info
;
5741 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
5743 ret
= select_bucket_placement(owner
, zonegroup_id
, placement_rule
,
5744 &selected_placement_rule_name
, &rule_info
);
5748 if (!pmaster_bucket
) {
5749 create_bucket_id(&bucket
.marker
);
5750 bucket
.bucket_id
= bucket
.marker
;
5752 bucket
.marker
= pmaster_bucket
->marker
;
5753 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
5756 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
5759 objv_tracker
.write_version
= *pobjv
;
5761 objv_tracker
.generate_new_write_ver(cct
);
5764 info
.bucket
= bucket
;
5765 info
.owner
= owner
.user_id
;
5766 info
.zonegroup
= zonegroup_id
;
5767 info
.placement_rule
= selected_placement_rule_name
;
5768 info
.index_type
= rule_info
.index_type
;
5769 info
.swift_ver_location
= swift_ver_location
;
5770 info
.swift_versioning
= (!swift_ver_location
.empty());
5771 if (pmaster_num_shards
) {
5772 info
.num_shards
= *pmaster_num_shards
;
5774 info
.num_shards
= bucket_index_max_shards
;
5776 info
.bucket_index_shard_hash_type
= RGWBucketInfo::MOD
;
5777 info
.requester_pays
= false;
5778 if (real_clock::is_zero(creation_time
)) {
5779 info
.creation_time
= ceph::real_clock::now();
5781 info
.creation_time
= creation_time
;
5784 info
.quota
= *pquota_info
;
5787 int r
= init_bucket_index(info
, info
.num_shards
);
5792 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true);
5793 if (ret
== -EEXIST
) {
5794 librados::IoCtx index_ctx
;
5795 map
<int, string
> bucket_objs
;
5796 int r
= open_bucket_index(info
, index_ctx
, bucket_objs
);
5800 /* we need to reread the info and return it, caller will have a use for it */
5801 RGWObjVersionTracker instance_ver
= info
.objv_tracker
;
5802 info
.objv_tracker
.clear();
5803 RGWObjectCtx
obj_ctx(this);
5804 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, NULL
);
5809 ldout(cct
, 0) << "get_bucket_info returned " << r
<< dendl
;
5813 /* only remove it if it's a different bucket instance */
5814 if (info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
5815 /* remove bucket meta instance */
5816 string entry
= bucket
.get_key();
5817 r
= rgw_bucket_instance_remove_entry(this, entry
, &instance_ver
);
5821 map
<int, string
>::const_iterator biter
;
5822 for (biter
= bucket_objs
.begin(); biter
!= bucket_objs
.end(); ++biter
) {
5823 // Do best effort removal
5824 index_ctx
.remove(biter
->second
);
5827 /* ret == -ENOENT here */
5832 /* this is highly unlikely */
5833 ldout(cct
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
5837 int RGWRados::select_new_bucket_location(RGWUserInfo
& user_info
, const string
& zonegroup_id
, const string
& request_rule
,
5838 string
*pselected_rule_name
, RGWZonePlacementInfo
*rule_info
)
5841 /* first check that rule exists within the specific zonegroup */
5842 RGWZoneGroup zonegroup
;
5843 int ret
= get_zonegroup(zonegroup_id
, zonegroup
);
5845 ldout(cct
, 0) << "could not find zonegroup " << zonegroup_id
<< " in current period" << dendl
;
5849 /* now check that tag exists within zonegroup */
5850 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5851 string rule
= request_rule
;
5853 rule
= user_info
.default_placement
;
5855 rule
= zonegroup
.default_placement
;
5859 ldout(cct
, 0) << "misconfiguration, should not have an empty placement rule name" << dendl
;
5863 map
<string
, RGWZoneGroupPlacementTarget
>::iterator titer
= zonegroup
.placement_targets
.find(rule
);
5864 if (titer
== zonegroup
.placement_targets
.end()) {
5865 ldout(cct
, 0) << "could not find placement rule " << rule
<< " within zonegroup " << dendl
;
5869 /* now check tag for the rule, whether user is permitted to use rule */
5870 RGWZoneGroupPlacementTarget
& target_rule
= titer
->second
;
5871 if (!target_rule
.user_permitted(user_info
.placement_tags
)) {
5872 ldout(cct
, 0) << "user not permitted to use placement rule" << dendl
;
5876 if (pselected_rule_name
)
5877 *pselected_rule_name
= rule
;
5879 return select_bucket_location_by_rule(rule
, rule_info
);
5882 int RGWRados::select_bucket_location_by_rule(const string
& location_rule
, RGWZonePlacementInfo
*rule_info
)
5884 if (location_rule
.empty()) {
5885 /* we can only reach here if we're trying to set a bucket location from a bucket
5886 * created on a different zone, using a legacy / default pool configuration
5888 return select_legacy_bucket_placement(rule_info
);
5892 * make sure that zone has this rule configured. We're
5893 * checking it for the local zone, because that's where this bucket object is going to
5896 map
<string
, RGWZonePlacementInfo
>::iterator piter
= get_zone_params().placement_pools
.find(location_rule
);
5897 if (piter
== get_zone_params().placement_pools
.end()) {
5898 /* couldn't find, means we cannot really place data for this bucket in this zone */
5899 if (get_zonegroup().equals(zonegroup_id
)) {
5900 /* that's a configuration error, zone should have that rule, as we're within the requested
5904 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5909 RGWZonePlacementInfo
& placement_info
= piter
->second
;
5912 *rule_info
= placement_info
;
5918 int RGWRados::select_bucket_placement(RGWUserInfo
& user_info
, const string
& zonegroup_id
, const string
& placement_rule
,
5919 string
*pselected_rule_name
, RGWZonePlacementInfo
*rule_info
)
5921 if (!get_zone_params().placement_pools
.empty()) {
5922 return select_new_bucket_location(user_info
, zonegroup_id
, placement_rule
,
5923 pselected_rule_name
, rule_info
);
5926 if (pselected_rule_name
) {
5927 pselected_rule_name
->clear();
5930 return select_legacy_bucket_placement(rule_info
);
5933 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo
*rule_info
)
5936 map
<string
, bufferlist
> m
;
5938 bool write_map
= false;
5940 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
5942 RGWObjectCtx
obj_ctx(this);
5943 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, avail_pools
, map_bl
, NULL
, NULL
);
5949 bufferlist::iterator iter
= map_bl
.begin();
5951 } catch (buffer::error
& err
) {
5952 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
5958 ret
= omap_get_all(obj
, header
, m
);
5963 if (ret
< 0 || m
.empty()) {
5964 vector
<rgw_pool
> pools
;
5965 string s
= string("default.") + default_storage_pool_suffix
;
5966 pools
.push_back(rgw_pool(s
));
5967 vector
<int> retcodes
;
5969 ret
= create_pools(pools
, retcodes
);
5972 ret
= omap_set(obj
, s
, bl
);
5980 ::encode(m
, new_bl
);
5981 ret
= put_system_obj_data(NULL
, obj
, new_bl
, -1, false);
5983 ldout(cct
, 0) << "WARNING: could not save avail pools map info ret=" << ret
<< dendl
;
5987 map
<string
, bufferlist
>::iterator miter
;
5990 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
5991 v
.push_back(miter
->first
);
5995 ret
= get_random_bytes((char *)&r
, sizeof(r
));
5999 int i
= r
% v
.size();
6003 pool_name
= miter
->first
;
6006 rule_info
->data_pool
= pool_name
;
6007 rule_info
->data_extra_pool
= pool_name
;
6008 rule_info
->index_pool
= pool_name
;
6009 rule_info
->index_type
= RGWBIType_Normal
;
6014 bool RGWRados::get_obj_data_pool(const string
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
6016 return rgw_get_obj_data_pool(zonegroup
, zone_params
, placement_rule
, obj
, pool
);
6019 bool RGWRados::obj_to_raw(const string
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
6021 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
6023 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
6026 int RGWRados::update_placement_map()
6029 map
<string
, bufferlist
> m
;
6030 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6031 int ret
= omap_get_all(obj
, header
, m
);
6036 ::encode(m
, new_bl
);
6037 ret
= put_system_obj_data(NULL
, obj
, new_bl
, -1, false);
6039 ldout(cct
, 0) << "WARNING: could not save avail pools map info ret=" << ret
<< dendl
;
6045 int RGWRados::add_bucket_placement(const rgw_pool
& new_pool
)
6047 librados::Rados
*rad
= get_rados_handle();
6048 int ret
= rad
->pool_lookup(new_pool
.name
.c_str());
6049 if (ret
< 0) // DNE, or something
6052 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6053 bufferlist empty_bl
;
6054 ret
= omap_set(obj
, new_pool
.to_str(), empty_bl
);
6056 // don't care about return value
6057 update_placement_map();
6062 int RGWRados::remove_bucket_placement(const rgw_pool
& old_pool
)
6064 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6065 int ret
= omap_del(obj
, old_pool
.to_str());
6067 // don't care about return value
6068 update_placement_map();
6073 int RGWRados::list_placement_set(set
<rgw_pool
>& names
)
6076 map
<string
, bufferlist
> m
;
6078 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6079 int ret
= omap_get_all(obj
, header
, m
);
6084 map
<string
, bufferlist
>::iterator miter
;
6085 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
6086 names
.insert(rgw_pool(miter
->first
));
6089 return names
.size();
6092 int RGWRados::create_pools(vector
<rgw_pool
>& pools
, vector
<int>& retcodes
)
6094 vector
<librados::PoolAsyncCompletion
*> completions
;
6097 librados::Rados
*rad
= get_rados_handle();
6098 for (auto iter
= pools
.begin(); iter
!= pools
.end(); ++iter
) {
6099 librados::PoolAsyncCompletion
*c
= librados::Rados::pool_async_create_completion();
6100 completions
.push_back(c
);
6101 rgw_pool
& pool
= *iter
;
6102 int ret
= rad
->pool_create_async(pool
.name
.c_str(), c
);
6103 rets
.push_back(ret
);
6106 vector
<int>::iterator riter
;
6107 vector
<librados::PoolAsyncCompletion
*>::iterator citer
;
6109 assert(rets
.size() == completions
.size());
6110 for (riter
= rets
.begin(), citer
= completions
.begin(); riter
!= rets
.end(); ++riter
, ++citer
) {
6112 PoolAsyncCompletion
*c
= *citer
;
6115 r
= c
->get_return_value();
6117 ldout(cct
, 0) << "WARNING: async pool_create returned " << r
<< dendl
;
6121 retcodes
.push_back(r
);
6126 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
6129 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
6132 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
6133 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
6137 int r
= open_pool_ctx(pool
, *ioctx
);
6142 ioctx
->locator_set_key(key
);
6147 int RGWRados::get_obj_head_ref(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_rados_ref
*ref
)
6149 get_obj_bucket_and_oid_loc(obj
, ref
->oid
, ref
->key
);
6152 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
6153 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
6157 int r
= open_pool_ctx(pool
, ref
->ioctx
);
6162 ref
->ioctx
.locator_set_key(ref
->key
);
6167 int RGWRados::get_raw_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
, rgw_pool
*pool
)
6174 if (ref
->oid
.empty()) {
6175 ref
->oid
= obj
.pool
.to_str();
6176 ref
->pool
= get_zone_params().domain_root
;
6178 ref
->pool
= obj
.pool
;
6183 r
= open_pool_ctx(ref
->pool
, ref
->ioctx
);
6187 ref
->ioctx
.locator_set_key(ref
->key
);
6192 int RGWRados::get_system_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
, rgw_pool
*pool
)
6194 return get_raw_obj_ref(obj
, ref
, pool
);
6198 * fixes an issue where head objects were supposed to have a locator created, but ended
6201 int RGWRados::fix_head_obj_locator(const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
6203 const rgw_bucket
& bucket
= bucket_info
.bucket
;
6207 rgw_obj
obj(bucket
, key
);
6209 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
6211 if (locator
.empty()) {
6212 ldout(cct
, 20) << "object does not have a locator, nothing to fix" << dendl
;
6216 librados::IoCtx ioctx
;
6218 int ret
= get_obj_head_ioctx(bucket_info
, obj
, &ioctx
);
6220 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
6223 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
6228 struct timespec mtime_ts
;
6229 map
<string
, bufferlist
> attrs
;
6230 librados::ObjectReadOperation op
;
6231 op
.getxattrs(&attrs
, NULL
);
6232 op
.stat2(&size
, &mtime_ts
, NULL
);
6233 #define HEAD_SIZE 512 * 1024
6234 op
.read(0, HEAD_SIZE
, &data
, NULL
);
6236 ret
= ioctx
.operate(oid
, &op
, NULL
);
6238 lderr(cct
) << "ERROR: ioctx.operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
6242 if (size
> HEAD_SIZE
) {
6243 lderr(cct
) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
6247 if (size
!= data
.length()) {
6248 lderr(cct
) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
6253 librados::ObjectWriteOperation wop
;
6255 wop
.mtime2(&mtime_ts
);
6257 map
<string
, bufferlist
>::iterator iter
;
6258 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6259 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
6264 ioctx
.locator_set_key(locator
);
6265 ioctx
.operate(oid
, &wop
);
6269 ioctx
.locator_set_key(string());
6271 ret
= ioctx
.remove(oid
);
6273 lderr(cct
) << "ERROR: failed to remove original bad object" << dendl
;
6281 int RGWRados::move_rados_obj(librados::IoCtx
& src_ioctx
,
6282 const string
& src_oid
, const string
& src_locator
,
6283 librados::IoCtx
& dst_ioctx
,
6284 const string
& dst_oid
, const string
& dst_locator
)
6287 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6289 uint64_t chunk_size
= COPY_BUF_SIZE
;
6293 struct timespec mtime_ts
;
6296 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
6300 src_ioctx
.locator_set_key(src_locator
);
6301 dst_ioctx
.locator_set_key(dst_locator
);
6305 ObjectReadOperation rop
;
6306 ObjectWriteOperation wop
;
6309 rop
.stat2(&size
, &mtime_ts
, NULL
);
6310 mtime
= real_clock::from_timespec(mtime_ts
);
6312 rop
.read(ofs
, chunk_size
, &data
, NULL
);
6313 ret
= src_ioctx
.operate(src_oid
, &rop
, NULL
);
6318 if (data
.length() == 0) {
6323 wop
.create(true); /* make it exclusive */
6324 wop
.mtime2(&mtime_ts
);
6325 mtime
= real_clock::from_timespec(mtime_ts
);
6327 wop
.write(ofs
, data
);
6328 ret
= dst_ioctx
.operate(dst_oid
, &wop
);
6329 ofs
+= data
.length();
6330 done
= data
.length() != chunk_size
;
6334 lderr(cct
) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
6335 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
6340 src_ioctx
.remove(src_oid
);
6345 lderr(cct
) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
6350 * fixes an issue where head objects were supposed to have a locator created, but ended
6353 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
)
6355 const rgw_bucket
& bucket
= bucket_info
.bucket
;
6356 rgw_obj
obj(bucket
, key
);
6363 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6368 RGWObjState
*astate
= NULL
;
6369 RGWObjectCtx
rctx(this);
6370 r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
6374 if (astate
->has_manifest
) {
6375 RGWObjManifest::obj_iterator miter
;
6376 RGWObjManifest
& manifest
= astate
->manifest
;
6377 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
6378 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(this);
6383 rgw_raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
6385 if (loc
.key
.ns
.empty()) {
6386 /* continue, we're only interested in tail objects */
6390 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
6391 ref
.ioctx
.locator_set_key(locator
);
6393 ldout(cct
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
6395 r
= ref
.ioctx
.stat(oid
, NULL
, NULL
);
6401 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
6403 /* create a new ioctx with the bad locator */
6404 librados::IoCtx src_ioctx
;
6405 src_ioctx
.dup(ref
.ioctx
);
6406 src_ioctx
.locator_set_key(bad_loc
);
6408 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
6410 /* cannot find a broken part */
6413 ldout(cct
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
6418 r
= move_rados_obj(src_ioctx
, oid
, bad_loc
, ref
.ioctx
, oid
, locator
);
6420 lderr(cct
) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
6429 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
, const rgw_obj
& obj
)
6433 RGWObjectCtx
obj_ctx(store
);
6435 RGWBucketInfo bucket_info
;
6436 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
6441 ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
6443 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6446 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6451 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
, int sid
)
6456 RGWObjectCtx
obj_ctx(store
);
6458 RGWBucketInfo bucket_info
;
6459 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
6464 ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, shard_id
, &bucket_obj
);
6466 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6469 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6475 /* Execute @handler on last item in bucket listing for bucket specified
6476 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6477 * to objects matching these criterias. */
6478 int RGWRados::on_last_entry_in_listing(RGWBucketInfo
& bucket_info
,
6479 const std::string
& obj_prefix
,
6480 const std::string
& obj_delim
,
6481 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
6483 RGWRados::Bucket
target(this, bucket_info
);
6484 RGWRados::Bucket::List
list_op(&target
);
6486 list_op
.params
.prefix
= obj_prefix
;
6487 list_op
.params
.delim
= obj_delim
;
6489 ldout(cct
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
6490 << ", obj_prefix=" << obj_prefix
6491 << ", obj_delim=" << obj_delim
6494 bool is_truncated
= false;
6496 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
6497 /* We need to rewind to the last object in a listing. */
6499 /* List bucket entries in chunks. */
6500 static constexpr int MAX_LIST_OBJS
= 100;
6501 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
6503 int ret
= list_op
.list_objects(MAX_LIST_OBJS
, &entries
, nullptr,
6507 } else if (!entries
.empty()) {
6508 last_entry
= entries
.back();
6510 } while (is_truncated
);
6513 return handler(*last_entry
);
6516 /* Empty listing - no items we can run handler on. */
6521 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
6522 const rgw_user
& user
,
6523 RGWBucketInfo
& bucket_info
,
6526 if (! swift_versioning_enabled(bucket_info
)) {
6530 obj_ctx
.obj
.set_atomic(obj
);
6532 RGWObjState
* state
= nullptr;
6533 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &state
, false);
6538 if (!state
->exists
) {
6545 const string
& src_name
= obj
.get_oid();
6546 char buf
[src_name
.size() + 32];
6547 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
6548 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
6549 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
6551 RGWBucketInfo dest_bucket_info
;
6553 r
= get_bucket_info(obj_ctx
, bucket_info
.bucket
.tenant
, bucket_info
.swift_ver_location
, dest_bucket_info
, NULL
, NULL
);
6555 ldout(cct
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
6557 return -ERR_PRECONDITION_FAILED
;
6562 if (dest_bucket_info
.owner
!= bucket_info
.owner
) {
6563 return -ERR_PRECONDITION_FAILED
;
6566 rgw_obj
dest_obj(dest_bucket_info
.bucket
, buf
);
6567 obj_ctx
.obj
.set_atomic(dest_obj
);
6571 r
= copy_obj(obj_ctx
,
6575 NULL
, /* req_info *info */
6581 NULL
, /* time_t *src_mtime */
6582 NULL
, /* time_t *mtime */
6583 NULL
, /* const time_t *mod_ptr */
6584 NULL
, /* const time_t *unmod_ptr */
6585 false, /* bool high_precision_time */
6586 NULL
, /* const char *if_match */
6587 NULL
, /* const char *if_nomatch */
6588 RGWRados::ATTRSMOD_NONE
,
6589 true, /* bool copy_if_newer */
6591 RGW_OBJ_CATEGORY_MAIN
,
6592 0, /* uint64_t olh_epoch */
6593 real_time(), /* time_t delete_at */
6594 NULL
, /* string *version_id */
6595 NULL
, /* string *ptag */
6596 NULL
, /* string *petag */
6597 NULL
, /* void (*progress_cb)(off_t, void *) */
6598 NULL
); /* void *progress_data */
6599 if (r
== -ECANCELED
|| r
== -ENOENT
) {
6600 /* Has already been overwritten, meaning another rgw process already
6608 int RGWRados::swift_versioning_restore(RGWObjectCtx
& obj_ctx
,
6609 const rgw_user
& user
,
6610 RGWBucketInfo
& bucket_info
,
6612 bool& restored
) /* out */
6614 if (! swift_versioning_enabled(bucket_info
)) {
6618 /* Bucket info of the bucket that stores previous versions of our object. */
6619 RGWBucketInfo archive_binfo
;
6621 int ret
= get_bucket_info(obj_ctx
, bucket_info
.bucket
.tenant
,
6622 bucket_info
.swift_ver_location
, archive_binfo
,
6628 /* Abort the operation if the bucket storing our archive belongs to someone
6629 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6630 * into consideration. For we can live with that.
6632 * TODO: delegate this check to un upper layer and compare with ACLs. */
6633 if (bucket_info
.owner
!= archive_binfo
.owner
) {
6637 /* This code will be executed on latest version of the object. */
6638 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
6639 std::string no_client_id
;
6640 std::string no_op_id
;
6641 std::string no_zone
;
6643 /* We don't support object versioning of Swift API on those buckets that
6644 * are already versioned using the S3 mechanism. This affects also bucket
6645 * storing archived objects. Otherwise the delete operation would create
6646 * a deletion marker. */
6647 if (archive_binfo
.versioned()) {
6649 return -ERR_PRECONDITION_FAILED
;
6652 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6653 * irrelevant and may be safely skipped. */
6654 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
6656 rgw_obj
archive_obj(archive_binfo
.bucket
, entry
.key
);
6657 obj_ctx
.obj
.set_atomic(archive_obj
);
6658 obj_ctx
.obj
.set_atomic(obj
);
6660 int ret
= copy_obj(obj_ctx
,
6664 nullptr, /* req_info *info */
6667 archive_obj
, /* src obj */
6668 bucket_info
, /* dest bucket info */
6669 archive_binfo
, /* src bucket info */
6670 nullptr, /* time_t *src_mtime */
6671 nullptr, /* time_t *mtime */
6672 nullptr, /* const time_t *mod_ptr */
6673 nullptr, /* const time_t *unmod_ptr */
6674 false, /* bool high_precision_time */
6675 nullptr, /* const char *if_match */
6676 nullptr, /* const char *if_nomatch */
6677 RGWRados::ATTRSMOD_NONE
,
6678 true, /* bool copy_if_newer */
6680 RGW_OBJ_CATEGORY_MAIN
,
6681 0, /* uint64_t olh_epoch */
6682 real_time(), /* time_t delete_at */
6683 nullptr, /* string *version_id */
6684 nullptr, /* string *ptag */
6685 nullptr, /* string *petag */
6686 nullptr, /* void (*progress_cb)(off_t, void *) */
6687 nullptr); /* void *progress_data */
6688 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
6689 /* Has already been overwritten, meaning another rgw process already
6692 } else if (ret
< 0) {
6698 /* Need to remove the archived copy. */
6699 ret
= delete_obj(obj_ctx
, archive_binfo
, archive_obj
,
6700 archive_binfo
.versioning_status());
6705 const std::string
& obj_name
= obj
.get_oid();
6706 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
6709 return on_last_entry_in_listing(archive_binfo
, prefix
, std::string(),
6714 * Write/overwrite an object to the bucket storage.
6715 * bucket: the bucket to store the object in
6716 * obj: the object name/key
6717 * data: the object contents/value
6718 * size: the amount of data to write (data must be this long)
6719 * accounted_size: original size of data before compression, encryption
6720 * mtime: if non-NULL, writes the given mtime to the bucket storage
6721 * attrs: all the given attrs are written to bucket storage for the given object
6722 * exclusive: create object exclusively
6723 * Returns: 0 on success, -ERR# otherwise.
6725 int RGWRados::Object::Write::_do_write_meta(uint64_t size
, uint64_t accounted_size
,
6726 map
<string
, bufferlist
>& attrs
, bool assume_noent
,
6729 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
6732 RGWRados
*store
= target
->get_store();
6734 ObjectWriteOperation op
;
6737 int r
= target
->get_state(&state
, false, assume_noent
);
6741 rgw_obj
& obj
= target
->get_obj();
6743 if (obj
.get_oid().empty()) {
6744 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
6748 r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
6752 bool is_olh
= state
->is_olh
;
6754 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
6756 const string
*ptag
= meta
.ptag
;
6757 if (!ptag
&& !index_op
->get_optag()->empty()) {
6758 ptag
= index_op
->get_optag();
6760 r
= target
->prepare_atomic_modification(op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false);
6764 if (real_clock::is_zero(meta
.set_mtime
)) {
6765 meta
.set_mtime
= real_clock::now();
6768 if (state
->is_olh
) {
6769 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
6772 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
6773 op
.mtime2(&mtime_ts
);
6776 /* if we want to overwrite the data, we also want to overwrite the
6777 xattrs, so just remove the object */
6778 op
.write_full(*meta
.data
);
6782 string content_type
;
6785 map
<string
, bufferlist
>::iterator iter
;
6787 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
6788 const string
& name
= iter
->first
;
6789 op
.rmxattr(name
.c_str());
6793 if (meta
.manifest
) {
6794 /* remove existing manifest attr */
6795 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
6796 if (iter
!= attrs
.end())
6800 ::encode(*meta
.manifest
, bl
);
6801 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
6804 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6805 const string
& name
= iter
->first
;
6806 bufferlist
& bl
= iter
->second
;
6811 op
.setxattr(name
.c_str(), bl
);
6813 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
6815 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
6816 content_type
= bl
.c_str();
6817 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
6821 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
6822 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
6825 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
6827 ::encode(store
->get_zone_short_id(), bl
);
6828 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
6837 bool orig_exists
= state
->exists
;
6838 uint64_t orig_size
= state
->accounted_size
;
6840 bool versioned_target
= (meta
.olh_epoch
> 0 || !obj
.key
.instance
.empty());
6842 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
6845 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
6848 if (!index_op
->is_prepared()) {
6849 r
= index_op
->prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
6854 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
6855 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
6856 or -ENOENT if was removed, or -EEXIST if it did not exist
6857 before and now it does */
6858 if (r
== -EEXIST
&& assume_noent
) {
6859 target
->invalidate_state();
6865 epoch
= ref
.ioctx
.get_last_version();
6866 poolid
= ref
.ioctx
.get_id();
6868 r
= target
->complete_atomic_modification();
6870 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
6873 r
= index_op
->complete(poolid
, epoch
, size
, accounted_size
,
6874 meta
.set_mtime
, etag
, content_type
, &acl_bl
,
6875 meta
.category
, meta
.remove_objs
, meta
.user_data
);
6880 *meta
.mtime
= meta
.set_mtime
;
6883 /* note that index_op was using state so we couldn't invalidate it earlier */
6884 target
->invalidate_state();
6888 r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, meta
.olh_epoch
, real_time(), false, meta
.zones_trace
);
6894 if (!real_clock::is_zero(meta
.delete_at
)) {
6895 rgw_obj_index_key obj_key
;
6896 obj
.key
.get_index_key(&obj_key
);
6898 r
= store
->objexp_hint_add(meta
.delete_at
,
6899 obj
.bucket
.tenant
, obj
.bucket
.name
, obj
.bucket
.bucket_id
, obj_key
);
6901 ldout(store
->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
6902 /* ignoring error, nothing we can do at this point */
6905 meta
.canceled
= false;
6907 /* update quota cache */
6908 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
6909 accounted_size
, orig_size
);
6913 int ret
= index_op
->cancel();
6915 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
6918 meta
.canceled
= true;
6920 /* we lost in a race. There are a few options:
6921 * - existing object was rewritten (ECANCELED)
6922 * - non existing object was created (EEXIST)
6923 * - object was removed (ENOENT)
6924 * should treat it as a success
6926 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
6927 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
6931 if (meta
.if_match
!= NULL
) {
6932 // only overwrite existing object
6933 if (strcmp(meta
.if_match
, "*") == 0) {
6935 r
= -ERR_PRECONDITION_FAILED
;
6936 } else if (r
== -ECANCELED
) {
6942 if (meta
.if_nomatch
!= NULL
) {
6943 // only create a new object
6944 if (strcmp(meta
.if_nomatch
, "*") == 0) {
6946 r
= -ERR_PRECONDITION_FAILED
;
6947 } else if (r
== -ENOENT
) {
6957 int RGWRados::Object::Write::write_meta(uint64_t size
, uint64_t accounted_size
,
6958 map
<string
, bufferlist
>& attrs
)
6960 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
6962 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
6963 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
6964 index_op
.set_zones_trace(meta
.zones_trace
);
6966 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
6969 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, (void *)&index_op
);
6971 assume_noent
= false;
6974 if (!assume_noent
) {
6975 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, (void *)&index_op
);
6980 /** Write/overwrite a system object. */
6981 int RGWRados::put_system_obj_impl(rgw_raw_obj
& obj
, uint64_t size
, real_time
*mtime
,
6982 map
<std::string
, bufferlist
>& attrs
, int flags
,
6984 RGWObjVersionTracker
*objv_tracker
,
6985 real_time set_mtime
/* 0 for don't set */)
6989 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
6993 ObjectWriteOperation op
;
6995 if (flags
& PUT_OBJ_EXCL
) {
6996 if (!(flags
& PUT_OBJ_CREATE
))
6998 op
.create(true); // exclusive create
7001 op
.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK
);
7006 objv_tracker
->prepare_op_for_write(&op
);
7009 if (real_clock::is_zero(set_mtime
)) {
7010 set_mtime
= real_clock::now();
7013 struct timespec mtime_ts
= real_clock::to_timespec(set_mtime
);
7014 op
.mtime2(&mtime_ts
);
7015 op
.write_full(data
);
7019 for (map
<string
, bufferlist
>::iterator iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
7020 const string
& name
= iter
->first
;
7021 bufferlist
& bl
= iter
->second
;
7026 op
.setxattr(name
.c_str(), bl
);
7029 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
7035 objv_tracker
->apply_write();
7045 int RGWRados::put_system_obj_data(void *ctx
, rgw_raw_obj
& obj
, bufferlist
& bl
,
7046 off_t ofs
, bool exclusive
,
7047 RGWObjVersionTracker
*objv_tracker
)
7051 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
7056 ObjectWriteOperation op
;
7062 objv_tracker
->prepare_op_for_write(&op
);
7069 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
7074 objv_tracker
->apply_write();
7080 * Write/overwrite an object to the bucket storage.
7081 * bucket: the bucket to store the object in
7082 * obj: the object name/key
7083 * data: the object contents/value
7084 * offset: the offet to write to in the object
7085 * If this is -1, we will overwrite the whole object.
7086 * size: the amount of data to write (data must be this long)
7087 * attrs: all the given attrs are written to bucket storage for the given object
7088 * Returns: 0 on success, -ERR# otherwise.
7091 int RGWRados::aio_put_obj_data(void *ctx
, rgw_raw_obj
& obj
, bufferlist
& bl
,
7092 off_t ofs
, bool exclusive
,
7096 int r
= get_raw_obj_ref(obj
, &ref
);
7101 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
7104 ObjectWriteOperation op
;
7114 r
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
7121 int RGWRados::aio_wait(void *handle
)
7123 AioCompletion
*c
= (AioCompletion
*)handle
;
7125 int ret
= c
->get_return_value();
7130 bool RGWRados::aio_completed(void *handle
)
7132 AioCompletion
*c
= (AioCompletion
*)handle
;
7133 return c
->is_safe();
7136 class RGWRadosPutObj
: public RGWGetDataCB
7140 RGWPutObjDataProcessor
*filter
;
7141 boost::optional
<RGWPutObj_Compress
>& compressor
;
7142 CompressorRef
& plugin
;
7143 RGWPutObjProcessor_Atomic
*processor
;
7144 RGWOpStateSingleOp
*opstate
;
7145 void (*progress_cb
)(off_t
, void *);
7146 void *progress_data
;
7147 bufferlist extra_data_bl
;
7148 uint64_t extra_data_len
;
7150 map
<string
, bufferlist
> src_attrs
;
7152 RGWRadosPutObj(CephContext
* cct
,
7153 CompressorRef
& plugin
,
7154 boost::optional
<RGWPutObj_Compress
>& compressor
,
7155 RGWPutObjProcessor_Atomic
*p
,
7156 RGWOpStateSingleOp
*_ops
,
7157 void (*_progress_cb
)(off_t
, void *),
7158 void *_progress_data
) :
7161 compressor(compressor
),
7165 progress_cb(_progress_cb
),
7166 progress_data(_progress_data
),
7170 int process_attrs(void) {
7171 if (extra_data_bl
.length()) {
7173 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
7174 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
7178 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
7180 src_attrs
.erase(RGW_ATTR_COMPRESSION
);
7181 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
7184 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
7185 //do not compress if object is encrypted
7186 compressor
= boost::in_place(cct
, plugin
, filter
);
7187 filter
= &*compressor
;
7192 int handle_data(bufferlist
& bl
, off_t ofs
, off_t len
) override
{
7194 progress_cb(ofs
, progress_data
);
7196 if (extra_data_len
) {
7197 size_t extra_len
= bl
.length();
7198 if (extra_len
> extra_data_len
)
7199 extra_len
= extra_data_len
;
7202 bl
.splice(0, extra_len
, &extra
);
7203 extra_data_bl
.append(extra
);
7205 extra_data_len
-= extra_len
;
7206 if (extra_data_len
== 0) {
7207 int res
= process_attrs();
7211 if (bl
.length() == 0) {
7215 data_len
+= bl
.length();
7218 bool need_opstate
= true;
7221 void *handle
= NULL
;
7223 uint64_t size
= bl
.length();
7224 int ret
= filter
->handle_data(bl
, ofs
, &handle
, &obj
, &again
);
7228 if (need_opstate
&& opstate
) {
7229 /* need to update opstate repository with new state. This is ratelimited, so we're not
7230 * really doing it every time
7232 ret
= opstate
->renew_state();
7234 ldout(cct
, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret
<< dendl
;
7235 int r
= filter
->throttle_data(handle
, obj
, size
, false);
7237 ldout(cct
, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r
<< dendl
;
7239 /* could not renew state! might have been marked as cancelled */
7242 need_opstate
= false;
7245 ret
= filter
->throttle_data(handle
, obj
, size
, false);
7253 bufferlist
& get_extra_data() { return extra_data_bl
; }
7255 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
7257 void set_extra_data_len(uint64_t len
) override
{
7258 extra_data_len
= len
;
7261 uint64_t get_data_len() {
7265 int complete(const string
& etag
, real_time
*mtime
, real_time set_mtime
,
7266 map
<string
, bufferlist
>& attrs
, real_time delete_at
, rgw_zone_set
*zones_trace
) {
7267 return processor
->complete(data_len
, etag
, mtime
, set_mtime
, attrs
, delete_at
, NULL
, NULL
, NULL
, zones_trace
);
7270 bool is_canceled() {
7271 return processor
->is_canceled();
7276 * prepare attrset depending on attrs_mod.
7278 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
7279 map
<string
, bufferlist
>& attrs
,
7280 RGWRados::AttrsMod attrs_mod
)
7282 switch (attrs_mod
) {
7283 case RGWRados::ATTRSMOD_NONE
:
7286 case RGWRados::ATTRSMOD_REPLACE
:
7287 if (!attrs
[RGW_ATTR_ETAG
].length()) {
7288 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
7291 case RGWRados::ATTRSMOD_MERGE
:
7292 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
7293 if (attrs
.find(it
->first
) == attrs
.end()) {
7294 attrs
[it
->first
] = it
->second
;
7301 int RGWRados::rewrite_obj(RGWBucketInfo
& dest_bucket_info
, rgw_obj
& obj
)
7303 map
<string
, bufferlist
> attrset
;
7307 RGWObjectCtx
rctx(this);
7309 RGWRados::Object
op_target(this, dest_bucket_info
, rctx
, obj
);
7310 RGWRados::Object::Read
read_op(&op_target
);
7312 read_op
.params
.attrs
= &attrset
;
7313 read_op
.params
.lastmod
= &mtime
;
7314 read_op
.params
.obj_size
= &obj_size
;
7316 int ret
= read_op
.prepare();
7320 attrset
.erase(RGW_ATTR_ID_TAG
);
7322 uint64_t max_chunk_size
;
7324 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, obj
, &max_chunk_size
);
7326 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj
.bucket
<< dendl
;
7330 return copy_obj_data(rctx
, dest_bucket_info
, read_op
, obj_size
- 1, obj
, obj
, max_chunk_size
, NULL
, mtime
, attrset
,
7331 RGW_OBJ_CATEGORY_MAIN
, 0, real_time(), NULL
, NULL
, NULL
);
7334 struct obj_time_weight
{
7336 uint32_t zone_short_id
;
7338 bool high_precision
;
7340 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7342 bool compare_low_precision(const obj_time_weight
& rhs
) {
7343 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
7344 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
7353 if (zone_short_id
!= rhs
.zone_short_id
) {
7354 return (zone_short_id
< rhs
.zone_short_id
);
7356 return (pg_ver
< rhs
.pg_ver
);
7360 bool operator<(const obj_time_weight
& rhs
) {
7361 if (!high_precision
|| !rhs
.high_precision
) {
7362 return compare_low_precision(rhs
);
7364 if (mtime
> rhs
.mtime
) {
7367 if (mtime
< rhs
.mtime
) {
7370 if (zone_short_id
!= rhs
.zone_short_id
) {
7371 return (zone_short_id
< rhs
.zone_short_id
);
7373 return (pg_ver
< rhs
.pg_ver
);
7376 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
7378 zone_short_id
= _short_id
;
7382 void init(RGWObjState
*state
) {
7383 mtime
= state
->mtime
;
7384 zone_short_id
= state
->zone_short_id
;
7385 pg_ver
= state
->pg_ver
;
7389 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
7392 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
7393 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
7399 class RGWGetExtraDataCB
: public RGWGetDataCB
{
7400 bufferlist extra_data
;
7402 RGWGetExtraDataCB() {}
7403 int handle_data(bufferlist
& bl
, off_t bl_ofs
, off_t bl_len
) override
{
7404 if (extra_data
.length() < extra_data_len
) {
7405 off_t max
= extra_data_len
- extra_data
.length();
7409 bl
.splice(0, max
, &extra_data
);
7414 bufferlist
& get_extra_data() {
7419 int RGWRados::stat_remote_obj(RGWObjectCtx
& obj_ctx
,
7420 const rgw_user
& user_id
,
7421 const string
& client_id
,
7423 const string
& source_zone
,
7425 RGWBucketInfo
& src_bucket_info
,
7426 real_time
*src_mtime
,
7428 const real_time
*mod_ptr
,
7429 const real_time
*unmod_ptr
,
7430 bool high_precision_time
,
7431 const char *if_match
,
7432 const char *if_nomatch
,
7433 map
<string
, bufferlist
> *pattrs
,
7438 /* source is in a different zonegroup, copy from there */
7440 RGWRESTStreamRWRequest
*in_stream_req
;
7442 map
<string
, bufferlist
> src_attrs
;
7443 append_rand_alpha(cct
, tag
, tag
, 32);
7444 obj_time_weight set_mtime_weight
;
7445 set_mtime_weight
.high_precision
= high_precision_time
;
7448 if (source_zone
.empty()) {
7449 if (src_bucket_info
.zonegroup
.empty()) {
7450 /* source is in the master zonegroup */
7451 conn
= rest_master_conn
;
7453 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
7454 if (iter
== zonegroup_conn_map
.end()) {
7455 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
7458 conn
= iter
->second
;
7461 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
7462 if (iter
== zone_conn_map
.end()) {
7463 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
7466 conn
= iter
->second
;
7469 RGWGetExtraDataCB cb
;
7471 map
<string
, string
> req_headers
;
7472 real_time set_mtime
;
7474 const real_time
*pmod
= mod_ptr
;
7476 obj_time_weight dest_mtime_weight
;
7478 int ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
7479 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
7480 true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
7481 true /* sync manifest */, &cb
, &in_stream_req
);
7486 ret
= conn
->complete_request(in_stream_req
, etag
, &set_mtime
, psize
, req_headers
);
7491 bufferlist
& extra_data_bl
= cb
.get_extra_data();
7492 if (extra_data_bl
.length()) {
7494 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
7495 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
7499 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
7501 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
7505 *src_mtime
= set_mtime
;
7509 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
7510 if (iter
!= src_attrs
.end()) {
7511 bufferlist
& etagbl
= iter
->second
;
7512 *petag
= etagbl
.to_str();
7517 *pattrs
= src_attrs
;
7523 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
7524 const rgw_user
& user_id
,
7525 const string
& client_id
,
7526 const string
& op_id
,
7527 bool record_op_state
,
7529 const string
& source_zone
,
7532 RGWBucketInfo
& dest_bucket_info
,
7533 RGWBucketInfo
& src_bucket_info
,
7534 real_time
*src_mtime
,
7536 const real_time
*mod_ptr
,
7537 const real_time
*unmod_ptr
,
7538 bool high_precision_time
,
7539 const char *if_match
,
7540 const char *if_nomatch
,
7543 map
<string
, bufferlist
>& attrs
,
7544 RGWObjCategory category
,
7546 real_time delete_at
,
7549 ceph::buffer::list
*petag
,
7550 void (*progress_cb
)(off_t
, void *),
7551 void *progress_data
,
7552 rgw_zone_set
*zones_trace
)
7554 /* source is in a different zonegroup, copy from there */
7556 RGWRESTStreamRWRequest
*in_stream_req
;
7559 append_rand_alpha(cct
, tag
, tag
, 32);
7560 obj_time_weight set_mtime_weight
;
7561 set_mtime_weight
.high_precision
= high_precision_time
;
7563 RGWPutObjProcessor_Atomic
processor(obj_ctx
,
7564 dest_bucket_info
, dest_obj
.bucket
, dest_obj
.key
.name
,
7565 cct
->_conf
->rgw_obj_stripe_size
, tag
, dest_bucket_info
.versioning_enabled());
7566 if (version_id
&& *version_id
!= "null") {
7567 processor
.set_version_id(*version_id
);
7569 processor
.set_olh_epoch(olh_epoch
);
7570 int ret
= processor
.prepare(this, NULL
);
7576 if (source_zone
.empty()) {
7577 if (dest_bucket_info
.zonegroup
.empty()) {
7578 /* source is in the master zonegroup */
7579 conn
= rest_master_conn
;
7581 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
7582 if (iter
== zonegroup_conn_map
.end()) {
7583 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
7586 conn
= iter
->second
;
7589 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
7590 if (iter
== zone_conn_map
.end()) {
7591 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
7594 conn
= iter
->second
;
7597 string obj_name
= dest_obj
.bucket
.name
+ "/" + dest_obj
.get_oid();
7599 RGWOpStateSingleOp
*opstate
= NULL
;
7601 if (record_op_state
) {
7602 opstate
= new RGWOpStateSingleOp(this, client_id
, op_id
, obj_name
);
7604 ret
= opstate
->set_state(RGWOpState::OPSTATE_IN_PROGRESS
);
7606 ldout(cct
, 0) << "ERROR: failed to set opstate ret=" << ret
<< dendl
;
7612 boost::optional
<RGWPutObj_Compress
> compressor
;
7613 CompressorRef plugin
;
7615 const auto& compression_type
= zone_params
.get_compression_type(
7616 dest_bucket_info
.placement_rule
);
7617 if (compression_type
!= "none") {
7618 plugin
= Compressor::create(cct
, compression_type
);
7620 ldout(cct
, 1) << "Cannot load plugin for compression type "
7621 << compression_type
<< dendl
;
7625 RGWRadosPutObj
cb(cct
, plugin
, compressor
, &processor
, opstate
, progress_cb
, progress_data
);
7628 map
<string
, string
> req_headers
;
7629 real_time set_mtime
;
7631 RGWObjState
*dest_state
= NULL
;
7633 const real_time
*pmod
= mod_ptr
;
7635 obj_time_weight dest_mtime_weight
;
7637 if (copy_if_newer
) {
7638 /* need to get mtime for destination */
7639 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
7643 if (!real_clock::is_zero(dest_state
->mtime
)) {
7644 dest_mtime_weight
.init(dest_state
);
7645 pmod
= &dest_mtime_weight
.mtime
;
7649 ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
7650 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
7651 true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
7652 true /* sync manifest */, &cb
, &in_stream_req
);
7657 ret
= conn
->complete_request(in_stream_req
, etag
, &set_mtime
, nullptr, req_headers
);
7661 if (compressor
&& compressor
->is_compressed()) {
7663 RGWCompressionInfo cs_info
;
7664 cs_info
.compression_type
= plugin
->get_type_name();
7665 cs_info
.orig_size
= cb
.get_data_len();
7666 cs_info
.blocks
= move(compressor
->get_compression_blocks());
7667 ::encode(cs_info
, tmp
);
7668 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
7671 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7672 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
7674 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
7675 if (iter
!= cb
.get_attrs().end()) {
7677 ::decode(delete_at
, iter
->second
);
7678 } catch (buffer::error
& err
) {
7679 ldout(cct
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
7685 *src_mtime
= set_mtime
;
7689 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
7690 if (iter
!= cb
.get_attrs().end()) {
7691 *petag
= iter
->second
;
7695 if (source_zone
.empty()) {
7696 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
7698 attrs
= cb
.get_attrs();
7701 if (copy_if_newer
) {
7702 uint64_t pg_ver
= 0;
7703 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
7704 if (i
!= attrs
.end() && i
->second
.length() > 0) {
7705 bufferlist::iterator iter
= i
->second
.begin();
7707 ::decode(pg_ver
, iter
);
7708 } catch (buffer::error
& err
) {
7709 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
7710 /* non critical error */
7713 set_mtime_weight
.init(set_mtime
, get_zone_short_id(), pg_ver
);
7716 #define MAX_COMPLETE_RETRY 100
7717 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
7718 ret
= cb
.complete(etag
, mtime
, set_mtime
, attrs
, delete_at
, zones_trace
);
7722 if (copy_if_newer
&& cb
.is_canceled()) {
7723 ldout(cct
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
7724 obj_ctx
.obj
.invalidate(dest_obj
); /* object was overwritten */
7725 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
7727 ldout(cct
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
7730 dest_mtime_weight
.init(dest_state
);
7731 dest_mtime_weight
.high_precision
= high_precision_time
;
7732 if (!dest_state
->exists
||
7733 dest_mtime_weight
< set_mtime_weight
) {
7734 ldout(cct
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
7737 ldout(cct
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
7743 if (i
== MAX_COMPLETE_RETRY
) {
7744 ldout(cct
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
7750 ret
= opstate
->set_state(RGWOpState::OPSTATE_COMPLETE
);
7752 ldout(cct
, 0) << "ERROR: failed to set opstate ret=" << ret
<< dendl
;
7759 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
7763 RGWOpState::OpState state
;
7765 state
= RGWOpState::OPSTATE_ERROR
;
7767 state
= RGWOpState::OPSTATE_COMPLETE
;
7769 int r
= opstate
->set_state(state
);
7771 ldout(cct
, 0) << "ERROR: failed to set opstate r=" << ret
<< dendl
;
7779 int RGWRados::copy_obj_to_remote_dest(RGWObjState
*astate
,
7780 map
<string
, bufferlist
>& src_attrs
,
7781 RGWRados::Object::Read
& read_op
,
7782 const rgw_user
& user_id
,
7788 RGWRESTStreamWriteRequest
*out_stream_req
;
7790 int ret
= rest_master_conn
->put_obj_init(user_id
, dest_obj
, astate
->size
, src_attrs
, &out_stream_req
);
7792 delete out_stream_req
;
7796 ret
= read_op
.iterate(0, astate
->size
- 1, out_stream_req
->get_out_cb());
7800 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
);
7809 * dest_obj: the object to copy into
7810 * src_obj: the object to copy from
7811 * attrs: usage depends on attrs_mod parameter
7812 * attrs_mod: the modification mode of the attrs, may have the following values:
7813 * ATTRSMOD_NONE - the attributes of the source object will be
7814 * copied without modifications, attrs parameter is ignored;
7815 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7816 * parameter, source object attributes are not copied;
7817 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7818 * are overwritten by values contained in attrs parameter.
7819 * err: stores any errors resulting from the get of the original object
7820 * Returns: 0 on success, -ERR# otherwise.
7822 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
7823 const rgw_user
& user_id
,
7824 const string
& client_id
,
7825 const string
& op_id
,
7827 const string
& source_zone
,
7830 RGWBucketInfo
& dest_bucket_info
,
7831 RGWBucketInfo
& src_bucket_info
,
7832 real_time
*src_mtime
,
7834 const real_time
*mod_ptr
,
7835 const real_time
*unmod_ptr
,
7836 bool high_precision_time
,
7837 const char *if_match
,
7838 const char *if_nomatch
,
7841 map
<string
, bufferlist
>& attrs
,
7842 RGWObjCategory category
,
7844 real_time delete_at
,
7847 ceph::buffer::list
*petag
,
7848 void (*progress_cb
)(off_t
, void *),
7849 void *progress_data
)
7853 rgw_obj shadow_obj
= dest_obj
;
7859 append_rand_alpha(cct
, dest_obj
.get_oid(), shadow_oid
, 32);
7860 shadow_obj
.init_ns(dest_obj
.bucket
, shadow_oid
, shadow_ns
);
7862 remote_dest
= !get_zonegroup().equals(dest_bucket_info
.zonegroup
);
7863 remote_src
= !get_zonegroup().equals(src_bucket_info
.zonegroup
);
7865 if (remote_src
&& remote_dest
) {
7866 ldout(cct
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
7870 ldout(cct
, 5) << "Copy object " << src_obj
.bucket
<< ":" << src_obj
.get_oid() << " => " << dest_obj
.bucket
<< ":" << dest_obj
.get_oid() << dendl
;
7872 if (remote_src
|| !source_zone
.empty()) {
7873 return fetch_remote_obj(obj_ctx
, user_id
, client_id
, op_id
, true, info
, source_zone
,
7874 dest_obj
, src_obj
, dest_bucket_info
, src_bucket_info
, src_mtime
, mtime
, mod_ptr
,
7875 unmod_ptr
, high_precision_time
,
7876 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
7877 olh_epoch
, delete_at
, version_id
, ptag
, petag
, progress_cb
, progress_data
);
7880 map
<string
, bufferlist
> src_attrs
;
7881 RGWRados::Object
src_op_target(this, src_bucket_info
, obj_ctx
, src_obj
);
7882 RGWRados::Object::Read
read_op(&src_op_target
);
7884 read_op
.conds
.mod_ptr
= mod_ptr
;
7885 read_op
.conds
.unmod_ptr
= unmod_ptr
;
7886 read_op
.conds
.high_precision_time
= high_precision_time
;
7887 read_op
.conds
.if_match
= if_match
;
7888 read_op
.conds
.if_nomatch
= if_nomatch
;
7889 read_op
.params
.attrs
= &src_attrs
;
7890 read_op
.params
.lastmod
= src_mtime
;
7891 read_op
.params
.obj_size
= &obj_size
;
7893 ret
= read_op
.prepare();
7898 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
7899 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
7901 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
7902 attrs
.erase(RGW_ATTR_ID_TAG
);
7903 attrs
.erase(RGW_ATTR_PG_VER
);
7904 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
7905 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
7906 if (cmp
!= src_attrs
.end())
7907 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
7909 RGWObjManifest manifest
;
7910 RGWObjState
*astate
= NULL
;
7912 ret
= get_obj_state(&obj_ctx
, src_bucket_info
, src_obj
, &astate
);
7917 vector
<rgw_raw_obj
> ref_objs
;
7920 /* dest is in a different zonegroup, copy it there */
7921 return copy_obj_to_remote_dest(astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
7923 uint64_t max_chunk_size
;
7925 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, dest_obj
, &max_chunk_size
);
7927 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
.bucket
<< dendl
;
7933 if (!get_obj_data_pool(src_bucket_info
.placement_rule
, src_obj
, &src_pool
)) {
7934 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
7937 if (!get_obj_data_pool(dest_bucket_info
.placement_rule
, dest_obj
, &dest_pool
)) {
7938 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
7943 bool copy_data
= !astate
->has_manifest
|| (src_pool
!= dest_pool
);
7944 bool copy_first
= false;
7945 if (astate
->has_manifest
) {
7946 if (!astate
->manifest
.has_tail()) {
7949 uint64_t head_size
= astate
->manifest
.get_head_size();
7951 if (head_size
> 0) {
7952 if (head_size
> max_chunk_size
) {
7962 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
7963 if (iter
!= attrs
.end()) {
7964 *petag
= iter
->second
;
7968 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
7969 return copy_obj_data(obj_ctx
, dest_bucket_info
, read_op
, obj_size
- 1, dest_obj
, src_obj
,
7970 max_chunk_size
, mtime
, real_time(), attrs
, category
, olh_epoch
, delete_at
,
7971 version_id
, ptag
, petag
);
7974 RGWObjManifest::obj_iterator miter
= astate
->manifest
.obj_begin();
7976 if (copy_first
) { // we need to copy first chunk, not increase refcount
7981 ret
= get_raw_obj_ref(miter
.get_location().get_raw_obj(this), &ref
);
7986 bool versioned_dest
= dest_bucket_info
.versioning_enabled();
7988 if (version_id
&& !version_id
->empty()) {
7989 versioned_dest
= true;
7990 dest_obj
.key
.set_instance(*version_id
);
7991 } else if (versioned_dest
) {
7992 gen_rand_obj_instance_name(&dest_obj
);
7995 bufferlist first_chunk
;
7997 bool copy_itself
= (dest_obj
== src_obj
);
7998 RGWObjManifest
*pmanifest
;
7999 ldout(cct
, 20) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
8001 RGWRados::Object
dest_op_target(this, dest_bucket_info
, obj_ctx
, dest_obj
);
8002 RGWRados::Object::Write
write_op(&dest_op_target
);
8011 append_rand_alpha(cct
, tag
, tag
, 32);
8015 manifest
= astate
->manifest
;
8016 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
8017 if (tail_placement
.bucket
.name
.empty()) {
8018 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
.bucket
);
8021 for (; miter
!= astate
->manifest
.obj_end(); ++miter
) {
8022 ObjectWriteOperation op
;
8023 cls_refcount_get(op
, tag
, true);
8024 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(this);
8025 ref
.ioctx
.locator_set_key(loc
.loc
);
8027 ret
= ref
.ioctx
.operate(loc
.oid
, &op
);
8032 ref_objs
.push_back(loc
);
8035 pmanifest
= &manifest
;
8037 pmanifest
= &astate
->manifest
;
8038 /* don't send the object's tail for garbage collection */
8039 astate
->keep_tail
= true;
8043 ret
= read_op
.read(0, max_chunk_size
, first_chunk
);
8048 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, first_chunk
.length());
8050 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, 0);
8053 write_op
.meta
.data
= &first_chunk
;
8054 write_op
.meta
.manifest
= pmanifest
;
8055 write_op
.meta
.ptag
= &tag
;
8056 write_op
.meta
.owner
= dest_bucket_info
.owner
;
8057 write_op
.meta
.mtime
= mtime
;
8058 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
8059 write_op
.meta
.category
= category
;
8060 write_op
.meta
.olh_epoch
= olh_epoch
;
8061 write_op
.meta
.delete_at
= delete_at
;
8063 ret
= write_op
.write_meta(obj_size
, astate
->accounted_size
, attrs
);
8072 vector
<rgw_raw_obj
>::iterator riter
;
8076 /* rollback reference */
8077 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
8078 ObjectWriteOperation op
;
8079 cls_refcount_put(op
, tag
, true);
8081 ref
.ioctx
.locator_set_key(riter
->loc
);
8083 int r
= ref
.ioctx
.operate(riter
->oid
, &op
);
8085 ldout(cct
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
8093 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
8094 RGWBucketInfo
& dest_bucket_info
,
8095 RGWRados::Object::Read
& read_op
, off_t end
,
8098 uint64_t max_chunk_size
,
8100 real_time set_mtime
,
8101 map
<string
, bufferlist
>& attrs
,
8102 RGWObjCategory category
,
8104 real_time delete_at
,
8107 ceph::buffer::list
*petag
)
8109 bufferlist first_chunk
;
8110 RGWObjManifest manifest
;
8113 append_rand_alpha(cct
, tag
, tag
, 32);
8115 RGWPutObjProcessor_Atomic
processor(obj_ctx
,
8116 dest_bucket_info
, dest_obj
.bucket
, dest_obj
.get_oid(),
8117 cct
->_conf
->rgw_obj_stripe_size
, tag
, dest_bucket_info
.versioning_enabled());
8119 processor
.set_version_id(*version_id
);
8121 processor
.set_olh_epoch(olh_epoch
);
8122 int ret
= processor
.prepare(this, NULL
);
8130 ret
= read_op
.read(ofs
, end
, bl
);
8132 uint64_t read_len
= ret
;
8139 ret
= processor
.handle_data(bl
, ofs
, &handle
, &obj
, &again
);
8143 ret
= processor
.throttle_data(handle
, obj
, read_len
, false);
8149 } while (ofs
<= end
);
8152 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
8153 if (iter
!= attrs
.end()) {
8154 bufferlist
& bl
= iter
->second
;
8155 etag
= string(bl
.c_str(), bl
.length());
8161 uint64_t accounted_size
;
8163 bool compressed
{false};
8164 RGWCompressionInfo cs_info
;
8165 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
8167 ldout(cct
, 0) << "ERROR: failed to read compression info" << dendl
;
8170 // pass original size if compressed
8171 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
8174 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
);
8177 bool RGWRados::is_meta_master()
8179 if (!get_zonegroup().is_master_zonegroup()) {
8183 return (get_zonegroup().master_zone
== zone_public_config
.id
);
8187 * Check to see if the bucket metadata could be synced
8188 * bucket: the bucket to check
8189 * Returns false is the bucket is not synced
8191 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket
& bucket
)
8194 /* no current period */
8195 if (current_period
.get_id().empty()) {
8199 /* zonegroup is not master zonegroup */
8200 if (!get_zonegroup().is_master_zonegroup()) {
8204 /* single zonegroup and a single zone */
8205 if (current_period
.is_single_zonegroup(cct
, this) && get_zonegroup().zones
.size() == 1) {
8209 /* zone is not master */
8210 if (get_zonegroup().master_zone
.compare(zone_public_config
.id
) != 0) {
8217 int RGWRados::check_bucket_empty(RGWBucketInfo
& bucket_info
)
8219 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
8220 rgw_obj_index_key marker
;
8225 #define NUM_ENTRIES 1000
8226 int r
= cls_bucket_list(bucket_info
, RGW_NO_SHARD
, marker
, prefix
, NUM_ENTRIES
, true, ent_map
,
8227 &is_truncated
, &marker
);
8232 std::map
<string
, rgw_bucket_dir_entry
>::iterator eiter
;
8233 for (eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
8236 if (rgw_obj_key::oid_to_key_in_ns(eiter
->second
.key
.name
, &obj
, ns
))
8239 } while (is_truncated
);
8245 * bucket: the name of the bucket to delete
8246 * Returns 0 on success, -ERR# otherwise.
8248 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, bool check_empty
)
8250 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8251 librados::IoCtx index_ctx
;
8252 map
<int, string
> bucket_objs
;
8253 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
8258 r
= check_bucket_empty(bucket_info
);
8264 r
= rgw_bucket_delete_bucket_obj(this, bucket
.tenant
, bucket
.name
, objv_tracker
);
8268 /* if the bucket is not synced we can remove the meta file */
8269 if (!is_syncing_bucket_meta(bucket
)) {
8270 RGWObjVersionTracker objv_tracker
;
8271 string entry
= bucket
.get_key();
8272 r
= rgw_bucket_instance_remove_entry(this, entry
, &objv_tracker
);
8276 /* remove bucket index objects*/
8277 map
<int, string
>::const_iterator biter
;
8278 for (biter
= bucket_objs
.begin(); biter
!= bucket_objs
.end(); ++biter
) {
8279 index_ctx
.remove(biter
->second
);
8285 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
)
8288 map
<string
, bufferlist
> attrs
;
8289 RGWObjectCtx
obj_ctx(this);
8291 if (bucket
.bucket_id
.empty()) {
8292 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
8294 r
= get_bucket_instance_info(obj_ctx
, bucket
, info
, nullptr, &attrs
);
8297 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
8301 info
.owner
= owner
.get_id();
8303 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
8305 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
8313 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
)
8317 vector
<rgw_bucket
>::iterator iter
;
8319 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
8320 rgw_bucket
& bucket
= *iter
;
8322 ldout(cct
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
8324 ldout(cct
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
8327 map
<string
, bufferlist
> attrs
;
8328 RGWObjectCtx
obj_ctx(this);
8329 int r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
8331 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
8336 info
.flags
&= ~BUCKET_SUSPENDED
;
8338 info
.flags
|= BUCKET_SUSPENDED
;
8341 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
8343 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
8351 int RGWRados::bucket_suspended(rgw_bucket
& bucket
, bool *suspended
)
8353 RGWBucketInfo bucket_info
;
8354 RGWObjectCtx
obj_ctx(this);
8355 int ret
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
);
8360 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
8364 int RGWRados::Object::complete_atomic_modification()
8366 if (!state
->has_manifest
|| state
->keep_tail
)
8369 cls_rgw_obj_chain chain
;
8370 store
->update_gc_chain(obj
, state
->manifest
, &chain
);
8372 if (chain
.empty()) {
8376 string tag
= state
->obj_tag
.to_str();
8377 return store
->gc
->send_chain(chain
, tag
, false); // do it async
8380 void RGWRados::update_gc_chain(rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
8382 RGWObjManifest::obj_iterator iter
;
8383 rgw_raw_obj raw_head
;
8384 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
8385 for (iter
= manifest
.obj_begin(); iter
!= manifest
.obj_end(); ++iter
) {
8386 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(this);
8387 if (mobj
== raw_head
)
8389 cls_rgw_obj_key
key(mobj
.oid
);
8390 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
8394 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
, bool sync
)
8396 return gc
->send_chain(chain
, tag
, sync
);
8399 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
, string
& bucket_oid
)
8401 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8402 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
8406 if (bucket
.bucket_id
.empty()) {
8407 ldout(cct
, 0) << "ERROR: empty bucket id for bucket operation" << dendl
;
8411 bucket_oid
= dir_oid_prefix
;
8412 bucket_oid
.append(bucket
.bucket_id
);
8417 int RGWRados::open_bucket_index_base(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8418 string
& bucket_oid_base
) {
8419 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8420 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
8424 if (bucket
.bucket_id
.empty()) {
8425 ldout(cct
, 0) << "ERROR: empty bucket_id for bucket operation" << dendl
;
8429 bucket_oid_base
= dir_oid_prefix
;
8430 bucket_oid_base
.append(bucket
.bucket_id
);
8436 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8437 map
<int, string
>& bucket_objs
, int shard_id
, map
<int, string
> *bucket_instance_ids
) {
8438 string bucket_oid_base
;
8439 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8444 get_bucket_index_objects(bucket_oid_base
, bucket_info
.num_shards
, bucket_objs
, shard_id
);
8445 if (bucket_instance_ids
) {
8446 get_bucket_instance_ids(bucket_info
, shard_id
, bucket_instance_ids
);
8451 template<typename T
>
8452 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8453 map
<int, string
>& oids
, map
<int, T
>& bucket_objs
,
8454 int shard_id
, map
<int, string
> *bucket_instance_ids
)
8456 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, bucket_instance_ids
);
8460 map
<int, string
>::const_iterator iter
= oids
.begin();
8461 for (; iter
!= oids
.end(); ++iter
) {
8462 bucket_objs
[iter
->first
] = T();
8467 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8468 const string
& obj_key
, string
*bucket_obj
, int *shard_id
)
8470 string bucket_oid_base
;
8471 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8475 RGWObjectCtx
obj_ctx(this);
8477 ret
= get_bucket_index_object(bucket_oid_base
, obj_key
, bucket_info
.num_shards
,
8478 (RGWBucketInfo::BIShardsHashType
)bucket_info
.bucket_index_shard_hash_type
, bucket_obj
, shard_id
);
8480 ldout(cct
, 10) << "get_bucket_index_object() returned ret=" << ret
<< dendl
;
8486 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8487 int shard_id
, string
*bucket_obj
)
8489 string bucket_oid_base
;
8490 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8494 RGWObjectCtx
obj_ctx(this);
8496 get_bucket_index_object(bucket_oid_base
, bucket_info
.num_shards
,
8497 shard_id
, bucket_obj
);
8501 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
8502 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
8504 for (const auto& pair
: header
.stats
) {
8505 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
8506 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
8508 RGWStorageStats
& s
= stats
[category
];
8510 s
.category
= category
;
8511 s
.size
+= header_stats
.total_size
;
8512 s
.size_rounded
+= header_stats
.total_size_rounded
;
8513 s
.size_utilized
+= header_stats
.actual_size
;
8514 s
.num_objects
+= header_stats
.num_entries
;
8518 int RGWRados::bucket_check_index(RGWBucketInfo
& bucket_info
,
8519 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
8520 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
8522 librados::IoCtx index_ctx
;
8523 // key - bucket index object id
8524 // value - bucket index check OP returned result with the given bucket index object (shard)
8525 map
<int, string
> oids
;
8526 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
8528 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, bucket_objs_ret
);
8533 ret
= CLSRGWIssueBucketCheck(index_ctx
, oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8538 // Aggregate results (from different shards if there is any)
8539 map
<int, struct rgw_cls_check_index_ret
>::iterator iter
;
8540 for (iter
= bucket_objs_ret
.begin(); iter
!= bucket_objs_ret
.end(); ++iter
) {
8541 accumulate_raw_stats(iter
->second
.existing_header
, *existing_stats
);
8542 accumulate_raw_stats(iter
->second
.calculated_header
, *calculated_stats
);
8548 int RGWRados::bucket_rebuild_index(RGWBucketInfo
& bucket_info
)
8550 librados::IoCtx index_ctx
;
8551 map
<int, string
> bucket_objs
;
8553 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
8558 return CLSRGWIssueBucketRebuild(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8561 int RGWRados::bucket_set_reshard(RGWBucketInfo
& bucket_info
, const cls_rgw_bucket_instance_entry
& entry
)
8563 librados::IoCtx index_ctx
;
8564 map
<int, string
> bucket_objs
;
8566 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
8571 return CLSRGWIssueSetBucketResharding(index_ctx
, bucket_objs
, entry
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8574 int RGWRados::defer_gc(void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
)
8576 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
8577 std::string oid
, key
;
8578 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
8582 RGWObjState
*state
= NULL
;
8584 int r
= get_obj_state(rctx
, bucket_info
, obj
, &state
, false);
8588 if (!state
->is_atomic
) {
8589 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
8593 if (state
->obj_tag
.length() == 0) {// check for backward compatibility
8594 ldout(cct
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
8598 string tag
= state
->obj_tag
.c_str();
8600 ldout(cct
, 0) << "defer chain tag=" << tag
<< dendl
;
8602 return gc
->defer_chain(tag
, false);
8605 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
8607 list
<string
> prefixes
;
8608 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
8609 cls_rgw_remove_obj(op
, prefixes
);
8612 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
8614 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
8617 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
8619 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
8625 * bucket: name of the bucket storing the object
8626 * obj: name of the object to delete
8627 * Returns: 0 on success, -ERR# otherwise.
8629 int RGWRados::Object::Delete::delete_obj()
8631 RGWRados
*store
= target
->get_store();
8632 rgw_obj
& src_obj
= target
->get_obj();
8633 const string
& instance
= src_obj
.key
.instance
;
8634 rgw_obj obj
= src_obj
;
8636 if (instance
== "null") {
8637 obj
.key
.instance
.clear();
8640 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
8642 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
8643 if (instance
.empty() || explicit_marker_version
) {
8644 rgw_obj marker
= obj
;
8646 if (!params
.marker_version_id
.empty()) {
8647 if (params
.marker_version_id
!= "null") {
8648 marker
.key
.set_instance(params
.marker_version_id
);
8650 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
8651 store
->gen_rand_obj_instance_name(&marker
);
8654 result
.version_id
= marker
.key
.instance
;
8655 result
.delete_marker
= true;
8657 struct rgw_bucket_dir_entry_meta meta
;
8659 meta
.owner
= params
.obj_owner
.get_id().to_str();
8660 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
8662 if (real_clock::is_zero(params
.mtime
)) {
8663 meta
.mtime
= real_clock::now();
8665 meta
.mtime
= params
.mtime
;
8668 int r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
, params
.zones_trace
);
8673 rgw_bucket_dir_entry dirent
;
8675 int r
= store
->bi_get_instance(target
->get_bucket_info(), obj
, &dirent
);
8679 result
.delete_marker
= dirent
.is_delete_marker();
8680 r
= store
->unlink_obj_instance(target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
, params
.zones_trace
);
8684 result
.version_id
= instance
;
8688 int r
= target
->get_bucket_shard(&bs
);
8690 ldout(store
->ctx(), 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
8694 r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
8696 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
8704 int r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
8710 r
= target
->get_state(&state
, false);
8714 ObjectWriteOperation op
;
8716 if (!real_clock::is_zero(params
.unmod_since
)) {
8717 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
8718 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
8719 if (!params
.high_precision_time
) {
8724 ldout(store
->ctx(), 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
8725 if (ctime
> unmod
) {
8726 return -ERR_PRECONDITION_FAILED
;
8729 /* only delete object if mtime is less than or equal to params.unmod_since */
8730 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
8732 uint64_t obj_size
= state
->size
;
8734 if (!real_clock::is_zero(params
.expiration_time
)) {
8736 real_time delete_at
;
8738 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
8740 bufferlist::iterator iter
= bl
.begin();
8741 ::decode(delete_at
, iter
);
8742 } catch (buffer::error
& err
) {
8743 ldout(store
->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
8747 if (params
.expiration_time
!= delete_at
) {
8748 return -ERR_PRECONDITION_FAILED
;
8751 return -ERR_PRECONDITION_FAILED
;
8755 if (!state
->exists
) {
8756 target
->invalidate_state();
8760 r
= target
->prepare_atomic_modification(op
, false, NULL
, NULL
, NULL
, true);
8764 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
8766 RGWRados::Bucket
bop(store
, bucket_info
);
8767 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
8769 index_op
.set_zones_trace(params
.zones_trace
);
8770 index_op
.set_bilog_flags(params
.bilog_flags
);
8773 r
= index_op
.prepare(CLS_RGW_OP_DEL
, &state
->write_tag
);
8777 store
->remove_rgw_head_obj(op
);
8778 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
8779 bool need_invalidate
= false;
8780 if (r
== -ECANCELED
) {
8781 /* raced with another operation, we can regard it as removed */
8782 need_invalidate
= true;
8785 bool removed
= (r
>= 0);
8787 int64_t poolid
= ref
.ioctx
.get_id();
8789 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
8790 if (obj_tombstone_cache
) {
8791 tombstone_entry entry
{*state
};
8792 obj_tombstone_cache
->add(obj
, entry
);
8794 r
= index_op
.complete_del(poolid
, ref
.ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
8796 int ret
= index_op
.cancel();
8798 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
8802 int ret
= target
->complete_atomic_modification();
8804 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
8806 /* other than that, no need to propagate error */
8809 if (need_invalidate
) {
8810 target
->invalidate_state();
8816 /* update quota cache */
8817 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_size
);
8822 int RGWRados::delete_obj(RGWObjectCtx
& obj_ctx
,
8823 const RGWBucketInfo
& bucket_info
,
8825 int versioning_status
,
8826 uint16_t bilog_flags
,
8827 const real_time
& expiration_time
,
8828 rgw_zone_set
*zones_trace
)
8830 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
8831 RGWRados::Object::Delete
del_op(&del_target
);
8833 del_op
.params
.bucket_owner
= bucket_info
.owner
;
8834 del_op
.params
.versioning_status
= versioning_status
;
8835 del_op
.params
.bilog_flags
= bilog_flags
;
8836 del_op
.params
.expiration_time
= expiration_time
;
8837 del_op
.params
.zones_trace
= zones_trace
;
8839 return del_op
.delete_obj();
8842 int RGWRados::delete_raw_obj(const rgw_raw_obj
& obj
)
8846 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
8851 ObjectWriteOperation op
;
8854 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
8861 int RGWRados::delete_system_obj(rgw_raw_obj
& obj
, RGWObjVersionTracker
*objv_tracker
)
8864 ldout(cct
, 1) << "delete_system_obj got empty object name "
8865 << obj
<< ", returning EINVAL" << dendl
;
8870 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
8875 ObjectWriteOperation op
;
8878 objv_tracker
->prepare_op_for_write(&op
);
8882 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
8889 int RGWRados::delete_obj_index(const rgw_obj
& obj
)
8891 std::string oid
, key
;
8892 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
8894 RGWObjectCtx
obj_ctx(this);
8896 RGWBucketInfo bucket_info
;
8897 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
);
8899 ldout(cct
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
8903 RGWRados::Bucket
bop(this, bucket_info
);
8904 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
8906 real_time removed_mtime
;
8907 int r
= index_op
.complete_del(-1 /* pool */, 0, removed_mtime
, NULL
);
8912 static void generate_fake_tag(RGWRados
*store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
8916 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin();
8917 if (mi
!= manifest
.obj_end()) {
8918 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
8920 tag
= mi
.get_location().get_raw_obj(store
).oid
;
8924 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
8925 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
8927 hash
.Update((const byte
*)manifest_bl
.c_str(), manifest_bl
.length());
8929 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
8930 if (iter
!= attrset
.end()) {
8931 bufferlist
& bl
= iter
->second
;
8932 hash
.Update((const byte
*)bl
.c_str(), bl
.length());
8936 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
8937 tag
.append(md5_str
);
8939 ldout(store
->ctx(), 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
8941 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
8944 static bool is_olh(map
<string
, bufferlist
>& attrs
)
8946 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
8947 return (iter
!= attrs
.end());
8950 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
8952 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
8953 return (iter
!= attrs
.end());
8956 int RGWRados::get_olh_target_state(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8957 RGWObjState
*olh_state
, RGWObjState
**target_state
)
8959 assert(olh_state
->is_olh
);
8962 int r
= RGWRados::follow_olh(bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
8966 r
= get_obj_state(&obj_ctx
, bucket_info
, target
, target_state
, false);
8974 int RGWRados::get_system_obj_state_impl(RGWObjectCtx
*rctx
, rgw_raw_obj
& obj
, RGWRawObjState
**state
, RGWObjVersionTracker
*objv_tracker
)
8980 RGWRawObjState
*s
= rctx
->raw
.get_state(obj
);
8981 ldout(cct
, 20) << "get_system_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
8989 int r
= raw_obj_stat(obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), objv_tracker
);
8992 s
->has_attrs
= true;
8993 s
->mtime
= real_time();
9000 s
->has_attrs
= true;
9001 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
9003 if (s
->obj_tag
.length())
9004 ldout(cct
, 20) << "get_system_obj_state: setting s->obj_tag to "
9005 << s
->obj_tag
.c_str() << dendl
;
9007 ldout(cct
, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl
;
9012 int RGWRados::get_system_obj_state(RGWObjectCtx
*rctx
, rgw_raw_obj
& obj
, RGWRawObjState
**state
, RGWObjVersionTracker
*objv_tracker
)
9017 ret
= get_system_obj_state_impl(rctx
, obj
, state
, objv_tracker
);
9018 } while (ret
== -EAGAIN
);
9023 int RGWRados::get_obj_state_impl(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
9024 RGWObjState
**state
, bool follow_olh
, bool assume_noent
)
9030 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
9032 RGWObjState
*s
= rctx
->obj
.get_state(obj
);
9033 ldout(cct
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
9036 if (s
->is_olh
&& need_follow_olh
) {
9037 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
9044 rgw_raw_obj raw_obj
;
9045 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
9049 if (!assume_noent
) {
9050 r
= RGWRados::raw_obj_stat(raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
);
9055 s
->has_attrs
= true;
9056 tombstone_entry entry
;
9057 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
9058 s
->mtime
= entry
.mtime
;
9059 s
->zone_short_id
= entry
.zone_short_id
;
9060 s
->pg_ver
= entry
.pg_ver
;
9061 ldout(cct
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
9062 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
9064 s
->mtime
= real_time();
9072 s
->has_attrs
= true;
9073 s
->accounted_size
= s
->size
;
9075 auto iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
9076 const bool compressed
= (iter
!= s
->attrset
.end());
9078 // use uncompressed size for accounted_size
9080 RGWCompressionInfo info
;
9081 auto p
= iter
->second
.begin();
9083 s
->accounted_size
= info
.orig_size
;
9084 } catch (buffer::error
&) {
9085 dout(0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
9090 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
9091 if (iter
!= s
->attrset
.end()) {
9092 bufferlist bl
= iter
->second
;
9093 bufferlist::iterator it
= bl
.begin();
9094 it
.copy(bl
.length(), s
->shadow_obj
);
9095 s
->shadow_obj
[bl
.length()] = '\0';
9097 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
9099 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
9100 if (manifest_bl
.length()) {
9101 bufferlist::iterator miter
= manifest_bl
.begin();
9103 ::decode(s
->manifest
, miter
);
9104 s
->has_manifest
= true;
9105 s
->manifest
.set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
9106 broken due to old bugs */
9107 s
->size
= s
->manifest
.get_obj_size();
9109 s
->accounted_size
= s
->size
;
9110 } catch (buffer::error
& err
) {
9111 ldout(cct
, 0) << "ERROR: couldn't decode manifest" << dendl
;
9114 ldout(cct
, 10) << "manifest: total_size = " << s
->manifest
.get_obj_size() << dendl
;
9115 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20) && s
->manifest
.has_explicit_objs()) {
9116 RGWObjManifest::obj_iterator mi
;
9117 for (mi
= s
->manifest
.obj_begin(); mi
!= s
->manifest
.obj_end(); ++mi
) {
9118 ldout(cct
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(this) << dendl
;
9122 if (!s
->obj_tag
.length()) {
9124 * Uh oh, something's wrong, object with manifest should have tag. Let's
9125 * create one out of the manifest, would be unique
9127 generate_fake_tag(this, s
->attrset
, s
->manifest
, manifest_bl
, s
->obj_tag
);
9131 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
9132 if (aiter
!= s
->attrset
.end()) {
9133 bufferlist
& pg_ver_bl
= aiter
->second
;
9134 if (pg_ver_bl
.length()) {
9135 bufferlist::iterator pgbl
= pg_ver_bl
.begin();
9137 ::decode(s
->pg_ver
, pgbl
);
9138 } catch (buffer::error
& err
) {
9139 ldout(cct
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
9143 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
9144 if (aiter
!= s
->attrset
.end()) {
9145 bufferlist
& zone_short_id_bl
= aiter
->second
;
9146 if (zone_short_id_bl
.length()) {
9147 bufferlist::iterator zbl
= zone_short_id_bl
.begin();
9149 ::decode(s
->zone_short_id
, zbl
);
9150 } catch (buffer::error
& err
) {
9151 ldout(cct
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
9155 if (s
->obj_tag
.length())
9156 ldout(cct
, 20) << "get_obj_state: setting s->obj_tag to " << s
->obj_tag
.c_str() << dendl
;
9158 ldout(cct
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
9160 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9161 * it exist, and not only if is_olh() returns true
9163 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
9164 if (iter
!= s
->attrset
.end()) {
9165 s
->olh_tag
= iter
->second
;
9168 if (is_olh(s
->attrset
)) {
9171 ldout(cct
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
9173 if (need_follow_olh
) {
9174 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
9181 int RGWRados::get_obj_state(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
9182 bool follow_olh
, bool assume_noent
)
9187 ret
= get_obj_state_impl(rctx
, bucket_info
, obj
, state
, follow_olh
, assume_noent
);
9188 } while (ret
== -EAGAIN
);
9193 int RGWRados::Object::get_manifest(RGWObjManifest
**pmanifest
)
9195 RGWObjState
*astate
;
9196 int r
= get_state(&astate
, true);
9201 *pmanifest
= &astate
->manifest
;
9206 int RGWRados::Object::Read::get_attr(const char *name
, bufferlist
& dest
)
9209 int r
= source
->get_state(&state
, true);
9214 if (!state
->get_attr(name
, dest
))
9221 int RGWRados::Object::Stat::stat_async()
9223 RGWObjectCtx
& ctx
= source
->get_ctx();
9224 rgw_obj
& obj
= source
->get_obj();
9225 RGWRados
*store
= source
->get_store();
9227 RGWObjState
*s
= ctx
.obj
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
9231 result
.size
= s
->size
;
9232 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
9233 result
.attrs
= s
->attrset
;
9234 result
.has_manifest
= s
->has_manifest
;
9235 result
.manifest
= s
->manifest
;
9241 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
9243 int r
= store
->get_obj_head_ioctx(source
->get_bucket_info(), obj
, &state
.io_ctx
);
9248 librados::ObjectReadOperation op
;
9249 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
9250 op
.getxattrs(&result
.attrs
, NULL
);
9251 state
.completion
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
9252 state
.io_ctx
.locator_set_key(loc
);
9253 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
9255 ldout(store
->ctx(), 5) << __func__
9256 << ": ERROR: aio_operate() returned ret=" << r
9265 int RGWRados::Object::Stat::wait()
9267 if (!state
.completion
) {
9271 state
.completion
->wait_for_safe();
9272 state
.ret
= state
.completion
->get_return_value();
9273 state
.completion
->release();
9275 if (state
.ret
!= 0) {
9282 int RGWRados::Object::Stat::finish()
9284 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
9285 if (iter
!= result
.attrs
.end()) {
9286 bufferlist
& bl
= iter
->second
;
9287 bufferlist::iterator biter
= bl
.begin();
9289 ::decode(result
.manifest
, biter
);
9290 } catch (buffer::error
& err
) {
9291 RGWRados
*store
= source
->get_store();
9292 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
9295 result
.has_manifest
= true;
9302 * Get an attribute for a system object.
9303 * obj: the object to get attr
9304 * name: name of the attr to retrieve
9305 * dest: bufferlist to store the result in
9306 * Returns: 0 on success, -ERR# otherwise.
9308 int RGWRados::system_obj_get_attr(rgw_raw_obj
& obj
, const char *name
, bufferlist
& dest
)
9312 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
9317 ObjectReadOperation op
;
9320 op
.getxattr(name
, &dest
, &rval
);
9322 r
= ref
.ioctx
.operate(ref
.oid
, &op
, NULL
);
9329 int RGWRados::append_atomic_test(RGWObjectCtx
*rctx
,
9330 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
9331 ObjectOperation
& op
, RGWObjState
**pstate
)
9336 int r
= get_obj_state(rctx
, bucket_info
, obj
, pstate
, false);
9340 RGWObjState
*state
= *pstate
;
9342 if (!state
->is_atomic
) {
9343 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not appending atomic test" << dendl
;
9347 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
9348 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
9350 ldout(cct
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
9355 int RGWRados::Object::get_state(RGWObjState
**pstate
, bool follow_olh
, bool assume_noent
)
9357 return store
->get_obj_state(&ctx
, bucket_info
, obj
, pstate
, follow_olh
, assume_noent
);
9360 void RGWRados::Object::invalidate_state()
9362 ctx
.obj
.invalidate(obj
);
9365 void RGWRados::SystemObject::invalidate_state()
9367 ctx
.raw
.invalidate(obj
);
9370 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
9371 const char *if_match
, const char *if_nomatch
, bool removal_op
)
9373 int r
= get_state(&state
, false);
9377 bool need_guard
= (state
->has_manifest
|| (state
->obj_tag
.length() != 0) ||
9378 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
9381 if (!state
->is_atomic
) {
9382 ldout(store
->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
9386 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
9393 /* first verify that the object wasn't replaced under */
9394 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
9395 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
9396 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9400 if (strcmp(if_match
, "*") == 0) {
9401 // test the object is existing
9402 if (!state
->exists
) {
9403 return -ERR_PRECONDITION_FAILED
;
9407 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
9408 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
9409 return -ERR_PRECONDITION_FAILED
;
9415 if (strcmp(if_nomatch
, "*") == 0) {
9416 // test the object is NOT existing
9417 if (state
->exists
) {
9418 return -ERR_PRECONDITION_FAILED
;
9422 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
9423 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
9424 return -ERR_PRECONDITION_FAILED
;
9431 if (state
->exists
) {
9433 store
->remove_rgw_head_obj(op
);
9440 /* the object is being removed, no need to update its tag */
9445 state
->write_tag
= *ptag
;
9447 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
9450 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
9452 ldout(store
->ctx(), 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
9454 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
9459 int RGWRados::system_obj_set_attr(void *ctx
, rgw_raw_obj
& obj
, const char *name
, bufferlist
& bl
,
9460 RGWObjVersionTracker
*objv_tracker
)
9462 map
<string
, bufferlist
> attrs
;
9464 return system_obj_set_attrs(ctx
, obj
, attrs
, NULL
, objv_tracker
);
9467 int RGWRados::system_obj_set_attrs(void *ctx
, rgw_raw_obj
& obj
,
9468 map
<string
, bufferlist
>& attrs
,
9469 map
<string
, bufferlist
>* rmattrs
,
9470 RGWObjVersionTracker
*objv_tracker
)
9474 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
9478 ObjectWriteOperation op
;
9481 objv_tracker
->prepare_op_for_write(&op
);
9484 map
<string
, bufferlist
>::iterator iter
;
9486 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9487 const string
& name
= iter
->first
;
9488 op
.rmxattr(name
.c_str());
9492 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9493 const string
& name
= iter
->first
;
9494 bufferlist
& bl
= iter
->second
;
9499 op
.setxattr(name
.c_str(), bl
);
9507 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9515 * Set an attr on an object.
9516 * bucket: name of the bucket holding the object
9517 * obj: name of the object to set the attr on
9518 * name: the attr to set
9519 * bl: the contents of the attr
9520 * Returns: 0 on success, -ERR# otherwise.
9522 int RGWRados::set_attr(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
9524 map
<string
, bufferlist
> attrs
;
9526 return set_attrs(ctx
, bucket_info
, obj
, attrs
, NULL
);
9529 int RGWRados::set_attrs(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
,
9530 map
<string
, bufferlist
>& attrs
,
9531 map
<string
, bufferlist
>* rmattrs
)
9534 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
9538 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
9540 ObjectWriteOperation op
;
9541 RGWObjState
*state
= NULL
;
9543 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &state
);
9547 map
<string
, bufferlist
>::iterator iter
;
9549 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9550 const string
& name
= iter
->first
;
9551 op
.rmxattr(name
.c_str());
9555 const rgw_bucket
& bucket
= obj
.bucket
;
9557 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9558 const string
& name
= iter
->first
;
9559 bufferlist
& bl
= iter
->second
;
9564 op
.setxattr(name
.c_str(), bl
);
9566 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
9571 rgw_obj_index_key obj_key
;
9572 obj
.key
.get_index_key(&obj_key
);
9574 objexp_hint_add(ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
9575 } catch (buffer::error
& err
) {
9576 ldout(cct
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
9584 RGWObjectCtx
obj_ctx(this);
9587 RGWRados::Bucket
bop(this, bucket_info
);
9588 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
9592 append_rand_alpha(cct
, tag
, tag
, 32);
9593 state
->write_tag
= tag
;
9594 r
= index_op
.prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
9599 bl
.append(tag
.c_str(), tag
.size() + 1);
9601 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
9604 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9607 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
9608 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
9609 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
9610 string
etag(etag_bl
.c_str(), etag_bl
.length());
9611 string
content_type(content_type_bl
.c_str(), content_type_bl
.length());
9612 uint64_t epoch
= ref
.ioctx
.get_last_version();
9613 int64_t poolid
= ref
.ioctx
.get_id();
9614 real_time mtime
= real_clock::now();
9615 r
= index_op
.complete(poolid
, epoch
, state
->size
, state
->accounted_size
,
9616 mtime
, etag
, content_type
, &acl_bl
,
9617 RGW_OBJ_CATEGORY_MAIN
, NULL
);
9619 int ret
= index_op
.cancel();
9621 ldout(cct
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
9629 state
->obj_tag
.swap(bl
);
9631 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9632 state
->attrset
.erase(iter
->first
);
9635 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9636 state
->attrset
[iter
->first
] = iter
->second
;
9643 int RGWRados::Object::Read::prepare()
9645 RGWRados
*store
= source
->get_store();
9646 CephContext
*cct
= store
->ctx();
9650 map
<string
, bufferlist
>::iterator iter
;
9652 RGWObjState
*astate
;
9653 int r
= source
->get_state(&astate
, true);
9657 if (!astate
->exists
) {
9661 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
9663 state
.obj
= astate
->obj
;
9664 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
9666 r
= store
->get_obj_head_ioctx(bucket_info
, state
.obj
, &state
.io_ctx
);
9671 *params
.attrs
= astate
->attrset
;
9672 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20)) {
9673 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
9674 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
9679 /* Convert all times go GMT to make them compatible */
9680 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
9681 obj_time_weight src_weight
;
9682 src_weight
.init(astate
);
9683 src_weight
.high_precision
= conds
.high_precision_time
;
9685 obj_time_weight dest_weight
;
9686 dest_weight
.high_precision
= conds
.high_precision_time
;
9688 if (conds
.mod_ptr
) {
9689 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
9690 ldout(cct
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
9691 if (!(dest_weight
< src_weight
)) {
9692 return -ERR_NOT_MODIFIED
;
9696 if (conds
.unmod_ptr
) {
9697 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
9698 ldout(cct
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
9699 if (dest_weight
< src_weight
) {
9700 return -ERR_PRECONDITION_FAILED
;
9704 if (conds
.if_match
|| conds
.if_nomatch
) {
9705 r
= get_attr(RGW_ATTR_ETAG
, etag
);
9709 if (conds
.if_match
) {
9710 string if_match_str
= rgw_string_unquote(conds
.if_match
);
9711 ldout(cct
, 10) << "ETag: " << etag
.c_str() << " " << " If-Match: " << if_match_str
<< dendl
;
9712 if (if_match_str
.compare(etag
.c_str()) != 0) {
9713 return -ERR_PRECONDITION_FAILED
;
9717 if (conds
.if_nomatch
) {
9718 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
9719 ldout(cct
, 10) << "ETag: " << etag
.c_str() << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
9720 if (if_nomatch_str
.compare(etag
.c_str()) == 0) {
9721 return -ERR_NOT_MODIFIED
;
9726 if (params
.obj_size
)
9727 *params
.obj_size
= astate
->size
;
9729 *params
.lastmod
= astate
->mtime
;
9734 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
9741 } else if (end
< 0) {
9746 if (ofs
>= (off_t
)obj_size
) {
9749 if (end
>= (off_t
)obj_size
) {
9756 int RGWRados::SystemObject::get_state(RGWRawObjState
**pstate
, RGWObjVersionTracker
*objv_tracker
)
9758 return store
->get_system_obj_state(&ctx
, obj
, pstate
, objv_tracker
);
9761 int RGWRados::stat_system_obj(RGWObjectCtx
& obj_ctx
,
9762 RGWRados::SystemObject::Read::GetObjState
& state
,
9764 map
<string
, bufferlist
> *attrs
,
9767 RGWObjVersionTracker
*objv_tracker
)
9769 RGWRawObjState
*astate
= NULL
;
9771 int r
= get_system_obj_state(&obj_ctx
, obj
, &astate
, objv_tracker
);
9775 if (!astate
->exists
) {
9780 *attrs
= astate
->attrset
;
9781 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20)) {
9782 map
<string
, bufferlist
>::iterator iter
;
9783 for (iter
= attrs
->begin(); iter
!= attrs
->end(); ++iter
) {
9784 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
9790 *obj_size
= astate
->size
;
9792 *lastmod
= astate
->mtime
;
9798 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard
**pbs
, std::function
<int(BucketShard
*)> call
)
9800 RGWRados
*store
= target
->get_store();
9804 #define NUM_RESHARD_RETRIES 10
9805 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
9806 int ret
= get_bucket_shard(&bs
);
9808 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9812 if (r
!= -ERR_BUSY_RESHARDING
) {
9815 ldout(store
->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
9816 string new_bucket_id
;
9817 r
= store
->block_while_resharding(bs
, &new_bucket_id
);
9818 if (r
== -ERR_BUSY_RESHARDING
) {
9824 ldout(store
->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
9825 i
= 0; /* resharding is finished, make sure we can retry */
9826 r
= target
->update_bucket_id(new_bucket_id
);
9828 ldout(store
->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id
<< " returned r=" << r
<< dendl
;
9845 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker
*objv_tracker
)
9847 RGWRados
*store
= source
->get_store();
9848 rgw_raw_obj
& obj
= source
->get_obj();
9850 return store
->stat_system_obj(source
->get_ctx(), state
, obj
, stat_params
.attrs
,
9851 stat_params
.lastmod
, stat_params
.obj_size
, objv_tracker
);
9854 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op
, const string
*write_tag
)
9859 RGWRados
*store
= target
->get_store();
9861 if (write_tag
&& write_tag
->length()) {
9862 optag
= string(write_tag
->c_str(), write_tag
->length());
9864 if (optag
.empty()) {
9865 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
9869 int r
= guard_reshard(nullptr, [&](BucketShard
*bs
) -> int {
9870 return store
->cls_obj_prepare_op(*bs
, op
, optag
, obj
, bilog_flags
, zones_trace
);
9881 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid
, uint64_t epoch
,
9882 uint64_t size
, uint64_t accounted_size
,
9883 ceph::real_time
& ut
, const string
& etag
,
9884 const string
& content_type
,
9886 RGWObjCategory category
,
9887 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
)
9892 RGWRados
*store
= target
->get_store();
9895 int ret
= get_bucket_shard(&bs
);
9897 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9901 rgw_bucket_dir_entry ent
;
9902 obj
.key
.get_index_key(&ent
.key
);
9903 ent
.meta
.size
= size
;
9904 ent
.meta
.accounted_size
= accounted_size
;
9905 ent
.meta
.mtime
= ut
;
9906 ent
.meta
.etag
= etag
;
9908 ent
.meta
.user_data
= *user_data
;
9911 if (acl_bl
&& acl_bl
->length()) {
9912 int ret
= store
->decode_policy(*acl_bl
, &owner
);
9914 ldout(store
->ctx(), 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
9917 ent
.meta
.owner
= owner
.get_id().to_str();
9918 ent
.meta
.owner_display_name
= owner
.get_display_name();
9919 ent
.meta
.content_type
= content_type
;
9921 ret
= store
->cls_obj_complete_add(*bs
, obj
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
9923 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9925 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9931 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid
, uint64_t epoch
,
9932 real_time
& removed_mtime
,
9933 list
<rgw_obj_index_key
> *remove_objs
)
9938 RGWRados
*store
= target
->get_store();
9941 int ret
= get_bucket_shard(&bs
);
9943 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9947 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
, zones_trace
);
9949 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9951 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9958 int RGWRados::Bucket::UpdateIndex::cancel()
9963 RGWRados
*store
= target
->get_store();
9966 int ret
= guard_reshard(&bs
, [&](BucketShard
*bs
) -> int {
9967 return store
->cls_obj_complete_cancel(*bs
, optag
, obj
, bilog_flags
, zones_trace
);
9971 * need to update data log anyhow, so that whoever follows needs to update its internal markers
9972 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
9973 * have no way to tell that they're all caught up
9975 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9977 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9983 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
)
9985 RGWRados
*store
= source
->get_store();
9986 CephContext
*cct
= store
->ctx();
9988 std::string oid
, key
;
9989 rgw_raw_obj read_obj
;
9990 uint64_t read_ofs
= ofs
;
9991 uint64_t len
, read_len
;
9992 bool reading_from_head
= true;
9993 ObjectReadOperation op
;
9995 bool merge_bl
= false;
9996 bufferlist
*pbl
= &bl
;
9998 uint64_t max_chunk_size
;
10000 RGWObjState
*astate
;
10001 int r
= source
->get_state(&astate
, true);
10008 len
= end
- ofs
+ 1;
10010 if (astate
->has_manifest
&& astate
->manifest
.has_tail()) {
10011 /* now get the relevant object part */
10012 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
10014 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
10015 read_obj
= iter
.get_location().get_raw_obj(store
);
10016 len
= min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
10017 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
10018 reading_from_head
= (read_obj
== state
.head_obj
);
10020 read_obj
= state
.head_obj
;
10023 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
);
10025 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
10029 if (len
> max_chunk_size
)
10030 len
= max_chunk_size
;
10033 state
.io_ctx
.locator_set_key(read_obj
.loc
);
10037 if (reading_from_head
) {
10038 /* only when reading from the head object do we need to do the atomic test */
10039 r
= store
->append_atomic_test(&source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
);
10043 if (astate
&& astate
->prefetch_data
) {
10044 if (!ofs
&& astate
->data
.length() >= len
) {
10046 return bl
.length();
10049 if (ofs
< astate
->data
.length()) {
10050 unsigned copy_len
= min((uint64_t)astate
->data
.length() - ofs
, len
);
10051 astate
->data
.copy(ofs
, copy_len
, bl
);
10052 read_len
-= copy_len
;
10053 read_ofs
+= copy_len
;
10055 return bl
.length();
10063 ldout(cct
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
10064 op
.read(read_ofs
, read_len
, pbl
, NULL
);
10066 r
= state
.io_ctx
.operate(read_obj
.oid
, &op
, NULL
);
10067 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
10074 bl
.append(read_bl
);
10077 return bl
.length();
10080 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados
*store
, rgw_raw_obj
& obj
, rgw_rados_ref
**pref
)
10084 int r
= store
->get_raw_obj_ref(obj
, &ref
, &pool
);
10095 int RGWRados::get_system_obj(RGWObjectCtx
& obj_ctx
, RGWRados::SystemObject::Read::GetObjState
& read_state
,
10096 RGWObjVersionTracker
*objv_tracker
, rgw_raw_obj
& obj
,
10097 bufferlist
& bl
, off_t ofs
, off_t end
,
10098 map
<string
, bufferlist
> *attrs
,
10099 rgw_cache_entry_info
*cache_info
)
10102 ObjectReadOperation op
;
10107 len
= end
- ofs
+ 1;
10109 if (objv_tracker
) {
10110 objv_tracker
->prepare_op_for_read(&op
);
10113 ldout(cct
, 20) << "rados->read ofs=" << ofs
<< " len=" << len
<< dendl
;
10114 op
.read(ofs
, len
, &bl
, NULL
);
10117 op
.getxattrs(attrs
, NULL
);
10120 rgw_rados_ref
*ref
;
10121 int r
= read_state
.get_ref(this, obj
, &ref
);
10123 ldout(cct
, 20) << "read_state.get_ref() on obj=" << obj
<< " returned " << r
<< dendl
;
10126 r
= ref
->ioctx
.operate(ref
->oid
, &op
, NULL
);
10128 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
10131 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
10133 uint64_t op_ver
= ref
->ioctx
.get_last_version();
10135 if (read_state
.last_ver
> 0 &&
10136 read_state
.last_ver
!= op_ver
) {
10137 ldout(cct
, 5) << "raced with an object write, abort" << dendl
;
10141 read_state
.last_ver
= op_ver
;
10143 return bl
.length();
10146 int RGWRados::SystemObject::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
, RGWObjVersionTracker
*objv_tracker
)
10148 RGWRados
*store
= source
->get_store();
10149 rgw_raw_obj
& obj
= source
->get_obj();
10151 return store
->get_system_obj(source
->get_ctx(), state
, objv_tracker
, obj
, bl
, ofs
, end
, read_params
.attrs
, read_params
.cache_info
);
10154 int RGWRados::SystemObject::Read::get_attr(const char *name
, bufferlist
& dest
)
10156 RGWRados
*store
= source
->get_store();
10157 rgw_raw_obj
& obj
= source
->get_obj();
10159 return store
->system_obj_get_attr(obj
, name
, dest
);
10162 struct get_obj_data
;
10164 struct get_obj_aio_data
{
10165 struct get_obj_data
*op_data
;
10170 struct get_obj_io
{
10175 static void _get_obj_aio_completion_cb(completion_t cb
, void *arg
);
10177 struct get_obj_data
: public RefCountedObject
{
10182 map
<off_t
, get_obj_io
> io_map
;
10183 map
<off_t
, librados::AioCompletion
*> completion_map
;
10184 uint64_t total_read
;
10187 list
<get_obj_aio_data
> aio_data
;
10188 RGWGetDataCB
*client_cb
;
10189 std::atomic
<bool> cancelled
= { false };
10190 std::atomic
<int64_t> err_code
= { 0 };
10192 list
<bufferlist
> read_list
;
10194 explicit get_obj_data(CephContext
*_cct
)
10196 rados(NULL
), ctx(NULL
),
10197 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10199 throttle(cct
, "get_obj_data", cct
->_conf
->rgw_get_obj_window_size
, false) {}
10200 ~get_obj_data() override
{ }
10201 void set_cancelled(int r
) {
10206 bool is_cancelled() {
10210 int get_err_code() {
10214 int wait_next_io(bool *done
) {
10216 map
<off_t
, librados::AioCompletion
*>::iterator iter
= completion_map
.begin();
10217 if (iter
== completion_map
.end()) {
10222 off_t cur_ofs
= iter
->first
;
10223 librados::AioCompletion
*c
= iter
->second
;
10226 c
->wait_for_safe_and_cb();
10227 int r
= c
->get_return_value();
10230 completion_map
.erase(cur_ofs
);
10232 if (completion_map
.empty()) {
10242 void add_io(off_t ofs
, off_t len
, bufferlist
**pbl
, AioCompletion
**pc
) {
10243 Mutex::Locker
l(lock
);
10245 const auto& io_iter
= io_map
.insert(
10246 map
<off_t
, get_obj_io
>::value_type(ofs
, get_obj_io()));
10248 assert(io_iter
.second
); // assert new insertion
10250 get_obj_io
& io
= (io_iter
.first
)->second
;
10253 struct get_obj_aio_data aio
;
10256 aio
.op_data
= this;
10258 aio_data
.push_back(aio
);
10260 struct get_obj_aio_data
*paio_data
= &aio_data
.back(); /* last element */
10262 librados::AioCompletion
*c
= librados::Rados::aio_create_completion((void *)paio_data
, NULL
, _get_obj_aio_completion_cb
);
10263 completion_map
[ofs
] = c
;
10267 /* we have a reference per IO, plus one reference for the calling function.
10268 * reference is dropped for each callback, plus when we're done iterating
10269 * over the parts */
10273 void cancel_io(off_t ofs
) {
10274 ldout(cct
, 20) << "get_obj_data::cancel_io() ofs=" << ofs
<< dendl
;
10276 map
<off_t
, AioCompletion
*>::iterator iter
= completion_map
.find(ofs
);
10277 if (iter
!= completion_map
.end()) {
10278 AioCompletion
*c
= iter
->second
;
10280 completion_map
.erase(ofs
);
10285 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10286 * need IoCtx to live, as io callback may still be called
10290 void cancel_all_io() {
10291 ldout(cct
, 20) << "get_obj_data::cancel_all_io()" << dendl
;
10292 Mutex::Locker
l(lock
);
10293 for (map
<off_t
, librados::AioCompletion
*>::iterator iter
= completion_map
.begin();
10294 iter
!= completion_map
.end(); ++iter
) {
10295 librados::AioCompletion
*c
= iter
->second
;
10300 int get_complete_ios(off_t ofs
, list
<bufferlist
>& bl_list
) {
10301 Mutex::Locker
l(lock
);
10303 map
<off_t
, get_obj_io
>::iterator liter
= io_map
.begin();
10305 if (liter
== io_map
.end() ||
10306 liter
->first
!= ofs
) {
10310 map
<off_t
, librados::AioCompletion
*>::iterator aiter
;
10311 aiter
= completion_map
.find(ofs
);
10312 if (aiter
== completion_map
.end()) {
10313 /* completion map does not hold this io, it was cancelled */
10317 AioCompletion
*completion
= aiter
->second
;
10318 int r
= completion
->get_return_value();
10322 for (; aiter
!= completion_map
.end(); ++aiter
) {
10323 completion
= aiter
->second
;
10324 if (!completion
->is_safe()) {
10325 /* reached a request that is not yet complete, stop */
10329 r
= completion
->get_return_value();
10331 set_cancelled(r
); /* mark it as cancelled, so that we don't continue processing next operations */
10337 map
<off_t
, get_obj_io
>::iterator old_liter
= liter
++;
10338 bl_list
.push_back(old_liter
->second
.bl
);
10339 io_map
.erase(old_liter
);
10346 static int _get_obj_iterate_cb(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, const rgw_raw_obj
& read_obj
, off_t obj_ofs
, off_t read_ofs
, off_t len
, bool is_head_obj
, RGWObjState
*astate
, void *arg
)
10348 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
10350 return d
->rados
->get_obj_iterate_cb(d
->ctx
, astate
, bucket_info
, obj
, read_obj
, obj_ofs
, read_ofs
, len
, is_head_obj
, arg
);
10353 static void _get_obj_aio_completion_cb(completion_t cb
, void *arg
)
10355 struct get_obj_aio_data
*aio_data
= (struct get_obj_aio_data
*)arg
;
10356 struct get_obj_data
*d
= aio_data
->op_data
;
10358 d
->rados
->get_obj_aio_completion_cb(cb
, arg
);
10362 void RGWRados::get_obj_aio_completion_cb(completion_t c
, void *arg
)
10364 struct get_obj_aio_data
*aio_data
= (struct get_obj_aio_data
*)arg
;
10365 struct get_obj_data
*d
= aio_data
->op_data
;
10366 off_t ofs
= aio_data
->ofs
;
10367 off_t len
= aio_data
->len
;
10369 list
<bufferlist
> bl_list
;
10370 list
<bufferlist
>::iterator iter
;
10373 ldout(cct
, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs
<< " len=" << len
<< dendl
;
10374 d
->throttle
.put(len
);
10376 r
= rados_aio_get_return_value(c
);
10378 ldout(cct
, 0) << "ERROR: got unexpected error when trying to read object: " << r
<< dendl
;
10379 d
->set_cancelled(r
);
10383 if (d
->is_cancelled()) {
10387 d
->data_lock
.Lock();
10389 r
= d
->get_complete_ios(ofs
, bl_list
);
10394 d
->read_list
.splice(d
->read_list
.end(), bl_list
);
10397 d
->data_lock
.Unlock();
10403 int RGWRados::flush_read_list(struct get_obj_data
*d
)
10405 d
->data_lock
.Lock();
10406 list
<bufferlist
> l
;
10407 l
.swap(d
->read_list
);
10409 d
->read_list
.clear();
10411 d
->data_lock
.Unlock();
10415 list
<bufferlist
>::iterator iter
;
10416 for (iter
= l
.begin(); iter
!= l
.end(); ++iter
) {
10417 bufferlist
& bl
= *iter
;
10418 r
= d
->client_cb
->handle_data(bl
, 0, bl
.length());
10420 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r
<< dendl
;
10425 d
->data_lock
.Lock();
10428 d
->set_cancelled(r
);
10430 d
->data_lock
.Unlock();
10434 int RGWRados::get_obj_iterate_cb(RGWObjectCtx
*ctx
, RGWObjState
*astate
,
10435 const RGWBucketInfo
& bucket_info
,
10436 const rgw_obj
& obj
,
10437 const rgw_raw_obj
& read_obj
,
10439 off_t read_ofs
, off_t len
,
10440 bool is_head_obj
, void *arg
)
10442 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
10443 ObjectReadOperation op
;
10444 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
10452 /* only when reading from the head object do we need to do the atomic test */
10453 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &astate
);
10458 obj_ofs
< astate
->data
.length()) {
10459 unsigned chunk_len
= min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
10461 d
->data_lock
.Lock();
10462 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
10463 d
->data_lock
.Unlock();
10468 d
->total_read
+= chunk_len
;
10472 read_ofs
+= chunk_len
;
10473 obj_ofs
+= chunk_len
;
10479 d
->throttle
.get(len
);
10480 if (d
->is_cancelled()) {
10481 return d
->get_err_code();
10484 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10487 d
->add_io(obj_ofs
, len
, &pbl
, &c
);
10489 ldout(cct
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
10490 op
.read(read_ofs
, len
, pbl
, NULL
);
10492 librados::IoCtx
io_ctx(d
->io_ctx
);
10493 io_ctx
.locator_set_key(read_obj
.loc
);
10495 r
= io_ctx
.aio_operate(read_obj
.oid
, c
, &op
, NULL
);
10497 ldout(cct
, 0) << "rados->aio_operate r=" << r
<< dendl
;
10501 // Flush data to client if there is any
10502 r
= flush_read_list(d
);
10509 ldout(cct
, 20) << "cancelling io r=" << r
<< " obj_ofs=" << obj_ofs
<< dendl
;
10510 d
->set_cancelled(r
);
10511 d
->cancel_io(obj_ofs
);
10516 int RGWRados::Object::Read::iterate(int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
)
10518 RGWRados
*store
= source
->get_store();
10519 CephContext
*cct
= store
->ctx();
10521 struct get_obj_data
*data
= new get_obj_data(cct
);
10524 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
10526 data
->rados
= store
;
10527 data
->io_ctx
.dup(state
.io_ctx
);
10528 data
->client_cb
= cb
;
10530 int r
= store
->iterate_obj(obj_ctx
, source
->get_bucket_info(), state
.obj
, ofs
, end
, cct
->_conf
->rgw_get_obj_max_req_size
, _get_obj_iterate_cb
, (void *)data
);
10532 data
->cancel_all_io();
10537 r
= data
->wait_next_io(&done
);
10539 dout(10) << "get_obj_iterate() r=" << r
<< ", canceling all io" << dendl
;
10540 data
->cancel_all_io();
10543 r
= store
->flush_read_list(data
);
10545 dout(10) << "get_obj_iterate() r=" << r
<< ", canceling all io" << dendl
;
10546 data
->cancel_all_io();
10556 int RGWRados::iterate_obj(RGWObjectCtx
& obj_ctx
,
10557 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
10558 off_t ofs
, off_t end
,
10559 uint64_t max_chunk_size
,
10560 int (*iterate_obj_cb
)(const RGWBucketInfo
&, const rgw_obj
& obj
,
10561 const rgw_raw_obj
&, off_t
, off_t
, off_t
, bool,
10562 RGWObjState
*, void *),
10565 rgw_raw_obj head_obj
;
10566 rgw_raw_obj read_obj
;
10567 uint64_t read_ofs
= ofs
;
10569 bool reading_from_head
= true;
10570 RGWObjState
*astate
= NULL
;
10572 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
10574 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &astate
, false);
10582 len
= end
- ofs
+ 1;
10584 if (astate
->has_manifest
) {
10585 /* now get the relevant object stripe */
10586 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
10588 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
.obj_end();
10590 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
10591 off_t stripe_ofs
= iter
.get_stripe_ofs();
10592 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
10594 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
10595 read_obj
= iter
.get_location().get_raw_obj(this);
10596 uint64_t read_len
= min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
10597 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
10599 if (read_len
> max_chunk_size
) {
10600 read_len
= max_chunk_size
;
10603 reading_from_head
= (read_obj
== head_obj
);
10604 r
= iterate_obj_cb(bucket_info
, obj
, read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
10614 while (ofs
<= end
) {
10615 read_obj
= head_obj
;
10616 uint64_t read_len
= min(len
, max_chunk_size
);
10618 r
= iterate_obj_cb(bucket_info
, obj
, read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
10631 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
10634 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10639 return ref
.ioctx
.operate(ref
.oid
, op
);
10642 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
10645 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10652 return ref
.ioctx
.operate(ref
.oid
, op
, &outbl
);
10655 int RGWRados::olh_init_modification_impl(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
10657 ObjectWriteOperation op
;
10659 assert(olh_obj
.key
.instance
.empty());
10661 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
10663 if (!state
.exists
) {
10666 op
.assert_exists();
10670 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10671 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10672 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10673 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10674 * log will reflect that.
10676 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10677 * is used for object data instance, olh_tag for olh instance.
10680 /* guard against racing writes */
10681 bucket_index_guard_olh_op(state
, op
);
10687 int ret
= gen_rand_alphanumeric_lower(cct
, &obj_tag
, 32);
10689 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
10693 bl
.append(obj_tag
.c_str(), obj_tag
.size());
10694 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
10696 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
10697 state
.obj_tag
= bl
;
10701 ret
= gen_rand_alphanumeric_lower(cct
, &olh_tag
, 32);
10703 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
10707 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
10708 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
10710 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
10711 state
.olh_tag
= olh_bl
;
10712 state
.is_olh
= true;
10715 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
10719 RGWOLHPendingInfo pending_info
;
10720 pending_info
.time
= real_clock::now();
10721 ::encode(pending_info
, bl
);
10723 #define OLH_PENDING_TAG_LEN 32
10724 /* tag will start with current time epoch, this so that entries are sorted by time */
10726 utime_t
ut(pending_info
.time
);
10727 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
10731 int ret
= gen_rand_alphanumeric_lower(cct
, &s
, OLH_PENDING_TAG_LEN
- op_tag
->size());
10733 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
10738 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
10739 attr_name
.append(*op_tag
);
10741 op
.setxattr(attr_name
.c_str(), bl
);
10743 ret
= obj_operate(bucket_info
, olh_obj
, &op
);
10748 state
.exists
= true;
10749 state
.attrset
[attr_name
] = bl
;
10754 int RGWRados::olh_init_modification(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
10758 ret
= olh_init_modification_impl(bucket_info
, state
, obj
, op_tag
);
10759 if (ret
== -EEXIST
) {
10766 int RGWRados::guard_reshard(BucketShard
*bs
, const rgw_obj
& obj_instance
, std::function
<int(BucketShard
*)> call
)
10769 const rgw_obj
*pobj
= &obj_instance
;
10772 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
10773 r
= bs
->init(pobj
->bucket
, *pobj
);
10775 ldout(cct
, 5) << "bs.init() returned ret=" << r
<< dendl
;
10779 if (r
!= -ERR_BUSY_RESHARDING
) {
10782 ldout(cct
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
10783 string new_bucket_id
;
10784 r
= block_while_resharding(bs
, &new_bucket_id
);
10785 if (r
== -ERR_BUSY_RESHARDING
) {
10791 ldout(cct
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
10792 i
= 0; /* resharding is finished, make sure we can retry */
10795 obj
.bucket
.update_bucket_id(new_bucket_id
);
10806 int RGWRados::block_while_resharding(RGWRados::BucketShard
*bs
, string
*new_bucket_id
)
10808 std::shared_ptr
<RGWReshardWait
> waiter
= reshard_wait
;
10810 return waiter
->block_while_resharding(bs
, new_bucket_id
);
10813 int RGWRados::bucket_index_link_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
10814 bool delete_marker
,
10815 const string
& op_tag
,
10816 struct rgw_bucket_dir_entry_meta
*meta
,
10817 uint64_t olh_epoch
,
10818 real_time unmod_since
, bool high_precision_time
, rgw_zone_set
*_zones_trace
)
10821 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10826 rgw_zone_set zones_trace
;
10827 if (_zones_trace
) {
10828 zones_trace
= *_zones_trace
;
10830 zones_trace
.insert(get_zone().id
);
10833 BucketShard
bs(this);
10835 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
10836 r
= guard_reshard(&bs
, obj_instance
, [&](BucketShard
*bs
) -> int {
10837 librados::ObjectWriteOperation op
;
10838 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
10839 return cls_rgw_bucket_link_olh(bs
->index_ctx
, op
,
10840 bs
->bucket_obj
, key
, olh_state
.olh_tag
, delete_marker
, op_tag
, meta
, olh_epoch
,
10841 unmod_since
, high_precision_time
,
10842 get_zone().log_data
, zones_trace
);
10845 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
10852 void RGWRados::bucket_index_guard_olh_op(RGWObjState
& olh_state
, ObjectOperation
& op
)
10854 ldout(cct
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
10855 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
10858 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
10859 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
, rgw_zone_set
*_zones_trace
)
10862 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10867 rgw_zone_set zones_trace
;
10868 if (_zones_trace
) {
10869 zones_trace
= *_zones_trace
;
10871 zones_trace
.insert(get_zone().id
);
10873 BucketShard
bs(this);
10875 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
10876 r
= guard_reshard(&bs
, obj_instance
, [&](BucketShard
*bs
) -> int {
10877 librados::ObjectWriteOperation op
;
10878 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
10879 return cls_rgw_bucket_unlink_instance(bs
->index_ctx
, op
, bs
->bucket_obj
, key
, op_tag
,
10880 olh_tag
, olh_epoch
, get_zone().log_data
, zones_trace
);
10883 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
10890 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
10891 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
10892 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
10893 bool *is_truncated
)
10896 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10901 BucketShard
bs(this);
10902 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10904 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10908 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
10910 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
10912 ret
= guard_reshard(&bs
, obj_instance
, [&](BucketShard
*bs
) -> int {
10913 ObjectReadOperation op
;
10914 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
10915 return cls_rgw_get_olh_log(bs
->index_ctx
, bs
->bucket_obj
, op
,
10916 key
, ver_marker
, olh_tag
, log
, is_truncated
);
10919 ldout(cct
, 20) << "cls_rgw_get_olh_log() returned r=" << r
<< dendl
;
10926 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
10929 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10934 BucketShard
bs(this);
10935 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10937 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10941 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
10943 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
10945 ret
= guard_reshard(&bs
, obj_instance
, [&](BucketShard
*pbs
) -> int {
10946 ObjectWriteOperation op
;
10947 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
10948 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
10949 return pbs
->index_ctx
.operate(pbs
->bucket_obj
, &op
);
10952 ldout(cct
, 20) << "cls_rgw_trim_olh_log() returned r=" << ret
<< dendl
;
10959 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
10962 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10967 BucketShard
bs(this);
10969 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
10971 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
10973 int ret
= guard_reshard(&bs
, obj_instance
, [&](BucketShard
*pbs
) -> int {
10974 ObjectWriteOperation op
;
10975 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
10976 return cls_rgw_clear_olh(pbs
->index_ctx
, op
, pbs
->bucket_obj
, key
, olh_tag
);
10979 ldout(cct
, 5) << "cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
10986 int RGWRados::apply_olh_log(RGWObjectCtx
& obj_ctx
, RGWObjState
& state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
10987 bufferlist
& olh_tag
, map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >& log
,
10988 uint64_t *plast_ver
, rgw_zone_set
* zones_trace
)
10994 librados::ObjectWriteOperation op
;
10996 uint64_t last_ver
= log
.rbegin()->first
;
10997 *plast_ver
= last_ver
;
10999 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
11001 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
11002 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
11004 bool need_to_link
= false;
11005 cls_rgw_obj_key key
;
11006 bool delete_marker
= false;
11007 list
<cls_rgw_obj_key
> remove_instances
;
11008 bool need_to_remove
= false;
11010 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
11011 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
11012 for (; viter
!= iter
->second
.end(); ++viter
) {
11013 rgw_bucket_olh_log_entry
& entry
= *viter
;
11015 ldout(cct
, 20) << "olh_log_entry: op=" << (int)entry
.op
11016 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
11017 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
11018 switch (entry
.op
) {
11019 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
11020 remove_instances
.push_back(entry
.key
);
11022 case CLS_RGW_OLH_OP_LINK_OLH
:
11023 need_to_link
= true;
11024 need_to_remove
= false;
11026 delete_marker
= entry
.delete_marker
;
11028 case CLS_RGW_OLH_OP_UNLINK_OLH
:
11029 need_to_remove
= true;
11030 need_to_link
= false;
11033 ldout(cct
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
11036 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
11037 attr_name
.append(entry
.op_tag
);
11038 op
.rmxattr(attr_name
.c_str());
11043 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
11048 const rgw_bucket
& bucket
= obj
.bucket
;
11050 if (need_to_link
) {
11051 rgw_obj
target(bucket
, key
);
11053 info
.target
= target
;
11054 info
.removed
= delete_marker
;
11056 ::encode(info
, bl
);
11057 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
11060 /* first remove object instances */
11061 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
11062 liter
!= remove_instances
.end(); ++liter
) {
11063 cls_rgw_obj_key
& key
= *liter
;
11064 rgw_obj
obj_instance(bucket
, key
);
11065 int ret
= delete_obj(obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
, ceph::real_time(), zones_trace
);
11066 if (ret
< 0 && ret
!= -ENOENT
) {
11067 ldout(cct
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
11072 /* update olh object */
11073 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
11074 if (r
== -ECANCELED
) {
11078 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
11082 r
= bucket_index_trim_olh_log(bucket_info
, state
, obj
, last_ver
);
11084 ldout(cct
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
11088 if (need_to_remove
) {
11089 ObjectWriteOperation rm_op
;
11091 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
11092 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
11093 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
11096 r
= ref
.ioctx
.operate(ref
.oid
, &rm_op
);
11097 if (r
== -ECANCELED
) {
11098 return 0; /* someone else won this race */
11101 * only clear if was successful, otherwise we might clobber pending operations on this object
11103 r
= bucket_index_clear_olh(bucket_info
, state
, obj
);
11105 ldout(cct
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
11115 * read olh log and apply it
11117 int RGWRados::update_olh(RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_zone_set
*zones_trace
)
11119 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
11121 uint64_t ver_marker
= 0;
11124 int ret
= bucket_index_read_olh_log(bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
11128 ret
= apply_olh_log(obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
, zones_trace
);
11132 } while (is_truncated
);
11137 int RGWRados::set_olh(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
11138 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
, rgw_zone_set
*zones_trace
)
11142 rgw_obj olh_obj
= target_obj
;
11143 olh_obj
.key
.instance
.clear();
11145 RGWObjState
*state
= NULL
;
11150 #define MAX_ECANCELED_RETRY 100
11151 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
11152 if (ret
== -ECANCELED
) {
11153 obj_ctx
.obj
.invalidate(olh_obj
);
11156 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
11161 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
11163 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
11164 if (ret
== -ECANCELED
) {
11169 ret
= bucket_index_link_olh(bucket_info
, *state
, target_obj
, delete_marker
, op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
, zones_trace
);
11171 ldout(cct
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
11172 if (ret
== -ECANCELED
) {
11180 if (i
== MAX_ECANCELED_RETRY
) {
11181 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
11185 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
11186 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
11190 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11197 int RGWRados::unlink_obj_instance(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
11198 uint64_t olh_epoch
, rgw_zone_set
*zones_trace
)
11202 rgw_obj olh_obj
= target_obj
;
11203 olh_obj
.key
.instance
.clear();
11205 RGWObjState
*state
= NULL
;
11210 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
11211 if (ret
== -ECANCELED
) {
11212 obj_ctx
.obj
.invalidate(olh_obj
);
11215 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
11219 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
11221 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11222 if (ret
== -ECANCELED
) {
11228 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
11230 ret
= bucket_index_unlink_instance(bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
, zones_trace
);
11232 ldout(cct
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11233 if (ret
== -ECANCELED
) {
11241 if (i
== MAX_ECANCELED_RETRY
) {
11242 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
11246 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
, zones_trace
);
11247 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
11251 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11258 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
11260 #define OBJ_INSTANCE_LEN 32
11261 char buf
[OBJ_INSTANCE_LEN
+ 1];
11263 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
11264 no underscore for instance name due to the way we encode the raw keys */
11266 target_obj
->key
.set_instance(buf
);
11269 static void filter_attrset(map
<string
, bufferlist
>& unfiltered_attrset
, const string
& check_prefix
,
11270 map
<string
, bufferlist
> *attrset
)
11273 map
<string
, bufferlist
>::iterator iter
;
11274 for (iter
= unfiltered_attrset
.lower_bound(check_prefix
);
11275 iter
!= unfiltered_attrset
.end(); ++iter
) {
11276 if (!boost::algorithm::starts_with(iter
->first
, check_prefix
))
11278 (*attrset
)[iter
->first
] = iter
->second
;
11282 int RGWRados::get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
11284 map
<string
, bufferlist
> unfiltered_attrset
;
11286 ObjectReadOperation op
;
11287 op
.getxattrs(&unfiltered_attrset
, NULL
);
11290 int r
= obj_operate(bucket_info
, obj
, &op
);
11295 map
<string
, bufferlist
> attrset
;
11297 filter_attrset(unfiltered_attrset
, RGW_ATTR_OLH_PREFIX
, &attrset
);
11299 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
11300 if (iter
== attrset
.end()) { /* not an olh */
11305 bufferlist::iterator biter
= iter
->second
.begin();
11306 ::decode(*olh
, biter
);
11307 } catch (buffer::error
& err
) {
11308 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
11315 void RGWRados::check_pending_olh_entries(map
<string
, bufferlist
>& pending_entries
,
11316 map
<string
, bufferlist
> *rm_pending_entries
)
11318 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
11320 real_time now
= real_clock::now();
11322 while (iter
!= pending_entries
.end()) {
11323 bufferlist::iterator biter
= iter
->second
.begin();
11324 RGWOLHPendingInfo pending_info
;
11326 ::decode(pending_info
, biter
);
11327 } catch (buffer::error
& err
) {
11328 /* skipping bad entry, we could remove it but it might hide a bug */
11329 ldout(cct
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
11334 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
11336 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
11337 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
11338 pending_entries
.erase(cur_iter
);
11340 /* entries names are sorted by time (rounded to a second) */
11346 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
11348 ObjectWriteOperation op
;
11350 bucket_index_guard_olh_op(state
, op
);
11352 for (map
<string
, bufferlist
>::iterator iter
= pending_attrs
.begin(); iter
!= pending_attrs
.end(); ++iter
) {
11353 op
.rmxattr(iter
->first
.c_str());
11357 int r
= get_obj_head_ref(bucket_info
, olh_obj
, &ref
);
11362 /* update olh object */
11363 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
11364 if (r
== -ENOENT
|| r
== -ECANCELED
) {
11365 /* raced with some other change, shouldn't sweat about it */
11369 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
11376 int RGWRados::follow_olh(const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
11378 map
<string
, bufferlist
> pending_entries
;
11379 filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
11381 map
<string
, bufferlist
> rm_pending_entries
;
11382 check_pending_olh_entries(pending_entries
, &rm_pending_entries
);
11384 if (!rm_pending_entries
.empty()) {
11385 int ret
= remove_olh_pending_entries(bucket_info
, *state
, olh_obj
, rm_pending_entries
);
11387 ldout(cct
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
11391 if (!pending_entries
.empty()) {
11392 ldout(cct
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
11394 int ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
11400 map
<string
, bufferlist
>::iterator iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
11401 assert(iter
!= state
->attrset
.end());
11404 bufferlist::iterator biter
= iter
->second
.begin();
11405 ::decode(olh
, biter
);
11406 } catch (buffer::error
& err
) {
11407 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
11415 *target
= olh
.target
;
11420 int RGWRados::raw_obj_stat(rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
11421 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
11422 RGWObjVersionTracker
*objv_tracker
)
11425 int r
= get_raw_obj_ref(obj
, &ref
);
11430 map
<string
, bufferlist
> unfiltered_attrset
;
11432 struct timespec mtime_ts
;
11434 ObjectReadOperation op
;
11435 if (objv_tracker
) {
11436 objv_tracker
->prepare_op_for_read(&op
);
11439 op
.getxattrs(&unfiltered_attrset
, NULL
);
11441 if (psize
|| pmtime
) {
11442 op
.stat2(&size
, &mtime_ts
, NULL
);
11445 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
11448 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &outbl
);
11451 *epoch
= ref
.ioctx
.get_last_version();
11460 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
11462 filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
11468 int RGWRados::get_bucket_stats(RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
11469 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
)
11471 map
<string
, rgw_bucket_dir_header
> headers
;
11472 map
<int, string
> bucket_instance_ids
;
11473 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
11478 assert(headers
.size() == bucket_instance_ids
.size());
11480 map
<string
, rgw_bucket_dir_header
>::iterator iter
= headers
.begin();
11481 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
11482 BucketIndexShardsManager ver_mgr
;
11483 BucketIndexShardsManager master_ver_mgr
;
11484 BucketIndexShardsManager marker_mgr
;
11485 string shard_marker
;
11487 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
11488 accumulate_raw_stats(iter
->second
, stats
);
11489 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->second
.ver
);
11490 ver_mgr
.add(viter
->first
, string(buf
));
11491 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->second
.master_ver
);
11492 master_ver_mgr
.add(viter
->first
, string(buf
));
11493 if (shard_id
>= 0) {
11494 *max_marker
= iter
->second
.max_marker
;
11496 marker_mgr
.add(viter
->first
, iter
->second
.max_marker
);
11499 ver_mgr
.to_string(bucket_ver
);
11500 master_ver_mgr
.to_string(master_ver
);
11501 if (shard_id
< 0) {
11502 marker_mgr
.to_string(max_marker
);
11507 int RGWRados::get_bi_log_status(RGWBucketInfo
& bucket_info
, int shard_id
,
11508 map
<int, string
>& markers
)
11510 map
<string
, rgw_bucket_dir_header
> headers
;
11511 map
<int, string
> bucket_instance_ids
;
11512 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
11516 assert(headers
.size() == bucket_instance_ids
.size());
11518 map
<string
, rgw_bucket_dir_header
>::iterator iter
= headers
.begin();
11519 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
11521 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
11522 if (shard_id
>= 0) {
11523 markers
[shard_id
] = iter
->second
.max_marker
;
11525 markers
[viter
->first
] = iter
->second
.max_marker
;
11531 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
11532 RGWGetBucketStats_CB
*cb
;
11534 map
<RGWObjCategory
, RGWStorageStats
> stats
;
11540 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
11541 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true),
11542 lock("RGWGetBucketStatsContext") {}
11544 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
11545 Mutex::Locker
l(lock
);
11548 accumulate_raw_stats(header
, stats
);
11553 // Are we all done?
11554 if (--pendings
== 0) {
11556 cb
->set_response(&stats
);
11558 cb
->handle_response(ret_code
);
11565 Mutex::Locker
l(lock
);
11570 int RGWRados::get_bucket_stats_async(RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
11573 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.num_shards
);
11575 int r
= cls_bucket_head_async(bucket_info
, shard_id
, get_ctx
, &num_aio
);
11580 get_ctx
->unset_cb();
11586 class RGWGetUserStatsContext
: public RGWGetUserHeader_CB
{
11587 RGWGetUserStats_CB
*cb
;
11590 explicit RGWGetUserStatsContext(RGWGetUserStats_CB
* const cb
)
11593 void handle_response(int r
, cls_user_header
& header
) override
{
11594 const cls_user_stats
& hs
= header
.stats
;
11596 RGWStorageStats stats
;
11598 stats
.size
= hs
.total_bytes
;
11599 stats
.size_rounded
= hs
.total_bytes_rounded
;
11600 stats
.num_objects
= hs
.total_entries
;
11602 cb
->set_response(stats
);
11605 cb
->handle_response(r
);
11611 int RGWRados::get_user_stats(const rgw_user
& user
, RGWStorageStats
& stats
)
11613 string user_str
= user
.to_str();
11615 cls_user_header header
;
11616 int r
= cls_user_get_header(user_str
, &header
);
11620 const cls_user_stats
& hs
= header
.stats
;
11622 stats
.size
= hs
.total_bytes
;
11623 stats
.size_rounded
= hs
.total_bytes_rounded
;
11624 stats
.num_objects
= hs
.total_entries
;
11629 int RGWRados::get_user_stats_async(const rgw_user
& user
, RGWGetUserStats_CB
*ctx
)
11631 string user_str
= user
.to_str();
11633 RGWGetUserStatsContext
*get_ctx
= new RGWGetUserStatsContext(ctx
);
11634 int r
= cls_user_get_header_async(user_str
, get_ctx
);
11644 void RGWRados::get_bucket_meta_oid(const rgw_bucket
& bucket
, string
& oid
)
11646 oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ bucket
.get_key(':');
11649 void RGWRados::get_bucket_instance_obj(const rgw_bucket
& bucket
, rgw_raw_obj
& obj
)
11651 if (!bucket
.oid
.empty()) {
11652 obj
.init(get_zone_params().domain_root
, bucket
.oid
);
11655 get_bucket_meta_oid(bucket
, oid
);
11656 obj
.init(get_zone_params().domain_root
, oid
);
11660 int RGWRados::get_bucket_instance_info(RGWObjectCtx
& obj_ctx
, const string
& meta_key
, RGWBucketInfo
& info
,
11661 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
11663 size_t pos
= meta_key
.find(':');
11664 if (pos
== string::npos
) {
11667 string oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ meta_key
;
11668 rgw_bucket_instance_key_to_oid(oid
);
11670 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
11673 int RGWRados::get_bucket_instance_info(RGWObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
11674 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
11677 if (bucket
.oid
.empty()) {
11678 get_bucket_meta_oid(bucket
, oid
);
11683 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
11686 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx
& obj_ctx
, const string
& oid
, RGWBucketInfo
& info
,
11687 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
,
11688 rgw_cache_entry_info
*cache_info
)
11690 ldout(cct
, 20) << "reading from " << get_zone_params().domain_root
<< ":" << oid
<< dendl
;
11694 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, oid
, epbl
, &info
.objv_tracker
, pmtime
, pattrs
, cache_info
);
11699 bufferlist::iterator iter
= epbl
.begin();
11701 ::decode(info
, iter
);
11702 } catch (buffer::error
& err
) {
11703 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
11706 info
.bucket
.oid
= oid
;
11710 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx
& obj_ctx
,
11711 const string
& tenant_name
,
11712 const string
& bucket_name
,
11713 RGWBucketEntryPoint
& entry_point
,
11714 RGWObjVersionTracker
*objv_tracker
,
11716 map
<string
, bufferlist
> *pattrs
,
11717 rgw_cache_entry_info
*cache_info
)
11720 string bucket_entry
;
11722 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
11723 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, bucket_entry
, bl
, objv_tracker
, pmtime
, pattrs
, cache_info
);
11728 bufferlist::iterator iter
= bl
.begin();
11730 ::decode(entry_point
, iter
);
11731 } catch (buffer::error
& err
) {
11732 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
11738 int RGWRados::convert_old_bucket_info(RGWObjectCtx
& obj_ctx
,
11739 const string
& tenant_name
,
11740 const string
& bucket_name
)
11742 RGWBucketEntryPoint entry_point
;
11743 real_time ep_mtime
;
11744 RGWObjVersionTracker ot
;
11745 map
<string
, bufferlist
> attrs
;
11746 RGWBucketInfo info
;
11748 ldout(cct
, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name
<< dendl
;
11750 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant_name
, bucket_name
, entry_point
, &ot
, &ep_mtime
, &attrs
);
11752 ldout(cct
, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret
<< " bucket=" << bucket_name
<< dendl
;
11756 if (!entry_point
.has_bucket_info
) {
11757 /* already converted! */
11761 info
= entry_point
.old_bucket_info
;
11762 info
.bucket
.oid
= bucket_name
;
11763 info
.ep_objv
= ot
.read_version
;
11765 ot
.generate_new_write_ver(cct
);
11767 ret
= put_linked_bucket_info(info
, false, ep_mtime
, &ot
.write_version
, &attrs
, true);
11769 ldout(cct
, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret
<< dendl
;
11776 int RGWRados::get_bucket_info(RGWObjectCtx
& obj_ctx
,
11777 const string
& tenant
, const string
& bucket_name
, RGWBucketInfo
& info
,
11778 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
11780 bucket_info_entry e
;
11781 string bucket_entry
;
11782 rgw_make_bucket_entry_name(tenant
, bucket_name
, bucket_entry
);
11784 if (binfo_cache
->find(bucket_entry
, &e
)) {
11793 RGWBucketEntryPoint entry_point
;
11794 real_time ep_mtime
;
11795 RGWObjVersionTracker ot
;
11796 rgw_cache_entry_info entry_cache_info
;
11797 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant
, bucket_name
, entry_point
, &ot
, &ep_mtime
, pattrs
, &entry_cache_info
);
11799 /* only init these fields */
11800 info
.bucket
.tenant
= tenant
;
11801 info
.bucket
.name
= bucket_name
;
11805 if (entry_point
.has_bucket_info
) {
11806 info
= entry_point
.old_bucket_info
;
11807 info
.bucket
.oid
= bucket_name
;
11808 info
.bucket
.tenant
= tenant
;
11809 info
.ep_objv
= ot
.read_version
;
11810 ldout(cct
, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info
.bucket
<< " owner " << info
.owner
<< dendl
;
11814 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11821 ldout(cct
, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point
.bucket
<< dendl
;
11824 /* read bucket instance info */
11827 get_bucket_meta_oid(entry_point
.bucket
, oid
);
11829 rgw_cache_entry_info cache_info
;
11831 ret
= get_bucket_instance_from_oid(obj_ctx
, oid
, e
.info
, &e
.mtime
, &e
.attrs
, &cache_info
);
11832 e
.info
.ep_objv
= ot
.read_version
;
11835 info
.bucket
.tenant
= tenant
;
11836 info
.bucket
.name
= bucket_name
;
11837 // XXX and why return anything in case of an error anyway?
11846 list
<rgw_cache_entry_info
*> cache_info_entries
;
11847 cache_info_entries
.push_back(&entry_cache_info
);
11848 cache_info_entries
.push_back(&cache_info
);
11851 /* chain to both bucket entry point and bucket instance */
11852 if (!binfo_cache
->put(this, bucket_entry
, &e
, cache_info_entries
)) {
11853 ldout(cct
, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl
;
11859 int RGWRados::put_bucket_entrypoint_info(const string
& tenant_name
, const string
& bucket_name
, RGWBucketEntryPoint
& entry_point
,
11860 bool exclusive
, RGWObjVersionTracker
& objv_tracker
, real_time mtime
,
11861 map
<string
, bufferlist
> *pattrs
)
11864 ::encode(entry_point
, epbl
);
11865 string bucket_entry
;
11866 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
11867 return rgw_bucket_store_info(this, bucket_entry
, epbl
, exclusive
, pattrs
, &objv_tracker
, mtime
);
11870 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
11871 real_time mtime
, map
<string
, bufferlist
> *pattrs
)
11873 info
.has_instance_obj
= true;
11876 ::encode(info
, bl
);
11878 string key
= info
.bucket
.get_key(); /* when we go through meta api, we don't use oid directly */
11879 int ret
= rgw_bucket_instance_store_info(this, key
, bl
, exclusive
, pattrs
, &info
.objv_tracker
, mtime
);
11880 if (ret
== -EEXIST
) {
11881 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
11882 * bucket operation on this specific bucket (e.g., being synced from the master), but
11883 * since bucket instace meta object is unique for this specific bucket instace, we don't
11884 * need to return an error.
11885 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
11886 * master, creating a bucket, sending bucket creation to the master, we create the bucket
11887 * locally, while in the sync thread we sync the new bucket.
11894 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
11895 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
)
11897 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
11899 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
);
11905 return 0; /* done! */
11907 RGWBucketEntryPoint entry_point
;
11908 entry_point
.bucket
= info
.bucket
;
11909 entry_point
.owner
= info
.owner
;
11910 entry_point
.creation_time
= info
.creation_time
;
11911 entry_point
.linked
= true;
11912 RGWObjVersionTracker ot
;
11913 if (pep_objv
&& !pep_objv
->tag
.empty()) {
11914 ot
.write_version
= *pep_objv
;
11916 ot
.generate_new_write_ver(cct
);
11918 *pep_objv
= ot
.write_version
;
11921 ret
= put_bucket_entrypoint_info(info
.bucket
.tenant
, info
.bucket
.name
, entry_point
, exclusive
, ot
, mtime
, NULL
);
11928 int RGWRados::omap_get_vals(rgw_raw_obj
& obj
, bufferlist
& header
, const string
& marker
, uint64_t count
, std::map
<string
, bufferlist
>& m
)
11931 int r
= get_raw_obj_ref(obj
, &ref
);
11936 r
= ref
.ioctx
.omap_get_vals(ref
.oid
, marker
, count
, &m
);
11944 int RGWRados::omap_get_all(rgw_raw_obj
& obj
, bufferlist
& header
,
11945 std::map
<string
, bufferlist
>& m
)
11948 int r
= get_raw_obj_ref(obj
, &ref
);
11953 #define MAX_OMAP_GET_ENTRIES 1024
11954 const int count
= MAX_OMAP_GET_ENTRIES
;
11955 string start_after
;
11958 std::map
<string
, bufferlist
> t
;
11959 r
= ref
.ioctx
.omap_get_vals(ref
.oid
, start_after
, count
, &t
);
11966 start_after
= t
.rbegin()->first
;
11967 m
.insert(t
.begin(), t
.end());
11972 int RGWRados::omap_set(rgw_raw_obj
& obj
, const std::string
& key
, bufferlist
& bl
)
11975 int r
= get_raw_obj_ref(obj
, &ref
);
11979 ldout(cct
, 15) << "omap_set obj=" << obj
<< " key=" << key
<< dendl
;
11981 map
<string
, bufferlist
> m
;
11984 r
= ref
.ioctx
.omap_set(ref
.oid
, m
);
11989 int RGWRados::omap_set(rgw_raw_obj
& obj
, std::map
<std::string
, bufferlist
>& m
)
11992 int r
= get_raw_obj_ref(obj
, &ref
);
11997 r
= ref
.ioctx
.omap_set(ref
.oid
, m
);
12002 int RGWRados::omap_del(rgw_raw_obj
& obj
, const std::string
& key
)
12005 int r
= get_raw_obj_ref(obj
, &ref
);
12013 r
= ref
.ioctx
.omap_rm_keys(ref
.oid
, k
);
12017 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
)
12019 RGWObjectCtx
obj_ctx(this);
12021 map
<string
, RGWBucketEnt
>::iterator iter
;
12022 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
12023 RGWBucketEnt
& ent
= iter
->second
;
12024 rgw_bucket
& bucket
= ent
.bucket
;
12027 ent
.size_rounded
= 0;
12029 map
<string
, rgw_bucket_dir_header
> headers
;
12031 RGWBucketInfo bucket_info
;
12032 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
12037 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
12041 map
<string
, rgw_bucket_dir_header
>::iterator hiter
= headers
.begin();
12042 for (; hiter
!= headers
.end(); ++hiter
) {
12043 RGWObjCategory category
= main_category
;
12044 map
<uint8_t, struct rgw_bucket_category_stats
>::iterator iter
= (hiter
->second
.stats
).find((uint8_t)category
);
12045 if (iter
!= hiter
->second
.stats
.end()) {
12046 struct rgw_bucket_category_stats
& stats
= iter
->second
;
12047 ent
.count
+= stats
.num_entries
;
12048 ent
.size
+= stats
.total_size
;
12049 ent
.size_rounded
+= stats
.total_size_rounded
;
12057 int RGWRados::append_async(rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
12060 int r
= get_raw_obj_ref(obj
, &ref
);
12064 librados::Rados
*rad
= get_rados_handle();
12065 librados::AioCompletion
*completion
= rad
->aio_create_completion(NULL
, NULL
, NULL
);
12067 r
= ref
.ioctx
.aio_append(ref
.oid
, completion
, bl
, size
);
12068 completion
->release();
12072 int RGWRados::distribute(const string
& key
, bufferlist
& bl
)
12075 * we were called before watch was initialized. This can only happen if we're updating some system
12076 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12077 * objects, they're currently only read on startup anyway.
12079 if (!watch_initialized
)
12083 pick_control_oid(key
, notify_oid
);
12085 ldout(cct
, 10) << "distributing notification oid=" << notify_oid
<< " bl.length()=" << bl
.length() << dendl
;
12086 return control_pool_ctx
.notify2(notify_oid
, bl
, 0, NULL
);
12089 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
12091 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
12092 librados::NObjectIterator
& iter
= ctx
.iter
;
12094 int r
= open_pool_ctx(pool
, io_ctx
);
12098 iter
= io_ctx
.nobjects_begin();
12103 int RGWRados::pool_iterate(RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
12104 bool *is_truncated
, RGWAccessListFilter
*filter
)
12106 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
12107 librados::NObjectIterator
& iter
= ctx
.iter
;
12109 if (iter
== io_ctx
.nobjects_end())
12114 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
12115 rgw_bucket_dir_entry e
;
12117 string oid
= iter
->get_oid();
12118 ldout(cct
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
12120 // fill it in with initial values; we may correct later
12121 if (filter
&& !filter
->filter(oid
, oid
))
12129 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
12131 return objs
.size();
12133 struct RGWAccessListFilterPrefix
: public RGWAccessListFilter
{
12136 explicit RGWAccessListFilterPrefix(const string
& _prefix
) : prefix(_prefix
) {}
12137 bool filter(string
& name
, string
& key
) override
{
12138 return (prefix
.compare(key
.substr(0, prefix
.size())) == 0);
12142 int RGWRados::list_raw_objects(const rgw_pool
& pool
, const string
& prefix_filter
,
12143 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
12144 bool *is_truncated
)
12146 RGWAccessListFilterPrefix
filter(prefix_filter
);
12148 if (!ctx
.initialized
) {
12149 int r
= pool_iterate_begin(pool
, ctx
.iter_ctx
);
12151 ldout(cct
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
12154 ctx
.initialized
= true;
12157 vector
<rgw_bucket_dir_entry
> objs
;
12158 int r
= pool_iterate(ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
12161 ldout(cct
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
12165 vector
<rgw_bucket_dir_entry
>::iterator iter
;
12166 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
12167 oids
.push_back(iter
->key
.name
);
12170 return oids
.size();
12173 int RGWRados::list_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& marker
, uint32_t max
,
12174 std::list
<rgw_bi_log_entry
>& result
, bool *truncated
)
12176 ldout(cct
, 20) << __func__
<< ": " << bucket_info
.bucket
<< " marker " << marker
<< " shard_id=" << shard_id
<< " max " << max
<< dendl
;
12179 librados::IoCtx index_ctx
;
12180 map
<int, string
> oids
;
12181 map
<int, cls_rgw_bi_log_list_ret
> bi_log_lists
;
12182 map
<int, string
> bucket_instance_ids
;
12183 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, &bucket_instance_ids
);
12187 BucketIndexShardsManager marker_mgr
;
12188 bool has_shards
= (oids
.size() > 1 || shard_id
>= 0);
12189 // If there are multiple shards for the bucket index object, the marker
12190 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12191 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12192 // only contain one record, and the key is the bucket instance id.
12193 r
= marker_mgr
.from_string(marker
, shard_id
);
12197 r
= CLSRGWIssueBILogList(index_ctx
, marker_mgr
, max
, oids
, bi_log_lists
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12201 map
<int, list
<rgw_bi_log_entry
>::iterator
> vcurrents
;
12202 map
<int, list
<rgw_bi_log_entry
>::iterator
> vends
;
12204 *truncated
= false;
12206 map
<int, cls_rgw_bi_log_list_ret
>::iterator miter
= bi_log_lists
.begin();
12207 for (; miter
!= bi_log_lists
.end(); ++miter
) {
12208 int shard_id
= miter
->first
;
12209 vcurrents
[shard_id
] = miter
->second
.entries
.begin();
12210 vends
[shard_id
] = miter
->second
.entries
.end();
12212 *truncated
= (*truncated
|| miter
->second
.truncated
);
12217 bool has_more
= true;
12218 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator viter
;
12219 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator eiter
;
12220 while (total
< max
&& has_more
) {
12223 viter
= vcurrents
.begin();
12224 eiter
= vends
.begin();
12226 for (; total
< max
&& viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
12227 assert (eiter
!= vends
.end());
12229 int shard_id
= viter
->first
;
12230 list
<rgw_bi_log_entry
>::iterator
& liter
= viter
->second
;
12232 if (liter
== eiter
->second
){
12235 rgw_bi_log_entry
& entry
= *(liter
);
12238 snprintf(buf
, sizeof(buf
), "%d", shard_id
);
12240 build_bucket_index_marker(buf
, entry
.id
, &tmp_id
);
12241 entry
.id
.swap(tmp_id
);
12243 marker_mgr
.add(shard_id
, entry
.id
);
12244 result
.push_back(entry
);
12252 for (viter
= vcurrents
.begin(), eiter
= vends
.begin(); viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
12253 assert (eiter
!= vends
.end());
12254 *truncated
= (*truncated
|| (viter
->second
!= eiter
->second
));
12258 // Refresh marker, if there are multiple shards, the output will look like
12259 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12260 // if there is no sharding, the simply marker (without oid) is returned
12262 marker_mgr
.to_string(&marker
);
12264 if (!result
.empty()) {
12265 marker
= result
.rbegin()->id
;
12272 int RGWRados::trim_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& start_marker
, string
& end_marker
)
12274 librados::IoCtx index_ctx
;
12275 map
<int, string
> bucket_objs
;
12277 BucketIndexShardsManager start_marker_mgr
;
12278 BucketIndexShardsManager end_marker_mgr
;
12280 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
12285 r
= start_marker_mgr
.from_string(start_marker
, shard_id
);
12290 r
= end_marker_mgr
.from_string(end_marker
, shard_id
);
12295 return CLSRGWIssueBILogTrim(index_ctx
, start_marker_mgr
, end_marker_mgr
, bucket_objs
,
12296 cct
->_conf
->rgw_bucket_index_max_aio
)();
12301 int RGWRados::bi_get_instance(const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, rgw_bucket_dir_entry
*dirent
)
12304 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
12309 rgw_cls_bi_entry bi_entry
;
12310 r
= bi_get(obj
.bucket
, obj
, InstanceIdx
, &bi_entry
);
12311 if (r
< 0 && r
!= -ENOENT
) {
12312 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
12317 bufferlist::iterator iter
= bi_entry
.data
.begin();
12319 ::decode(*dirent
, iter
);
12320 } catch (buffer::error
& err
) {
12321 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
12328 int RGWRados::bi_get(rgw_bucket
& bucket
, rgw_obj
& obj
, BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
12330 BucketShard
bs(this);
12331 int ret
= bs
.init(bucket
, obj
);
12333 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
12337 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
12339 ret
= cls_rgw_bi_get(bs
.index_ctx
, bs
.bucket_obj
, index_type
, key
, entry
);
12346 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
12348 cls_rgw_bi_put(op
, bs
.bucket_obj
, entry
);
12351 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
12353 int ret
= cls_rgw_bi_put(bs
.index_ctx
, bs
.bucket_obj
, entry
);
12360 int RGWRados::bi_put(rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
12362 BucketShard
bs(this);
12363 int ret
= bs
.init(bucket
, obj
);
12365 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
12369 return bi_put(bs
, entry
);
12372 int RGWRados::bi_list(rgw_bucket
& bucket
, const string
& obj_name
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
12374 rgw_obj
obj(bucket
, obj_name
);
12375 BucketShard
bs(this);
12376 int ret
= bs
.init(bucket
, obj
);
12378 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
12382 ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, obj_name
, marker
, max
, entries
, is_truncated
);
12383 if (ret
== -ENOENT
) {
12384 *is_truncated
= false;
12392 int RGWRados::bi_list(BucketShard
& bs
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
12394 int ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, filter_obj
, marker
, max
, entries
, is_truncated
);
12401 int RGWRados::bi_remove(BucketShard
& bs
)
12403 int ret
= bs
.index_ctx
.remove(bs
.bucket_obj
);
12404 if (ret
== -ENOENT
) {
12408 ldout(cct
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
12415 int RGWRados::bi_list(rgw_bucket
& bucket
, int shard_id
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
12417 BucketShard
bs(this);
12418 int ret
= bs
.init(bucket
, shard_id
);
12420 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
12424 return bi_list(bs
, filter_obj
, marker
, max
, entries
, is_truncated
);
12427 int RGWRados::gc_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
12429 return gc_pool_ctx
.operate(oid
, op
);
12432 int RGWRados::gc_aio_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
12434 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
12435 int r
= gc_pool_ctx
.aio_operate(oid
, c
, op
);
12440 int RGWRados::gc_operate(string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
12442 return gc_pool_ctx
.operate(oid
, op
, pbl
);
12445 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
)
12447 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
);
12450 int RGWRados::process_gc()
12452 return gc
->process();
12455 int RGWRados::list_lc_progress(const string
& marker
, uint32_t max_entries
, map
<string
, int> *progress_map
)
12457 return lc
->list_lc_progress(marker
, max_entries
, progress_map
);
12460 int RGWRados::process_lc()
12462 return lc
->process();
12465 int RGWRados::process_expire_objects()
12467 obj_expirer
->inspect_all_shards(utime_t(), ceph_clock_now());
12471 int RGWRados::cls_rgw_init_index(librados::IoCtx
& index_ctx
, librados::ObjectWriteOperation
& op
, string
& oid
)
12474 cls_rgw_bucket_init(op
);
12475 return index_ctx
.operate(oid
, &op
);
12478 int RGWRados::cls_obj_prepare_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
12479 rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
12481 rgw_zone_set zones_trace
;
12482 if (_zones_trace
) {
12483 zones_trace
= *_zones_trace
;
12486 zones_trace
.insert(get_zone().id
);
12489 ObjectWriteOperation o
;
12490 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
12491 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
12492 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), get_zone().log_data
, bilog_flags
, zones_trace
);
12493 return bs
.index_ctx
.operate(bs
.bucket_obj
, &o
);
12496 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, const rgw_obj
& obj
, RGWModifyOp op
, string
& tag
,
12497 int64_t pool
, uint64_t epoch
,
12498 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
12499 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
12501 ObjectWriteOperation o
;
12502 rgw_bucket_dir_entry_meta dir_meta
;
12503 dir_meta
= ent
.meta
;
12504 dir_meta
.category
= category
;
12506 rgw_bucket_entry_ver ver
;
12509 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
12510 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
12511 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
12512 get_zone().log_data
, bilog_flags
, _zones_trace
);
12513 complete_op_data
*arg
;
12514 index_completion_manager
->create_completion(obj
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
12515 get_zone().log_data
, bilog_flags
, _zones_trace
, &arg
);
12516 librados::AioCompletion
*completion
= arg
->rados_completion
;
12517 int ret
= bs
.index_ctx
.aio_operate(bs
.bucket_obj
, arg
->rados_completion
, &o
);
12518 completion
->release(); /* can't reference arg here, as it might have already been released */
12522 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, const rgw_obj
& obj
, string
& tag
,
12523 int64_t pool
, uint64_t epoch
,
12524 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
12525 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
12527 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
12530 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
12531 int64_t pool
, uint64_t epoch
,
12533 real_time
& removed_mtime
,
12534 list
<rgw_obj_index_key
> *remove_objs
,
12535 uint16_t bilog_flags
,
12536 rgw_zone_set
*zones_trace
)
12538 rgw_bucket_dir_entry ent
;
12539 ent
.meta
.mtime
= removed_mtime
;
12540 obj
.key
.get_index_key(&ent
.key
);
12541 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
, ent
, RGW_OBJ_CATEGORY_NONE
, remove_objs
, bilog_flags
, zones_trace
);
12544 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
12546 rgw_bucket_dir_entry ent
;
12547 obj
.key
.get_index_key(&ent
.key
);
12548 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_CANCEL
, tag
, -1 /* pool id */, 0, ent
, RGW_OBJ_CATEGORY_NONE
, NULL
, bilog_flags
, zones_trace
);
12551 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo
& bucket_info
, uint64_t timeout
)
12553 librados::IoCtx index_ctx
;
12554 map
<int, string
> bucket_objs
;
12555 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
12559 return CLSRGWIssueSetTagTimeout(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
12562 int RGWRados::cls_bucket_list(RGWBucketInfo
& bucket_info
, int shard_id
, rgw_obj_index_key
& start
, const string
& prefix
,
12563 uint32_t num_entries
, bool list_versions
, map
<string
, rgw_bucket_dir_entry
>& m
,
12564 bool *is_truncated
, rgw_obj_index_key
*last_entry
,
12565 bool (*force_check_filter
)(const string
& name
))
12567 ldout(cct
, 10) << "cls_bucket_list " << bucket_info
.bucket
<< " start " << start
.name
<< "[" << start
.instance
<< "] num_entries " << num_entries
<< dendl
;
12569 librados::IoCtx index_ctx
;
12570 // key - oid (for different shards if there is any)
12571 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12572 map
<int, string
> oids
;
12573 map
<int, struct rgw_cls_list_ret
> list_results
;
12574 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
12578 cls_rgw_obj_key
start_key(start
.name
, start
.instance
);
12579 r
= CLSRGWIssueBucketList(index_ctx
, start_key
, prefix
, num_entries
, list_versions
,
12580 oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12584 // Create a list of iterators that are used to iterate each shard
12585 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vcurrents(list_results
.size());
12586 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vends(list_results
.size());
12587 vector
<string
> vnames(list_results
.size());
12588 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
12589 *is_truncated
= false;
12590 for (; iter
!= list_results
.end(); ++iter
) {
12591 vcurrents
.push_back(iter
->second
.dir
.m
.begin());
12592 vends
.push_back(iter
->second
.dir
.m
.end());
12593 vnames
.push_back(oids
[iter
->first
]);
12594 *is_truncated
= (*is_truncated
|| iter
->second
.is_truncated
);
12597 // Create a map to track the next candidate entry from each shard, if the entry
12598 // from a specified shard is selected/erased, the next entry from that shard will
12599 // be inserted for next round selection
12600 map
<string
, size_t> candidates
;
12601 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
12602 if (vcurrents
[i
] != vends
[i
]) {
12603 candidates
[vcurrents
[i
]->first
] = i
;
12607 map
<string
, bufferlist
> updates
;
12608 uint32_t count
= 0;
12609 while (count
< num_entries
&& !candidates
.empty()) {
12611 // Select the next one
12612 int pos
= candidates
.begin()->second
;
12613 const string
& name
= vcurrents
[pos
]->first
;
12614 struct rgw_bucket_dir_entry
& dirent
= vcurrents
[pos
]->second
;
12616 bool force_check
= force_check_filter
&& force_check_filter(dirent
.key
.name
);
12617 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) || !dirent
.pending_map
.empty() || force_check
) {
12618 /* there are uncommitted ops. We need to check the current state,
12619 * and if the tags are old we need to do cleanup as well. */
12620 librados::IoCtx sub_ctx
;
12621 sub_ctx
.dup(index_ctx
);
12622 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
, updates
[vnames
[pos
]]);
12623 if (r
< 0 && r
!= -ENOENT
) {
12628 ldout(cct
, 10) << "RGWRados::cls_bucket_list: got " << dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
12629 m
[name
] = std::move(dirent
);
12633 // Refresh the candidates map
12634 candidates
.erase(candidates
.begin());
12636 if (vcurrents
[pos
] != vends
[pos
]) {
12637 candidates
[vcurrents
[pos
]->first
] = pos
;
12641 // Suggest updates if there is any
12642 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
12643 for (; miter
!= updates
.end(); ++miter
) {
12644 if (miter
->second
.length()) {
12645 ObjectWriteOperation o
;
12646 cls_rgw_suggest_changes(o
, miter
->second
);
12647 // we don't care if we lose suggested updates, send them off blindly
12648 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
12649 index_ctx
.aio_operate(miter
->first
, c
, &o
);
12654 // Check if all the returned entries are consumed or not
12655 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
12656 if (vcurrents
[i
] != vends
[i
])
12657 *is_truncated
= true;
12660 *last_entry
= m
.rbegin()->first
;
12665 int RGWRados::cls_obj_usage_log_add(const string
& oid
, rgw_usage_log_info
& info
)
12667 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
12671 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12676 ObjectWriteOperation op
;
12677 cls_rgw_usage_log_add(op
, info
);
12679 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12683 int RGWRados::cls_obj_usage_log_read(string
& oid
, string
& user
, uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
12684 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
, bool *is_truncated
)
12686 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
12690 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12695 *is_truncated
= false;
12697 r
= cls_rgw_usage_log_read(ref
.ioctx
, ref
.oid
, user
, start_epoch
, end_epoch
,
12698 max_entries
, read_iter
, usage
, is_truncated
);
12703 int RGWRados::cls_obj_usage_log_trim(string
& oid
, string
& user
, uint64_t start_epoch
, uint64_t end_epoch
)
12705 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
12709 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12714 ObjectWriteOperation op
;
12715 cls_rgw_usage_log_trim(op
, user
, start_epoch
, end_epoch
);
12717 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12721 int RGWRados::remove_objs_from_index(RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
12723 librados::IoCtx index_ctx
;
12726 uint8_t suggest_flag
= (get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
12728 int r
= open_bucket_index(bucket_info
, index_ctx
, dir_oid
);
12732 bufferlist updates
;
12734 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
12735 rgw_bucket_dir_entry entry
;
12737 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
12738 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12739 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
12740 ::encode(entry
, updates
);
12745 r
= index_ctx
.exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
12750 int RGWRados::check_disk_state(librados::IoCtx io_ctx
,
12751 const RGWBucketInfo
& bucket_info
,
12752 rgw_bucket_dir_entry
& list_state
,
12753 rgw_bucket_dir_entry
& object
,
12754 bufferlist
& suggested_updates
)
12756 const rgw_bucket
& bucket
= bucket_info
.bucket
;
12757 uint8_t suggest_flag
= (get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
12761 rgw_obj
obj(bucket
, list_state
.key
);
12764 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
12766 if (loc
!= list_state
.locator
) {
12767 ldout(cct
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
12770 io_ctx
.locator_set_key(list_state
.locator
);
12772 RGWObjState
*astate
= NULL
;
12773 RGWObjectCtx
rctx(this);
12774 int r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
12778 list_state
.pending_map
.clear(); // we don't need this and it inflates size
12779 if (!astate
->exists
) {
12780 /* object doesn't exist right now -- hopefully because it's
12781 * marked as !exists and got deleted */
12782 if (list_state
.exists
) {
12783 /* FIXME: what should happen now? Work out if there are any
12784 * non-bad ways this could happen (there probably are, but annoying
12787 // encode a suggested removal of that key
12788 list_state
.ver
.epoch
= io_ctx
.get_last_version();
12789 list_state
.ver
.pool
= io_ctx
.get_id();
12790 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
12795 string content_type
;
12798 object
.meta
.size
= astate
->size
;
12799 object
.meta
.accounted_size
= astate
->accounted_size
;
12800 object
.meta
.mtime
= astate
->mtime
;
12802 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
12803 if (iter
!= astate
->attrset
.end()) {
12804 etag
= iter
->second
.c_str();
12806 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
12807 if (iter
!= astate
->attrset
.end()) {
12808 content_type
= iter
->second
.c_str();
12810 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
12811 if (iter
!= astate
->attrset
.end()) {
12812 r
= decode_policy(iter
->second
, &owner
);
12814 dout(0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
12818 if (astate
->has_manifest
) {
12819 RGWObjManifest::obj_iterator miter
;
12820 RGWObjManifest
& manifest
= astate
->manifest
;
12821 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
12822 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(this);
12824 rgw_raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
12826 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
12827 dout(10) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
12828 r
= delete_obj_index(loc
);
12830 dout(0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
12836 object
.meta
.etag
= etag
;
12837 object
.meta
.content_type
= content_type
;
12838 object
.meta
.owner
= owner
.get_id().to_str();
12839 object
.meta
.owner_display_name
= owner
.get_display_name();
12841 // encode suggested updates
12842 list_state
.ver
.pool
= io_ctx
.get_id();
12843 list_state
.ver
.epoch
= astate
->epoch
;
12844 list_state
.meta
.size
= object
.meta
.size
;
12845 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
12846 list_state
.meta
.mtime
= object
.meta
.mtime
;
12847 list_state
.meta
.category
= main_category
;
12848 list_state
.meta
.etag
= etag
;
12849 list_state
.meta
.content_type
= content_type
;
12850 if (astate
->obj_tag
.length() > 0)
12851 list_state
.tag
= astate
->obj_tag
.c_str();
12852 list_state
.meta
.owner
= owner
.get_id().to_str();
12853 list_state
.meta
.owner_display_name
= owner
.get_display_name();
12855 list_state
.exists
= true;
12856 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
12860 int RGWRados::cls_bucket_head(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<string
, struct rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
12862 librados::IoCtx index_ctx
;
12863 map
<int, string
> oids
;
12864 map
<int, struct rgw_cls_list_ret
> list_results
;
12865 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, list_results
, shard_id
, bucket_instance_ids
);
12869 r
= CLSRGWIssueGetDirHeader(index_ctx
, oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12873 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
12874 for(; iter
!= list_results
.end(); ++iter
) {
12875 headers
[oids
[iter
->first
]] = iter
->second
.dir
.header
;
12880 int RGWRados::cls_bucket_head_async(const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
12882 librados::IoCtx index_ctx
;
12883 map
<int, string
> bucket_objs
;
12884 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
12888 map
<int, string
>::iterator iter
= bucket_objs
.begin();
12889 for (; iter
!= bucket_objs
.end(); ++iter
) {
12890 r
= cls_rgw_get_dir_header_async(index_ctx
, iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
12901 int RGWRados::cls_user_get_header(const string
& user_id
, cls_user_header
*header
)
12903 string buckets_obj_id
;
12904 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
12905 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
12909 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12914 librados::ObjectReadOperation op
;
12916 ::cls_user_get_header(op
, header
, &rc
);
12918 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &ibl
);
12927 int RGWRados::cls_user_get_header_async(const string
& user_id
, RGWGetUserHeader_CB
*ctx
)
12929 string buckets_obj_id
;
12930 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
12931 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
12935 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12940 r
= ::cls_user_get_header_async(ref
.ioctx
, ref
.oid
, ctx
);
12947 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj
& user_obj
, const RGWBucketInfo
& bucket_info
)
12949 map
<string
, struct rgw_bucket_dir_header
> headers
;
12950 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
12952 ldout(cct
, 20) << "cls_bucket_header() returned " << r
<< dendl
;
12956 cls_user_bucket_entry entry
;
12958 bucket_info
.bucket
.convert(&entry
.bucket
);
12960 map
<string
, struct rgw_bucket_dir_header
>::iterator hiter
= headers
.begin();
12961 for (; hiter
!= headers
.end(); ++hiter
) {
12962 map
<uint8_t, struct rgw_bucket_category_stats
>::iterator iter
= hiter
->second
.stats
.begin();
12963 for (; iter
!= hiter
->second
.stats
.end(); ++iter
) {
12964 struct rgw_bucket_category_stats
& header_stats
= iter
->second
;
12965 entry
.size
+= header_stats
.total_size
;
12966 entry
.size_rounded
+= header_stats
.total_size_rounded
;
12967 entry
.count
+= header_stats
.num_entries
;
12971 list
<cls_user_bucket_entry
> entries
;
12972 entries
.push_back(entry
);
12974 r
= cls_user_update_buckets(user_obj
, entries
, false);
12976 ldout(cct
, 20) << "cls_user_update_buckets() returned " << r
<< dendl
;
12983 int RGWRados::cls_user_list_buckets(rgw_raw_obj
& obj
,
12984 const string
& in_marker
,
12985 const string
& end_marker
,
12986 const int max_entries
,
12987 list
<cls_user_bucket_entry
>& entries
,
12988 string
* const out_marker
,
12989 bool * const truncated
)
12993 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12998 librados::ObjectReadOperation op
;
13001 cls_user_bucket_list(op
, in_marker
, end_marker
, max_entries
, entries
, out_marker
, truncated
, &rc
);
13003 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &ibl
);
13012 int RGWRados::cls_user_update_buckets(rgw_raw_obj
& obj
, list
<cls_user_bucket_entry
>& entries
, bool add
)
13016 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
13021 librados::ObjectWriteOperation op
;
13022 cls_user_set_buckets(op
, entries
, add
);
13023 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13030 int RGWRados::complete_sync_user_stats(const rgw_user
& user_id
)
13032 string buckets_obj_id
;
13033 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
13034 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
13035 return cls_user_complete_stats_sync(obj
);
13038 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj
& obj
)
13042 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
13047 librados::ObjectWriteOperation op
;
13048 ::cls_user_complete_stats_sync(op
);
13049 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13056 int RGWRados::cls_user_add_bucket(rgw_raw_obj
& obj
, const cls_user_bucket_entry
& entry
)
13058 list
<cls_user_bucket_entry
> l
;
13059 l
.push_back(entry
);
13061 return cls_user_update_buckets(obj
, l
, true);
13064 int RGWRados::cls_user_remove_bucket(rgw_raw_obj
& obj
, const cls_user_bucket
& bucket
)
13068 int r
= get_system_obj_ref(obj
, &ref
, &p
);
13073 librados::ObjectWriteOperation op
;
13074 ::cls_user_remove_bucket(op
, bucket
);
13075 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13082 int RGWRados::check_bucket_shards(const RGWBucketInfo
& bucket_info
, rgw_bucket
& bucket
,
13083 RGWQuotaInfo
& bucket_quota
)
13085 if (!cct
->_conf
->rgw_dynamic_resharding
) {
13089 bool need_resharding
= false;
13090 int num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
13091 uint32_t suggested_num_shards
;
13093 int ret
= quota_handler
->check_bucket_shards((uint64_t)cct
->_conf
->rgw_max_objs_per_shard
,
13094 num_source_shards
, bucket_info
.owner
, bucket
, bucket_quota
,
13095 1, need_resharding
, &suggested_num_shards
);
13100 if (need_resharding
) {
13101 return add_bucket_to_reshard(bucket_info
, suggested_num_shards
);
13107 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo
& bucket_info
, uint32_t new_num_shards
)
13109 RGWReshard
reshard(this);
13111 uint32_t num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
13113 new_num_shards
= min(new_num_shards
, get_max_bucket_shards());
13114 if (new_num_shards
<= num_source_shards
) {
13115 ldout(cct
, 20) << "not resharding bucket name=" << bucket_info
.bucket
.name
<< ", orig_num=" << num_source_shards
<< ", new_num_shards=" << new_num_shards
<< dendl
;
13119 cls_rgw_reshard_entry entry
;
13120 entry
.time
= real_clock::now();
13121 entry
.tenant
= bucket_info
.owner
.tenant
;
13122 entry
.bucket_name
= bucket_info
.bucket
.name
;
13123 entry
.bucket_id
= bucket_info
.bucket
.bucket_id
;
13124 entry
.old_num_shards
= num_source_shards
;
13125 entry
.new_num_shards
= new_num_shards
;
13127 return reshard
.add(entry
);
13130 int RGWRados::check_quota(const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
13131 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
, uint64_t obj_size
)
13133 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
);
13136 void RGWRados::get_bucket_index_objects(const string
& bucket_oid_base
,
13137 uint32_t num_shards
, map
<int, string
>& bucket_objects
, int shard_id
)
13140 bucket_objects
[0] = bucket_oid_base
;
13142 char buf
[bucket_oid_base
.size() + 32];
13143 if (shard_id
< 0) {
13144 for (uint32_t i
= 0; i
< num_shards
; ++i
) {
13145 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), i
);
13146 bucket_objects
[i
] = buf
;
13149 if ((uint32_t)shard_id
> num_shards
) {
13152 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
13153 bucket_objects
[shard_id
] = buf
;
13158 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<int, string
> *result
)
13160 const rgw_bucket
& bucket
= bucket_info
.bucket
;
13161 string plain_id
= bucket
.name
+ ":" + bucket
.bucket_id
;
13162 if (!bucket_info
.num_shards
) {
13163 (*result
)[0] = plain_id
;
13166 if (shard_id
< 0) {
13167 for (uint32_t i
= 0; i
< bucket_info
.num_shards
; ++i
) {
13168 snprintf(buf
, sizeof(buf
), ":%d", i
);
13169 (*result
)[i
] = plain_id
+ buf
;
13172 if ((uint32_t)shard_id
> bucket_info
.num_shards
) {
13175 snprintf(buf
, sizeof(buf
), ":%d", shard_id
);
13176 (*result
)[shard_id
] = plain_id
+ buf
;
13181 int RGWRados::get_target_shard_id(const RGWBucketInfo
& bucket_info
, const string
& obj_key
,
13185 switch (bucket_info
.bucket_index_shard_hash_type
) {
13186 case RGWBucketInfo::MOD
:
13187 if (!bucket_info
.num_shards
) {
13192 uint32_t sid
= ceph_str_hash_linux(obj_key
.c_str(), obj_key
.size());
13193 uint32_t sid2
= sid
^ ((sid
& 0xFF) << 24);
13194 sid
= rgw_shards_mod(sid2
, bucket_info
.num_shards
);
13196 *shard_id
= (int)sid
;
13206 void RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, uint32_t num_shards
,
13207 int shard_id
, string
*bucket_obj
)
13210 // By default with no sharding, we use the bucket oid as itself
13211 (*bucket_obj
) = bucket_oid_base
;
13213 char buf
[bucket_oid_base
.size() + 32];
13214 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
13215 (*bucket_obj
) = buf
;
13219 int RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, const string
& obj_key
,
13220 uint32_t num_shards
, RGWBucketInfo::BIShardsHashType hash_type
, string
*bucket_obj
, int *shard_id
)
13223 switch (hash_type
) {
13224 case RGWBucketInfo::MOD
:
13226 // By default with no sharding, we use the bucket oid as itself
13227 (*bucket_obj
) = bucket_oid_base
;
13232 uint32_t sid
= ceph_str_hash_linux(obj_key
.c_str(), obj_key
.size());
13233 uint32_t sid2
= sid
^ ((sid
& 0xFF) << 24);
13234 sid
= rgw_shards_mod(sid2
, num_shards
);
13235 char buf
[bucket_oid_base
.size() + 32];
13236 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), sid
);
13237 (*bucket_obj
) = buf
;
13239 *shard_id
= (int)sid
;
13249 void RGWStateLog::oid_str(int shard
, string
& oid
) {
13250 oid
= RGW_STATELOG_OBJ_PREFIX
+ module_name
+ ".";
13252 snprintf(buf
, sizeof(buf
), "%d", shard
);
13256 int RGWStateLog::get_shard_num(const string
& object
) {
13257 uint32_t val
= ceph_str_hash_linux(object
.c_str(), object
.length());
13258 return val
% num_shards
;
13261 string
RGWStateLog::get_oid(const string
& object
) {
13262 int shard
= get_shard_num(object
);
13264 oid_str(shard
, oid
);
13268 int RGWStateLog::open_ioctx(librados::IoCtx
& ioctx
) {
13270 store
->get_log_pool(pool
);
13271 int r
= rgw_init_ioctx(store
->get_rados_handle(), pool
, ioctx
);
13273 lderr(store
->ctx()) << "ERROR: could not open rados pool" << dendl
;
13279 int RGWStateLog::store_entry(const string
& client_id
, const string
& op_id
, const string
& object
,
13280 uint32_t state
, bufferlist
*bl
, uint32_t *check_state
)
13282 if (client_id
.empty() ||
13285 ldout(store
->ctx(), 0) << "client_id / op_id / object is empty" << dendl
;
13288 librados::IoCtx ioctx
;
13289 int r
= open_ioctx(ioctx
);
13293 string oid
= get_oid(object
);
13295 librados::ObjectWriteOperation op
;
13297 cls_statelog_check_state(op
, client_id
, op_id
, object
, *check_state
);
13299 utime_t ts
= ceph_clock_now();
13301 cls_statelog_add(op
, client_id
, op_id
, object
, ts
, state
, (bl
? *bl
: nobl
));
13302 r
= ioctx
.operate(oid
, &op
);
13310 int RGWStateLog::remove_entry(const string
& client_id
, const string
& op_id
, const string
& object
)
13312 if (client_id
.empty() ||
13315 ldout(store
->ctx(), 0) << "client_id / op_id / object is empty" << dendl
;
13318 librados::IoCtx ioctx
;
13319 int r
= open_ioctx(ioctx
);
13323 string oid
= get_oid(object
);
13325 librados::ObjectWriteOperation op
;
13326 cls_statelog_remove_by_object(op
, object
, op_id
);
13327 r
= ioctx
.operate(oid
, &op
);
13335 void RGWStateLog::init_list_entries(const string
& client_id
, const string
& op_id
, const string
& object
,
13338 list_state
*state
= new list_state
;
13339 state
->client_id
= client_id
;
13340 state
->op_id
= op_id
;
13341 state
->object
= object
;
13342 if (object
.empty()) {
13343 state
->cur_shard
= 0;
13344 state
->max_shard
= num_shards
- 1;
13346 state
->cur_shard
= state
->max_shard
= get_shard_num(object
);
13348 *handle
= (void *)state
;
13351 int RGWStateLog::list_entries(void *handle
, int max_entries
,
13352 list
<cls_statelog_entry
>& entries
,
13355 list_state
*state
= static_cast<list_state
*>(handle
);
13357 librados::IoCtx ioctx
;
13358 int r
= open_ioctx(ioctx
);
13364 for (; state
->cur_shard
<= state
->max_shard
&& max_entries
> 0; ++state
->cur_shard
) {
13366 oid_str(state
->cur_shard
, oid
);
13368 librados::ObjectReadOperation op
;
13369 list
<cls_statelog_entry
> ents
;
13371 cls_statelog_list(op
, state
->client_id
, state
->op_id
, state
->object
, state
->marker
,
13372 max_entries
, ents
, &state
->marker
, &truncated
);
13374 r
= ioctx
.operate(oid
, &op
, &ibl
);
13375 if (r
== -ENOENT
) {
13380 ldout(store
->ctx(), 0) << "cls_statelog_list returned " << r
<< dendl
;
13385 state
->marker
.clear();
13388 max_entries
-= ents
.size();
13390 entries
.splice(entries
.end(), ents
);
13396 *done
= (state
->cur_shard
> state
->max_shard
);
13401 void RGWStateLog::finish_list_entries(void *handle
)
13403 list_state
*state
= static_cast<list_state
*>(handle
);
13407 void RGWStateLog::dump_entry(const cls_statelog_entry
& entry
, Formatter
*f
)
13409 f
->open_object_section("statelog_entry");
13410 f
->dump_string("client_id", entry
.client_id
);
13411 f
->dump_string("op_id", entry
.op_id
);
13412 f
->dump_string("object", entry
.object
);
13413 entry
.timestamp
.gmtime_nsec(f
->dump_stream("timestamp"));
13414 if (!dump_entry_internal(entry
, f
)) {
13415 f
->dump_int("state", entry
.state
);
13417 f
->close_section();
13420 RGWOpState::RGWOpState(RGWRados
*_store
) : RGWStateLog(_store
, _store
->ctx()->_conf
->rgw_num_zone_opstate_shards
, string("obj_opstate"))
13424 bool RGWOpState::dump_entry_internal(const cls_statelog_entry
& entry
, Formatter
*f
)
13427 switch ((OpState
)entry
.state
) {
13428 case OPSTATE_UNKNOWN
:
13431 case OPSTATE_IN_PROGRESS
:
13434 case OPSTATE_COMPLETE
:
13437 case OPSTATE_ERROR
:
13440 case OPSTATE_ABORT
:
13443 case OPSTATE_CANCELLED
:
13449 f
->dump_string("state", s
);
13453 int RGWOpState::state_from_str(const string
& s
, OpState
*state
)
13455 if (s
== "unknown") {
13456 *state
= OPSTATE_UNKNOWN
;
13457 } else if (s
== "in-progress") {
13458 *state
= OPSTATE_IN_PROGRESS
;
13459 } else if (s
== "complete") {
13460 *state
= OPSTATE_COMPLETE
;
13461 } else if (s
== "error") {
13462 *state
= OPSTATE_ERROR
;
13463 } else if (s
== "abort") {
13464 *state
= OPSTATE_ABORT
;
13465 } else if (s
== "cancelled") {
13466 *state
= OPSTATE_CANCELLED
;
13474 int RGWOpState::set_state(const string
& client_id
, const string
& op_id
, const string
& object
, OpState state
)
13476 uint32_t s
= (uint32_t)state
;
13477 return store_entry(client_id
, op_id
, object
, s
, NULL
, NULL
);
13480 int RGWOpState::renew_state(const string
& client_id
, const string
& op_id
, const string
& object
, OpState state
)
13482 uint32_t s
= (uint32_t)state
;
13483 return store_entry(client_id
, op_id
, object
, s
, NULL
, &s
);
13486 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados
*store
, const string
& cid
, const string
& oid
,
13487 const string
& obj
) : os(store
), client_id(cid
), op_id(oid
), object(obj
)
13489 cct
= store
->ctx();
13490 cur_state
= RGWOpState::OPSTATE_UNKNOWN
;
13493 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state
) {
13494 last_update
= real_clock::now();
13496 return os
.set_state(client_id
, op_id
, object
, state
);
13499 int RGWOpStateSingleOp::renew_state() {
13500 real_time now
= real_clock::now();
13502 int rate_limit_sec
= cct
->_conf
->rgw_opstate_ratelimit_sec
;
13504 if (rate_limit_sec
&& now
- last_update
< make_timespan(rate_limit_sec
)) {
13509 return os
.renew_state(client_id
, op_id
, object
, cur_state
);
13513 uint64_t RGWRados::instance_id()
13515 return get_rados_handle()->get_instance_id();
13518 uint64_t RGWRados::next_bucket_id()
13520 Mutex::Locker
l(bucket_id_lock
);
13521 return ++max_bucket_id
;
13524 RGWRados
*RGWStoreManager::init_storage_provider(CephContext
*cct
, bool use_gc_thread
, bool use_lc_thread
, bool quota_threads
, bool run_sync_thread
, bool run_reshard_thread
)
13526 int use_cache
= cct
->_conf
->rgw_cache_enabled
;
13527 RGWRados
*store
= NULL
;
13529 store
= new RGWRados
;
13531 store
= new RGWCache
<RGWRados
>;
13534 if (store
->initialize(cct
, use_gc_thread
, use_lc_thread
, quota_threads
, run_sync_thread
, run_reshard_thread
) < 0) {
13542 RGWRados
*RGWStoreManager::init_raw_storage_provider(CephContext
*cct
)
13544 RGWRados
*store
= NULL
;
13545 store
= new RGWRados
;
13547 store
->set_context(cct
);
13549 if (store
->init_rados() < 0) {
13557 void RGWStoreManager::close_storage(RGWRados
*store
)
13567 librados::Rados
* RGWRados::get_rados_handle()
13569 if (rados
.size() == 1) {
13572 handle_lock
.get_read();
13573 pthread_t id
= pthread_self();
13574 std::map
<pthread_t
, int>:: iterator it
= rados_map
.find(id
);
13576 if (it
!= rados_map
.end()) {
13577 handle_lock
.put_read();
13578 return &rados
[it
->second
];
13580 handle_lock
.put_read();
13581 handle_lock
.get_write();
13582 const uint32_t handle
= next_rados_handle
;
13583 rados_map
[id
] = handle
;
13584 if (++next_rados_handle
== rados
.size()) {
13585 next_rados_handle
= 0;
13587 handle_lock
.put_write();
13588 return &rados
[handle
];
13593 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
13596 int ret
= get_raw_obj_ref(obj
, &ref
);
13598 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
13602 ObjectWriteOperation op
;
13603 list
<string
> prefixes
;
13604 cls_rgw_remove_obj(op
, prefixes
);
13606 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13607 ret
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
13609 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
13614 handles
.push_back(c
);
13619 int RGWRados::delete_obj_aio(const rgw_obj
& obj
,
13620 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
13621 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
)
13624 int ret
= get_obj_head_ref(bucket_info
, obj
, &ref
);
13626 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
13630 if (keep_index_consistent
) {
13631 RGWRados::Bucket
bop(this, bucket_info
);
13632 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
13634 ret
= index_op
.prepare(CLS_RGW_OP_DEL
, &astate
->write_tag
);
13636 lderr(cct
) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
13641 ObjectWriteOperation op
;
13642 list
<string
> prefixes
;
13643 cls_rgw_remove_obj(op
, prefixes
);
13645 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13646 ret
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
13648 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
13653 handles
.push_back(c
);
13655 if (keep_index_consistent
) {
13656 ret
= delete_obj_index(obj
);
13658 lderr(cct
) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;
13665 int rgw_compression_info_from_attrset(map
<string
, bufferlist
>& attrs
, bool& need_decompress
, RGWCompressionInfo
& cs_info
) {
13666 map
<string
, bufferlist
>::iterator value
= attrs
.find(RGW_ATTR_COMPRESSION
);
13667 if (value
!= attrs
.end()) {
13668 bufferlist::iterator bliter
= value
->second
.begin();
13670 ::decode(cs_info
, bliter
);
13671 } catch (buffer::error
& err
) {
13674 if (cs_info
.blocks
.size() == 0) {
13677 if (cs_info
.compression_type
!= "none")
13678 need_decompress
= true;
13680 need_decompress
= false;
13683 need_decompress
= false;